From 7df65e002fb8f2e3cdc3912601efc723b0f3e3e9 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 28 Apr 2025 11:41:26 +0100 Subject: [PATCH 01/16] Add a device_id string to the is_available_on_device method --- src/psyclone/psyir/nodes/call.py | 6 +- src/psyclone/psyir/nodes/intrinsic_call.py | 76 ++++++++++--------- .../tests/psyir/nodes/intrinsic_call_test.py | 20 ++++- src/psyclone/transformations.py | 14 +++- 4 files changed, 76 insertions(+), 40 deletions(-) diff --git a/src/psyclone/psyir/nodes/call.py b/src/psyclone/psyir/nodes/call.py index 6dd0d44aba..739b4c5dab 100644 --- a/src/psyclone/psyir/nodes/call.py +++ b/src/psyclone/psyir/nodes/call.py @@ -389,12 +389,14 @@ def is_pure(self): return self.routine.symbol.is_pure return None - def is_available_on_device(self): + def is_available_on_device(self, device_string: str = "") -> bool: ''' + :param device_string: optional string to identify the offloading + device (or its compiler-platform family). :returns: whether this call is available on an accelerated device. - :rtype: bool ''' + # pylint: disable=unused-argument return False @property diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index 970fffe26e..c030202d54 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -770,44 +770,24 @@ def intrinsic(self): ''' return self.routine.symbol.intrinsic - # This is not part of the intrinsic enum, because its ValueError could - # change for different devices, and in the future we may want to pass - # a device/arch/compiler parameter or look at the configuration file. - # Currently it is inspired from: https://docs.nvidia.com/hpc-sdk/ - # compilers/hpc-compilers-user-guide/#acc-fort-intrin-sum - # But that list is incomplete (e.g. SUM is supported and not listed) - def is_available_on_device(self): + def is_available_on_device(self, device_string: str = "") -> bool: ''' + :param device_string: optional string to identify the offloading + device (or its compiler-platform family). :returns: whether this intrinsic is available on an accelerated device. - :rtype: bool ''' - return self.intrinsic in ( - IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS, - IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT, - IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN, - IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS, - IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE, - IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP, - IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR, - IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR, - IntrinsicCall.Intrinsic.LOG, - IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN, - IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT, - IntrinsicCall.Intrinsic.NOT, - IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN, - IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT, - IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH, - IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE, - # The ones below can be offloaded but provide numerical differences - # even with the -gpu=uniform_math flag, ideally it should be - # configurable if these are allowed or not. - # IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL, - # The one below are not documented on nvidia compiler - IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, - IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, - IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL, - IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE) + if not device_string: + device_string = "nvfortran-all" + + if device_string == "nvfortran-all": + return self.intrinsic in NVFORTRAN_ALL + if device_string == "nvfortran-repr": + return self.intrinsic in NVFORTRAN_REPRODUCIBLE + + raise ValueError( + f"Unsupported device_string value '{device_string}', the supported" + " values are '' (default), 'nvfortran-all', 'nvfortran-repr'") @classmethod def create(cls, intrinsic, arguments=()): @@ -972,6 +952,34 @@ def is_inquiry(self): return self.intrinsic.is_inquiry +# Intrinsics available on nvidia gpus with reproducible results when +# compiled with the nvfortran "-gpu=uniform_math" flag +NVFORTRAN_REPRODUCIBLE = ( + IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS, + IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT, + IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN, + IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS, + IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE, + IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP, + IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR, + IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR, + IntrinsicCall.Intrinsic.LOG, IntrinsicCall.Intrinsic.NOT, + IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN, + IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT, + IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN, + IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT, + IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH, + IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE, + IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, + IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, + IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL, + IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE +) + +# All nvfortran intrinsics available on GPUs +NVFORTRAN_ALL = NVFORTRAN_REPRODUCIBLE + ( + IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL) + # TODO #658 this can be removed once we have support for determining the # type of a PSyIR expression. # Intrinsics that perform a reduction on an array. diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index 6eec6ca32c..824287e847 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -137,11 +137,11 @@ def test_intrinsiccall_is_inquiry(): (IntrinsicCall.Intrinsic.INT, True), (IntrinsicCall.Intrinsic.IOR, True), (IntrinsicCall.Intrinsic.LOG, True), - (IntrinsicCall.Intrinsic.LOG10, False), + (IntrinsicCall.Intrinsic.LOG10, True), (IntrinsicCall.Intrinsic.MOD, True), (IntrinsicCall.Intrinsic.NINT, True), (IntrinsicCall.Intrinsic.NOT, True), - (IntrinsicCall.Intrinsic.REAL, False), + (IntrinsicCall.Intrinsic.REAL, True), (IntrinsicCall.Intrinsic.SIGN, True), (IntrinsicCall.Intrinsic.SIN, True), (IntrinsicCall.Intrinsic.SINH, True), @@ -158,6 +158,22 @@ def test_intrinsiccall_is_available_on_device(intrinsic, result): assert intrinsic_call.is_available_on_device() is result +def test_intrinsiccall_is_available_on_device_with_device_string(): + '''Tests that the is_available_on_device() method with a device_string + argument provides different results with the 'nvfortran-reprod' + ''' + intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.LOG10) + assert not intrinsic_call.is_available_on_device("nvfortran-repr") + intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.REAL) + assert not intrinsic_call.is_available_on_device("nvfortran-repr") + + with pytest.raises(ValueError) as err: + assert not intrinsic_call.is_available_on_device("invalid") + assert ("Unsupported device_string value 'invalid', the supported values" + " are '' (default), 'nvfortran-all', 'nvfortran-repr'" + in str(err.value)) + + def test_intrinsiccall_alloc_create(): '''Tests the create() method supports various forms of 'allocate'. diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 93c813e012..ff7d5c44fc 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -378,6 +378,8 @@ def validate_it_can_run_on_gpu(self, node, options): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param bool options["device_string"]: provide a compiler-platform + identifier strign. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. @@ -393,6 +395,7 @@ def validate_it_can_run_on_gpu(self, node, options): routines. ''' force = options.get("force", False) if options else False + device_string = options.get("device_string", "") if options else "" if not isinstance(node, (Kern, Routine)): raise TransformationError( @@ -489,7 +492,7 @@ def validate_it_can_run_on_gpu(self, node, options): calls = kernel_schedule.walk(Call) for call in calls: - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): call_str = call.debug_string().rstrip("\n") raise TransformationError( f"{k_or_r} '{node.name}' calls another routine " @@ -1658,9 +1661,16 @@ def validate(self, node_list, options=None): f"The provided 'default_present' option must be a " f"boolean, but found '{options['default_present']}'." ) + if options is not None and "device_string" in options: + if not isinstance(options["device_string"], str): + raise TransformationError( + f"The provided 'device_string' option must be a " + f"boolean, but found '{options['device_string']}'." + ) + device_string = options.get("device_string", "") if options else "" for node in node_list: for call in node.walk(Call): - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): raise TransformationError( f"'{call.routine.name}' is not available on the " f"accelerator device, and therefore it cannot " From 56db78dcadabd4b46d768ac860507b89b1168e2f Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 3 Jun 2025 15:25:29 +0100 Subject: [PATCH 02/16] #2856 Add REPRODUCIBLE configuration in NEMO OpenMP script --- examples/nemo/scripts/omp_gpu_trans.py | 9 +++++++-- examples/nemo/scripts/utils.py | 8 +++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 8077255505..a27bb7ac88 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -54,6 +54,9 @@ # By default, we don't do module inlining as it's still under development. INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) +# By default, we allow all device intrinsics (not only the reproducible ones) +REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) + # This environment variable informs if this is targeting NEMOv4, in which case # array privatisation is disabled and some more files excluded NEMOV4 = os.environ.get('NEMOV4', False) @@ -198,7 +201,8 @@ def trans(psyir): region_directive_trans=omp_target_trans, loop_directive_trans=omp_gpu_loop_trans, collapse=True, - privatise_arrays=False + privatise_arrays=False, + reproducible_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") @@ -207,7 +211,8 @@ def trans(psyir): region_directive_trans=omp_target_trans, loop_directive_trans=omp_gpu_loop_trans, collapse=True, - privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES) + privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES), + reproducible_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES: # This have issues offloading, but we can still do OpenMP threading diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 8084e2e637..8b086acb7f 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -411,6 +411,7 @@ def insert_explicit_loop_parallelism( loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, + reproducible_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive as an ancestor, attempt to insert the given region and loop directives. @@ -429,6 +430,8 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. + :param reproducible_intrinsics_only: if True it prevent offloading loops + with non-reproducible device intrinsics. ''' if schedule.name == "ts_wgt": @@ -439,7 +442,10 @@ def insert_explicit_loop_parallelism( continue # Skip if an outer loop is already parallelised opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, - "verbose": True, "nowait": True} + "verbose": True, "nowait": False} + + if reproducible_intrinsics_only: + opts["device_sring"] = "nvfortran-repr" routine_name = loop.ancestor(Routine).name From 8fc3122e0840af66db439158f22da27036138bb3 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 3 Jun 2025 15:49:46 +0100 Subject: [PATCH 03/16] #2856 Test PROFILING and non-reproducible NEMOv5 BENCH builds --- .github/workflows/nemo_v5_tests.yml | 25 ++++++++++++--- .../scripts/KGOs/arch-linux_spack_profile.fcm | 31 +++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 41e759604f..be2cadfd67 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -215,21 +215,38 @@ jobs: export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS - # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. + # We test BENCH with injecting the profiling hooks cd $NEMO_DIR - cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profiling.fcm arch/arch-linux_spack_profiling.fcm + export ENABLE_PROFILING=1 + # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" - + export REPRODUCIBLE=1 + # Clean up and compile rm -rf tests/${TEST_DIR} ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ -j 4 -v 1 - # Run test + # Run reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg OMP_NUM_THREADS=4 mpirun -np 1 ./nemo diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat + + # Now do a fast-build (without reproducible or profiling option, which have a big impact + # for BENCH due to some inner-loop REAL intrinsics) + unset REPRODUCIBLE + unset ENABLE_PROFILING + export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" + rm -rf tests/${TEST_DIR} + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + -j 4 -v 1 + + # Run non-reproducible test + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + OMP_NUM_THREADS=4 mpirun -np 1 ./nemo export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm new file mode 100644 index 0000000000..2e6c8df745 --- /dev/null +++ b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm @@ -0,0 +1,31 @@ +# This fcm file is intended to be used with the psyclone-spack nemo-build-environment recipe +# which will populate all environment variables but PSYCLONE_HOME and FCFLAGS, which should +# be populated manually for the desired target. For example, using: +# $ spack load nemo-build-environment%nvhpc +# $ export PSYCLONE_HOME=${PWD}/.venv +# $ export FCFLAGS="-i4 -Mr8 -O3 -Minline -Mcray=pointer -Mpre -mp" + +%PSYCLONE_HOME ${PSYCLONE_HOME} +%NCDF_INC -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include +%NCDF_LIB -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf + +%PROFILE_INC -I${PROFILING_DIR} +%PROFILE_LIB -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt + + + +%CPP cpp -Dkey_nosignedzero +%FC ${MPIF90} -c +%FCFLAGS ${FCFLAGS} +%FFLAGS %FCFLAGS +%LD ${MPIF90} +%LDFLAGS ${FCFLAGS} +%FPPFLAGS -P -traditional +%AR ar +%ARFLAGS rs +%MK make +%USER_INC %NCDF_INC %PROFILE_INC +%USER_LIB %NCDF_LIB %PROFILE_LIB + +%CC ${CC} +%CFLAGS -O2 From 04fa5e018f6f289cb70c5699dcdb551ae43f4d8e Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 3 Jun 2025 16:36:25 +0100 Subject: [PATCH 04/16] #2856 Fix issues with device_string option --- examples/nemo/scripts/utils.py | 5 +++-- src/psyclone/psyir/transformations/omp_target_trans.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 8b086acb7f..deb07b9690 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -445,7 +445,7 @@ def insert_explicit_loop_parallelism( "verbose": True, "nowait": False} if reproducible_intrinsics_only: - opts["device_sring"] = "nvfortran-repr" + opts["device_string"] = "nvfortran-repr" routine_name = loop.ancestor(Routine).name @@ -493,7 +493,8 @@ def insert_explicit_loop_parallelism( # And if successful, the region directive on top. if region_directive_trans: - region_directive_trans.apply(loop.parent.parent) + import pdb; pdb.set_trace() + region_directive_trans.apply(loop.parent.parent, options=opts) except TransformationError: # This loop cannot be transformed, proceed to next loop. # The parallelisation restrictions will be explained with a comment diff --git a/src/psyclone/psyir/transformations/omp_target_trans.py b/src/psyclone/psyir/transformations/omp_target_trans.py index 5c5d3a5221..071bc274ab 100644 --- a/src/psyclone/psyir/transformations/omp_target_trans.py +++ b/src/psyclone/psyir/transformations/omp_target_trans.py @@ -158,11 +158,12 @@ def validate(self, node, options=None): :raises TransformationError: if its a function and the target region attempts to enclose the assingment setting the return value. ''' + device_string = options.get("device_string", "") if options else "" node_list = self.get_node_list(node) super().validate(node, options) for node in node_list: for call in node.walk(Call): - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): raise TransformationError( f"'{call.routine.name}' is not available on the " f"accelerator device, and therefore it cannot " From b89ca8033fc786d39a69ef16c2fb7e015897bdfb Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 3 Jun 2025 16:37:12 +0100 Subject: [PATCH 05/16] #2856 Remove debug statement --- examples/nemo/scripts/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index deb07b9690..e1fa1cf384 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -493,7 +493,6 @@ def insert_explicit_loop_parallelism( # And if successful, the region directive on top. if region_directive_trans: - import pdb; pdb.set_trace() region_directive_trans.apply(loop.parent.parent, options=opts) except TransformationError: # This loop cannot be transformed, proceed to next loop. From 5c3b22ab92bf941ac67ef661bc51c015aa9f3d42 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 09:28:36 +0100 Subject: [PATCH 06/16] #2856 Fix CI action mistake --- .github/workflows/nemo_v5_tests.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index be2cadfd67..29090012fc 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -214,28 +214,29 @@ jobs: export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC - # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS - # We test BENCH with injecting the profiling hooks + # Set up FCM: PATHs are loaded from SPACK, we only need to set the + # environemnt variables to chose the flags and enable PROFILING hooks + # and prevent non-reporducible intrinsics cd $NEMO_DIR - cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profiling.fcm arch/arch-linux_spack_profiling.fcm + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profile.fcm arch/arch-linux_spack_profile.fcm export ENABLE_PROFILING=1 # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 - # Clean up and compile rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ -j 4 -v 1 # Run reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg OMP_NUM_THREADS=4 mpirun -np 1 ./nemo + # We can compare all digits for this build diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat - # Now do a fast-build (without reproducible or profiling option, which have a big impact - # for BENCH due to some inner-loop REAL intrinsics) + # Now do a fast-build (without reproducible or profiling options, which have a + # big impact for BENCH due to some inner-loop REAL intrinsics) unset REPRODUCIBLE unset ENABLE_PROFILING export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" From d435ee2cddb89729aea98496eebc7cad5b1d133c Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 09:58:00 +0100 Subject: [PATCH 07/16] #2856 NEMO integration tests builds the profiling lib --- .github/workflows/nemo_v5_tests.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 29090012fc..6a10acdda0 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -214,9 +214,14 @@ jobs: export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC - # Set up FCM: PATHs are loaded from SPACK, we only need to set the - # environemnt variables to chose the flags and enable PROFILING hooks - # and prevent non-reporducible intrinsics + # Make sure the profiling wrapper is compiled with the same compiler + export PROFILING_DIR=${GITHUB_WORKSPACE}/lib/profiling/nvidia/ + cd $PROFILING_DIR + make clean + F90=$MPIF90 make + + # First do a debug-build: set the environemnt variables to use flags and intrinsics + # with numerically reproducible results and enable PROFILING hooks cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profile.fcm arch/arch-linux_spack_profile.fcm export ENABLE_PROFILING=1 From 3de5e8b4a4657dca4cb886369fd6c33f240934fd Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 10:34:46 +0100 Subject: [PATCH 08/16] #2856 Fix NEMO integration test --- .github/workflows/nemo_v5_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 6a10acdda0..18c9a1081d 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -242,6 +242,7 @@ jobs: # Now do a fast-build (without reproducible or profiling options, which have a # big impact for BENCH due to some inner-loop REAL intrinsics) + cd $NEMO_DIR unset REPRODUCIBLE unset ENABLE_PROFILING export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" From c74e509934381ec5d0e8831c8dd9ae6d009452bd Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 12:08:14 +0100 Subject: [PATCH 09/16] #2856 Exclude one more nemo file --- examples/nemo/scripts/omp_gpu_trans.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index a27bb7ac88..89ccdfd32b 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -100,6 +100,7 @@ "trczdf.f90", "trcice_pisces.f90", "dtatsd.f90", + "trcatf.f90", ] From 78a06be6b7773f1c04916e92c4234a1b70594c80 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 16:29:43 +0100 Subject: [PATCH 10/16] #2856 Add tests for the device_id transformation options --- .../kernel_transformation_test.py | 33 +++++++++++++++++++ .../transformations/omp_target_trans_test.py | 21 ++++++++++++ 2 files changed, 54 insertions(+) diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index de6e9d2e55..0b9aae6d02 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -454,6 +454,39 @@ def test_kernel_gpu_annotation_trans(rtrans, expected_directive, assert expected_directive in code +@pytest.mark.parametrize( + "rtrans", + [ACCRoutineTrans(), OMPDeclareTargetTrans()]) +def test_kernel_gpu_annotation_device_id(rtrans, fortran_reader): + ''' Check that the GPU annotation transformations validations + check the intrinsics using the provided device id. ''' + + code = ''' + function myfunc(a) + integer :: a + real :: myfunc + myfunc = REAL(a) + end function + ''' + psyir = fortran_reader.psyir_from_source(code) + routine = psyir.children[0] + # The routine is valid + rtrans.validate(routine) + # But not if we are targeting "nvidia-repr" or an invalid device + with pytest.raises(TransformationError) as err: + rtrans.validate(routine, options={'device_string': + 'nvfortran-repr'}) + assert ("routine 'myfunc' calls another routine 'REAL(a)' which is not " + "available on the accelerator device" + in str(err.value)) + with pytest.raises(ValueError) as err: + rtrans.validate(routine, options={'device_string': + 'unknown-device'}) + assert ("Unsupported device_string value 'unknown-device', the supported " + "values are '' (default), 'nvfortran-all', 'nvfortran-repr'" + in str(err.value)) + + def test_1kern_trans(kernel_outputdir): ''' Check that we generate the correct code when an invoke contains the same kernel more than once but only one of them is transformed. ''' diff --git a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py index 21a256a6e8..53ea44f293 100644 --- a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py +++ b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py @@ -129,6 +129,11 @@ def test_omptargettrans_validate(fortran_reader): char = 'a' // 'b' end do end do + do i = 1, 10 + do j = 1, 10 + A(i, j) = LOG10(3) + end do + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) @@ -151,6 +156,22 @@ def test_omptargettrans_validate(fortran_reader): assert ("Nodes of type 'CodeBlock' cannot be enclosed by a OMPTarget" "Trans transformation" in str(err.value)) + # The last loop is valid + omptargettrans.validate(loops[3]) + # But not if we are targeting "nvidia-repr" or an invalid device + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[3], options={'device_string': + 'nvfortran-repr'}) + assert ("'LOG10' is not available on the accelerator device, and therefore" + " it cannot be called from within an OMP Target region" + in str(err.value)) + with pytest.raises(ValueError) as err: + omptargettrans.validate(loops[3], options={'device_string': + 'unknown-device'}) + assert ("Unsupported device_string value 'unknown-device', the supported " + "values are '' (default), 'nvfortran-all', 'nvfortran-repr'" + in str(err.value)) + def test_omptargetrans_apply_nowait(fortran_reader, fortran_writer): '''Test the behaviour of the OMPTargetTrans apply function is as From 69e61a64ab734ed2e0280b92e81532be3e0846aa Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 4 Jun 2025 16:49:20 +0100 Subject: [PATCH 11/16] #2856 Remove unnecessary check --- src/psyclone/transformations.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 0b8d18a5d8..23eec4a2ad 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -1769,12 +1769,6 @@ def validate(self, node_list, options=None): f"The provided 'default_present' option must be a " f"boolean, but found '{options['default_present']}'." ) - if options is not None and "device_string" in options: - if not isinstance(options["device_string"], str): - raise TransformationError( - f"The provided 'device_string' option must be a " - f"boolean, but found '{options['device_string']}'." - ) device_string = options.get("device_string", "") if options else "" for node in node_list: for call in node.walk(Call): From f3b5430e1da1499e2b82407e780a93c1b4abcd47 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 5 Jun 2025 09:20:09 +0100 Subject: [PATCH 12/16] #2856 Fix NEMO tests and add a separate DEFAULT_DEVICE_INTRINSICS global --- .github/workflows/nemo_v5_tests.yml | 2 ++ src/psyclone/psyir/nodes/intrinsic_call.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 18c9a1081d..787fbdc3e9 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -280,6 +280,7 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} @@ -320,6 +321,7 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index d036c05c08..9ada095a85 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -778,8 +778,7 @@ def is_available_on_device(self, device_string: str = "") -> bool: ''' if not device_string: - device_string = "nvfortran-all" - + return self.intrinsic in DEFAULT_DEVICE_INTRINISCS if device_string == "nvfortran-all": return self.intrinsic in NVFORTRAN_ALL if device_string == "nvfortran-repr": @@ -979,6 +978,9 @@ def is_inquiry(self): NVFORTRAN_ALL = NVFORTRAN_REPRODUCIBLE + ( IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL) +# For now the default intrinsics availabe on GPU are the same as nvfortran-all +DEFAULT_DEVICE_INTRINISCS = NVFORTRAN_ALL + # TODO #658 this can be removed once we have support for determining the # type of a PSyIR expression. # Intrinsics that perform a reduction on an array. From f9a6d7719f9e5ee82ed7f2f3473a30afc221d85b Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Thu, 5 Jun 2025 10:21:39 +0100 Subject: [PATCH 13/16] #2856 Fix missing coverage --- src/psyclone/tests/psyir/nodes/intrinsic_call_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index c78f102036..19470f965d 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -154,7 +154,9 @@ def test_intrinsiccall_is_inquiry(): def test_intrinsiccall_is_available_on_device(intrinsic, result): '''Tests that the is_available_on_device() method works as expected.''' intrinsic_call = IntrinsicCall(intrinsic) + # For now default and nvfortran-all are the same assert intrinsic_call.is_available_on_device() is result + assert intrinsic_call.is_available_on_device('nvfortran-all') is result def test_intrinsiccall_is_available_on_device_with_device_string(): From 6211fcaa3b6b4d090e1a38e205380ff5f9c8d19b Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 9 Jun 2025 15:02:35 +0100 Subject: [PATCH 14/16] #2856 Improve device_string implementation --- examples/nemo/scripts/omp_gpu_trans.py | 4 +-- examples/nemo/scripts/utils.py | 8 ++--- src/psyclone/psyir/nodes/intrinsic_call.py | 15 +++++---- .../psyir/transformations/omp_target_trans.py | 4 +++ .../tests/psyir/nodes/intrinsic_call_test.py | 8 ++--- .../kernel_transformation_test.py | 17 +++++----- .../transformations/omp_target_trans_test.py | 4 +-- src/psyclone/transformations.py | 31 +++++++++++++++++-- 8 files changed, 61 insertions(+), 30 deletions(-) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 89ccdfd32b..89740cad9c 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -203,7 +203,7 @@ def trans(psyir): loop_directive_trans=omp_gpu_loop_trans, collapse=True, privatise_arrays=False, - reproducible_intrinsics_only=REPRODUCIBLE, + uniform_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") @@ -213,7 +213,7 @@ def trans(psyir): loop_directive_trans=omp_gpu_loop_trans, collapse=True, privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES), - reproducible_intrinsics_only=REPRODUCIBLE, + uniform_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES: # This have issues offloading, but we can still do OpenMP threading diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index e1fa1cf384..4f1cd73054 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -411,7 +411,7 @@ def insert_explicit_loop_parallelism( loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, - reproducible_intrinsics_only: bool = False, + uniform_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive as an ancestor, attempt to insert the given region and loop directives. @@ -430,7 +430,7 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. - :param reproducible_intrinsics_only: if True it prevent offloading loops + :param uniform_intrinsics_only: if True it prevent offloading loops with non-reproducible device intrinsics. ''' @@ -444,8 +444,8 @@ def insert_explicit_loop_parallelism( opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, "verbose": True, "nowait": False} - if reproducible_intrinsics_only: - opts["device_string"] = "nvfortran-repr" + if uniform_intrinsics_only: + opts["device_string"] = "nvfortran-uniform" routine_name = loop.ancestor(Routine).name diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index 9ada095a85..bbfc39d7d8 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -776,17 +776,20 @@ def is_available_on_device(self, device_string: str = "") -> bool: device (or its compiler-platform family). :returns: whether this intrinsic is available on an accelerated device. + :raises ValueError: if the provided 'device_string' is not one of the + supported values. + ''' if not device_string: return self.intrinsic in DEFAULT_DEVICE_INTRINISCS if device_string == "nvfortran-all": return self.intrinsic in NVFORTRAN_ALL - if device_string == "nvfortran-repr": - return self.intrinsic in NVFORTRAN_REPRODUCIBLE + if device_string == "nvfortran-uniform": + return self.intrinsic in NVFORTRAN_UNIFORM raise ValueError( f"Unsupported device_string value '{device_string}', the supported" - " values are '' (default), 'nvfortran-all', 'nvfortran-repr'") + " values are '' (default), 'nvfortran-all', 'nvfortran-uniform'") @classmethod def create(cls, intrinsic, arguments=()): @@ -950,9 +953,9 @@ def is_inquiry(self): return self.intrinsic.is_inquiry -# Intrinsics available on nvidia gpus with reproducible results when +# Intrinsics available on nvidia gpus with uniform (CPU and GPU) results when # compiled with the nvfortran "-gpu=uniform_math" flag -NVFORTRAN_REPRODUCIBLE = ( +NVFORTRAN_UNIFORM = ( IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS, IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT, IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN, @@ -975,7 +978,7 @@ def is_inquiry(self): ) # All nvfortran intrinsics available on GPUs -NVFORTRAN_ALL = NVFORTRAN_REPRODUCIBLE + ( +NVFORTRAN_ALL = NVFORTRAN_UNIFORM + ( IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL) # For now the default intrinsics availabe on GPU are the same as nvfortran-all diff --git a/src/psyclone/psyir/transformations/omp_target_trans.py b/src/psyclone/psyir/transformations/omp_target_trans.py index 071bc274ab..059fb45c75 100644 --- a/src/psyclone/psyir/transformations/omp_target_trans.py +++ b/src/psyclone/psyir/transformations/omp_target_trans.py @@ -152,6 +152,8 @@ def validate(self, node, options=None): :type node: List[:py:class:`psyclone.psyir.nodes.Node`] :param options: a dictionary with options for transformations. :type options: Optional[Dict[str, Any]] + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if it contains calls to routines that are not available in the accelerator device. @@ -190,6 +192,8 @@ def apply(self, node, options=None): :type options: Optional[Dict[str,Any]] :param bool options["nowait"]: whether to add a nowait clause and a corresponding barrier to enable asynchronous execution. + :param str options["device_string"]: provide a compiler-platform + identifier. ''' if not options: diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index 19470f965d..8dbc5c8a1f 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -161,17 +161,17 @@ def test_intrinsiccall_is_available_on_device(intrinsic, result): def test_intrinsiccall_is_available_on_device_with_device_string(): '''Tests that the is_available_on_device() method with a device_string - argument provides different results with the 'nvfortran-reprod' + argument provides different results with the 'nvfortran-uniformod' ''' intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.LOG10) - assert not intrinsic_call.is_available_on_device("nvfortran-repr") + assert not intrinsic_call.is_available_on_device("nvfortran-uniform") intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.REAL) - assert not intrinsic_call.is_available_on_device("nvfortran-repr") + assert not intrinsic_call.is_available_on_device("nvfortran-uniform") with pytest.raises(ValueError) as err: assert not intrinsic_call.is_available_on_device("invalid") assert ("Unsupported device_string value 'invalid', the supported values" - " are '' (default), 'nvfortran-all', 'nvfortran-repr'" + " are '' (default), 'nvfortran-all', 'nvfortran-uniform'" in str(err.value)) diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index 0b9aae6d02..965b2b1ae5 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -429,10 +429,9 @@ def test_gpumixin_validate_no_call(): IntrinsicCall.create(IntrinsicCall.Intrinsic.GET_COMMAND)) with pytest.raises(TransformationError) as err: rtrans.validate(kernel) - assert ("Kernel 'testkern_with_call_code' calls another routine " - "'GET_COMMAND()' which is not available on the accelerator device " - "and therefore cannot have ACCRoutineTrans applied to it " - "(TODO #342)." + assert ("Kernel 'testkern_with_call_code' calls intrinsic 'GET_COMMAND' " + "which is not available by default. Use the 'device_string' " + "option to specify a different device." in str(err.value)) @@ -475,15 +474,15 @@ def test_kernel_gpu_annotation_device_id(rtrans, fortran_reader): # But not if we are targeting "nvidia-repr" or an invalid device with pytest.raises(TransformationError) as err: rtrans.validate(routine, options={'device_string': - 'nvfortran-repr'}) - assert ("routine 'myfunc' calls another routine 'REAL(a)' which is not " - "available on the accelerator device" - in str(err.value)) + 'nvfortran-uniform'}) + assert ("routine 'myfunc' calls intrinsic 'REAL' which is not available in" + " 'nvfortran-uniform'. Use the 'device_string' option to specify a" + " different device." in str(err.value)) with pytest.raises(ValueError) as err: rtrans.validate(routine, options={'device_string': 'unknown-device'}) assert ("Unsupported device_string value 'unknown-device', the supported " - "values are '' (default), 'nvfortran-all', 'nvfortran-repr'" + "values are '' (default), 'nvfortran-all', 'nvfortran-uniform'" in str(err.value)) diff --git a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py index 53ea44f293..fdc91a0e07 100644 --- a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py +++ b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py @@ -161,7 +161,7 @@ def test_omptargettrans_validate(fortran_reader): # But not if we are targeting "nvidia-repr" or an invalid device with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[3], options={'device_string': - 'nvfortran-repr'}) + 'nvfortran-uniform'}) assert ("'LOG10' is not available on the accelerator device, and therefore" " it cannot be called from within an OMP Target region" in str(err.value)) @@ -169,7 +169,7 @@ def test_omptargettrans_validate(fortran_reader): omptargettrans.validate(loops[3], options={'device_string': 'unknown-device'}) assert ("Unsupported device_string value 'unknown-device', the supported " - "values are '' (default), 'nvfortran-all', 'nvfortran-repr'" + "values are '' (default), 'nvfortran-all', 'nvfortran-uniform'" in str(err.value)) diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 9e092ae286..dcbcf0a1d0 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -66,7 +66,7 @@ OMPDeclareTargetDirective, OMPDirective, OMPMasterDirective, OMPParallelDirective, OMPParallelDoDirective, OMPSerialDirective, OMPSingleDirective, OMPTaskloopDirective, PSyDataNode, Return, - Routine, Schedule) + Routine, Schedule, IntrinsicCall) from psyclone.psyir.nodes.acc_mixins import ACCAsyncMixin from psyclone.psyir.nodes.array_mixin import ArrayMixin from psyclone.psyir.nodes.structure_member import StructureMember @@ -378,8 +378,8 @@ def validate_it_can_run_on_gpu(self, node, options): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. - :param bool options["device_string"]: provide a compiler-platform - identifier strign. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. @@ -492,6 +492,15 @@ def validate_it_can_run_on_gpu(self, node, options): calls = kernel_schedule.walk(Call) for call in calls: if not call.is_available_on_device(device_string): + if isinstance(call, IntrinsicCall): + device_str = (f"in '{device_string}'" if device_string else + "by default") + raise TransformationError( + f"{k_or_r} '{node.name}' calls intrinsic " + f"'{call.intrinsic.name}' which is not available " + f"{device_str}. Use the 'device_string' option to " + f"specify a different device." + ) call_str = call.debug_string().rstrip("\n") raise TransformationError( f"{k_or_r} '{node.name}' calls another routine " @@ -554,6 +563,8 @@ def apply(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. ''' self.validate(node, options) @@ -584,6 +595,8 @@ def validate(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. @@ -1773,6 +1786,14 @@ def validate(self, node_list, options=None): for node in node_list: for call in node.walk(Call): if not call.is_available_on_device(device_string): + if isinstance(call, IntrinsicCall): + dstr = (f"in '{device_string}'" if device_string else + "by default") + raise TransformationError( + f"{call.intrinsic.name} is not available " + f"{dstr}. Use the 'device_string' option to " + f"specify a different device." + ) raise TransformationError( f"'{call.routine.name}' is not available on the " f"accelerator device, and therefore it cannot " @@ -2772,6 +2793,8 @@ def apply(self, node, options=None): :param str options["parallelism"]: the level of parallelism that the target routine (or a callee) exposes. One of "seq" (the default), "vector", "worker" or "gang". + :param str options["device_string"]: provide a compiler-platform + identifier. ''' # Check that we can safely apply this transformation @@ -2807,6 +2830,8 @@ def validate(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. From c74108db4b0150d54903555da1b5b2c0109a1623 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Wed, 11 Jun 2025 16:18:54 +0100 Subject: [PATCH 15/16] #2856 Improve configurable intr error message and improve coverage --- .../psyir/transformations/omp_target_trans.py | 6 ++++-- .../tests/psyir/nodes/intrinsic_call_test.py | 2 +- .../kernel_transformation_test.py | 10 +++++----- .../transformations/omp_target_trans_test.py | 10 ++++++---- .../transformations/transformations_test.py | 18 ++++++++++++++++++ src/psyclone/transformations.py | 16 ++++++++++------ 6 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/psyclone/psyir/transformations/omp_target_trans.py b/src/psyclone/psyir/transformations/omp_target_trans.py index 059fb45c75..5221595c68 100644 --- a/src/psyclone/psyir/transformations/omp_target_trans.py +++ b/src/psyclone/psyir/transformations/omp_target_trans.py @@ -168,8 +168,10 @@ def validate(self, node, options=None): if not call.is_available_on_device(device_string): raise TransformationError( f"'{call.routine.name}' is not available on the " - f"accelerator device, and therefore it cannot " - f"be called from within an OMP Target region.") + f"default accelerator device, and therefore it cannot " + f"be called from within an OMP Target region. Use " + f"the 'device_string' option to specify a different " + f"device.") routine = node.ancestor(Routine) if routine and routine.return_symbol: # if it is a function, the target must not include its return sym diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index 8dbc5c8a1f..249706d77b 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -161,7 +161,7 @@ def test_intrinsiccall_is_available_on_device(intrinsic, result): def test_intrinsiccall_is_available_on_device_with_device_string(): '''Tests that the is_available_on_device() method with a device_string - argument provides different results with the 'nvfortran-uniformod' + argument provides different results with the 'nvfortran-uniform' ''' intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.LOG10) assert not intrinsic_call.is_available_on_device("nvfortran-uniform") diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index 965b2b1ae5..051a3248c7 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -430,8 +430,8 @@ def test_gpumixin_validate_no_call(): with pytest.raises(TransformationError) as err: rtrans.validate(kernel) assert ("Kernel 'testkern_with_call_code' calls intrinsic 'GET_COMMAND' " - "which is not available by default. Use the 'device_string' " - "option to specify a different device." + "which is not available on the default accelerator device. Use " + "the 'device_string' option to specify a different device." in str(err.value)) @@ -475,9 +475,9 @@ def test_kernel_gpu_annotation_device_id(rtrans, fortran_reader): with pytest.raises(TransformationError) as err: rtrans.validate(routine, options={'device_string': 'nvfortran-uniform'}) - assert ("routine 'myfunc' calls intrinsic 'REAL' which is not available in" - " 'nvfortran-uniform'. Use the 'device_string' option to specify a" - " different device." in str(err.value)) + assert ("routine 'myfunc' calls intrinsic 'REAL' which is not available on" + " the 'nvfortran-uniform' device. Use the 'device_string' option " + "to specify a different device." in str(err.value)) with pytest.raises(ValueError) as err: rtrans.validate(routine, options={'device_string': 'unknown-device'}) diff --git a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py index fdc91a0e07..3c5028c3c3 100644 --- a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py +++ b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py @@ -147,8 +147,9 @@ def test_omptargettrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[1]) - assert ("'myfunc' is not available on the accelerator device, and " - "therefore it cannot be called from within an OMP Target region." + assert ("'myfunc' is not available on the default accelerator device, and " + "therefore it cannot be called from within an OMP Target region. " + "Use the 'device_string' option to specify a different device." in str(err.value)) with pytest.raises(TransformationError) as err: @@ -162,8 +163,9 @@ def test_omptargettrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[3], options={'device_string': 'nvfortran-uniform'}) - assert ("'LOG10' is not available on the accelerator device, and therefore" - " it cannot be called from within an OMP Target region" + assert ("'LOG10' is not available on the default accelerator device, and " + "therefore it cannot be called from within an OMP Target region. " + "Use the 'device_string' option to specify a different device." in str(err.value)) with pytest.raises(ValueError) as err: omptargettrans.validate(loops[3], options={'device_string': diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index 3051fc0375..09eafed49e 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -138,6 +138,11 @@ def test_accparalleltrans_validate(fortran_reader): char = 'a' // 'b' end do end do + do i = 1, 10 + do j = 1, 10 + A(i,j) = GET_COMMAND(2) + end do + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) @@ -154,6 +159,19 @@ def test_accparalleltrans_validate(fortran_reader): assert ("Nodes of type 'CodeBlock' cannot be enclosed by a ACCParallel" "Trans transformation" in str(err.value)) + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[2]) + assert ("'GET_COMMAND' is not available on the default accelerator " + "device. Use the 'device_string' option to specify a different " + "device." in str(err.value)) + + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[2], options={'device_string': + 'nvfortran-all'}) + assert ("'GET_COMMAND' is not available on the 'nvfortran-all' device. " + "Use the 'device_string' option to specify a different device." + in str(err.value)) + def test_accenterdata(): ''' Generic tests for the ACCEnterDataTrans class ''' diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index dcbcf0a1d0..efed9eb84e 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -493,8 +493,10 @@ def validate_it_can_run_on_gpu(self, node, options): for call in calls: if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): - device_str = (f"in '{device_string}'" if device_string else - "by default") + if device_string: + device_str = f"on the '{device_string}' device" + else: + device_str = "on the default accelerator device" raise TransformationError( f"{k_or_r} '{node.name}' calls intrinsic " f"'{call.intrinsic.name}' which is not available " @@ -1787,11 +1789,13 @@ def validate(self, node_list, options=None): for call in node.walk(Call): if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): - dstr = (f"in '{device_string}'" if device_string else - "by default") + if device_string: + device_str = f"on the '{device_string}' device" + else: + device_str = "on the default accelerator device" raise TransformationError( - f"{call.intrinsic.name} is not available " - f"{dstr}. Use the 'device_string' option to " + f"'{call.intrinsic.name}' is not available " + f"{device_str}. Use the 'device_string' option to " f"specify a different device." ) raise TransformationError( From a3faca6b8ac7fb88927fca9155cfb34460be7e7a Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Mon, 16 Jun 2025 10:38:34 +0100 Subject: [PATCH 16/16] #2856 Fix device_string error messages --- .../psyir/transformations/omp_target_trans.py | 11 ++++++----- .../kernel_transformation_test.py | 5 +++-- .../transformations/omp_target_trans_test.py | 16 ++++++++-------- .../transformations/transformations_test.py | 6 +++--- src/psyclone/transformations.py | 6 ++++-- 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/psyclone/psyir/transformations/omp_target_trans.py b/src/psyclone/psyir/transformations/omp_target_trans.py index 5221595c68..5c8637d3ff 100644 --- a/src/psyclone/psyir/transformations/omp_target_trans.py +++ b/src/psyclone/psyir/transformations/omp_target_trans.py @@ -166,12 +166,13 @@ def validate(self, node, options=None): for node in node_list: for call in node.walk(Call): if not call.is_available_on_device(device_string): + device_str = device_string if device_string else "default" raise TransformationError( - f"'{call.routine.name}' is not available on the " - f"default accelerator device, and therefore it cannot " - f"be called from within an OMP Target region. Use " - f"the 'device_string' option to specify a different " - f"device.") + f"'{call.routine.name}' is not available on the" + f" '{device_str}' accelerator device, and therefore " + f"it cannot be called from within an OMP Target " + f"region. Use the 'device_string' option to specify a " + f"different device.") routine = node.ancestor(Routine) if routine and routine.return_symbol: # if it is a function, the target must not include its return sym diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index 051a3248c7..7b087a0906 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -476,8 +476,9 @@ def test_kernel_gpu_annotation_device_id(rtrans, fortran_reader): rtrans.validate(routine, options={'device_string': 'nvfortran-uniform'}) assert ("routine 'myfunc' calls intrinsic 'REAL' which is not available on" - " the 'nvfortran-uniform' device. Use the 'device_string' option " - "to specify a different device." in str(err.value)) + " the 'nvfortran-uniform' accelerator device. Use the " + "'device_string' option to specify a different device." + in str(err.value)) with pytest.raises(ValueError) as err: rtrans.validate(routine, options={'device_string': 'unknown-device'}) diff --git a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py index 3c5028c3c3..aa76d8f11f 100644 --- a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py +++ b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py @@ -147,10 +147,10 @@ def test_omptargettrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[1]) - assert ("'myfunc' is not available on the default accelerator device, and " - "therefore it cannot be called from within an OMP Target region. " - "Use the 'device_string' option to specify a different device." - in str(err.value)) + assert ("'myfunc' is not available on the 'default' accelerator device, " + "and therefore it cannot be called from within an OMP Target " + "region. Use the 'device_string' option to specify a different " + "device." in str(err.value)) with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[2]) @@ -163,10 +163,10 @@ def test_omptargettrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[3], options={'device_string': 'nvfortran-uniform'}) - assert ("'LOG10' is not available on the default accelerator device, and " - "therefore it cannot be called from within an OMP Target region. " - "Use the 'device_string' option to specify a different device." - in str(err.value)) + assert ("'LOG10' is not available on the 'nvfortran-uniform' accelerator " + "device, and therefore it cannot be called from within an OMP " + "Target region. Use the 'device_string' option to specify a " + "different device." in str(err.value)) with pytest.raises(ValueError) as err: omptargettrans.validate(loops[3], options={'device_string': 'unknown-device'}) diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index 09eafed49e..e1f003b40a 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -168,9 +168,9 @@ def test_accparalleltrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[2], options={'device_string': 'nvfortran-all'}) - assert ("'GET_COMMAND' is not available on the 'nvfortran-all' device. " - "Use the 'device_string' option to specify a different device." - in str(err.value)) + assert ("'GET_COMMAND' is not available on the 'nvfortran-all' accelerator" + " device. Use the 'device_string' option to specify a different " + "device." in str(err.value)) def test_accenterdata(): diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index efed9eb84e..8ea4d49d97 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -494,7 +494,8 @@ def validate_it_can_run_on_gpu(self, node, options): if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): if device_string: - device_str = f"on the '{device_string}' device" + device_str = (f"on the '{device_string}' accelerator " + f"device") else: device_str = "on the default accelerator device" raise TransformationError( @@ -1790,7 +1791,8 @@ def validate(self, node_list, options=None): if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): if device_string: - device_str = f"on the '{device_string}' device" + device_str = (f"on the '{device_string}' " + f"accelerator device") else: device_str = "on the default accelerator device" raise TransformationError(