diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 41e759604f..787fbdc3e9 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -214,22 +214,46 @@ jobs: export NEMO_DIR=${HOME}/${NEMODIR_NAME} export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC - # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS - # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. + # Make sure the profiling wrapper is compiled with the same compiler + export PROFILING_DIR=${GITHUB_WORKSPACE}/lib/profiling/nvidia/ + cd $PROFILING_DIR + make clean + F90=$MPIF90 make + + # First do a debug-build: set the environemnt variables to use flags and intrinsics + # with numerically reproducible results and enable PROFILING hooks cd $NEMO_DIR - cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profile.fcm arch/arch-linux_spack_profile.fcm + export ENABLE_PROFILING=1 + # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" - + export REPRODUCIBLE=1 # Clean up and compile rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ -j 4 -v 1 - # Run test + # Run reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg OMP_NUM_THREADS=4 mpirun -np 1 ./nemo + # We can compare all digits for this build diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat + + # Now do a fast-build (without reproducible or profiling options, which have a + # big impact for BENCH due to some inner-loop REAL intrinsics) + cd $NEMO_DIR + unset REPRODUCIBLE + unset ENABLE_PROFILING + export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" + rm -rf tests/${TEST_DIR} + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + -j 4 -v 1 + + # Run non-reproducible test + cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 + cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg + OMP_NUM_THREADS=4 mpirun -np 1 ./nemo export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s) ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ @@ -256,6 +280,7 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} @@ -296,6 +321,7 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm new file mode 100644 index 0000000000..2e6c8df745 --- /dev/null +++ b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm @@ -0,0 +1,31 @@ +# This fcm file is intended to be used with the psyclone-spack nemo-build-environment recipe +# which will populate all environment variables but PSYCLONE_HOME and FCFLAGS, which should +# be populated manually for the desired target. For example, using: +# $ spack load nemo-build-environment%nvhpc +# $ export PSYCLONE_HOME=${PWD}/.venv +# $ export FCFLAGS="-i4 -Mr8 -O3 -Minline -Mcray=pointer -Mpre -mp" + +%PSYCLONE_HOME ${PSYCLONE_HOME} +%NCDF_INC -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include +%NCDF_LIB -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf + +%PROFILE_INC -I${PROFILING_DIR} +%PROFILE_LIB -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt + + + +%CPP cpp -Dkey_nosignedzero +%FC ${MPIF90} -c +%FCFLAGS ${FCFLAGS} +%FFLAGS %FCFLAGS +%LD ${MPIF90} +%LDFLAGS ${FCFLAGS} +%FPPFLAGS -P -traditional +%AR ar +%ARFLAGS rs +%MK make +%USER_INC %NCDF_INC %PROFILE_INC +%USER_LIB %NCDF_LIB %PROFILE_LIB + +%CC ${CC} +%CFLAGS -O2 diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py index 8077255505..89740cad9c 100755 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ b/examples/nemo/scripts/omp_gpu_trans.py @@ -54,6 +54,9 @@ # By default, we don't do module inlining as it's still under development. INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) +# By default, we allow all device intrinsics (not only the reproducible ones) +REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) + # This environment variable informs if this is targeting NEMOv4, in which case # array privatisation is disabled and some more files excluded NEMOV4 = os.environ.get('NEMOV4', False) @@ -97,6 +100,7 @@ "trczdf.f90", "trcice_pisces.f90", "dtatsd.f90", + "trcatf.f90", ] @@ -198,7 +202,8 @@ def trans(psyir): region_directive_trans=omp_target_trans, loop_directive_trans=omp_gpu_loop_trans, collapse=True, - privatise_arrays=False + privatise_arrays=False, + uniform_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") @@ -207,7 +212,8 @@ def trans(psyir): region_directive_trans=omp_target_trans, loop_directive_trans=omp_gpu_loop_trans, collapse=True, - privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES) + privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES), + uniform_intrinsics_only=REPRODUCIBLE, ) elif psyir.name not in PARALLELISATION_ISSUES: # This have issues offloading, but we can still do OpenMP threading diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 8084e2e637..4f1cd73054 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -411,6 +411,7 @@ def insert_explicit_loop_parallelism( loop_directive_trans=None, collapse: bool = True, privatise_arrays: bool = False, + uniform_intrinsics_only: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive as an ancestor, attempt to insert the given region and loop directives. @@ -429,6 +430,8 @@ def insert_explicit_loop_parallelism( many nested loops as possible. :param privatise_arrays: whether to attempt to privatise arrays that cause write-write race conditions. + :param uniform_intrinsics_only: if True it prevent offloading loops + with non-reproducible device intrinsics. ''' if schedule.name == "ts_wgt": @@ -439,7 +442,10 @@ def insert_explicit_loop_parallelism( continue # Skip if an outer loop is already parallelised opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, - "verbose": True, "nowait": True} + "verbose": True, "nowait": False} + + if uniform_intrinsics_only: + opts["device_string"] = "nvfortran-uniform" routine_name = loop.ancestor(Routine).name @@ -487,7 +493,7 @@ def insert_explicit_loop_parallelism( # And if successful, the region directive on top. if region_directive_trans: - region_directive_trans.apply(loop.parent.parent) + region_directive_trans.apply(loop.parent.parent, options=opts) except TransformationError: # This loop cannot be transformed, proceed to next loop. # The parallelisation restrictions will be explained with a comment diff --git a/src/psyclone/psyir/nodes/call.py b/src/psyclone/psyir/nodes/call.py index ac4f65c6a4..8859342e0d 100644 --- a/src/psyclone/psyir/nodes/call.py +++ b/src/psyclone/psyir/nodes/call.py @@ -390,12 +390,14 @@ def is_pure(self): return self.routine.symbol.is_pure return None - def is_available_on_device(self): + def is_available_on_device(self, device_string: str = "") -> bool: ''' + :param device_string: optional string to identify the offloading + device (or its compiler-platform family). :returns: whether this call is available on an accelerated device. - :rtype: bool ''' + # pylint: disable=unused-argument return False @property diff --git a/src/psyclone/psyir/nodes/intrinsic_call.py b/src/psyclone/psyir/nodes/intrinsic_call.py index 58dc21c5d0..bbfc39d7d8 100644 --- a/src/psyclone/psyir/nodes/intrinsic_call.py +++ b/src/psyclone/psyir/nodes/intrinsic_call.py @@ -770,44 +770,26 @@ def intrinsic(self): ''' return self.routine.symbol.intrinsic - # This is not part of the intrinsic enum, because its ValueError could - # change for different devices, and in the future we may want to pass - # a device/arch/compiler parameter or look at the configuration file. - # Currently it is inspired from: https://docs.nvidia.com/hpc-sdk/ - # compilers/hpc-compilers-user-guide/#acc-fort-intrin-sum - # But that list is incomplete (e.g. SUM is supported and not listed) - def is_available_on_device(self): + def is_available_on_device(self, device_string: str = "") -> bool: ''' + :param device_string: optional string to identify the offloading + device (or its compiler-platform family). :returns: whether this intrinsic is available on an accelerated device. - :rtype: bool + + :raises ValueError: if the provided 'device_string' is not one of the + supported values. ''' - return self.intrinsic in ( - IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS, - IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT, - IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN, - IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS, - IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE, - IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP, - IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR, - IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR, - IntrinsicCall.Intrinsic.LOG, - IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN, - IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT, - IntrinsicCall.Intrinsic.NOT, - IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN, - IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT, - IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH, - IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE, - # The ones below can be offloaded but provide numerical differences - # even with the -gpu=uniform_math flag, ideally it should be - # configurable if these are allowed or not. - # IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL, - # The one below are not documented on nvidia compiler - IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, - IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, - IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL, - IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE) + if not device_string: + return self.intrinsic in DEFAULT_DEVICE_INTRINISCS + if device_string == "nvfortran-all": + return self.intrinsic in NVFORTRAN_ALL + if device_string == "nvfortran-uniform": + return self.intrinsic in NVFORTRAN_UNIFORM + + raise ValueError( + f"Unsupported device_string value '{device_string}', the supported" + " values are '' (default), 'nvfortran-all', 'nvfortran-uniform'") @classmethod def create(cls, intrinsic, arguments=()): @@ -971,6 +953,37 @@ def is_inquiry(self): return self.intrinsic.is_inquiry +# Intrinsics available on nvidia gpus with uniform (CPU and GPU) results when +# compiled with the nvfortran "-gpu=uniform_math" flag +NVFORTRAN_UNIFORM = ( + IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS, + IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT, + IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN, + IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS, + IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE, + IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP, + IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR, + IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR, + IntrinsicCall.Intrinsic.LOG, IntrinsicCall.Intrinsic.NOT, + IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN, + IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT, + IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN, + IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT, + IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH, + IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE, + IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE, + IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND, + IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL, + IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE +) + +# All nvfortran intrinsics available on GPUs +NVFORTRAN_ALL = NVFORTRAN_UNIFORM + ( + IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL) + +# For now the default intrinsics availabe on GPU are the same as nvfortran-all +DEFAULT_DEVICE_INTRINISCS = NVFORTRAN_ALL + # TODO #658 this can be removed once we have support for determining the # type of a PSyIR expression. # Intrinsics that perform a reduction on an array. diff --git a/src/psyclone/psyir/transformations/omp_target_trans.py b/src/psyclone/psyir/transformations/omp_target_trans.py index 5c5d3a5221..5c8637d3ff 100644 --- a/src/psyclone/psyir/transformations/omp_target_trans.py +++ b/src/psyclone/psyir/transformations/omp_target_trans.py @@ -152,21 +152,27 @@ def validate(self, node, options=None): :type node: List[:py:class:`psyclone.psyir.nodes.Node`] :param options: a dictionary with options for transformations. :type options: Optional[Dict[str, Any]] + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if it contains calls to routines that are not available in the accelerator device. :raises TransformationError: if its a function and the target region attempts to enclose the assingment setting the return value. ''' + device_string = options.get("device_string", "") if options else "" node_list = self.get_node_list(node) super().validate(node, options) for node in node_list: for call in node.walk(Call): - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): + device_str = device_string if device_string else "default" raise TransformationError( - f"'{call.routine.name}' is not available on the " - f"accelerator device, and therefore it cannot " - f"be called from within an OMP Target region.") + f"'{call.routine.name}' is not available on the" + f" '{device_str}' accelerator device, and therefore " + f"it cannot be called from within an OMP Target " + f"region. Use the 'device_string' option to specify a " + f"different device.") routine = node.ancestor(Routine) if routine and routine.return_symbol: # if it is a function, the target must not include its return sym @@ -189,6 +195,8 @@ def apply(self, node, options=None): :type options: Optional[Dict[str,Any]] :param bool options["nowait"]: whether to add a nowait clause and a corresponding barrier to enable asynchronous execution. + :param str options["device_string"]: provide a compiler-platform + identifier. ''' if not options: diff --git a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py index 8e628d13c7..249706d77b 100644 --- a/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py +++ b/src/psyclone/tests/psyir/nodes/intrinsic_call_test.py @@ -136,11 +136,11 @@ def test_intrinsiccall_is_inquiry(): (IntrinsicCall.Intrinsic.INT, True), (IntrinsicCall.Intrinsic.IOR, True), (IntrinsicCall.Intrinsic.LOG, True), - (IntrinsicCall.Intrinsic.LOG10, False), + (IntrinsicCall.Intrinsic.LOG10, True), (IntrinsicCall.Intrinsic.MOD, True), (IntrinsicCall.Intrinsic.NINT, True), (IntrinsicCall.Intrinsic.NOT, True), - (IntrinsicCall.Intrinsic.REAL, False), + (IntrinsicCall.Intrinsic.REAL, True), (IntrinsicCall.Intrinsic.SIGN, True), (IntrinsicCall.Intrinsic.SIN, True), (IntrinsicCall.Intrinsic.SINH, True), @@ -154,7 +154,25 @@ def test_intrinsiccall_is_inquiry(): def test_intrinsiccall_is_available_on_device(intrinsic, result): '''Tests that the is_available_on_device() method works as expected.''' intrinsic_call = IntrinsicCall(intrinsic) + # For now default and nvfortran-all are the same assert intrinsic_call.is_available_on_device() is result + assert intrinsic_call.is_available_on_device('nvfortran-all') is result + + +def test_intrinsiccall_is_available_on_device_with_device_string(): + '''Tests that the is_available_on_device() method with a device_string + argument provides different results with the 'nvfortran-uniform' + ''' + intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.LOG10) + assert not intrinsic_call.is_available_on_device("nvfortran-uniform") + intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.REAL) + assert not intrinsic_call.is_available_on_device("nvfortran-uniform") + + with pytest.raises(ValueError) as err: + assert not intrinsic_call.is_available_on_device("invalid") + assert ("Unsupported device_string value 'invalid', the supported values" + " are '' (default), 'nvfortran-all', 'nvfortran-uniform'" + in str(err.value)) def test_intrinsiccall_alloc_create(): diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index de6e9d2e55..7b087a0906 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -429,10 +429,9 @@ def test_gpumixin_validate_no_call(): IntrinsicCall.create(IntrinsicCall.Intrinsic.GET_COMMAND)) with pytest.raises(TransformationError) as err: rtrans.validate(kernel) - assert ("Kernel 'testkern_with_call_code' calls another routine " - "'GET_COMMAND()' which is not available on the accelerator device " - "and therefore cannot have ACCRoutineTrans applied to it " - "(TODO #342)." + assert ("Kernel 'testkern_with_call_code' calls intrinsic 'GET_COMMAND' " + "which is not available on the default accelerator device. Use " + "the 'device_string' option to specify a different device." in str(err.value)) @@ -454,6 +453,40 @@ def test_kernel_gpu_annotation_trans(rtrans, expected_directive, assert expected_directive in code +@pytest.mark.parametrize( + "rtrans", + [ACCRoutineTrans(), OMPDeclareTargetTrans()]) +def test_kernel_gpu_annotation_device_id(rtrans, fortran_reader): + ''' Check that the GPU annotation transformations validations + check the intrinsics using the provided device id. ''' + + code = ''' + function myfunc(a) + integer :: a + real :: myfunc + myfunc = REAL(a) + end function + ''' + psyir = fortran_reader.psyir_from_source(code) + routine = psyir.children[0] + # The routine is valid + rtrans.validate(routine) + # But not if we are targeting "nvidia-repr" or an invalid device + with pytest.raises(TransformationError) as err: + rtrans.validate(routine, options={'device_string': + 'nvfortran-uniform'}) + assert ("routine 'myfunc' calls intrinsic 'REAL' which is not available on" + " the 'nvfortran-uniform' accelerator device. Use the " + "'device_string' option to specify a different device." + in str(err.value)) + with pytest.raises(ValueError) as err: + rtrans.validate(routine, options={'device_string': + 'unknown-device'}) + assert ("Unsupported device_string value 'unknown-device', the supported " + "values are '' (default), 'nvfortran-all', 'nvfortran-uniform'" + in str(err.value)) + + def test_1kern_trans(kernel_outputdir): ''' Check that we generate the correct code when an invoke contains the same kernel more than once but only one of them is transformed. ''' diff --git a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py index 21a256a6e8..aa76d8f11f 100644 --- a/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py +++ b/src/psyclone/tests/psyir/transformations/omp_target_trans_test.py @@ -129,6 +129,11 @@ def test_omptargettrans_validate(fortran_reader): char = 'a' // 'b' end do end do + do i = 1, 10 + do j = 1, 10 + A(i, j) = LOG10(3) + end do + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) @@ -142,15 +147,33 @@ def test_omptargettrans_validate(fortran_reader): with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[1]) - assert ("'myfunc' is not available on the accelerator device, and " - "therefore it cannot be called from within an OMP Target region." - in str(err.value)) + assert ("'myfunc' is not available on the 'default' accelerator device, " + "and therefore it cannot be called from within an OMP Target " + "region. Use the 'device_string' option to specify a different " + "device." in str(err.value)) with pytest.raises(TransformationError) as err: omptargettrans.validate(loops[2]) assert ("Nodes of type 'CodeBlock' cannot be enclosed by a OMPTarget" "Trans transformation" in str(err.value)) + # The last loop is valid + omptargettrans.validate(loops[3]) + # But not if we are targeting "nvidia-repr" or an invalid device + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[3], options={'device_string': + 'nvfortran-uniform'}) + assert ("'LOG10' is not available on the 'nvfortran-uniform' accelerator " + "device, and therefore it cannot be called from within an OMP " + "Target region. Use the 'device_string' option to specify a " + "different device." in str(err.value)) + with pytest.raises(ValueError) as err: + omptargettrans.validate(loops[3], options={'device_string': + 'unknown-device'}) + assert ("Unsupported device_string value 'unknown-device', the supported " + "values are '' (default), 'nvfortran-all', 'nvfortran-uniform'" + in str(err.value)) + def test_omptargetrans_apply_nowait(fortran_reader, fortran_writer): '''Test the behaviour of the OMPTargetTrans apply function is as diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index d7f55161d8..4097edd199 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -139,6 +139,11 @@ def test_accparalleltrans_validate(fortran_reader): char = 'a' // 'b' end do end do + do i = 1, 10 + do j = 1, 10 + A(i,j) = GET_COMMAND(2) + end do + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) @@ -155,6 +160,19 @@ def test_accparalleltrans_validate(fortran_reader): assert ("Nodes of type 'CodeBlock' cannot be enclosed by a ACCParallel" "Trans transformation" in str(err.value)) + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[2]) + assert ("'GET_COMMAND' is not available on the default accelerator " + "device. Use the 'device_string' option to specify a different " + "device." in str(err.value)) + + with pytest.raises(TransformationError) as err: + omptargettrans.validate(loops[2], options={'device_string': + 'nvfortran-all'}) + assert ("'GET_COMMAND' is not available on the 'nvfortran-all' accelerator" + " device. Use the 'device_string' option to specify a different " + "device." in str(err.value)) + def test_accenterdata(): ''' Generic tests for the ACCEnterDataTrans class ''' diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 17289d9693..8ea4d49d97 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -66,7 +66,7 @@ OMPDeclareTargetDirective, OMPDirective, OMPMasterDirective, OMPParallelDirective, OMPParallelDoDirective, OMPSerialDirective, OMPSingleDirective, OMPTaskloopDirective, PSyDataNode, Return, - Routine, Schedule) + Routine, Schedule, IntrinsicCall) from psyclone.psyir.nodes.acc_mixins import ACCAsyncMixin from psyclone.psyir.nodes.array_mixin import ArrayMixin from psyclone.psyir.nodes.structure_member import StructureMember @@ -378,6 +378,8 @@ def validate_it_can_run_on_gpu(self, node, options): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. @@ -393,6 +395,7 @@ def validate_it_can_run_on_gpu(self, node, options): routines. ''' force = options.get("force", False) if options else False + device_string = options.get("device_string", "") if options else "" if not isinstance(node, (Kern, Routine)): raise TransformationError( @@ -488,7 +491,19 @@ def validate_it_can_run_on_gpu(self, node, options): calls = kernel_schedule.walk(Call) for call in calls: - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): + if isinstance(call, IntrinsicCall): + if device_string: + device_str = (f"on the '{device_string}' accelerator " + f"device") + else: + device_str = "on the default accelerator device" + raise TransformationError( + f"{k_or_r} '{node.name}' calls intrinsic " + f"'{call.intrinsic.name}' which is not available " + f"{device_str}. Use the 'device_string' option to " + f"specify a different device." + ) call_str = call.debug_string().rstrip("\n") raise TransformationError( f"{k_or_r} '{node.name}' calls another routine " @@ -551,6 +566,8 @@ def apply(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. ''' self.validate(node, options) @@ -581,6 +598,8 @@ def validate(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel. @@ -1766,9 +1785,21 @@ def validate(self, node_list, options=None): f"The provided 'default_present' option must be a " f"boolean, but found '{options['default_present']}'." ) + device_string = options.get("device_string", "") if options else "" for node in node_list: for call in node.walk(Call): - if not call.is_available_on_device(): + if not call.is_available_on_device(device_string): + if isinstance(call, IntrinsicCall): + if device_string: + device_str = (f"on the '{device_string}' " + f"accelerator device") + else: + device_str = "on the default accelerator device" + raise TransformationError( + f"'{call.intrinsic.name}' is not available " + f"{device_str}. Use the 'device_string' option to " + f"specify a different device." + ) raise TransformationError( f"'{call.routine.name}' is not available on the " f"accelerator device, and therefore it cannot " @@ -2768,6 +2799,8 @@ def apply(self, node, options=None): :param str options["parallelism"]: the level of parallelism that the target routine (or a callee) exposes. One of "seq" (the default), "vector", "worker" or "gang". + :param str options["device_string"]: provide a compiler-platform + identifier. ''' # Check that we can safely apply this transformation @@ -2803,6 +2836,8 @@ def validate(self, node, options=None): :type options: Optional[Dict[str, Any]] :param bool options["force"]: whether to allow routines with CodeBlocks to run on the GPU. + :param str options["device_string"]: provide a compiler-platform + identifier. :raises TransformationError: if the node is not a kernel or a routine. :raises TransformationError: if the target is a built-in kernel.