Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7df65e0
Add a device_id string to the is_available_on_device method
sergisiso Apr 28, 2025
4cd1199
Merge remote-tracking branch 'origin/master' into configurable_intr_a…
sergisiso Jun 2, 2025
56db78d
#2856 Add REPRODUCIBLE configuration in NEMO OpenMP script
sergisiso Jun 3, 2025
371f596
Merge remote-tracking branch 'origin/master' into 2856_configurable_i…
sergisiso Jun 3, 2025
8fc3122
#2856 Test PROFILING and non-reproducible NEMOv5 BENCH builds
sergisiso Jun 3, 2025
04fa5e0
#2856 Fix issues with device_string option
sergisiso Jun 3, 2025
b89ca80
#2856 Remove debug statement
sergisiso Jun 3, 2025
5c3b22a
#2856 Fix CI action mistake
sergisiso Jun 4, 2025
d435ee2
#2856 NEMO integration tests builds the profiling lib
sergisiso Jun 4, 2025
3de5e8b
#2856 Fix NEMO integration test
sergisiso Jun 4, 2025
c74e509
#2856 Exclude one more nemo file
sergisiso Jun 4, 2025
78a06be
#2856 Add tests for the device_id transformation options
sergisiso Jun 4, 2025
69e61a6
#2856 Remove unnecessary check
sergisiso Jun 4, 2025
f3b5430
#2856 Fix NEMO tests and add a separate DEFAULT_DEVICE_INTRINSICS global
sergisiso Jun 5, 2025
f9a6d77
#2856 Fix missing coverage
sergisiso Jun 5, 2025
8cb19fc
Merge remote-tracking branch 'origin/master' into 2856_configurable_i…
sergisiso Jun 5, 2025
a21fce1
Merge branch 'master' into 2856_configurable_intr_available_on_device
arporter Jun 6, 2025
6211fca
#2856 Improve device_string implementation
sergisiso Jun 9, 2025
c74108d
#2856 Improve configurable intr error message and improve coverage
sergisiso Jun 11, 2025
a3faca6
#2856 Fix device_string error messages
sergisiso Jun 16, 2025
ccdb102
Merge remote-tracking branch 'origin/master' into 2856_configurable_i…
sergisiso Jun 16, 2025
d07e62c
Merge branch 'master' into 2856_configurable_intr_available_on_device
arporter Jun 23, 2025
96035a8
Merge branch 'master' into 2856_configurable_intr_available_on_device
arporter Jun 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions .github/workflows/nemo_v5_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,22 +214,46 @@ jobs:
export NEMO_DIR=${HOME}/${NEMODIR_NAME}
export TEST_DIR=BENCH_OMP_OFFLOAD_NVHPC

# Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS
# We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results.
# Make sure the profiling wrapper is compiled with the same compiler
export PROFILING_DIR=${GITHUB_WORKSPACE}/lib/profiling/nvidia/
cd $PROFILING_DIR
make clean
F90=$MPIF90 make

# First do a debug-build: set the environemnt variables to use flags and intrinsics
# with numerically reproducible results and enable PROFILING hooks
cd $NEMO_DIR
cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack_profile.fcm arch/arch-linux_spack_profile.fcm
export ENABLE_PROFILING=1
# We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results.
export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"

export REPRODUCIBLE=1
# Clean up and compile
rm -rf tests/${TEST_DIR}
./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-j 4 -v 1

# Run test
# Run reproducible test
cd $NEMO_DIR/tests/${TEST_DIR}/EXP00
cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg
OMP_NUM_THREADS=4 mpirun -np 1 ./nemo
# We can compare all digits for this build
diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.bench.nvhpc.small.10steps run.stat

# Now do a fast-build (without reproducible or profiling options, which have a
# big impact for BENCH due to some inner-loop REAL intrinsics)
cd $NEMO_DIR
unset REPRODUCIBLE
unset ENABLE_PROFILING
export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed"
rm -rf tests/${TEST_DIR}
./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-j 4 -v 1

# Run non-reproducible test
cd $NEMO_DIR/tests/${TEST_DIR}/EXP00
cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg
OMP_NUM_THREADS=4 mpirun -np 1 ./nemo
export TIME_sec=$(grep "local proces" timing.output | head -n 1 | awk '{print $4}' | tr -d s)
${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \
"mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \
Expand All @@ -256,6 +280,7 @@ jobs:
cd $NEMO_DIR
cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
export REPRODUCIBLE=1

# Clean up and compile
rm -rf cfgs/${TEST_DIR}
Expand Down Expand Up @@ -296,6 +321,7 @@ jobs:
cd $NEMO_DIR
cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
export REPRODUCIBLE=1

# Clean up and compile
rm -rf cfgs/${TEST_DIR}
Expand Down
31 changes: 31 additions & 0 deletions examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This fcm file is intended to be used with the psyclone-spack nemo-build-environment recipe
# which will populate all environment variables but PSYCLONE_HOME and FCFLAGS, which should
# be populated manually for the desired target. For example, using:
# $ spack load nemo-build-environment%nvhpc
# $ export PSYCLONE_HOME=${PWD}/.venv
# $ export FCFLAGS="-i4 -Mr8 -O3 -Minline -Mcray=pointer -Mpre -mp"

%PSYCLONE_HOME ${PSYCLONE_HOME}
%NCDF_INC -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include
%NCDF_LIB -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf

%PROFILE_INC -I${PROFILING_DIR}
%PROFILE_LIB -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt



%CPP cpp -Dkey_nosignedzero
%FC ${MPIF90} -c
%FCFLAGS ${FCFLAGS}
%FFLAGS %FCFLAGS
%LD ${MPIF90}
%LDFLAGS ${FCFLAGS}
%FPPFLAGS -P -traditional
%AR ar
%ARFLAGS rs
%MK make
%USER_INC %NCDF_INC %PROFILE_INC
%USER_LIB %NCDF_LIB %PROFILE_LIB

%CC ${CC}
%CFLAGS -O2
10 changes: 8 additions & 2 deletions examples/nemo/scripts/omp_gpu_trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
# By default, we don't do module inlining as it's still under development.
INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False)

# By default, we allow all device intrinsics (not only the reproducible ones)
REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False)

# This environment variable informs if this is targeting NEMOv4, in which case
# array privatisation is disabled and some more files excluded
NEMOV4 = os.environ.get('NEMOV4', False)
Expand Down Expand Up @@ -97,6 +100,7 @@
"trczdf.f90",
"trcice_pisces.f90",
"dtatsd.f90",
"trcatf.f90",
Comment thread
arporter marked this conversation as resolved.
]


Expand Down Expand Up @@ -198,7 +202,8 @@ def trans(psyir):
region_directive_trans=omp_target_trans,
loop_directive_trans=omp_gpu_loop_trans,
collapse=True,
privatise_arrays=False
privatise_arrays=False,
uniform_intrinsics_only=REPRODUCIBLE,
)
elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES:
print(f"Adding OpenMP offloading to subroutine: {subroutine.name}")
Expand All @@ -207,7 +212,8 @@ def trans(psyir):
region_directive_trans=omp_target_trans,
loop_directive_trans=omp_gpu_loop_trans,
collapse=True,
privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES)
privatise_arrays=(psyir.name not in PRIVATISATION_ISSUES),
uniform_intrinsics_only=REPRODUCIBLE,
)
elif psyir.name not in PARALLELISATION_ISSUES:
# This have issues offloading, but we can still do OpenMP threading
Expand Down
10 changes: 8 additions & 2 deletions examples/nemo/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ def insert_explicit_loop_parallelism(
loop_directive_trans=None,
collapse: bool = True,
privatise_arrays: bool = False,
uniform_intrinsics_only: bool = False,
):
''' For each loop in the schedule that doesn't already have a Directive
as an ancestor, attempt to insert the given region and loop directives.
Expand All @@ -429,6 +430,8 @@ def insert_explicit_loop_parallelism(
many nested loops as possible.
:param privatise_arrays: whether to attempt to privatise arrays that cause
write-write race conditions.
:param uniform_intrinsics_only: if True it prevent offloading loops
with non-reproducible device intrinsics.

'''
if schedule.name == "ts_wgt":
Expand All @@ -439,7 +442,10 @@ def insert_explicit_loop_parallelism(
continue # Skip if an outer loop is already parallelised

opts = {"collapse": collapse, "privatise_arrays": privatise_arrays,
"verbose": True, "nowait": True}
"verbose": True, "nowait": False}

if uniform_intrinsics_only:
opts["device_string"] = "nvfortran-uniform"

routine_name = loop.ancestor(Routine).name

Expand Down Expand Up @@ -487,7 +493,7 @@ def insert_explicit_loop_parallelism(

# And if successful, the region directive on top.
if region_directive_trans:
region_directive_trans.apply(loop.parent.parent)
region_directive_trans.apply(loop.parent.parent, options=opts)
except TransformationError:
# This loop cannot be transformed, proceed to next loop.
# The parallelisation restrictions will be explained with a comment
Expand Down
6 changes: 4 additions & 2 deletions src/psyclone/psyir/nodes/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,14 @@ def is_pure(self):
return self.routine.symbol.is_pure
return None

def is_available_on_device(self):
def is_available_on_device(self, device_string: str = "") -> bool:
'''
:param device_string: optional string to identify the offloading
device (or its compiler-platform family).
:returns: whether this call is available on an accelerated device.
:rtype: bool

'''
# pylint: disable=unused-argument
return False

@property
Expand Down
81 changes: 47 additions & 34 deletions src/psyclone/psyir/nodes/intrinsic_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,44 +770,26 @@ def intrinsic(self):
'''
return self.routine.symbol.intrinsic

# This is not part of the intrinsic enum, because its ValueError could
# change for different devices, and in the future we may want to pass
# a device/arch/compiler parameter or look at the configuration file.
# Currently it is inspired from: https://docs.nvidia.com/hpc-sdk/
# compilers/hpc-compilers-user-guide/#acc-fort-intrin-sum
# But that list is incomplete (e.g. SUM is supported and not listed)
def is_available_on_device(self):
def is_available_on_device(self, device_string: str = "") -> bool:
'''
:param device_string: optional string to identify the offloading
device (or its compiler-platform family).
:returns: whether this intrinsic is available on an accelerated device.
:rtype: bool

Comment thread
arporter marked this conversation as resolved.
:raises ValueError: if the provided 'device_string' is not one of the
supported values.

'''
return self.intrinsic in (
IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS,
IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT,
IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN,
IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS,
IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE,
IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP,
IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR,
IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR,
IntrinsicCall.Intrinsic.LOG,
IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN,
IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT,
IntrinsicCall.Intrinsic.NOT,
IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN,
IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT,
IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH,
IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE,
# The ones below can be offloaded but provide numerical differences
# even with the -gpu=uniform_math flag, ideally it should be
# configurable if these are allowed or not.
# IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL,
# The one below are not documented on nvidia compiler
IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE,
IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND,
IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL,
IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE)
if not device_string:
return self.intrinsic in DEFAULT_DEVICE_INTRINISCS
if device_string == "nvfortran-all":
return self.intrinsic in NVFORTRAN_ALL
if device_string == "nvfortran-uniform":
return self.intrinsic in NVFORTRAN_UNIFORM

raise ValueError(
f"Unsupported device_string value '{device_string}', the supported"
" values are '' (default), 'nvfortran-all', 'nvfortran-uniform'")

@classmethod
def create(cls, intrinsic, arguments=()):
Expand Down Expand Up @@ -971,6 +953,37 @@ def is_inquiry(self):
return self.intrinsic.is_inquiry


# Intrinsics available on nvidia gpus with uniform (CPU and GPU) results when
# compiled with the nvfortran "-gpu=uniform_math" flag
NVFORTRAN_UNIFORM = (
IntrinsicCall.Intrinsic.ABS, IntrinsicCall.Intrinsic.ACOS,
IntrinsicCall.Intrinsic.AINT, IntrinsicCall.Intrinsic.ANINT,
IntrinsicCall.Intrinsic.ASIN, IntrinsicCall.Intrinsic.ATAN,
IntrinsicCall.Intrinsic.ATAN2, IntrinsicCall.Intrinsic.COS,
IntrinsicCall.Intrinsic.COSH, IntrinsicCall.Intrinsic.DBLE,
IntrinsicCall.Intrinsic.DPROD, IntrinsicCall.Intrinsic.EXP,
IntrinsicCall.Intrinsic.IAND, IntrinsicCall.Intrinsic.IEOR,
IntrinsicCall.Intrinsic.INT, IntrinsicCall.Intrinsic.IOR,
IntrinsicCall.Intrinsic.LOG, IntrinsicCall.Intrinsic.NOT,
IntrinsicCall.Intrinsic.MAX, IntrinsicCall.Intrinsic.MIN,
IntrinsicCall.Intrinsic.MOD, IntrinsicCall.Intrinsic.NINT,
IntrinsicCall.Intrinsic.SIGN, IntrinsicCall.Intrinsic.SIN,
IntrinsicCall.Intrinsic.SINH, IntrinsicCall.Intrinsic.SQRT,
IntrinsicCall.Intrinsic.TAN, IntrinsicCall.Intrinsic.TANH,
IntrinsicCall.Intrinsic.UBOUND, IntrinsicCall.Intrinsic.MERGE,
IntrinsicCall.Intrinsic.PRODUCT, IntrinsicCall.Intrinsic.SIZE,
IntrinsicCall.Intrinsic.SUM, IntrinsicCall.Intrinsic.LBOUND,
IntrinsicCall.Intrinsic.MAXVAL, IntrinsicCall.Intrinsic.MINVAL,
IntrinsicCall.Intrinsic.TINY, IntrinsicCall.Intrinsic.HUGE
)

# All nvfortran intrinsics available on GPUs
NVFORTRAN_ALL = NVFORTRAN_UNIFORM + (
IntrinsicCall.Intrinsic.LOG10, IntrinsicCall.Intrinsic.REAL)

# For now the default intrinsics availabe on GPU are the same as nvfortran-all
DEFAULT_DEVICE_INTRINISCS = NVFORTRAN_ALL

# TODO #658 this can be removed once we have support for determining the
# type of a PSyIR expression.
# Intrinsics that perform a reduction on an array.
Expand Down
16 changes: 12 additions & 4 deletions src/psyclone/psyir/transformations/omp_target_trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,21 +152,27 @@ def validate(self, node, options=None):
:type node: List[:py:class:`psyclone.psyir.nodes.Node`]
:param options: a dictionary with options for transformations.
:type options: Optional[Dict[str, Any]]
:param str options["device_string"]: provide a compiler-platform
identifier.

:raises TransformationError: if it contains calls to routines that
are not available in the accelerator device.
:raises TransformationError: if its a function and the target region
attempts to enclose the assingment setting the return value.
'''
device_string = options.get("device_string", "") if options else ""
Comment thread
arporter marked this conversation as resolved.
node_list = self.get_node_list(node)
super().validate(node, options)
for node in node_list:
for call in node.walk(Call):
if not call.is_available_on_device():
if not call.is_available_on_device(device_string):
device_str = device_string if device_string else "default"
raise TransformationError(
f"'{call.routine.name}' is not available on the "
f"accelerator device, and therefore it cannot "
f"be called from within an OMP Target region.")
f"'{call.routine.name}' is not available on the"
f" '{device_str}' accelerator device, and therefore "
f"it cannot be called from within an OMP Target "
f"region. Use the 'device_string' option to specify a "
f"different device.")
routine = node.ancestor(Routine)
if routine and routine.return_symbol:
# if it is a function, the target must not include its return sym
Expand All @@ -189,6 +195,8 @@ def apply(self, node, options=None):
:type options: Optional[Dict[str,Any]]
:param bool options["nowait"]: whether to add a nowait clause and a
corresponding barrier to enable asynchronous execution.
:param str options["device_string"]: provide a compiler-platform
identifier.

'''
if not options:
Expand Down
22 changes: 20 additions & 2 deletions src/psyclone/tests/psyir/nodes/intrinsic_call_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,11 @@ def test_intrinsiccall_is_inquiry():
(IntrinsicCall.Intrinsic.INT, True),
(IntrinsicCall.Intrinsic.IOR, True),
(IntrinsicCall.Intrinsic.LOG, True),
(IntrinsicCall.Intrinsic.LOG10, False),
(IntrinsicCall.Intrinsic.LOG10, True),
(IntrinsicCall.Intrinsic.MOD, True),
(IntrinsicCall.Intrinsic.NINT, True),
(IntrinsicCall.Intrinsic.NOT, True),
(IntrinsicCall.Intrinsic.REAL, False),
(IntrinsicCall.Intrinsic.REAL, True),
(IntrinsicCall.Intrinsic.SIGN, True),
(IntrinsicCall.Intrinsic.SIN, True),
(IntrinsicCall.Intrinsic.SINH, True),
Expand All @@ -154,7 +154,25 @@ def test_intrinsiccall_is_inquiry():
def test_intrinsiccall_is_available_on_device(intrinsic, result):
'''Tests that the is_available_on_device() method works as expected.'''
intrinsic_call = IntrinsicCall(intrinsic)
# For now default and nvfortran-all are the same
assert intrinsic_call.is_available_on_device() is result
assert intrinsic_call.is_available_on_device('nvfortran-all') is result


def test_intrinsiccall_is_available_on_device_with_device_string():
'''Tests that the is_available_on_device() method with a device_string
argument provides different results with the 'nvfortran-uniform'
'''
intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.LOG10)
assert not intrinsic_call.is_available_on_device("nvfortran-uniform")
intrinsic_call = IntrinsicCall(IntrinsicCall.Intrinsic.REAL)
assert not intrinsic_call.is_available_on_device("nvfortran-uniform")

with pytest.raises(ValueError) as err:
assert not intrinsic_call.is_available_on_device("invalid")
assert ("Unsupported device_string value 'invalid', the supported values"
" are '' (default), 'nvfortran-all', 'nvfortran-uniform'"
in str(err.value))


def test_intrinsiccall_alloc_create():
Expand Down
Loading
Loading