diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
new file mode 100644
index 00000000000..911f4385a6a
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
@@ -0,0 +1,169 @@
+/* Copyright (C) 2026 Free Software Foundation, Inc.
+ Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+ This file is part of GDB.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see . */
+
+/* This program exercises a single-device cooperative kernel launch.
+ The kernel uses cooperative_groups::this_grid ().sync (), which is
+ implemented on AMD GPUs using the Global Wave Sync (GWS) hardware
+ mechanism. It is launched through hipLaunchCooperativeKernel so
+ the whole grid is co-resident and can synchronize at the grid
+ level.
+
+ The companion .exp file uses this program to exercise the debugger
+ while a kernel is running with GWS-based grid synchronization. */
+
+#include
+#include
+#include
+#include
+
+#include "rocm-test-utils.h"
+
+namespace cg = cooperative_groups;
+
+/* Use small but non-trivial launch dimensions so the test runs quickly
+ while still creating waves from multiple workgroups participating
+ in the GWS barrier. Two workgroups of 64 threads each give 128
+ threads in total.
+
+ N is exactly one slot per thread so Phase 2 has no inter-thread
+ write conflict and the host-side expected values can be computed
+ straightforwardly. */
+
+constexpr unsigned int group_size = 64;
+constexpr unsigned int num_groups = 2;
+constexpr unsigned int total_threads = group_size * num_groups;
+constexpr unsigned int N = total_threads;
+
+/* Two-phase grid-cooperative kernel.
+
+ Phase 1: every thread writes its slot in in_buf.
+ GWS : the whole grid synchronizes via cooperative_groups::this_grid ().
+ Phase 2: every thread reads a slot owned by a thread in a *different*
+ workgroup and stores it into out_buf.
+
+ Because the Phase 2 read targets a slot written by a different
+ workgroup during Phase 1, this is only correct if grid.sync ()
+ actually synchronized every wave in the grid. */
+
+__global__ void
+coop_grid_sync_kernel (int *in_buf, int *out_buf)
+{
+ cg::grid_group grid = cg::this_grid ();
+
+ unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ /* Phase 1: each thread writes its own value. */
+ in_buf[tid] = (int) (tid + 1); /* before-sync line */
+
+ /* Grid-wide synchronization (GWS). */
+ grid.sync (); /* sync line */
+
+ /* Phase 2: read the slot owned by a thread in a different workgroup
+ and store it in our own slot of the output buffer. */
+ unsigned int peer = (tid + blockDim.x) % total_threads;
+ int v = in_buf[peer]; /* after-sync line */
+ out_buf[tid] = v;
+}
+
+int
+main ()
+{
+ int n_devices = 0;
+ CHECK (hipGetDeviceCount (&n_devices));
+ if (n_devices <= 0)
+ {
+ printf ("No HIP devices found, skipping.\n");
+ return 0;
+ }
+
+ int device_id = -1;
+ hipDeviceProp_t props;
+ for (int id = 0; id < n_devices; ++id)
+ {
+ CHECK (hipGetDeviceProperties (&props, id));
+ if (props.cooperativeLaunch)
+ {
+ device_id = id;
+ break;
+ }
+ }
+
+ if (device_id < 0)
+ {
+ printf ("None of the %d HIP device(s) support cooperative launch, "
+ "skipping.\n",
+ n_devices);
+ return 0;
+ }
+
+ CHECK (hipSetDevice (device_id));
+
+ int *in_d = nullptr;
+ int *out_d = nullptr;
+ CHECK (hipMalloc ((void **) &in_d, N * sizeof (int)));
+ CHECK (hipMalloc ((void **) &out_d, N * sizeof (int)));
+ CHECK (hipMemset (in_d, 0, N * sizeof (int)));
+ CHECK (hipMemset (out_d, 0, N * sizeof (int)));
+
+ dim3 grid_dim (num_groups, 1, 1);
+ dim3 block_dim (group_size, 1, 1);
+
+ void *kernel_args[2];
+ kernel_args[0] = (void *) &in_d;
+ kernel_args[1] = (void *) &out_d;
+
+ /* Launch the kernel cooperatively. This is the API that enables
+ grid.sync () support via GWS. The trailing 0, 0 arguments are
+ sharedMem and stream respectively. */
+ CHECK (hipLaunchCooperativeKernel
+ (reinterpret_cast (coop_grid_sync_kernel),
+ grid_dim, block_dim, kernel_args, 0, 0));
+
+ CHECK (hipDeviceSynchronize ());
+
+ int out_h[N];
+ CHECK (hipMemcpy (out_h, out_d, N * sizeof (int), hipMemcpyDeviceToHost));
+
+ CHECK (hipFree (in_d));
+ CHECK (hipFree (out_d));
+
+ /* Each thread tid stores in_buf[(tid + group_size) % total_threads]
+ into out_buf[tid]. After Phase 1, in_buf[k] == k + 1, so the
+ expected value at out_h[tid] is ((tid + group_size) %
+ total_threads) + 1. */
+ int errors = 0;
+ for (unsigned int tid = 0; tid < N; tid++)
+ {
+ unsigned int peer = (tid + group_size) % total_threads;
+ int expected = (int) (peer + 1);
+ if (out_h[tid] != expected)
+ {
+ fprintf (stderr, "mismatch at %u: got %d, expected %d\n",
+ tid, out_h[tid], expected);
+ errors++;
+ }
+ }
+
+ if (errors != 0)
+ {
+ fprintf (stderr, "FAILED: %d mismatches\n", errors);
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
new file mode 100644
index 00000000000..2d18f94e0fc
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
@@ -0,0 +1,189 @@
+# Copyright (C) 2026 Free Software Foundation, Inc.
+# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+# Tests for debugging a single-device cooperative kernel that uses
+# cooperative_groups::this_grid ().sync (), implemented on AMD GPUs
+# using the Global Wave Sync (GWS) hardware mechanism.
+#
+# The test verifies that GDB can:
+# - hit a breakpoint placed before grid.sync () in a cooperatively
+# launched kernel (i.e. when the dispatch was created with
+# hipLaunchCooperativeKernel),
+# - list the cooperative dispatch via "info dispatches",
+# - observe multiple GPU waves participating in the grid sync,
+# - hit a breakpoint placed after grid.sync (), and
+# - resume the kernel to normal completion.
+
+load_lib rocm.exp
+
+require allow_hipcc_tests
+require supports_cooperative_groups
+
+standard_testfile .cpp
+
+if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} {
+ return
+}
+
+# Test: hit a breakpoint inside a cooperative kernel before grid.sync (),
+# inspect the GPU state, then continue past grid.sync () and to program
+# exit.
+
+proc_with_prefix test_break_around_grid_sync {} {
+ clean_restart $::testfile
+
+ with_rocm_gpu_lock {
+ if {![runto_main]} {
+ return
+ }
+
+ # Place a breakpoint before grid.sync (). Use "allow-pending"
+ # because the kernel is loaded as device code at dispatch time.
+ gdb_breakpoint \
+ [gdb_get_line_number "before-sync line"] allow-pending temporary
+
+ # Continue into the kernel. If the program self-skips (no device
+ # advertises cooperativeLaunch) it exits cleanly; treat that as
+ # UNSUPPORTED rather than a FAIL.
+ set reached 0
+ gdb_test_multiple "continue" "stop before grid.sync" {
+ -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" {
+ set reached 1
+ pass $gdb_test_name
+ }
+ -re -wrap "$::inferior_exited_re normally.*" {
+ unsupported "cooperative launch not available"
+ }
+ }
+ if { !$reached } {
+ return
+ }
+
+ # Verify that waves from multiple workgroups are stopped at the
+ # pre-sync breakpoint. Counting waves alone is wave-size
+ # dependent (1 wave per workgroup on wave64 vs 2 on wave32) and
+ # would let the test pass on wave32 even if all visible waves
+ # happened to come from a single workgroup. Instead, collect
+ # the distinct workgroup (block) coordinates from the AMDGPU
+ # Wave entries in "info threads" and require at least two
+ # distinct workgroups, which directly verifies that the
+ # cooperative dispatch's multi-workgroup property is exercised.
+ set blocks {}
+ gdb_test_multiple "info threads" "blocks present pre-sync" -lbl {
+ -re "AMDGPU Wave \[^(\r\n\]*\\((\[^()\r\n\]+)\\)" {
+ if {[lsearch -exact $blocks $expect_out(1,string)] == -1} {
+ lappend blocks $expect_out(1,string)
+ }
+ exp_continue
+ }
+ -re -wrap "" {
+ gdb_assert {[llength $blocks] >= 2} \
+ "waves from at least 2 distinct workgroups are present pre-sync"
+ }
+ }
+
+ # "info dispatches" should show the cooperative dispatch.
+ gdb_test "info dispatches" \
+ ".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \
+ "info dispatches lists cooperative dispatch"
+
+ # Place a breakpoint after grid.sync (). It fires once all waves
+ # have crossed the GWS barrier, which proves GWS-protected code
+ # can be debugged across the barrier.
+ gdb_breakpoint \
+ [gdb_get_line_number "after-sync line"] allow-pending temporary
+
+ gdb_continue_to_breakpoint "stop after grid.sync"
+
+ # The kernel should be able to run to completion afterwards.
+ gdb_continue_to_end "continue to program end" "continue" 1
+ }
+}
+
+# Test: stop inside the cooperative kernel with several co-resident
+# waves parked at the barrier and check that gdb reports correct,
+# distinct per-wave and per-lane state. This is the debugger
+# behaviour of interest: that gdb selects the right register context
+# per wave and the right SIMT state per lane, not merely that the
+# waves are in the kernel.
+
+proc_with_prefix test_threads_in_coop_kernel {} {
+ clean_restart $::testfile
+
+ with_rocm_gpu_lock {
+ if {![runto_main]} {
+ return
+ }
+
+ gdb_breakpoint \
+ [gdb_get_line_number "before-sync line"] allow-pending temporary
+
+ # Continue into the kernel. If the program self-skips (no device
+ # advertises cooperativeLaunch) it exits cleanly; treat that as
+ # UNSUPPORTED rather than a FAIL.
+ set reached 0
+ gdb_test_multiple "continue" "stop in cooperative kernel" {
+ -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" {
+ set reached 1
+ pass $gdb_test_name
+ }
+ -re -wrap "$::inferior_exited_re normally.*" {
+ unsupported "cooperative launch not available"
+ }
+ }
+ if { !$reached } {
+ return
+ }
+
+ set waves [info_thread_get_wave_list]
+ gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed"
+
+ # Per-wave register context: switching to each wave and reading
+ # blockIdx.x must yield that wave's owning workgroup, and across
+ # all waves we must observe more than one distinct workgroup.
+ set blocks {}
+ foreach wave $waves {
+ with_test_prefix "wave $wave" {
+ gdb_test "thread $wave" "Switching to thread $wave.*" \
+ "switch to wave $wave"
+ set bidx [get_integer_valueof "blockIdx.x" -1 "blockIdx.x"]
+ if {[lsearch -exact $blocks $bidx] == -1} {
+ lappend blocks $bidx
+ }
+ }
+ }
+ gdb_assert {[llength $blocks] >= 2} \
+ "distinct workgroups observed across waves"
+
+ # Per-lane (SIMT) state: within a single wave, different lanes map
+ # to different threads, so threadIdx.x must differ between lanes.
+ gdb_test "thread [lindex $waves 0]" "Switching to thread .*" \
+ "switch to first wave for lane inspection"
+ gdb_test "lane 0" ".*" "switch to lane 0"
+ set tid0 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 0"]
+ gdb_test "lane 1" ".*" "switch to lane 1"
+ set tid1 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 1"]
+ gdb_assert {$tid0 != $tid1} "distinct threadIdx.x across lanes"
+
+ # Resume to completion.
+ gdb_continue_to_end "continue to program end" "continue" 1
+ }
+}
+
+test_break_around_grid_sync
+test_threads_in_coop_kernel
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
new file mode 100644
index 00000000000..e29b3da8d56
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
@@ -0,0 +1,207 @@
+/* Copyright (C) 2026 Free Software Foundation, Inc.
+ Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+ This file is part of GDB.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see . */
+
+/* This program exercises a multi-device cooperative kernel launch via
+ hipLaunchCooperativeKernelMultiDevice. The kernel uses both
+ cooperative_groups::this_grid ().sync () (intra-device GWS) and
+ cooperative_groups::this_multi_grid ().sync () (cross-device sync).
+
+ When run on a system with fewer than two GPUs that each support
+ cooperative multi-device launch, the program prints a message and
+ exits cleanly. */
+
+#include
+#include
+#include
+#include
+
+#include "rocm-test-utils.h"
+
+namespace cg = cooperative_groups;
+
+constexpr int N_USED_GPUS = 2;
+constexpr unsigned int N_PER_DEVICE = 256;
+constexpr unsigned int group_size = 64;
+constexpr unsigned int num_groups = 2;
+
+/* Two-phase cooperative kernel running on every device.
+
+ Phase 1: each thread writes (grid_rank + 1) * (i + 1) into its slot.
+ GWS : intra-device grid synchronization via this_grid ().sync ().
+ MGWS : cross-device synchronization via this_multi_grid ().sync ().
+ Phase 2: thread 0 of each grid writes its partial sum into
+ result[grid_rank + 1] (a slot in the shared host-coherent buffer).
+ Then thread 0 of grid 0 aggregates the partials into result[0]. */
+
+__global__ void
+coop_multi_grid_sync_kernel (int *data, unsigned int n_elements,
+ long *result)
+{
+ cg::grid_group grid = cg::this_grid ();
+ cg::multi_grid_group mgrid = cg::this_multi_grid ();
+
+ unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int stride = gridDim.x * blockDim.x;
+
+ unsigned int grid_rank = mgrid.grid_rank ();
+
+ /* Phase 1: each thread writes data into its slot. */
+ for (unsigned int i = tid; i < n_elements; i += stride)
+ data[i] = (int) ((grid_rank + 1) * (i + 1)); /* before-grid-sync line */
+
+ /* Intra-device grid sync (GWS). */
+ grid.sync (); /* grid-sync line */
+
+ /* Each grid computes its partial sum and stores it in the shared
+ buffer at slot (grid_rank + 1). Slot 0 is reserved for the
+ final total. */
+ if (tid == 0)
+ {
+ long sum = 0;
+ for (unsigned int i = 0; i < n_elements; i++)
+ sum += data[i];
+ result[grid_rank + 1] = sum; /* after-grid-sync line */
+ }
+
+ /* Cross-device sync (multi-grid GWS). */
+ mgrid.sync (); /* multi-grid-sync line */
+
+ /* Grid 0 aggregates partial sums from all grids into result[0]. */
+ if (grid_rank == 0 && tid == 0)
+ {
+ long total = 0;
+ unsigned int n_grids = mgrid.num_grids ();
+ for (unsigned int g = 0; g < n_grids; g++)
+ total += result[g + 1];
+ result[0] = total; /* after-multi-grid-sync line */
+ }
+}
+
+int
+main ()
+{
+ int n_devices = 0;
+ CHECK (hipGetDeviceCount (&n_devices));
+
+ /* Pick the first N_USED_GPUS devices that support
+ cooperativeMultiDeviceLaunch. In a mixed-architecture system not
+ every GPU has GWS support, so we ignore unsupported ones rather
+ than failing the whole test. This loop runs unconditionally (it
+ is bounded by n_devices) so the marker below is always reached,
+ even when too few devices are available; the companion .exp reads
+ n_gpus there to decide whether to skip. */
+ int selected[N_USED_GPUS];
+ int n_gpus = 0;
+ for (int id = 0; id < n_devices && n_gpus < N_USED_GPUS; id++)
+ {
+ hipDeviceProp_t p;
+ CHECK (hipGetDeviceProperties (&p, id));
+ if (p.cooperativeMultiDeviceLaunch)
+ {
+ selected[n_gpus] = id;
+ n_gpus++;
+ }
+ }
+ if (n_gpus < N_USED_GPUS) /* n-gpus-final line */
+ {
+ printf ("Fewer than %d of the %d HIP device(s) support cooperative"
+ " multi-device launch, skipping.\n",
+ N_USED_GPUS, n_devices);
+ return 0;
+ }
+
+ int *data_d[N_USED_GPUS] = {};
+ hipStream_t stream[N_USED_GPUS] = {};
+
+ /* The shared result buffer is host-coherent so every device can
+ read/write it without explicit memcpys. Layout:
+ result[0] -- final total (written by grid 0 only)
+ result[1 .. n_gpus] -- per-grid partial sums. */
+ long *result_h = nullptr;
+
+ for (int i = 0; i < n_gpus; i++)
+ {
+ CHECK (hipSetDevice (selected[i]));
+ CHECK (hipMalloc ((void **) &data_d[i], N_PER_DEVICE * sizeof (int)));
+ CHECK (hipMemset (data_d[i], 0, N_PER_DEVICE * sizeof (int)));
+ CHECK (hipStreamCreate (&stream[i]));
+ }
+
+ CHECK (hipSetDevice (selected[0]));
+ CHECK (hipHostMalloc ((void **) &result_h, (n_gpus + 1) * sizeof (long),
+ hipHostMallocCoherent));
+ for (int i = 0; i < n_gpus + 1; i++)
+ result_h[i] = 0;
+
+ /* Build per-device launch parameters. Each device gets the same
+ grid/block dims and a pointer to *its own* data buffer, but the
+ result pointer is the single shared host-coherent buffer. */
+ hipLaunchParams launch_params[N_USED_GPUS];
+ void *args[N_USED_GPUS][3];
+ unsigned int n_elements = N_PER_DEVICE;
+
+ for (int i = 0; i < n_gpus; i++)
+ {
+ args[i][0] = (void *) &data_d[i];
+ args[i][1] = (void *) &n_elements;
+ args[i][2] = (void *) &result_h;
+
+ launch_params[i].func
+ = reinterpret_cast (coop_multi_grid_sync_kernel);
+ launch_params[i].gridDim = dim3 (num_groups, 1, 1);
+ launch_params[i].blockDim = dim3 (group_size, 1, 1);
+ launch_params[i].sharedMem = 0;
+ launch_params[i].stream = stream[i];
+ launch_params[i].args = args[i];
+ }
+
+ CHECK (hipLaunchCooperativeKernelMultiDevice (launch_params, n_gpus, 0));
+
+ for (int i = 0; i < n_gpus; i++)
+ {
+ CHECK (hipSetDevice (selected[i]));
+ CHECK (hipDeviceSynchronize ());
+ }
+
+ long total = result_h[0];
+
+ for (int i = 0; i < n_gpus; i++)
+ {
+ CHECK (hipSetDevice (selected[i]));
+ CHECK (hipFree (data_d[i]));
+ CHECK (hipStreamDestroy (stream[i]));
+ }
+ CHECK (hipSetDevice (selected[0]));
+ CHECK (hipHostFree (result_h));
+
+ /* Compute the expected total: for grid g (0-based), the thread that
+ writes slot i stores (g + 1) * (i + 1). Per grid, the sum is
+ (g + 1) * N_PER_DEVICE * (N_PER_DEVICE + 1) / 2. */
+ long per_grid = (long) N_PER_DEVICE * (N_PER_DEVICE + 1) / 2;
+ long expected = 0;
+ for (int g = 0; g < n_gpus; g++)
+ expected += (long) (g + 1) * per_grid;
+
+ if (total != expected)
+ {
+ fprintf (stderr, "FAILED: total %ld, expected %ld\n", total, expected);
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
new file mode 100644
index 00000000000..f192ddc0d28
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
@@ -0,0 +1,131 @@
+# Copyright (C) 2026 Free Software Foundation, Inc.
+# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+# Tests for debugging a multi-device cooperative kernel that uses
+# cooperative_groups::this_grid ().sync () within each device and
+# cooperative_groups::this_multi_grid ().sync () across devices. Both
+# barriers are implemented on AMD GPUs using the GWS hardware
+# mechanism.
+#
+# The test verifies that GDB can:
+# - read the number of participating GPUs from the inferior,
+# - hit a kernel breakpoint inside the cooperative dispatch on
+# *every* participating GPU (matched via the per-device
+# "Breakpoint ." subscript that
+# hipLaunchCooperativeKernelMultiDevice produces), and
+# - resume the cross-device dispatch through both the intra-device
+# this_grid ().sync () and the cross-device this_multi_grid ().sync ()
+# to normal program completion.
+
+load_lib rocm.exp
+
+require allow_hipcc_tests
+require hip_devices_support_debug_multi_process
+require supports_cooperative_groups
+
+standard_testfile .cpp
+
+if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} {
+ return
+}
+
+# Test: hit a breakpoint inside the cooperative kernel on every
+# participating GPU, then resume the dispatch through both barriers
+# and to program completion.
+
+proc_with_prefix test_break_in_multi_coop_kernel {} {
+ # Multi-device cooperative dispatches involve waves on more than
+ # one device making forward progress before the multi-grid sync
+ # releases. Run the inferior in non-stop mode so the debugger can
+ # observe per-device stops independently.
+ save_vars { ::GDBFLAGS } {
+ append ::GDBFLAGS " -ex \"set non-stop on\""
+ clean_restart $::testfile
+ }
+
+ with_rocm_gpu_lock {
+ if {![runto_main]} {
+ return
+ }
+
+ # Break where the inferior has finalized "n_gpus" (the count of
+ # GPUs it will use for the cooperative dispatch). This line runs
+ # on every execution, before the inferior's own eligibility
+ # check, so reading "n_gpus" from gdb lets us decide here whether
+ # to proceed. The require gates above cover architecture-level
+ # support; per-device runtime gating (>= 2 devices that advertise
+ # cooperativeMultiDeviceLaunch) can only be known at this point.
+ gdb_breakpoint [gdb_get_line_number "n-gpus-final line"] temporary
+ gdb_continue_to_breakpoint "advance to n-gpus-final"
+
+ set n_gpus [get_integer_valueof "n_gpus" 0]
+ if { $n_gpus < 2 } {
+ unsupported "fewer than two cooperative-capable GPUs"
+ return
+ }
+
+ # Replace the marker breakpoint with the kernel breakpoint and
+ # run all threads in the background so the host can finish
+ # launching kernels on every device.
+ delete_breakpoints
+ gdb_breakpoint \
+ [gdb_get_line_number "before-grid-sync line"] allow-pending
+
+ gdb_test_multiple "continue -a &" "background continue" {
+ -re "Continuing\\.\r\n$::gdb_prompt " {
+ pass $gdb_test_name
+ }
+ }
+
+ # Check that gdb resolves the single source breakpoint to one
+ # child location per participating GPU ("Breakpoint .")
+ # and, in non-stop mode, reports each device-side stop
+ # independently. Collect distinct values until a stop has
+ # been seen for every GPU.
+ array unset gpu_seen
+ set distinct_gpus 0
+ gdb_test_multiple "" "breakpoint hit on every participating GPU" {
+ -re "\r\nThread $::decimal\[^\r\n\]* hit Breakpoint $::decimal\\.($::decimal),\[^\r\n\]*\r\n" {
+ set inst $expect_out(1,string)
+ if {![info exists gpu_seen($inst)]} {
+ set gpu_seen($inst) 1
+ incr distinct_gpus
+ }
+ if {$distinct_gpus >= $n_gpus} {
+ pass $gdb_test_name
+ } else {
+ exp_continue
+ }
+ }
+ }
+
+ # All participating GPUs have been observed stopping at the
+ # kernel breakpoint. Remove the breakpoint and let the
+ # dispatch run through this_grid ().sync () and
+ # this_multi_grid ().sync () to clean program exit.
+ delete_breakpoints
+
+ gdb_test_multiple "continue -a" "continue all to end" {
+ -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " {
+ pass $gdb_test_name
+ }
+ }
+ }
+}
+
+test_break_in_multi_coop_kernel
diff --git a/gdb/testsuite/lib/rocm.exp b/gdb/testsuite/lib/rocm.exp
index c937f92e24e..44273e40832 100644
--- a/gdb/testsuite/lib/rocm.exp
+++ b/gdb/testsuite/lib/rocm.exp
@@ -412,6 +412,46 @@ proc hip_devices_support_debug_multi_process {} {
return 1
}
+# Helpers below gate cooperative-group debugging tests, i.e. tests that
+# debug kernels launched with hipLaunchCooperativeKernel /
+# hipLaunchCooperativeKernelMultiDevice and synchronized via
+# cooperative_groups::this_grid ().sync () / this_multi_grid ().sync ().
+#
+# This is a debugger-side gate, distinct from what the HIP runtime
+# advertises via hipDeviceProp_t::cooperativeLaunch /
+# cooperativeMultiDeviceLaunch: the Global Wave Sync (GWS) hardware
+# implementing those barriers exists on architectures where the kernel
+# runs correctly outside the debugger but rocgdb cannot yet step or
+# breakpoint through the cooperative dispatch.
+
+# Return true if TARGET (a gfx name like "gfx90a") is an architecture
+# where rocgdb supports debugging cooperative-group kernels.
+
+proc target_supports_cooperative_groups { target } {
+ set unsupported_targets {
+ gfx1100 gfx1101 gfx1102 gfx1103
+ }
+
+ return [expr {[lsearch -exact $unsupported_targets $target] == -1}]
+}
+
+# Return true if every AMDGPU device on the system is an architecture
+# where rocgdb supports debugging cooperative-group kernels.
+
+proc supports_cooperative_groups {} {
+ set targets [find_amdgpu_devices]
+ if { [llength $targets] == 0 } {
+ return 0
+ }
+
+ foreach target $targets {
+ if { ![target_supports_cooperative_groups $target] } {
+ return 0
+ }
+ }
+ return 1
+}
+
# Return true if the current device's version is less than VERSION.
#
# VERSION must be the "gfx" name of the device, such as gfx906 or gfx90a.