From 714c354a12c90d05ec0adcdaa6ec7752190e48c8 Mon Sep 17 00:00:00 2001
From: Sarang Patrange <spatrang@amd.com>
Date: Thu, 4 Jun 2026 09:23:44 -0500
Subject: [PATCH 1/2] gdb, testsuite: add gdb.rocm tests for cooperative group
 GWS debugging

Add dejagnu coverage for debugging AMD GPU cooperative-group kernels
(hipLaunchCooperativeKernel / hipLaunchCooperativeKernelMultiDevice),
which synchronize at the grid / multi-grid level via Global Wave Sync
(GWS).  Previously covered only by out-of-tree tests.

New tests:
  * gdb.rocm/coop-group-grid-sync.{cpp,exp}
    Single-device, this_grid ().sync ().
  * gdb.rocm/coop-group-multi-grid-sync.{cpp,exp}
    Multi-device, this_grid ().sync () + this_multi_grid ().sync ();
    runs in non-stop mode.

Host-side post-conditions validate the cooperative semantics
numerically, so any regression in GWS behaviour under the debugger
surfaces as a test failure rather than a silent miscompare.  The
tests pick a debugger-supported device at runtime and self-skip with
UNSUPPORTED when the configuration is insufficient.

Two helpers added in gdb/testsuite/lib/rocm.exp:
target_supports_cooperative_groups <target> (per-target gate, returns
false on gfx1100/1101/1102/1103 per amd-dbgapi.h) and
supports_cooperative_groups (require-gate wrapper used by both .exp
files).  This is a debugger-side gate, distinct from the runtime's
cooperativeLaunch / cooperativeMultiDeviceLaunch flags.
---
 .../gdb.rocm/coop-group-grid-sync.cpp         | 168 ++++++++++++++
 .../gdb.rocm/coop-group-grid-sync.exp         | 152 +++++++++++++
 .../gdb.rocm/coop-group-multi-grid-sync.cpp   | 211 ++++++++++++++++++
 .../gdb.rocm/coop-group-multi-grid-sync.exp   | 147 ++++++++++++
 gdb/testsuite/lib/rocm.exp                    |  40 ++++
 5 files changed, 718 insertions(+)
 create mode 100644 gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
 create mode 100644 gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
 create mode 100644 gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
 create mode 100644 gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
new file mode 100644
index 00000000000..14d395e1fd8
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
@@ -0,0 +1,168 @@
+/* Copyright (C) 2026 Free Software Foundation, Inc.
+   Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* This program exercises a single-device cooperative kernel launch.
+   The kernel uses cooperative_groups::this_grid ().sync (), which is
+   implemented on AMD GPUs using the Global Wave Sync (GWS) hardware
+   mechanism.  It is launched through hipLaunchCooperativeKernel so
+   the whole grid is co-resident and can synchronize at the grid
+   level.
+
+   The companion .exp file uses this program to exercise the debugger
+   while a kernel is running with GWS-based grid synchronization.  */
+
+#include <hip/hip_cooperative_groups.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "rocm-test-utils.h"
+
+namespace cg = cooperative_groups;
+
+/* Use small but non-trivial launch dimensions so the test runs quickly
+   while still creating waves from multiple workgroups participating
+   in the GWS barrier.  Two workgroups of 64 threads each give 128
+   threads in total.
+
+   N is exactly one slot per thread so Phase 2 has no intra-thread race
+   and the host-side expected values can be computed straightforwardly.  */
+
+constexpr unsigned int group_size = 64;
+constexpr unsigned int num_groups = 2;
+constexpr unsigned int total_threads = group_size * num_groups;
+constexpr unsigned int N = total_threads;
+
+/* Two-phase grid-cooperative kernel.
+
+   Phase 1: every thread writes its slot in in_buf.
+   GWS    : the whole grid synchronizes via cooperative_groups::this_grid ().
+   Phase 2: every thread reads a slot owned by a thread in a *different*
+   workgroup and stores it into out_buf.
+
+   Because the Phase 2 read targets a slot written by a different
+   workgroup during Phase 1, this is only correct if grid.sync ()
+   actually synchronized every wave in the grid.  */
+
+__global__ void
+coop_grid_sync_kernel (int *in_buf, int *out_buf)
+{
+  cg::grid_group grid = cg::this_grid ();
+
+  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  /* Phase 1: each thread writes its own value.  */
+  in_buf[tid] = (int) (tid + 1);	/* before-sync line */
+
+  /* Grid-wide synchronization (GWS).  */
+  grid.sync ();				/* sync line */
+
+  /* Phase 2: read the slot owned by a thread in a different workgroup
+     and store it in our own slot of the output buffer.  */
+  unsigned int peer = (tid + blockDim.x) % total_threads;
+  int v = in_buf[peer];			/* after-sync line */
+  out_buf[tid] = v;
+}
+
+int
+main ()
+{
+  int n_devices = 0;
+  CHECK (hipGetDeviceCount (&n_devices));
+  if (n_devices <= 0)
+    {
+      printf ("No HIP devices found, skipping.\n");
+      return 0;
+    }
+
+  int device_id = -1;
+  hipDeviceProp_t props;
+  for (int id = 0; id < n_devices; ++id)
+    {
+      CHECK (hipGetDeviceProperties (&props, id));
+      if (props.cooperativeLaunch)
+	{
+	  device_id = id;
+	  break;
+	}
+    }
+
+  if (device_id < 0)
+    {
+      printf ("None of the %d HIP device(s) support cooperative launch, "
+	      "skipping.\n",
+	      n_devices);
+      return 0;
+    }
+
+  CHECK (hipSetDevice (device_id));
+
+  int *in_d = nullptr;
+  int *out_d = nullptr;
+  CHECK (hipMalloc ((void **) &in_d, N * sizeof (int)));
+  CHECK (hipMalloc ((void **) &out_d, N * sizeof (int)));
+  CHECK (hipMemset (in_d, 0, N * sizeof (int)));
+  CHECK (hipMemset (out_d, 0, N * sizeof (int)));
+
+  dim3 grid_dim (num_groups, 1, 1);
+  dim3 block_dim (group_size, 1, 1);
+
+  void *kernel_args[2];
+  kernel_args[0] = (void *) &in_d;
+  kernel_args[1] = (void *) &out_d;
+
+  /* Launch the kernel cooperatively.  This is the API that enables
+     grid.sync () support via GWS.  The trailing 0, 0 arguments are
+     sharedMem and stream respectively.  */
+  CHECK (hipLaunchCooperativeKernel
+	   (reinterpret_cast<void *> (coop_grid_sync_kernel),
+	    grid_dim, block_dim, kernel_args, 0, 0));
+
+  CHECK (hipDeviceSynchronize ());
+
+  int out_h[N];
+  CHECK (hipMemcpy (out_h, out_d, N * sizeof (int), hipMemcpyDeviceToHost));
+
+  CHECK (hipFree (in_d));
+  CHECK (hipFree (out_d));
+
+  /* Each thread tid stores in_buf[(tid + group_size) % total_threads]
+     into out_buf[tid].  After Phase 1, in_buf[k] == k + 1, so the
+     expected value at out_h[tid] is ((tid + group_size) %
+     total_threads) + 1.  */
+  int errors = 0;
+  for (unsigned int tid = 0; tid < N; tid++)
+    {
+      unsigned int peer = (tid + group_size) % total_threads;
+      int expected = (int) (peer + 1);
+      if (out_h[tid] != expected)
+	{
+	  fprintf (stderr, "mismatch at %u: got %d, expected %d\n",
+		   tid, out_h[tid], expected);
+	  errors++;
+	}
+    }
+
+  if (errors != 0)
+    {
+      fprintf (stderr, "FAILED: %d mismatches\n", errors);
+      return EXIT_FAILURE;
+    }
+
+  return EXIT_SUCCESS;
+}
diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
new file mode 100644
index 00000000000..b2ef18ccd36
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
@@ -0,0 +1,152 @@
+# Copyright (C) 2026 Free Software Foundation, Inc.
+# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for debugging a single-device cooperative kernel that uses
+# cooperative_groups::this_grid ().sync (), implemented on AMD GPUs
+# using the Global Wave Sync (GWS) hardware mechanism.
+#
+# The test verifies that GDB can:
+#   - hit a breakpoint placed before grid.sync () in a cooperatively
+#     launched kernel (i.e. when the dispatch was created with
+#     hipLaunchCooperativeKernel),
+#   - list the cooperative dispatch via "info dispatches",
+#   - observe multiple GPU waves participating in the grid sync,
+#   - hit a breakpoint placed after grid.sync (), and
+#   - resume the kernel to normal completion.
+
+load_lib rocm.exp
+
+require allow_hipcc_tests
+require supports_cooperative_groups
+
+standard_testfile .cpp
+
+if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} {
+    return
+}
+
+# Test: hit a breakpoint inside a cooperative kernel before grid.sync (),
+# inspect the GPU state, then continue past grid.sync () and to program
+# exit.
+
+proc_with_prefix test_break_around_grid_sync {} {
+    clean_restart $::testfile
+
+    with_rocm_gpu_lock {
+	if {![runto_main]} {
+	    return
+	}
+
+	# Place a breakpoint before grid.sync ().  Use "allow-pending"
+	# because the kernel is loaded as device code at dispatch time.
+	gdb_breakpoint \
+	    [gdb_get_line_number "before-sync line"] allow-pending
+
+	# The kernel must hit the pre-sync breakpoint.
+	if {[gdb_continue_to_breakpoint "stop before grid.sync"] != 0} {
+	    return
+	}
+
+	# Verify that waves from multiple workgroups are stopped at the
+	# pre-sync breakpoint.  Counting waves alone is wave-size
+	# dependent (1 wave per workgroup on wave64 vs 2 on wave32) and
+	# would let the test pass on wave32 even if all visible waves
+	# happened to come from a single workgroup.  Instead, collect
+	# the distinct workgroup (block) coordinates from the AMDGPU
+	# Wave entries in "info threads" and require at least two
+	# distinct workgroups, which directly verifies that the
+	# cooperative dispatch's multi-workgroup property is exercised.
+	set blocks {}
+	gdb_test_multiple "info threads" "blocks present pre-sync" -lbl {
+	    -re "AMDGPU Wave \[^(\r\n\]*\\((\[^()\r\n\]+)\\)" {
+		if {[lsearch -exact $blocks $expect_out(1,string)] == -1} {
+		    lappend blocks $expect_out(1,string)
+		}
+		exp_continue
+	    }
+	    -re -wrap "" {
+		gdb_assert {[llength $blocks] >= 2} \
+		    "waves from at least 2 distinct workgroups are present pre-sync"
+	    }
+	}
+
+	# "info dispatches" should show the cooperative dispatch.
+	gdb_test "info dispatches" \
+	    ".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \
+	    "info dispatches lists cooperative dispatch"
+
+	# Replace the pre-sync breakpoint with one after grid.sync ().
+	# The after-sync breakpoint fires once all waves have crossed
+	# the GWS barrier, which proves GWS-protected code can be
+	# debugged across the barrier.
+	delete_breakpoints
+	gdb_breakpoint \
+	    [gdb_get_line_number "after-sync line"] allow-pending
+
+	gdb_continue_to_breakpoint "stop after grid.sync"
+
+	# The kernel should be able to run to completion afterwards.
+	delete_breakpoints
+	gdb_continue_to_end "continue to program end" "continue" 1
+    }
+}
+
+# Test: place a breakpoint inside the cooperative kernel, then iterate
+# over every stopped GPU wave and verify that each one has a backtrace
+# pointing into coop_grid_sync_kernel.  This exercises the wave
+# organization a debugger sees when several waves participate in a GWS
+# barrier.
+
+proc_with_prefix test_threads_in_coop_kernel {} {
+    clean_restart $::testfile
+
+    with_rocm_gpu_lock {
+	if {![runto_main]} {
+	    return
+	}
+
+	gdb_breakpoint \
+	    [gdb_get_line_number "before-sync line"] allow-pending
+
+	if {[gdb_continue_to_breakpoint "stop in cooperative kernel"] != 0} {
+	    return
+	}
+
+	# Collect the list of AMDGPU waves and check that every one of
+	# them has a backtrace pointing into coop_grid_sync_kernel.
+	set waves [info_thread_get_wave_list]
+	gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed"
+
+	foreach wave $waves {
+	    with_test_prefix "wave $wave" {
+		gdb_test "thread $wave" "Switching to thread $wave.*" \
+		    "switch to wave $wave"
+		gdb_test "bt 1" \
+		    "#0\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*" \
+		    "backtrace inside coop_grid_sync_kernel"
+	    }
+	}
+
+	# Resume to completion.
+	delete_breakpoints
+	gdb_continue_to_end "continue to program end" "continue" 1
+    }
+}
+
+test_break_around_grid_sync
+test_threads_in_coop_kernel
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
new file mode 100644
index 00000000000..b392de257d5
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
@@ -0,0 +1,211 @@
+/* Copyright (C) 2026 Free Software Foundation, Inc.
+   Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* This program exercises a multi-device cooperative kernel launch via
+   hipLaunchCooperativeKernelMultiDevice.  The kernel uses both
+   cooperative_groups::this_grid ().sync () (intra-device GWS) and
+   cooperative_groups::this_multi_grid ().sync () (cross-device sync).
+
+   When run on a system with fewer than two GPUs that each support
+   cooperative multi-device launch, the program prints a message and
+   exits cleanly.  */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "rocm-test-utils.h"
+
+namespace cg = cooperative_groups;
+
+constexpr int N_USED_GPUS = 2;
+constexpr unsigned int N_PER_DEVICE = 256;
+constexpr unsigned int group_size = 64;
+constexpr unsigned int num_groups = 2;
+
+/* Two-phase cooperative kernel running on every device.
+
+   Phase 1: each thread writes (grid_rank + 1) * (i + 1) into its slot.
+   GWS    : intra-device grid synchronization via this_grid ().sync ().
+   MGWS   : cross-device synchronization via this_multi_grid ().sync ().
+   Phase 2: thread 0 of each grid writes its partial sum into
+   result[grid_rank + 1] (a slot in the shared host-coherent buffer).
+   Then thread 0 of grid 0 aggregates the partials into result[0].  */
+
+__global__ void
+coop_multi_grid_sync_kernel (int *data, unsigned int n_elements,
+			     long *result)
+{
+  cg::grid_group grid = cg::this_grid ();
+  cg::multi_grid_group mgrid = cg::this_multi_grid ();
+
+  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int stride = gridDim.x * blockDim.x;
+
+  unsigned int grid_rank = mgrid.grid_rank ();
+
+  /* Phase 1: each thread writes data into its slot.  */
+  for (unsigned int i = tid; i < n_elements; i += stride)
+    data[i] = (int) ((grid_rank + 1) * (i + 1));   /* before-grid-sync line */
+
+  /* Intra-device grid sync (GWS).  */
+  grid.sync ();			       /* grid-sync line */
+
+  /* Each grid computes its partial sum and stores it in the shared
+     buffer at slot (grid_rank + 1).  Slot 0 is reserved for the
+     final total.  */
+  if (tid == 0)
+    {
+      long sum = 0;
+      for (unsigned int i = 0; i < n_elements; i++)
+	sum += data[i];
+      result[grid_rank + 1] = sum;     /* after-grid-sync line */
+    }
+
+  /* Cross-device sync (multi-grid GWS).  */
+  mgrid.sync ();		       /* multi-grid-sync line */
+
+  /* Grid 0 aggregates partial sums from all grids into result[0].  */
+  if (grid_rank == 0 && tid == 0)
+    {
+      long total = 0;
+      unsigned int n_grids = mgrid.num_grids ();
+      for (unsigned int g = 0; g < n_grids; g++)
+	total += result[g + 1];
+      result[0] = total;	       /* after-multi-grid-sync line */
+    }
+}
+
+int
+main ()
+{
+  int n_devices = 0;
+  CHECK (hipGetDeviceCount (&n_devices));
+  if (n_devices < N_USED_GPUS)
+    {
+      printf ("Multi-device cooperative test needs >= %d GPUs"
+	      " (found %d), skipping.\n", N_USED_GPUS, n_devices);
+      return 0;
+    }
+
+  /* Pick the first N_USED_GPUS devices that support
+     cooperativeMultiDeviceLaunch.  In a mixed-architecture system not
+     every GPU has GWS support, so we ignore unsupported ones rather
+     than failing the whole test.  */
+  int selected[N_USED_GPUS];
+  int n_gpus = 0;
+  for (int id = 0; id < n_devices && n_gpus < N_USED_GPUS; id++)
+    {
+      hipDeviceProp_t p;
+      CHECK (hipGetDeviceProperties (&p, id));
+      if (p.cooperativeMultiDeviceLaunch)
+	{
+	  selected[n_gpus] = id;
+	  n_gpus++;
+	}
+    }
+  if (n_gpus < N_USED_GPUS)
+    {
+      printf ("Fewer than %d devices in 0..%d support cooperative"
+	      " multi-device launch, skipping.\n",
+	      N_USED_GPUS, n_devices - 1);
+      return 0;
+    }
+
+  /* n-gpus-final line.  */
+  int *data_d[N_USED_GPUS] = {};
+  hipStream_t stream[N_USED_GPUS] = {};
+
+  /* The shared result buffer is host-coherent so every device can
+     read/write it without explicit memcpys.  Layout:
+       result[0]            -- final total (written by grid 0 only)
+       result[1 .. n_gpus]  -- per-grid partial sums.  */
+  long *result_h = nullptr;
+
+  for (int i = 0; i < n_gpus; i++)
+    {
+      CHECK (hipSetDevice (selected[i]));
+      CHECK (hipMalloc ((void **) &data_d[i], N_PER_DEVICE * sizeof (int)));
+      CHECK (hipMemset (data_d[i], 0, N_PER_DEVICE * sizeof (int)));
+      CHECK (hipStreamCreate (&stream[i]));
+    }
+
+  CHECK (hipSetDevice (selected[0]));
+  CHECK (hipHostMalloc ((void **) &result_h, (n_gpus + 1) * sizeof (long),
+			hipHostMallocCoherent));
+  for (int i = 0; i < n_gpus + 1; i++)
+    result_h[i] = 0;
+
+  /* Build per-device launch parameters.  Each device gets the same
+     grid/block dims and a pointer to *its own* data buffer, but the
+     result pointer is the single shared host-coherent buffer.  */
+  hipLaunchParams launch_params[N_USED_GPUS];
+  void *args[N_USED_GPUS][3];
+  unsigned int n_elements = N_PER_DEVICE;
+
+  for (int i = 0; i < n_gpus; i++)
+    {
+      args[i][0] = (void *) &data_d[i];
+      args[i][1] = (void *) &n_elements;
+      args[i][2] = (void *) &result_h;
+
+      launch_params[i].func
+	= reinterpret_cast<void *> (coop_multi_grid_sync_kernel);
+      launch_params[i].gridDim = dim3 (num_groups, 1, 1);
+      launch_params[i].blockDim = dim3 (group_size, 1, 1);
+      launch_params[i].sharedMem = 0;
+      launch_params[i].stream = stream[i];
+      launch_params[i].args = args[i];
+    }
+
+  CHECK (hipLaunchCooperativeKernelMultiDevice (launch_params, n_gpus, 0));
+
+  for (int i = 0; i < n_gpus; i++)
+    {
+      CHECK (hipSetDevice (selected[i]));
+      CHECK (hipDeviceSynchronize ());
+    }
+
+  long total = result_h[0];
+
+  for (int i = 0; i < n_gpus; i++)
+    {
+      CHECK (hipSetDevice (selected[i]));
+      CHECK (hipFree (data_d[i]));
+      CHECK (hipStreamDestroy (stream[i]));
+    }
+  CHECK (hipSetDevice (selected[0]));
+  CHECK (hipHostFree (result_h));
+
+  /* Compute the expected total: for grid g (0-based), the thread that
+     writes slot i stores (g + 1) * (i + 1).  Per grid, the sum is
+     (g + 1) * N_PER_DEVICE * (N_PER_DEVICE + 1) / 2.  */
+  long per_grid = (long) N_PER_DEVICE * (N_PER_DEVICE + 1) / 2;
+  long expected = 0;
+  for (int g = 0; g < n_gpus; g++)
+    expected += (long) (g + 1) * per_grid;
+
+  if (total != expected)
+    {
+      fprintf (stderr, "FAILED: total %ld, expected %ld\n", total, expected);
+      return EXIT_FAILURE;
+    }
+
+  return EXIT_SUCCESS;
+}
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
new file mode 100644
index 00000000000..53111e9da62
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
@@ -0,0 +1,147 @@
+# Copyright (C) 2026 Free Software Foundation, Inc.
+# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for debugging a multi-device cooperative kernel that uses
+# cooperative_groups::this_grid ().sync () within each device and
+# cooperative_groups::this_multi_grid ().sync () across devices.  Both
+# barriers are implemented on AMD GPUs using the GWS hardware
+# mechanism.
+#
+# The test verifies that GDB can:
+#   - read the number of participating GPUs from the inferior,
+#   - hit a kernel breakpoint inside the cooperative dispatch on
+#     *every* participating GPU (matched via the per-device
+#     "Breakpoint <id>.<instance>" subscript that hipLaunchCooperative
+#     KernelMultiDevice produces), and
+#   - resume the cross-device dispatch through both the intra-device
+#     this_grid ().sync () and the cross-device this_multi_grid ().sync ()
+#     to normal program completion.
+
+load_lib rocm.exp
+
+require allow_hipcc_tests
+require hip_devices_support_debug_multi_process
+require supports_cooperative_groups
+
+standard_testfile .cpp
+
+if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} {
+    return
+}
+
+# Test: hit a breakpoint inside the cooperative kernel on every
+# participating GPU, then resume the dispatch through both barriers
+# and to program completion.
+
+proc_with_prefix test_break_in_multi_coop_kernel {} {
+    # Multi-device cooperative dispatches involve waves on more than
+    # one device making forward progress before the multi-grid sync
+    # releases.  Run the inferior in non-stop mode so the debugger can
+    # observe per-device stops independently.
+    save_vars { ::GDBFLAGS } {
+	append ::GDBFLAGS " -ex \"set non-stop on\""
+	clean_restart $::testfile
+    }
+
+    with_rocm_gpu_lock {
+	if {![runto_main]} {
+	    return
+	}
+
+	# Advance to the point where the inferior has finalized "n_gpus"
+	# (the count of GPUs it will actually use for the cooperative
+	# dispatch) so we can read it from gdb.  The require gates above
+	# cover architecture-level support; per-device runtime gating
+	# (>= 2 devices that advertise cooperativeMultiDeviceLaunch) is
+	# done inside main () in the .cpp, which self-skips with a clean
+	# exit when those preconditions are not met.  Treat that clean
+	# exit as UNSUPPORTED rather than a FAIL.
+	gdb_breakpoint \
+	    [gdb_get_line_number "n-gpus-final line"] allow-pending
+
+	set eligible 0
+	gdb_test_multiple "continue" "advance to n-gpus-final" {
+	    -re "hit Breakpoint $::decimal,\[^\r\n\]*main \\(\\)\[^\r\n\]*\r\n\[^\r\n\]*\r\n$::gdb_prompt " {
+		set eligible 1
+		pass $gdb_test_name
+	    }
+	    -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " {
+		unsupported "multi-device cooperative launch not available"
+	    }
+	}
+	if { !$eligible } {
+	    return
+	}
+
+	set n_gpus [get_integer_valueof "n_gpus" 0]
+	gdb_assert {$n_gpus >= 2} "n_gpus is at least 2"
+	if { $n_gpus < 2 } {
+	    return
+	}
+
+	# Replace the marker breakpoint with the kernel breakpoint and
+	# run all threads in the background so the host can finish
+	# launching kernels on every device.
+	delete_breakpoints
+	gdb_breakpoint \
+	    [gdb_get_line_number "before-grid-sync line"] allow-pending
+
+	gdb_test_multiple "continue -a &" "background continue" {
+	    -re "Continuing\\.\r\n$::gdb_prompt " {
+		pass $gdb_test_name
+	    }
+	}
+
+	# In non-stop mode, hipLaunchCooperativeKernelMultiDevice
+	# produces one child breakpoint instance per participating GPU
+	# ("Breakpoint <id>.<inst>").  Collect distinct <inst> values
+	# until we have observed a stop on every GPU; only then is it
+	# safe to delete the breakpoint and let the dispatch run
+	# through both grid syncs.
+	array unset gpu_seen
+	set distinct_gpus 0
+	gdb_test_multiple "" "breakpoint hit on every participating GPU" {
+	    -re "\r\nThread $::decimal\[^\r\n\]* hit Breakpoint $::decimal\\.($::decimal),\[^\r\n\]*\r\n" {
+		set inst $expect_out(1,string)
+		if {![info exists gpu_seen($inst)]} {
+		    set gpu_seen($inst) 1
+		    incr distinct_gpus
+		}
+		if {$distinct_gpus >= $n_gpus} {
+		    pass $gdb_test_name
+		} else {
+		    exp_continue
+		}
+	    }
+	}
+
+	# All participating GPUs have been observed stopping at the
+	# kernel breakpoint.  Remove the breakpoint and let the
+	# dispatch run through this_grid ().sync () and
+	# this_multi_grid ().sync () to clean program exit.
+	delete_breakpoints
+
+	gdb_test_multiple "continue -a" "continue all to end" {
+	    -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " {
+		pass $gdb_test_name
+	    }
+	}
+    }
+}
+
+test_break_in_multi_coop_kernel
diff --git a/gdb/testsuite/lib/rocm.exp b/gdb/testsuite/lib/rocm.exp
index c937f92e24e..44273e40832 100644
--- a/gdb/testsuite/lib/rocm.exp
+++ b/gdb/testsuite/lib/rocm.exp
@@ -412,6 +412,46 @@ proc hip_devices_support_debug_multi_process {} {
     return 1
 }
 
+# Helpers below gate cooperative-group debugging tests, i.e. tests that
+# debug kernels launched with hipLaunchCooperativeKernel /
+# hipLaunchCooperativeKernelMultiDevice and synchronized via
+# cooperative_groups::this_grid ().sync () / this_multi_grid ().sync ().
+#
+# This is a debugger-side gate, distinct from what the HIP runtime
+# advertises via hipDeviceProp_t::cooperativeLaunch /
+# cooperativeMultiDeviceLaunch: the Global Wave Sync (GWS) hardware
+# implementing those barriers exists on architectures where the kernel
+# runs correctly outside the debugger but rocgdb cannot yet step or
+# breakpoint through the cooperative dispatch.
+
+# Return true if TARGET (a gfx name like "gfx90a") is an architecture
+# where rocgdb supports debugging cooperative-group kernels.
+
+proc target_supports_cooperative_groups { target } {
+    set unsupported_targets {
+	gfx1100 gfx1101 gfx1102 gfx1103
+    }
+
+    return [expr {[lsearch -exact $unsupported_targets $target] == -1}]
+}
+
+# Return true if every AMDGPU device on the system is an architecture
+# where rocgdb supports debugging cooperative-group kernels.
+
+proc supports_cooperative_groups {} {
+    set targets [find_amdgpu_devices]
+    if { [llength $targets] == 0 } {
+	return 0
+    }
+
+    foreach target $targets {
+	if { ![target_supports_cooperative_groups $target] } {
+	    return 0
+	}
+    }
+    return 1
+}
+
 # Return true if the current device's version is less than VERSION.
 #
 # VERSION must be the "gfx" name of the device, such as gfx906 or gfx90a.

From a00cfb32cfcb9088fe05caa7e6a3f305a8075545 Mon Sep 17 00:00:00 2001
From: Sarang Patrange <spatrang@amd.com>
Date: Tue, 9 Jun 2026 05:23:55 -0500
Subject: [PATCH 2/2] gdb, testsuite: improve gdb.rocm cooperative group GWS
 tests

Refine the cooperative-group GWS tests for robustness and to make the
debugger behaviour under test more explicit:

  * coop-group-grid-sync.exp: use temporary breakpoints for the
    in-kernel locations (still pending, since the device code is
    loaded at dispatch time) and drop the redundant delete_breakpoints
    calls.

  * coop-group-grid-sync.exp: report UNSUPPORTED instead of FAIL when
    the inferior self-skips because no device advertises
    cooperativeLaunch.

  * coop-group-grid-sync.exp: have test_threads_in_coop_kernel check
    distinct per-wave blockIdx.x (per-wave register context) and
    per-lane threadIdx.x divergence (per-lane SIMT state), instead of
    only confirming that the backtrace names the kernel.

  * coop-group-multi-grid-sync.{cpp,exp}: read n_gpus from a marker
    line that is reached on every execution (no early return before
    it), so that when fewer than two cooperative-capable GPUs are
    available -- including when a parallel test run restricts the
    visible GPUs -- the test reports UNSUPPORTED rather than FAILing.

  * coop-group-{grid,multi-grid}-sync.{cpp,exp}: minor comment and
    GNU-style cleanups -- tab-align the in-kernel marker comment, keep
    hipLaunchCooperativeKernelMultiDevice on a single line, and
    clarify the Phase 2 data-dependency comment.

  * coop-group-{grid,multi-grid}-sync.{cpp,exp}: give the per-wave and
    per-lane value reads explicit, unique test names, and keep the
    "n-gpus-final" marker string unique so gdb_get_line_number
    resolves the intended line.

Tested on gfx942 with an in-tree build and the 7.14 nightly rocgdb,
both with all GPUs visible and with the visible set restricted to one.
---
 .../gdb.rocm/coop-group-grid-sync.cpp         |  5 +-
 .../gdb.rocm/coop-group-grid-sync.exp         | 87 +++++++++++++------
 .../gdb.rocm/coop-group-multi-grid-sync.cpp   | 20 ++---
 .../gdb.rocm/coop-group-multi-grid-sync.exp   | 50 ++++-------
 4 files changed, 90 insertions(+), 72 deletions(-)

diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
index 14d395e1fd8..911f4385a6a 100644
--- a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
@@ -40,8 +40,9 @@ namespace cg = cooperative_groups;
    in the GWS barrier.  Two workgroups of 64 threads each give 128
    threads in total.
 
-   N is exactly one slot per thread so Phase 2 has no intra-thread race
-   and the host-side expected values can be computed straightforwardly.  */
+   N is exactly one slot per thread so Phase 2 has no inter-thread
+   write conflict and the host-side expected values can be computed
+   straightforwardly.  */
 
 constexpr unsigned int group_size = 64;
 constexpr unsigned int num_groups = 2;
diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
index b2ef18ccd36..2d18f94e0fc 100644
--- a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
+++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
@@ -55,10 +55,22 @@ proc_with_prefix test_break_around_grid_sync {} {
 	# Place a breakpoint before grid.sync ().  Use "allow-pending"
 	# because the kernel is loaded as device code at dispatch time.
 	gdb_breakpoint \
-	    [gdb_get_line_number "before-sync line"] allow-pending
-
-	# The kernel must hit the pre-sync breakpoint.
-	if {[gdb_continue_to_breakpoint "stop before grid.sync"] != 0} {
+	    [gdb_get_line_number "before-sync line"] allow-pending temporary
+
+	# Continue into the kernel.  If the program self-skips (no device
+	# advertises cooperativeLaunch) it exits cleanly; treat that as
+	# UNSUPPORTED rather than a FAIL.
+	set reached 0
+	gdb_test_multiple "continue" "stop before grid.sync" {
+	    -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" {
+		set reached 1
+		pass $gdb_test_name
+	    }
+	    -re -wrap "$::inferior_exited_re normally.*" {
+		unsupported "cooperative launch not available"
+	    }
+	}
+	if { !$reached } {
 	    return
 	}
 
@@ -90,27 +102,25 @@ proc_with_prefix test_break_around_grid_sync {} {
 	    ".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \
 	    "info dispatches lists cooperative dispatch"
 
-	# Replace the pre-sync breakpoint with one after grid.sync ().
-	# The after-sync breakpoint fires once all waves have crossed
-	# the GWS barrier, which proves GWS-protected code can be
-	# debugged across the barrier.
-	delete_breakpoints
+	# Place a breakpoint after grid.sync ().  It fires once all waves
+	# have crossed the GWS barrier, which proves GWS-protected code
+	# can be debugged across the barrier.
 	gdb_breakpoint \
-	    [gdb_get_line_number "after-sync line"] allow-pending
+	    [gdb_get_line_number "after-sync line"] allow-pending temporary
 
 	gdb_continue_to_breakpoint "stop after grid.sync"
 
 	# The kernel should be able to run to completion afterwards.
-	delete_breakpoints
 	gdb_continue_to_end "continue to program end" "continue" 1
     }
 }
 
-# Test: place a breakpoint inside the cooperative kernel, then iterate
-# over every stopped GPU wave and verify that each one has a backtrace
-# pointing into coop_grid_sync_kernel.  This exercises the wave
-# organization a debugger sees when several waves participate in a GWS
-# barrier.
+# Test: stop inside the cooperative kernel with several co-resident
+# waves parked at the barrier and check that gdb reports correct,
+# distinct per-wave and per-lane state.  This is the debugger
+# behaviour of interest: that gdb selects the right register context
+# per wave and the right SIMT state per lane, not merely that the
+# waves are in the kernel.
 
 proc_with_prefix test_threads_in_coop_kernel {} {
     clean_restart $::testfile
@@ -121,29 +131,56 @@ proc_with_prefix test_threads_in_coop_kernel {} {
 	}
 
 	gdb_breakpoint \
-	    [gdb_get_line_number "before-sync line"] allow-pending
-
-	if {[gdb_continue_to_breakpoint "stop in cooperative kernel"] != 0} {
+	    [gdb_get_line_number "before-sync line"] allow-pending temporary
+
+	# Continue into the kernel.  If the program self-skips (no device
+	# advertises cooperativeLaunch) it exits cleanly; treat that as
+	# UNSUPPORTED rather than a FAIL.
+	set reached 0
+	gdb_test_multiple "continue" "stop in cooperative kernel" {
+	    -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" {
+		set reached 1
+		pass $gdb_test_name
+	    }
+	    -re -wrap "$::inferior_exited_re normally.*" {
+		unsupported "cooperative launch not available"
+	    }
+	}
+	if { !$reached } {
 	    return
 	}
 
-	# Collect the list of AMDGPU waves and check that every one of
-	# them has a backtrace pointing into coop_grid_sync_kernel.
 	set waves [info_thread_get_wave_list]
 	gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed"
 
+	# Per-wave register context: switching to each wave and reading
+	# blockIdx.x must yield that wave's owning workgroup, and across
+	# all waves we must observe more than one distinct workgroup.
+	set blocks {}
 	foreach wave $waves {
 	    with_test_prefix "wave $wave" {
 		gdb_test "thread $wave" "Switching to thread $wave.*" \
 		    "switch to wave $wave"
-		gdb_test "bt 1" \
-		    "#0\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*" \
-		    "backtrace inside coop_grid_sync_kernel"
+		set bidx [get_integer_valueof "blockIdx.x" -1 "blockIdx.x"]
+		if {[lsearch -exact $blocks $bidx] == -1} {
+		    lappend blocks $bidx
+		}
 	    }
 	}
+	gdb_assert {[llength $blocks] >= 2} \
+	    "distinct workgroups observed across waves"
+
+	# Per-lane (SIMT) state: within a single wave, different lanes map
+	# to different threads, so threadIdx.x must differ between lanes.
+	gdb_test "thread [lindex $waves 0]" "Switching to thread .*" \
+	    "switch to first wave for lane inspection"
+	gdb_test "lane 0" ".*" "switch to lane 0"
+	set tid0 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 0"]
+	gdb_test "lane 1" ".*" "switch to lane 1"
+	set tid1 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 1"]
+	gdb_assert {$tid0 != $tid1} "distinct threadIdx.x across lanes"
 
 	# Resume to completion.
-	delete_breakpoints
 	gdb_continue_to_end "continue to program end" "continue" 1
     }
 }
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
index b392de257d5..e29b3da8d56 100644
--- a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp
@@ -62,7 +62,7 @@ coop_multi_grid_sync_kernel (int *data, unsigned int n_elements,
 
   /* Phase 1: each thread writes data into its slot.  */
   for (unsigned int i = tid; i < n_elements; i += stride)
-    data[i] = (int) ((grid_rank + 1) * (i + 1));   /* before-grid-sync line */
+    data[i] = (int) ((grid_rank + 1) * (i + 1));	/* before-grid-sync line */
 
   /* Intra-device grid sync (GWS).  */
   grid.sync ();			       /* grid-sync line */
@@ -97,17 +97,14 @@ main ()
 {
   int n_devices = 0;
   CHECK (hipGetDeviceCount (&n_devices));
-  if (n_devices < N_USED_GPUS)
-    {
-      printf ("Multi-device cooperative test needs >= %d GPUs"
-	      " (found %d), skipping.\n", N_USED_GPUS, n_devices);
-      return 0;
-    }
 
   /* Pick the first N_USED_GPUS devices that support
      cooperativeMultiDeviceLaunch.  In a mixed-architecture system not
      every GPU has GWS support, so we ignore unsupported ones rather
-     than failing the whole test.  */
+     than failing the whole test.  This loop runs unconditionally (it
+     is bounded by n_devices) so the marker below is always reached,
+     even when too few devices are available; the companion .exp reads
+     n_gpus there to decide whether to skip.  */
   int selected[N_USED_GPUS];
   int n_gpus = 0;
   for (int id = 0; id < n_devices && n_gpus < N_USED_GPUS; id++)
@@ -120,15 +117,14 @@ main ()
 	  n_gpus++;
 	}
     }
-  if (n_gpus < N_USED_GPUS)
+  if (n_gpus < N_USED_GPUS)	/* n-gpus-final line */
     {
-      printf ("Fewer than %d devices in 0..%d support cooperative"
+      printf ("Fewer than %d of the %d HIP device(s) support cooperative"
 	      " multi-device launch, skipping.\n",
-	      N_USED_GPUS, n_devices - 1);
+	      N_USED_GPUS, n_devices);
       return 0;
     }
 
-  /* n-gpus-final line.  */
   int *data_d[N_USED_GPUS] = {};
   hipStream_t stream[N_USED_GPUS] = {};
 
diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
index 53111e9da62..f192ddc0d28 100644
--- a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
+++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp
@@ -26,8 +26,8 @@
 #   - read the number of participating GPUs from the inferior,
 #   - hit a kernel breakpoint inside the cooperative dispatch on
 #     *every* participating GPU (matched via the per-device
-#     "Breakpoint <id>.<instance>" subscript that hipLaunchCooperative
-#     KernelMultiDevice produces), and
+#     "Breakpoint <id>.<instance>" subscript that
+#     hipLaunchCooperativeKernelMultiDevice produces), and
 #   - resume the cross-device dispatch through both the intra-device
 #     this_grid ().sync () and the cross-device this_multi_grid ().sync ()
 #     to normal program completion.
@@ -63,34 +63,19 @@ proc_with_prefix test_break_in_multi_coop_kernel {} {
 	    return
 	}
 
-	# Advance to the point where the inferior has finalized "n_gpus"
-	# (the count of GPUs it will actually use for the cooperative
-	# dispatch) so we can read it from gdb.  The require gates above
-	# cover architecture-level support; per-device runtime gating
-	# (>= 2 devices that advertise cooperativeMultiDeviceLaunch) is
-	# done inside main () in the .cpp, which self-skips with a clean
-	# exit when those preconditions are not met.  Treat that clean
-	# exit as UNSUPPORTED rather than a FAIL.
-	gdb_breakpoint \
-	    [gdb_get_line_number "n-gpus-final line"] allow-pending
-
-	set eligible 0
-	gdb_test_multiple "continue" "advance to n-gpus-final" {
-	    -re "hit Breakpoint $::decimal,\[^\r\n\]*main \\(\\)\[^\r\n\]*\r\n\[^\r\n\]*\r\n$::gdb_prompt " {
-		set eligible 1
-		pass $gdb_test_name
-	    }
-	    -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " {
-		unsupported "multi-device cooperative launch not available"
-	    }
-	}
-	if { !$eligible } {
-	    return
-	}
+	# Break where the inferior has finalized "n_gpus" (the count of
+	# GPUs it will use for the cooperative dispatch).  This line runs
+	# on every execution, before the inferior's own eligibility
+	# check, so reading "n_gpus" from gdb lets us decide here whether
+	# to proceed.  The require gates above cover architecture-level
+	# support; per-device runtime gating (>= 2 devices that advertise
+	# cooperativeMultiDeviceLaunch) can only be known at this point.
+	gdb_breakpoint [gdb_get_line_number "n-gpus-final line"] temporary
+	gdb_continue_to_breakpoint "advance to n-gpus-final"
 
 	set n_gpus [get_integer_valueof "n_gpus" 0]
-	gdb_assert {$n_gpus >= 2} "n_gpus is at least 2"
 	if { $n_gpus < 2 } {
+	    unsupported "fewer than two cooperative-capable GPUs"
 	    return
 	}
 
@@ -107,12 +92,11 @@ proc_with_prefix test_break_in_multi_coop_kernel {} {
 	    }
 	}
 
-	# In non-stop mode, hipLaunchCooperativeKernelMultiDevice
-	# produces one child breakpoint instance per participating GPU
-	# ("Breakpoint <id>.<inst>").  Collect distinct <inst> values
-	# until we have observed a stop on every GPU; only then is it
-	# safe to delete the breakpoint and let the dispatch run
-	# through both grid syncs.
+	# Check that gdb resolves the single source breakpoint to one
+	# child location per participating GPU ("Breakpoint <id>.<inst>")
+	# and, in non-stop mode, reports each device-side stop
+	# independently.  Collect distinct <inst> values until a stop has
+	# been seen for every GPU.
 	array unset gpu_seen
 	set distinct_gpus 0
 	gdb_test_multiple "" "breakpoint hit on every participating GPU" {