From 714c354a12c90d05ec0adcdaa6ec7752190e48c8 Mon Sep 17 00:00:00 2001 From: Sarang Patrange Date: Thu, 4 Jun 2026 09:23:44 -0500 Subject: [PATCH 1/2] gdb, testsuite: add gdb.rocm tests for cooperative group GWS debugging Add dejagnu coverage for debugging AMD GPU cooperative-group kernels (hipLaunchCooperativeKernel / hipLaunchCooperativeKernelMultiDevice), which synchronize at the grid / multi-grid level via Global Wave Sync (GWS). Previously covered only by out-of-tree tests. New tests: * gdb.rocm/coop-group-grid-sync.{cpp,exp} Single-device, this_grid ().sync (). * gdb.rocm/coop-group-multi-grid-sync.{cpp,exp} Multi-device, this_grid ().sync () + this_multi_grid ().sync (); runs in non-stop mode. Host-side post-conditions validate the cooperative semantics numerically, so any regression in GWS behaviour under the debugger surfaces as a test failure rather than a silent miscompare. The tests pick a debugger-supported device at runtime and self-skip with UNSUPPORTED when the configuration is insufficient. Two helpers added in gdb/testsuite/lib/rocm.exp: target_supports_cooperative_groups (per-target gate, returns false on gfx1100/1101/1102/1103 per amd-dbgapi.h) and supports_cooperative_groups (require-gate wrapper used by both .exp files). This is a debugger-side gate, distinct from the runtime's cooperativeLaunch / cooperativeMultiDeviceLaunch flags. --- .../gdb.rocm/coop-group-grid-sync.cpp | 168 ++++++++++++++ .../gdb.rocm/coop-group-grid-sync.exp | 152 +++++++++++++ .../gdb.rocm/coop-group-multi-grid-sync.cpp | 211 ++++++++++++++++++ .../gdb.rocm/coop-group-multi-grid-sync.exp | 147 ++++++++++++ gdb/testsuite/lib/rocm.exp | 40 ++++ 5 files changed, 718 insertions(+) create mode 100644 gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp create mode 100644 gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp create mode 100644 gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp create mode 100644 gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp new file mode 100644 index 00000000000..14d395e1fd8 --- /dev/null +++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp @@ -0,0 +1,168 @@ +/* Copyright (C) 2026 Free Software Foundation, Inc. + Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + + This file is part of GDB. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* This program exercises a single-device cooperative kernel launch. + The kernel uses cooperative_groups::this_grid ().sync (), which is + implemented on AMD GPUs using the Global Wave Sync (GWS) hardware + mechanism. It is launched through hipLaunchCooperativeKernel so + the whole grid is co-resident and can synchronize at the grid + level. + + The companion .exp file uses this program to exercise the debugger + while a kernel is running with GWS-based grid synchronization. */ + +#include +#include +#include +#include + +#include "rocm-test-utils.h" + +namespace cg = cooperative_groups; + +/* Use small but non-trivial launch dimensions so the test runs quickly + while still creating waves from multiple workgroups participating + in the GWS barrier. Two workgroups of 64 threads each give 128 + threads in total. + + N is exactly one slot per thread so Phase 2 has no intra-thread race + and the host-side expected values can be computed straightforwardly. */ + +constexpr unsigned int group_size = 64; +constexpr unsigned int num_groups = 2; +constexpr unsigned int total_threads = group_size * num_groups; +constexpr unsigned int N = total_threads; + +/* Two-phase grid-cooperative kernel. + + Phase 1: every thread writes its slot in in_buf. + GWS : the whole grid synchronizes via cooperative_groups::this_grid (). + Phase 2: every thread reads a slot owned by a thread in a *different* + workgroup and stores it into out_buf. + + Because the Phase 2 read targets a slot written by a different + workgroup during Phase 1, this is only correct if grid.sync () + actually synchronized every wave in the grid. */ + +__global__ void +coop_grid_sync_kernel (int *in_buf, int *out_buf) +{ + cg::grid_group grid = cg::this_grid (); + + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + /* Phase 1: each thread writes its own value. */ + in_buf[tid] = (int) (tid + 1); /* before-sync line */ + + /* Grid-wide synchronization (GWS). */ + grid.sync (); /* sync line */ + + /* Phase 2: read the slot owned by a thread in a different workgroup + and store it in our own slot of the output buffer. */ + unsigned int peer = (tid + blockDim.x) % total_threads; + int v = in_buf[peer]; /* after-sync line */ + out_buf[tid] = v; +} + +int +main () +{ + int n_devices = 0; + CHECK (hipGetDeviceCount (&n_devices)); + if (n_devices <= 0) + { + printf ("No HIP devices found, skipping.\n"); + return 0; + } + + int device_id = -1; + hipDeviceProp_t props; + for (int id = 0; id < n_devices; ++id) + { + CHECK (hipGetDeviceProperties (&props, id)); + if (props.cooperativeLaunch) + { + device_id = id; + break; + } + } + + if (device_id < 0) + { + printf ("None of the %d HIP device(s) support cooperative launch, " + "skipping.\n", + n_devices); + return 0; + } + + CHECK (hipSetDevice (device_id)); + + int *in_d = nullptr; + int *out_d = nullptr; + CHECK (hipMalloc ((void **) &in_d, N * sizeof (int))); + CHECK (hipMalloc ((void **) &out_d, N * sizeof (int))); + CHECK (hipMemset (in_d, 0, N * sizeof (int))); + CHECK (hipMemset (out_d, 0, N * sizeof (int))); + + dim3 grid_dim (num_groups, 1, 1); + dim3 block_dim (group_size, 1, 1); + + void *kernel_args[2]; + kernel_args[0] = (void *) &in_d; + kernel_args[1] = (void *) &out_d; + + /* Launch the kernel cooperatively. This is the API that enables + grid.sync () support via GWS. The trailing 0, 0 arguments are + sharedMem and stream respectively. */ + CHECK (hipLaunchCooperativeKernel + (reinterpret_cast (coop_grid_sync_kernel), + grid_dim, block_dim, kernel_args, 0, 0)); + + CHECK (hipDeviceSynchronize ()); + + int out_h[N]; + CHECK (hipMemcpy (out_h, out_d, N * sizeof (int), hipMemcpyDeviceToHost)); + + CHECK (hipFree (in_d)); + CHECK (hipFree (out_d)); + + /* Each thread tid stores in_buf[(tid + group_size) % total_threads] + into out_buf[tid]. After Phase 1, in_buf[k] == k + 1, so the + expected value at out_h[tid] is ((tid + group_size) % + total_threads) + 1. */ + int errors = 0; + for (unsigned int tid = 0; tid < N; tid++) + { + unsigned int peer = (tid + group_size) % total_threads; + int expected = (int) (peer + 1); + if (out_h[tid] != expected) + { + fprintf (stderr, "mismatch at %u: got %d, expected %d\n", + tid, out_h[tid], expected); + errors++; + } + } + + if (errors != 0) + { + fprintf (stderr, "FAILED: %d mismatches\n", errors); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp new file mode 100644 index 00000000000..b2ef18ccd36 --- /dev/null +++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp @@ -0,0 +1,152 @@ +# Copyright (C) 2026 Free Software Foundation, Inc. +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + +# This file is part of GDB. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Tests for debugging a single-device cooperative kernel that uses +# cooperative_groups::this_grid ().sync (), implemented on AMD GPUs +# using the Global Wave Sync (GWS) hardware mechanism. +# +# The test verifies that GDB can: +# - hit a breakpoint placed before grid.sync () in a cooperatively +# launched kernel (i.e. when the dispatch was created with +# hipLaunchCooperativeKernel), +# - list the cooperative dispatch via "info dispatches", +# - observe multiple GPU waves participating in the grid sync, +# - hit a breakpoint placed after grid.sync (), and +# - resume the kernel to normal completion. + +load_lib rocm.exp + +require allow_hipcc_tests +require supports_cooperative_groups + +standard_testfile .cpp + +if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} { + return +} + +# Test: hit a breakpoint inside a cooperative kernel before grid.sync (), +# inspect the GPU state, then continue past grid.sync () and to program +# exit. + +proc_with_prefix test_break_around_grid_sync {} { + clean_restart $::testfile + + with_rocm_gpu_lock { + if {![runto_main]} { + return + } + + # Place a breakpoint before grid.sync (). Use "allow-pending" + # because the kernel is loaded as device code at dispatch time. + gdb_breakpoint \ + [gdb_get_line_number "before-sync line"] allow-pending + + # The kernel must hit the pre-sync breakpoint. + if {[gdb_continue_to_breakpoint "stop before grid.sync"] != 0} { + return + } + + # Verify that waves from multiple workgroups are stopped at the + # pre-sync breakpoint. Counting waves alone is wave-size + # dependent (1 wave per workgroup on wave64 vs 2 on wave32) and + # would let the test pass on wave32 even if all visible waves + # happened to come from a single workgroup. Instead, collect + # the distinct workgroup (block) coordinates from the AMDGPU + # Wave entries in "info threads" and require at least two + # distinct workgroups, which directly verifies that the + # cooperative dispatch's multi-workgroup property is exercised. + set blocks {} + gdb_test_multiple "info threads" "blocks present pre-sync" -lbl { + -re "AMDGPU Wave \[^(\r\n\]*\\((\[^()\r\n\]+)\\)" { + if {[lsearch -exact $blocks $expect_out(1,string)] == -1} { + lappend blocks $expect_out(1,string) + } + exp_continue + } + -re -wrap "" { + gdb_assert {[llength $blocks] >= 2} \ + "waves from at least 2 distinct workgroups are present pre-sync" + } + } + + # "info dispatches" should show the cooperative dispatch. + gdb_test "info dispatches" \ + ".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \ + "info dispatches lists cooperative dispatch" + + # Replace the pre-sync breakpoint with one after grid.sync (). + # The after-sync breakpoint fires once all waves have crossed + # the GWS barrier, which proves GWS-protected code can be + # debugged across the barrier. + delete_breakpoints + gdb_breakpoint \ + [gdb_get_line_number "after-sync line"] allow-pending + + gdb_continue_to_breakpoint "stop after grid.sync" + + # The kernel should be able to run to completion afterwards. + delete_breakpoints + gdb_continue_to_end "continue to program end" "continue" 1 + } +} + +# Test: place a breakpoint inside the cooperative kernel, then iterate +# over every stopped GPU wave and verify that each one has a backtrace +# pointing into coop_grid_sync_kernel. This exercises the wave +# organization a debugger sees when several waves participate in a GWS +# barrier. + +proc_with_prefix test_threads_in_coop_kernel {} { + clean_restart $::testfile + + with_rocm_gpu_lock { + if {![runto_main]} { + return + } + + gdb_breakpoint \ + [gdb_get_line_number "before-sync line"] allow-pending + + if {[gdb_continue_to_breakpoint "stop in cooperative kernel"] != 0} { + return + } + + # Collect the list of AMDGPU waves and check that every one of + # them has a backtrace pointing into coop_grid_sync_kernel. + set waves [info_thread_get_wave_list] + gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed" + + foreach wave $waves { + with_test_prefix "wave $wave" { + gdb_test "thread $wave" "Switching to thread $wave.*" \ + "switch to wave $wave" + gdb_test "bt 1" \ + "#0\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*" \ + "backtrace inside coop_grid_sync_kernel" + } + } + + # Resume to completion. + delete_breakpoints + gdb_continue_to_end "continue to program end" "continue" 1 + } +} + +test_break_around_grid_sync +test_threads_in_coop_kernel diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp new file mode 100644 index 00000000000..b392de257d5 --- /dev/null +++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp @@ -0,0 +1,211 @@ +/* Copyright (C) 2026 Free Software Foundation, Inc. + Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + + This file is part of GDB. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* This program exercises a multi-device cooperative kernel launch via + hipLaunchCooperativeKernelMultiDevice. The kernel uses both + cooperative_groups::this_grid ().sync () (intra-device GWS) and + cooperative_groups::this_multi_grid ().sync () (cross-device sync). + + When run on a system with fewer than two GPUs that each support + cooperative multi-device launch, the program prints a message and + exits cleanly. */ + +#include +#include +#include +#include + +#include "rocm-test-utils.h" + +namespace cg = cooperative_groups; + +constexpr int N_USED_GPUS = 2; +constexpr unsigned int N_PER_DEVICE = 256; +constexpr unsigned int group_size = 64; +constexpr unsigned int num_groups = 2; + +/* Two-phase cooperative kernel running on every device. + + Phase 1: each thread writes (grid_rank + 1) * (i + 1) into its slot. + GWS : intra-device grid synchronization via this_grid ().sync (). + MGWS : cross-device synchronization via this_multi_grid ().sync (). + Phase 2: thread 0 of each grid writes its partial sum into + result[grid_rank + 1] (a slot in the shared host-coherent buffer). + Then thread 0 of grid 0 aggregates the partials into result[0]. */ + +__global__ void +coop_multi_grid_sync_kernel (int *data, unsigned int n_elements, + long *result) +{ + cg::grid_group grid = cg::this_grid (); + cg::multi_grid_group mgrid = cg::this_multi_grid (); + + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int stride = gridDim.x * blockDim.x; + + unsigned int grid_rank = mgrid.grid_rank (); + + /* Phase 1: each thread writes data into its slot. */ + for (unsigned int i = tid; i < n_elements; i += stride) + data[i] = (int) ((grid_rank + 1) * (i + 1)); /* before-grid-sync line */ + + /* Intra-device grid sync (GWS). */ + grid.sync (); /* grid-sync line */ + + /* Each grid computes its partial sum and stores it in the shared + buffer at slot (grid_rank + 1). Slot 0 is reserved for the + final total. */ + if (tid == 0) + { + long sum = 0; + for (unsigned int i = 0; i < n_elements; i++) + sum += data[i]; + result[grid_rank + 1] = sum; /* after-grid-sync line */ + } + + /* Cross-device sync (multi-grid GWS). */ + mgrid.sync (); /* multi-grid-sync line */ + + /* Grid 0 aggregates partial sums from all grids into result[0]. */ + if (grid_rank == 0 && tid == 0) + { + long total = 0; + unsigned int n_grids = mgrid.num_grids (); + for (unsigned int g = 0; g < n_grids; g++) + total += result[g + 1]; + result[0] = total; /* after-multi-grid-sync line */ + } +} + +int +main () +{ + int n_devices = 0; + CHECK (hipGetDeviceCount (&n_devices)); + if (n_devices < N_USED_GPUS) + { + printf ("Multi-device cooperative test needs >= %d GPUs" + " (found %d), skipping.\n", N_USED_GPUS, n_devices); + return 0; + } + + /* Pick the first N_USED_GPUS devices that support + cooperativeMultiDeviceLaunch. In a mixed-architecture system not + every GPU has GWS support, so we ignore unsupported ones rather + than failing the whole test. */ + int selected[N_USED_GPUS]; + int n_gpus = 0; + for (int id = 0; id < n_devices && n_gpus < N_USED_GPUS; id++) + { + hipDeviceProp_t p; + CHECK (hipGetDeviceProperties (&p, id)); + if (p.cooperativeMultiDeviceLaunch) + { + selected[n_gpus] = id; + n_gpus++; + } + } + if (n_gpus < N_USED_GPUS) + { + printf ("Fewer than %d devices in 0..%d support cooperative" + " multi-device launch, skipping.\n", + N_USED_GPUS, n_devices - 1); + return 0; + } + + /* n-gpus-final line. */ + int *data_d[N_USED_GPUS] = {}; + hipStream_t stream[N_USED_GPUS] = {}; + + /* The shared result buffer is host-coherent so every device can + read/write it without explicit memcpys. Layout: + result[0] -- final total (written by grid 0 only) + result[1 .. n_gpus] -- per-grid partial sums. */ + long *result_h = nullptr; + + for (int i = 0; i < n_gpus; i++) + { + CHECK (hipSetDevice (selected[i])); + CHECK (hipMalloc ((void **) &data_d[i], N_PER_DEVICE * sizeof (int))); + CHECK (hipMemset (data_d[i], 0, N_PER_DEVICE * sizeof (int))); + CHECK (hipStreamCreate (&stream[i])); + } + + CHECK (hipSetDevice (selected[0])); + CHECK (hipHostMalloc ((void **) &result_h, (n_gpus + 1) * sizeof (long), + hipHostMallocCoherent)); + for (int i = 0; i < n_gpus + 1; i++) + result_h[i] = 0; + + /* Build per-device launch parameters. Each device gets the same + grid/block dims and a pointer to *its own* data buffer, but the + result pointer is the single shared host-coherent buffer. */ + hipLaunchParams launch_params[N_USED_GPUS]; + void *args[N_USED_GPUS][3]; + unsigned int n_elements = N_PER_DEVICE; + + for (int i = 0; i < n_gpus; i++) + { + args[i][0] = (void *) &data_d[i]; + args[i][1] = (void *) &n_elements; + args[i][2] = (void *) &result_h; + + launch_params[i].func + = reinterpret_cast (coop_multi_grid_sync_kernel); + launch_params[i].gridDim = dim3 (num_groups, 1, 1); + launch_params[i].blockDim = dim3 (group_size, 1, 1); + launch_params[i].sharedMem = 0; + launch_params[i].stream = stream[i]; + launch_params[i].args = args[i]; + } + + CHECK (hipLaunchCooperativeKernelMultiDevice (launch_params, n_gpus, 0)); + + for (int i = 0; i < n_gpus; i++) + { + CHECK (hipSetDevice (selected[i])); + CHECK (hipDeviceSynchronize ()); + } + + long total = result_h[0]; + + for (int i = 0; i < n_gpus; i++) + { + CHECK (hipSetDevice (selected[i])); + CHECK (hipFree (data_d[i])); + CHECK (hipStreamDestroy (stream[i])); + } + CHECK (hipSetDevice (selected[0])); + CHECK (hipHostFree (result_h)); + + /* Compute the expected total: for grid g (0-based), the thread that + writes slot i stores (g + 1) * (i + 1). Per grid, the sum is + (g + 1) * N_PER_DEVICE * (N_PER_DEVICE + 1) / 2. */ + long per_grid = (long) N_PER_DEVICE * (N_PER_DEVICE + 1) / 2; + long expected = 0; + for (int g = 0; g < n_gpus; g++) + expected += (long) (g + 1) * per_grid; + + if (total != expected) + { + fprintf (stderr, "FAILED: total %ld, expected %ld\n", total, expected); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp new file mode 100644 index 00000000000..53111e9da62 --- /dev/null +++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp @@ -0,0 +1,147 @@ +# Copyright (C) 2026 Free Software Foundation, Inc. +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + +# This file is part of GDB. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Tests for debugging a multi-device cooperative kernel that uses +# cooperative_groups::this_grid ().sync () within each device and +# cooperative_groups::this_multi_grid ().sync () across devices. Both +# barriers are implemented on AMD GPUs using the GWS hardware +# mechanism. +# +# The test verifies that GDB can: +# - read the number of participating GPUs from the inferior, +# - hit a kernel breakpoint inside the cooperative dispatch on +# *every* participating GPU (matched via the per-device +# "Breakpoint ." subscript that hipLaunchCooperative +# KernelMultiDevice produces), and +# - resume the cross-device dispatch through both the intra-device +# this_grid ().sync () and the cross-device this_multi_grid ().sync () +# to normal program completion. + +load_lib rocm.exp + +require allow_hipcc_tests +require hip_devices_support_debug_multi_process +require supports_cooperative_groups + +standard_testfile .cpp + +if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} { + return +} + +# Test: hit a breakpoint inside the cooperative kernel on every +# participating GPU, then resume the dispatch through both barriers +# and to program completion. + +proc_with_prefix test_break_in_multi_coop_kernel {} { + # Multi-device cooperative dispatches involve waves on more than + # one device making forward progress before the multi-grid sync + # releases. Run the inferior in non-stop mode so the debugger can + # observe per-device stops independently. + save_vars { ::GDBFLAGS } { + append ::GDBFLAGS " -ex \"set non-stop on\"" + clean_restart $::testfile + } + + with_rocm_gpu_lock { + if {![runto_main]} { + return + } + + # Advance to the point where the inferior has finalized "n_gpus" + # (the count of GPUs it will actually use for the cooperative + # dispatch) so we can read it from gdb. The require gates above + # cover architecture-level support; per-device runtime gating + # (>= 2 devices that advertise cooperativeMultiDeviceLaunch) is + # done inside main () in the .cpp, which self-skips with a clean + # exit when those preconditions are not met. Treat that clean + # exit as UNSUPPORTED rather than a FAIL. + gdb_breakpoint \ + [gdb_get_line_number "n-gpus-final line"] allow-pending + + set eligible 0 + gdb_test_multiple "continue" "advance to n-gpus-final" { + -re "hit Breakpoint $::decimal,\[^\r\n\]*main \\(\\)\[^\r\n\]*\r\n\[^\r\n\]*\r\n$::gdb_prompt " { + set eligible 1 + pass $gdb_test_name + } + -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " { + unsupported "multi-device cooperative launch not available" + } + } + if { !$eligible } { + return + } + + set n_gpus [get_integer_valueof "n_gpus" 0] + gdb_assert {$n_gpus >= 2} "n_gpus is at least 2" + if { $n_gpus < 2 } { + return + } + + # Replace the marker breakpoint with the kernel breakpoint and + # run all threads in the background so the host can finish + # launching kernels on every device. + delete_breakpoints + gdb_breakpoint \ + [gdb_get_line_number "before-grid-sync line"] allow-pending + + gdb_test_multiple "continue -a &" "background continue" { + -re "Continuing\\.\r\n$::gdb_prompt " { + pass $gdb_test_name + } + } + + # In non-stop mode, hipLaunchCooperativeKernelMultiDevice + # produces one child breakpoint instance per participating GPU + # ("Breakpoint ."). Collect distinct values + # until we have observed a stop on every GPU; only then is it + # safe to delete the breakpoint and let the dispatch run + # through both grid syncs. + array unset gpu_seen + set distinct_gpus 0 + gdb_test_multiple "" "breakpoint hit on every participating GPU" { + -re "\r\nThread $::decimal\[^\r\n\]* hit Breakpoint $::decimal\\.($::decimal),\[^\r\n\]*\r\n" { + set inst $expect_out(1,string) + if {![info exists gpu_seen($inst)]} { + set gpu_seen($inst) 1 + incr distinct_gpus + } + if {$distinct_gpus >= $n_gpus} { + pass $gdb_test_name + } else { + exp_continue + } + } + } + + # All participating GPUs have been observed stopping at the + # kernel breakpoint. Remove the breakpoint and let the + # dispatch run through this_grid ().sync () and + # this_multi_grid ().sync () to clean program exit. + delete_breakpoints + + gdb_test_multiple "continue -a" "continue all to end" { + -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " { + pass $gdb_test_name + } + } + } +} + +test_break_in_multi_coop_kernel diff --git a/gdb/testsuite/lib/rocm.exp b/gdb/testsuite/lib/rocm.exp index c937f92e24e..44273e40832 100644 --- a/gdb/testsuite/lib/rocm.exp +++ b/gdb/testsuite/lib/rocm.exp @@ -412,6 +412,46 @@ proc hip_devices_support_debug_multi_process {} { return 1 } +# Helpers below gate cooperative-group debugging tests, i.e. tests that +# debug kernels launched with hipLaunchCooperativeKernel / +# hipLaunchCooperativeKernelMultiDevice and synchronized via +# cooperative_groups::this_grid ().sync () / this_multi_grid ().sync (). +# +# This is a debugger-side gate, distinct from what the HIP runtime +# advertises via hipDeviceProp_t::cooperativeLaunch / +# cooperativeMultiDeviceLaunch: the Global Wave Sync (GWS) hardware +# implementing those barriers exists on architectures where the kernel +# runs correctly outside the debugger but rocgdb cannot yet step or +# breakpoint through the cooperative dispatch. + +# Return true if TARGET (a gfx name like "gfx90a") is an architecture +# where rocgdb supports debugging cooperative-group kernels. + +proc target_supports_cooperative_groups { target } { + set unsupported_targets { + gfx1100 gfx1101 gfx1102 gfx1103 + } + + return [expr {[lsearch -exact $unsupported_targets $target] == -1}] +} + +# Return true if every AMDGPU device on the system is an architecture +# where rocgdb supports debugging cooperative-group kernels. + +proc supports_cooperative_groups {} { + set targets [find_amdgpu_devices] + if { [llength $targets] == 0 } { + return 0 + } + + foreach target $targets { + if { ![target_supports_cooperative_groups $target] } { + return 0 + } + } + return 1 +} + # Return true if the current device's version is less than VERSION. # # VERSION must be the "gfx" name of the device, such as gfx906 or gfx90a. From a00cfb32cfcb9088fe05caa7e6a3f305a8075545 Mon Sep 17 00:00:00 2001 From: Sarang Patrange Date: Tue, 9 Jun 2026 05:23:55 -0500 Subject: [PATCH 2/2] gdb, testsuite: improve gdb.rocm cooperative group GWS tests Refine the cooperative-group GWS tests for robustness and to make the debugger behaviour under test more explicit: * coop-group-grid-sync.exp: use temporary breakpoints for the in-kernel locations (still pending, since the device code is loaded at dispatch time) and drop the redundant delete_breakpoints calls. * coop-group-grid-sync.exp: report UNSUPPORTED instead of FAIL when the inferior self-skips because no device advertises cooperativeLaunch. * coop-group-grid-sync.exp: have test_threads_in_coop_kernel check distinct per-wave blockIdx.x (per-wave register context) and per-lane threadIdx.x divergence (per-lane SIMT state), instead of only confirming that the backtrace names the kernel. * coop-group-multi-grid-sync.{cpp,exp}: read n_gpus from a marker line that is reached on every execution (no early return before it), so that when fewer than two cooperative-capable GPUs are available -- including when a parallel test run restricts the visible GPUs -- the test reports UNSUPPORTED rather than FAILing. * coop-group-{grid,multi-grid}-sync.{cpp,exp}: minor comment and GNU-style cleanups -- tab-align the in-kernel marker comment, keep hipLaunchCooperativeKernelMultiDevice on a single line, and clarify the Phase 2 data-dependency comment. * coop-group-{grid,multi-grid}-sync.{cpp,exp}: give the per-wave and per-lane value reads explicit, unique test names, and keep the "n-gpus-final" marker string unique so gdb_get_line_number resolves the intended line. Tested on gfx942 with an in-tree build and the 7.14 nightly rocgdb, both with all GPUs visible and with the visible set restricted to one. --- .../gdb.rocm/coop-group-grid-sync.cpp | 5 +- .../gdb.rocm/coop-group-grid-sync.exp | 87 +++++++++++++------ .../gdb.rocm/coop-group-multi-grid-sync.cpp | 20 ++--- .../gdb.rocm/coop-group-multi-grid-sync.exp | 50 ++++------- 4 files changed, 90 insertions(+), 72 deletions(-) diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp index 14d395e1fd8..911f4385a6a 100644 --- a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp +++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp @@ -40,8 +40,9 @@ namespace cg = cooperative_groups; in the GWS barrier. Two workgroups of 64 threads each give 128 threads in total. - N is exactly one slot per thread so Phase 2 has no intra-thread race - and the host-side expected values can be computed straightforwardly. */ + N is exactly one slot per thread so Phase 2 has no inter-thread + write conflict and the host-side expected values can be computed + straightforwardly. */ constexpr unsigned int group_size = 64; constexpr unsigned int num_groups = 2; diff --git a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp index b2ef18ccd36..2d18f94e0fc 100644 --- a/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp +++ b/gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp @@ -55,10 +55,22 @@ proc_with_prefix test_break_around_grid_sync {} { # Place a breakpoint before grid.sync (). Use "allow-pending" # because the kernel is loaded as device code at dispatch time. gdb_breakpoint \ - [gdb_get_line_number "before-sync line"] allow-pending - - # The kernel must hit the pre-sync breakpoint. - if {[gdb_continue_to_breakpoint "stop before grid.sync"] != 0} { + [gdb_get_line_number "before-sync line"] allow-pending temporary + + # Continue into the kernel. If the program self-skips (no device + # advertises cooperativeLaunch) it exits cleanly; treat that as + # UNSUPPORTED rather than a FAIL. + set reached 0 + gdb_test_multiple "continue" "stop before grid.sync" { + -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" { + set reached 1 + pass $gdb_test_name + } + -re -wrap "$::inferior_exited_re normally.*" { + unsupported "cooperative launch not available" + } + } + if { !$reached } { return } @@ -90,27 +102,25 @@ proc_with_prefix test_break_around_grid_sync {} { ".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \ "info dispatches lists cooperative dispatch" - # Replace the pre-sync breakpoint with one after grid.sync (). - # The after-sync breakpoint fires once all waves have crossed - # the GWS barrier, which proves GWS-protected code can be - # debugged across the barrier. - delete_breakpoints + # Place a breakpoint after grid.sync (). It fires once all waves + # have crossed the GWS barrier, which proves GWS-protected code + # can be debugged across the barrier. gdb_breakpoint \ - [gdb_get_line_number "after-sync line"] allow-pending + [gdb_get_line_number "after-sync line"] allow-pending temporary gdb_continue_to_breakpoint "stop after grid.sync" # The kernel should be able to run to completion afterwards. - delete_breakpoints gdb_continue_to_end "continue to program end" "continue" 1 } } -# Test: place a breakpoint inside the cooperative kernel, then iterate -# over every stopped GPU wave and verify that each one has a backtrace -# pointing into coop_grid_sync_kernel. This exercises the wave -# organization a debugger sees when several waves participate in a GWS -# barrier. +# Test: stop inside the cooperative kernel with several co-resident +# waves parked at the barrier and check that gdb reports correct, +# distinct per-wave and per-lane state. This is the debugger +# behaviour of interest: that gdb selects the right register context +# per wave and the right SIMT state per lane, not merely that the +# waves are in the kernel. proc_with_prefix test_threads_in_coop_kernel {} { clean_restart $::testfile @@ -121,29 +131,56 @@ proc_with_prefix test_threads_in_coop_kernel {} { } gdb_breakpoint \ - [gdb_get_line_number "before-sync line"] allow-pending - - if {[gdb_continue_to_breakpoint "stop in cooperative kernel"] != 0} { + [gdb_get_line_number "before-sync line"] allow-pending temporary + + # Continue into the kernel. If the program self-skips (no device + # advertises cooperativeLaunch) it exits cleanly; treat that as + # UNSUPPORTED rather than a FAIL. + set reached 0 + gdb_test_multiple "continue" "stop in cooperative kernel" { + -re -wrap "Temporary breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel .*" { + set reached 1 + pass $gdb_test_name + } + -re -wrap "$::inferior_exited_re normally.*" { + unsupported "cooperative launch not available" + } + } + if { !$reached } { return } - # Collect the list of AMDGPU waves and check that every one of - # them has a backtrace pointing into coop_grid_sync_kernel. set waves [info_thread_get_wave_list] gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed" + # Per-wave register context: switching to each wave and reading + # blockIdx.x must yield that wave's owning workgroup, and across + # all waves we must observe more than one distinct workgroup. + set blocks {} foreach wave $waves { with_test_prefix "wave $wave" { gdb_test "thread $wave" "Switching to thread $wave.*" \ "switch to wave $wave" - gdb_test "bt 1" \ - "#0\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*" \ - "backtrace inside coop_grid_sync_kernel" + set bidx [get_integer_valueof "blockIdx.x" -1 "blockIdx.x"] + if {[lsearch -exact $blocks $bidx] == -1} { + lappend blocks $bidx + } } } + gdb_assert {[llength $blocks] >= 2} \ + "distinct workgroups observed across waves" + + # Per-lane (SIMT) state: within a single wave, different lanes map + # to different threads, so threadIdx.x must differ between lanes. + gdb_test "thread [lindex $waves 0]" "Switching to thread .*" \ + "switch to first wave for lane inspection" + gdb_test "lane 0" ".*" "switch to lane 0" + set tid0 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 0"] + gdb_test "lane 1" ".*" "switch to lane 1" + set tid1 [get_integer_valueof "threadIdx.x" -1 "threadIdx.x at lane 1"] + gdb_assert {$tid0 != $tid1} "distinct threadIdx.x across lanes" # Resume to completion. - delete_breakpoints gdb_continue_to_end "continue to program end" "continue" 1 } } diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp index b392de257d5..e29b3da8d56 100644 --- a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp +++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.cpp @@ -62,7 +62,7 @@ coop_multi_grid_sync_kernel (int *data, unsigned int n_elements, /* Phase 1: each thread writes data into its slot. */ for (unsigned int i = tid; i < n_elements; i += stride) - data[i] = (int) ((grid_rank + 1) * (i + 1)); /* before-grid-sync line */ + data[i] = (int) ((grid_rank + 1) * (i + 1)); /* before-grid-sync line */ /* Intra-device grid sync (GWS). */ grid.sync (); /* grid-sync line */ @@ -97,17 +97,14 @@ main () { int n_devices = 0; CHECK (hipGetDeviceCount (&n_devices)); - if (n_devices < N_USED_GPUS) - { - printf ("Multi-device cooperative test needs >= %d GPUs" - " (found %d), skipping.\n", N_USED_GPUS, n_devices); - return 0; - } /* Pick the first N_USED_GPUS devices that support cooperativeMultiDeviceLaunch. In a mixed-architecture system not every GPU has GWS support, so we ignore unsupported ones rather - than failing the whole test. */ + than failing the whole test. This loop runs unconditionally (it + is bounded by n_devices) so the marker below is always reached, + even when too few devices are available; the companion .exp reads + n_gpus there to decide whether to skip. */ int selected[N_USED_GPUS]; int n_gpus = 0; for (int id = 0; id < n_devices && n_gpus < N_USED_GPUS; id++) @@ -120,15 +117,14 @@ main () n_gpus++; } } - if (n_gpus < N_USED_GPUS) + if (n_gpus < N_USED_GPUS) /* n-gpus-final line */ { - printf ("Fewer than %d devices in 0..%d support cooperative" + printf ("Fewer than %d of the %d HIP device(s) support cooperative" " multi-device launch, skipping.\n", - N_USED_GPUS, n_devices - 1); + N_USED_GPUS, n_devices); return 0; } - /* n-gpus-final line. */ int *data_d[N_USED_GPUS] = {}; hipStream_t stream[N_USED_GPUS] = {}; diff --git a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp index 53111e9da62..f192ddc0d28 100644 --- a/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp +++ b/gdb/testsuite/gdb.rocm/coop-group-multi-grid-sync.exp @@ -26,8 +26,8 @@ # - read the number of participating GPUs from the inferior, # - hit a kernel breakpoint inside the cooperative dispatch on # *every* participating GPU (matched via the per-device -# "Breakpoint ." subscript that hipLaunchCooperative -# KernelMultiDevice produces), and +# "Breakpoint ." subscript that +# hipLaunchCooperativeKernelMultiDevice produces), and # - resume the cross-device dispatch through both the intra-device # this_grid ().sync () and the cross-device this_multi_grid ().sync () # to normal program completion. @@ -63,34 +63,19 @@ proc_with_prefix test_break_in_multi_coop_kernel {} { return } - # Advance to the point where the inferior has finalized "n_gpus" - # (the count of GPUs it will actually use for the cooperative - # dispatch) so we can read it from gdb. The require gates above - # cover architecture-level support; per-device runtime gating - # (>= 2 devices that advertise cooperativeMultiDeviceLaunch) is - # done inside main () in the .cpp, which self-skips with a clean - # exit when those preconditions are not met. Treat that clean - # exit as UNSUPPORTED rather than a FAIL. - gdb_breakpoint \ - [gdb_get_line_number "n-gpus-final line"] allow-pending - - set eligible 0 - gdb_test_multiple "continue" "advance to n-gpus-final" { - -re "hit Breakpoint $::decimal,\[^\r\n\]*main \\(\\)\[^\r\n\]*\r\n\[^\r\n\]*\r\n$::gdb_prompt " { - set eligible 1 - pass $gdb_test_name - } - -re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " { - unsupported "multi-device cooperative launch not available" - } - } - if { !$eligible } { - return - } + # Break where the inferior has finalized "n_gpus" (the count of + # GPUs it will use for the cooperative dispatch). This line runs + # on every execution, before the inferior's own eligibility + # check, so reading "n_gpus" from gdb lets us decide here whether + # to proceed. The require gates above cover architecture-level + # support; per-device runtime gating (>= 2 devices that advertise + # cooperativeMultiDeviceLaunch) can only be known at this point. + gdb_breakpoint [gdb_get_line_number "n-gpus-final line"] temporary + gdb_continue_to_breakpoint "advance to n-gpus-final" set n_gpus [get_integer_valueof "n_gpus" 0] - gdb_assert {$n_gpus >= 2} "n_gpus is at least 2" if { $n_gpus < 2 } { + unsupported "fewer than two cooperative-capable GPUs" return } @@ -107,12 +92,11 @@ proc_with_prefix test_break_in_multi_coop_kernel {} { } } - # In non-stop mode, hipLaunchCooperativeKernelMultiDevice - # produces one child breakpoint instance per participating GPU - # ("Breakpoint ."). Collect distinct values - # until we have observed a stop on every GPU; only then is it - # safe to delete the breakpoint and let the dispatch run - # through both grid syncs. + # Check that gdb resolves the single source breakpoint to one + # child location per participating GPU ("Breakpoint .") + # and, in non-stop mode, reports each device-side stop + # independently. Collect distinct values until a stop has + # been seen for every GPU. array unset gpu_seen set distinct_gpus 0 gdb_test_multiple "" "breakpoint hit on every participating GPU" {