From 2256ee2f608a879c69aa4e4c8c3733c8b0e1d7ec Mon Sep 17 00:00:00 2001
From: Lawrence Zhang <lawzhang@meta.com>
Date: Fri, 12 Jun 2026 15:06:02 -0700
Subject: [PATCH] Use 256-thread blocks for jagged dense-output kernel (#5848)

Summary:

X-link: https://github.com/facebookresearch/FBGEMM/pull/2766

Initial Rocm profiler thread tracing shows pretty poor utilization for jagged_1d_to_dense.
Prev was setting it to use 16 threads only which is only 25% util of a wavefront when D=1 for 1d.


For 2d (large D) values it seems OK to just consistently use 256. Alternative could be coding special path for just the D=1 (1d case). Open to suggestions. Generally this will shift them from using 1024 threads to 256.

These changes impacts jagged_1d_to_dense, jagged_2d_to_dense, and jagged_to_padded_dense_forward. Mostly focused on jagged_1d_to_dense.

Differential Revision: D107571746
---
 fbgemm_gpu/src/jagged_tensor_ops/common.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
index 33938bec62..bb9742bfbd 100644
--- a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
+++ b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
@@ -224,7 +224,13 @@ inline std::tuple<dim3, dim3, StackArray<int64_t>> check_shape_and_partition_(
 
   const int threads_x =
       inner_dense_size >= kWarpSize / 2 ? kWarpSize : inner_dense_size;
+#ifndef USE_ROCM
   const int threads_y = kMaxThreads / kWarpSize;
+#else
+  // AMD: ~256-thread blocks improve wavefront packing for common D=1 shapes.
+  constexpr int kTargetBlockThreads = 256;
+  const int threads_y = kTargetBlockThreads / threads_x;
+#endif
   const dim3 blocks(
       div_round_up(outer_dense_size * jagged_folded_size, threads_y));