From 2256ee2f608a879c69aa4e4c8c3733c8b0e1d7ec Mon Sep 17 00:00:00 2001 From: Lawrence Zhang Date: Fri, 12 Jun 2026 15:06:02 -0700 Subject: [PATCH] Use 256-thread blocks for jagged dense-output kernel (#5848) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2766 Initial Rocm profiler thread tracing shows pretty poor utilization for jagged_1d_to_dense. Prev was setting it to use 16 threads only which is only 25% util of a wavefront when D=1 for 1d. For 2d (large D) values it seems OK to just consistently use 256. Alternative could be coding special path for just the D=1 (1d case). Open to suggestions. Generally this will shift them from using 1024 threads to 256. These changes impacts jagged_1d_to_dense, jagged_2d_to_dense, and jagged_to_padded_dense_forward. Mostly focused on jagged_1d_to_dense. Differential Revision: D107571746 --- fbgemm_gpu/src/jagged_tensor_ops/common.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh index 33938bec62..bb9742bfbd 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh +++ b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh @@ -224,7 +224,13 @@ inline std::tuple> check_shape_and_partition_( const int threads_x = inner_dense_size >= kWarpSize / 2 ? kWarpSize : inner_dense_size; +#ifndef USE_ROCM const int threads_y = kMaxThreads / kWarpSize; +#else + // AMD: ~256-thread blocks improve wavefront packing for common D=1 shapes. + constexpr int kTargetBlockThreads = 256; + const int threads_y = kTargetBlockThreads / threads_x; +#endif const dim3 blocks( div_round_up(outer_dense_size * jagged_folded_size, threads_y));