[AMDGPU] Remove unnecessary and broken sign/zero-extension by LU-JOHN · Pull Request #203436 · llvm/llvm-project

LU-JOHN · 2026-06-12T02:31:11Z

When expanding div/rem by using floating-point operations, sign/zero-extending the result from the calculated DivBits input width to 32-bits is unnecessary. CreateFPToSI or CreateFPToUI is called with a 32-bit int type so the conversion instruction will already produce a result with the desired width.

Also it is incorrect. For signed-division DIVBITS_MAX_NEG/-1, the result should be -DIVBITS_MAX_NEG a positive value. Sign-extension will incorrectly return a negative result. For example, for DivBits=4, -8/-1 = 8, but adding code to do a 28-bit sign-extension will incorrectly return -8.

Tested in llvm/llvm-test-suite#423.

llvmorg-github-actions · 2026-06-12T02:31:45Z

@llvm/pr-subscribers-backend-amdgpu

Author: LU-JOHN

Changes

When expanding div/rem by using floating-point operations, sign/zero-extending the result to the calculated DivBits input width is unnecessary. CreateFPToSI or CreateFPToUI already sign/zero-extend to 32-bits.

Also it is incorrect. For signed-division DIVBITS_MAX_NEG/-1, the result should be -DIVBITS_MAX_NEG a positive value. Sign-extension will incorrectly return a negative result. For example, for DivBits=4, -8/-1 = 8, but adding code to do a 28-bit sign-extension will incorrectly return -8.

Patch is 166.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/203436.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+1-15)
(modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+648-529)
(modified) llvm/test/CodeGen/AMDGPU/divrem24-assume.ll (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/sdivrem24.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+36-46)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 394c2326a64ef..0d32911dcb67e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1085,7 +1085,7 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloatImpl(
   // Thus, we conservatively restrict expandDivRemToFloatImpl to
   // [-0x40000,0x3FFFFF] for IsSigned
   // [0x000000,0x3FFFFF] for !IsSigned.
-  assert(DivBits <= (IsSigned ? 23 : 22) &&
+  assert(0 < DivBits && DivBits <= (IsSigned ? 23 : 22) &&
          "abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "
          "correctly");
 
@@ -1168,20 +1168,6 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloatImpl(
     Res = Builder.CreateSub(Num, Rem);
   }
 
-  if (DivBits != 0 && DivBits < 32) {
-    // Extend in register from the number of bits this divide really is.
-    if (IsSigned) {
-      int InRegBits = 32 - DivBits;
-
-      Res = Builder.CreateShl(Res, InRegBits);
-      Res = Builder.CreateAShr(Res, InRegBits);
-    } else {
-      ConstantInt *TruncMask
-        = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
-      Res = Builder.CreateAnd(Res, TruncMask);
-    }
-  }
-
   return Res;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 322d27934cc6e..d83bb728e47f9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1,11 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
 
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
-; CHECK-LABEL: @udiv_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = uitofp fast i32 [[Y:%.*]] to float
+; CHECK-LABEL: define amdgpu_kernel void @udiv_i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp fast i32 [[Y]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], f0x4F7FFFFE
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
@@ -18,7 +19,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
@@ -34,7 +35,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
-; CHECK-NEXT:    store i32 [[TMP29]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP29]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i32:
@@ -68,7 +69,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -103,8 +103,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
-; CHECK-LABEL: @urem_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = uitofp fast i32 [[Y:%.*]] to float
+; CHECK-LABEL: define amdgpu_kernel void @urem_i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp fast i32 [[Y]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], f0x4F7FFFFE
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
@@ -117,7 +118,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
@@ -131,7 +132,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
-; CHECK-NEXT:    store i32 [[TMP27]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP27]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i32:
@@ -162,7 +163,6 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -195,9 +195,10 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
-; CHECK-LABEL: @sdiv_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
+; CHECK-LABEL: define amdgpu_kernel void @sdiv_i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X]], 31
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
@@ -234,7 +235,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
-; CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i32:
@@ -274,7 +275,6 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -315,9 +315,10 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
-; CHECK-LABEL: @srem_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
+; CHECK-LABEL: define amdgpu_kernel void @srem_i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X]], 31
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
@@ -351,7 +352,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[TMP35]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP35]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32:
@@ -387,7 +388,6 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -425,9 +425,10 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
-; CHECK-LABEL: @udiv_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @udiv_i16(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = uitofp fast i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp fast i32 [[TMP2]] to float
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
@@ -441,9 +442,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
-; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP15]] to i16
+; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i16:
@@ -467,7 +467,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x2c
@@ -494,9 +493,10 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 }
 
 define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
-; CHECK-LABEL: @urem_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @urem_i16(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = uitofp fast i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp fast i32 [[TMP2]] to float
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
@@ -512,9 +512,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
-; CHECK-NEXT:    store i16 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 2
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP17]] to i16
+; CHECK-NEXT:    store i16 [[TMP19]], ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i16:
@@ -540,7 +539,6 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
@@ -569,9 +567,10 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 }
 
 define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
-; CHECK-LABEL: @sdiv_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @sdiv_i16(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
@@ -588,10 +587,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
-; CHECK-NEXT:    store i16 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 2
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP18]] to i16
+; CHECK-NEXT:    store i16 [[TMP21]], ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i16:
@@ -619,7 +616,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
@@ -650,9 +646,10 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 }
 
 define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
-; CHECK-LABEL: @srem_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @srem_i16(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
@@ -671,10 +668,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
-; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
-; CHECK-NEXT:    store i16 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP20]] to i16
+; CHECK-NEXT:    store i16 [[TMP23]], ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i16:
@@ -704,7 +699,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
@@ -737,9 +731,10 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 }
 
 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
-; CHECK-LABEL: @udiv_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @udiv_i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i8 [[X:%.*]], i8 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = uitofp fast i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp fast i32 [[TMP2]] to float
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
@@ -753,9 +748,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
-; CHECK-NEXT:    store i8 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP15]] to i8
+; CHECK-NEXT:    store i8 [[TMP17]], ptr addrspace(1) [[OUT]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i8:
@@ -776,7 +770,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
@@ -800,9 +793,10 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 }
 
 define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
-; CHECK-LABEL: @urem_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @urem_i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i8 [[X:%.*]], i8 [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = uitofp fast i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp fast i32 [[TMP2]] to float
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
@@ -818,9 +812,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
-; CHECK-NEXT:    store i8 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP17]] to i8
+; CHECK-NEXT:    store i8 [[TMP19]], ptr addrspace(1) [[OUT]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i8:
@@ -844,7 +837,6 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
@@ -871,9 +863,10 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 }
 
 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
-; CHECK-LABEL: @sdiv_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-LABEL: define amdgpu_kernel void @sdiv_i8...
[truncated]

Copilot

Pull request overview

This PR updates AMDGPU’s float-based div/rem expansion to stop applying an extra sign/zero-extension step to the computed quotient/remainder, since the FP-to-int conversions already produce 32-bit results and the extra extension can be incorrect for some narrowed signed ranges. The change is accompanied by test updates to reflect the new generated IR/GCN patterns.

Changes:

Remove the post-processing sign/zero-extension (shift/ashr or and-mask) in expandDivRemToFloatImpl.
Tighten the internal precondition check for DivBits and update several AMDGPU CodeGen tests accordingly.
Refresh/adjust FileCheck expectations across multiple div/rem test cases (including CodeGenPrepare output).

Reviewed changes

Copilot reviewed 5 out of 6 changed files in this pull request and generated 1 comment.

Show a summary per file

File	Description
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp	Removes in-register sign/zero-extension after FP-based div/rem expansion; updates `DivBits` assertion.
llvm/test/CodeGen/AMDGPU/srem64.ll	Updates expected GCN/GCN-IR sequences for the 64-bit srem shrink/expansion path.
llvm/test/CodeGen/AMDGPU/sdivrem24.ll	Adjusts expected codegen checks to no longer expect a bitfield extract after div/rem expansion.
llvm/test/CodeGen/AMDGPU/sdiv64.ll	Updates expected codegen checks to drop a `v_bfe_i32` expectation after quotient adjustment.
llvm/test/CodeGen/AMDGPU/divrem24-assume.ll	Updates the opt/IR checks to reflect removal of a masking `and` before zext/GEP indexing.
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll	Regenerates/updates CodeGenPrepare IR checks and many FileCheck patterns to match the new lowering behavior.

+  assert(0 < DivBits && DivBits <= (IsSigned ? 23 : 22) &&
         "abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "
         "correctly");


mgcarrasco · 2026-06-12T11:09:33Z

By any chance, could we add a test for this new edge case in llvm-test-suite?

arsenm · 2026-06-12T11:31:03Z

CreateFPToSI or CreateFPToUI already sign/zero-extend to 32-bits.

This part is unclear. The instructions just use whatever type you give them

LU-JOHN · 2026-06-12T21:40:23Z

By any chance, could we add a test for this new edge case in llvm-test-suite?

Tested in llvm/llvm-test-suite#423.

LU-JOHN · 2026-06-12T21:56:19Z

CreateFPToSI or CreateFPToUI already sign/zero-extend to 32-bits.

This part is unclear. The instructions just use whatever type you give them

Updated explanation for clarity.

jayfoad · 2026-06-15T14:43:25Z

+; CHECK-LABEL: define amdgpu_kernel void @udiv_i32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp fast i32 [[Y]] to float


Are some of these diffs just from rerunning the update script? Can you commit that separately as an NFC?

Are some of these diffs just from rerunning the update script? Can you commit that separately as an NFC?

Updated opt checks with update_test_checks in #203926.

jayfoad · 2026-06-16T10:08:30Z

 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP15]] to i16


Most of these IR changes seem nice but harmless. Is there an example where the IR was actually wrong before, and your patch fixes it?

Most of these IR changes seem nice but harmless. Is there an example where the IR was actually wrong before, and your patch fixes it?

Pre-commit tests that currently generate wrong code in #204155.

Test results in kernel s_test_sdiv22_32 in sdiv.ll and s_test_sdiv22_64 in sdiv64.ll show results that were wrongly sign-extended with v_bfe_i32.

Signed-off-by: John Lu <John.Lu@amd.com>

jayfoad

LGTM

When expanding div/rem by using floating-point operations, sign/zero-extending the result from the calculated DivBits input width to 32-bits is unnecessary. CreateFPToSI or CreateFPToUI is called with a 32-bit int type so the conversion instruction will already produce a result with the desired width. Also it is incorrect. For signed-division `DIVBITS_MAX_NEG/-1`, the result should be `-DIVBITS_MAX_NEG` a positive value. Sign-extension will incorrectly return a negative result. For example, for DivBits=4, `-8/-1 = 8`, but adding code to do a 28-bit sign-extension will incorrectly return `-8`. Tested in llvm/llvm-test-suite#423. --------- Signed-off-by: John Lu <John.Lu@amd.com>

llvmorg-github-actions Bot added the backend:AMDGPU label Jun 12, 2026

LU-JOHN requested a review from Copilot June 12, 2026 02:31

Copilot started reviewing on behalf of LU-JOHN June 12, 2026 02:31 View session

LU-JOHN requested a review from mgcarrasco June 12, 2026 02:32

Copilot AI reviewed Jun 12, 2026

View reviewed changes

Comment thread llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Comment on lines +1088 to 1090

assert(0 < DivBits && DivBits <= (IsSigned ? 23 : 22) &&

"abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "

"correctly");

LU-JOHN mentioned this pull request Jun 12, 2026

[HIP] Test signed 23-bit division and remainder llvm/llvm-test-suite#423

Open

jayfoad reviewed Jun 15, 2026

View reviewed changes

LU-JOHN force-pushed the no_extension branch from 20e41ff to c328b76 Compare June 15, 2026 16:58

LU-JOHN requested a review from jayfoad June 15, 2026 17:22

jayfoad reviewed Jun 16, 2026

View reviewed changes

LU-JOHN added 2 commits June 16, 2026 09:22

Remove unnecessary/broken extension

ff616e6

Signed-off-by: John Lu <John.Lu@amd.com>

Update test results

195cbbf

Signed-off-by: John Lu <John.Lu@amd.com>

LU-JOHN force-pushed the no_extension branch from c328b76 to 195cbbf Compare June 16, 2026 14:35

LU-JOHN requested a review from jayfoad June 16, 2026 14:39

jayfoad approved these changes Jun 17, 2026

View reviewed changes

LU-JOHN merged commit 12d560d into llvm:main Jun 17, 2026
11 checks passed

Uh oh!

Conversation

LU-JOHN commented Jun 12, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmorg-github-actions Bot commented Jun 12, 2026

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Reviewed changes

Uh oh!

mgcarrasco commented Jun 12, 2026

Uh oh!

arsenm commented Jun 12, 2026

Uh oh!

LU-JOHN commented Jun 12, 2026

Uh oh!

LU-JOHN commented Jun 12, 2026

Uh oh!

jayfoad Jun 15, 2026

Choose a reason for hiding this comment

Uh oh!

LU-JOHN Jun 15, 2026

Choose a reason for hiding this comment

Uh oh!

jayfoad Jun 16, 2026

Choose a reason for hiding this comment

Uh oh!

LU-JOHN Jun 16, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

LU-JOHN Jun 16, 2026

Choose a reason for hiding this comment

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

LU-JOHN commented Jun 12, 2026 •

edited

Loading

LU-JOHN Jun 16, 2026 •

edited

Loading