From d067d070678b766f6479f5436972f7ce897060a0 Mon Sep 17 00:00:00 2001
From: Vincenzo Ingrosso <vincenzo@ingrosso.net>
Date: Sun, 28 Jun 2026 17:24:34 +0200
Subject: [PATCH] CUDA: scale q8->f16 cache reserve on >=112 GiB cards

cuda_q8_f16_cache_reserve_bytes() returned a flat 512 MiB reserve once
total VRAM >= 112 GiB, instead of the 5% / 4 GiB-min rule used below that.
The q8->f16 dequant cache is eager and fills HBM down to the reserve, so on
a large model the session/context graph allocated after model load OOMs at
session creation even though the weights themselves fit. WEIGHT_CACHE_LIMIT_GB
does not bound this cache, and loading an MTP model disables it and hides
the issue.

Drop the >=112 GiB special case so every card uses 5% / 4 GiB-min. This is
the CUDA twin of #446 (same bug on the ROCm runtime, q4q2).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01AQVgY7rXrksjtBjPFSCnMH
---
 ds4_cuda.cu | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 188b341ad..61f0fb651 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -514,14 +514,16 @@ static uint64_t cuda_q8_f16_cache_reserve_bytes(uint64_t total_bytes) {
     const uint64_t reserve = cuda_parse_mib_env("DS4_CUDA_Q8_F16_CACHE_RESERVE_MB", &present);
     if (present) return reserve;
 
-    if (total_bytes >= 112ull * 1024ull * 1024ull * 1024ull) {
-        return 512ull * 1048576ull;
-    }
-
     /* The expanded Q8->F16 cache is only an acceleration path.  Keep enough
-     * device memory free for cuBLAS workspaces, transient graph buffers, and
-     * driver bookkeeping instead of letting optional cached weights consume the
-     * last few GiB on 96 GiB cards. */
+     * device memory free for the session/context tensors, cuBLAS workspaces, and
+     * transient graph buffers allocated after model load, instead of letting
+     * optional cached weights consume the last few GiB.
+     *
+     * Do NOT shrink this to a sub-GiB reserve on large (>=112 GiB) cards: a tiny
+     * reserve lets the eager preload fill device memory down to a few hundred MiB
+     * and OOM at session creation on big models (e.g. a ~252 GB GLM-5.2 split
+     * across 2x H200 NVL). Loading an MTP model disables this cache and hides it.
+     * This mirrors PR #446 (same bug on the ROCm runtime, q4q2). */
     const uint64_t min_reserve = 4096ull * 1048576ull;
     const uint64_t pct_reserve = total_bytes / 20u; /* 5% */
     return pct_reserve > min_reserve ? pct_reserve : min_reserve;