diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 188b341ad..61f0fb651 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -514,14 +514,16 @@ static uint64_t cuda_q8_f16_cache_reserve_bytes(uint64_t total_bytes) { const uint64_t reserve = cuda_parse_mib_env("DS4_CUDA_Q8_F16_CACHE_RESERVE_MB", &present); if (present) return reserve; - if (total_bytes >= 112ull * 1024ull * 1024ull * 1024ull) { - return 512ull * 1048576ull; - } - /* The expanded Q8->F16 cache is only an acceleration path. Keep enough - * device memory free for cuBLAS workspaces, transient graph buffers, and - * driver bookkeeping instead of letting optional cached weights consume the - * last few GiB on 96 GiB cards. */ + * device memory free for the session/context tensors, cuBLAS workspaces, and + * transient graph buffers allocated after model load, instead of letting + * optional cached weights consume the last few GiB. + * + * Do NOT shrink this to a sub-GiB reserve on large (>=112 GiB) cards: a tiny + * reserve lets the eager preload fill device memory down to a few hundred MiB + * and OOM at session creation on big models (e.g. a ~252 GB GLM-5.2 split + * across 2x H200 NVL). Loading an MTP model disables this cache and hides it. + * This mirrors PR #446 (same bug on the ROCm runtime, q4q2). */ const uint64_t min_reserve = 4096ull * 1048576ull; const uint64_t pct_reserve = total_bytes / 20u; /* 5% */ return pct_reserve > min_reserve ? pct_reserve : min_reserve;