From ad5c55c583b8f52952998cae62011d5103cff4cc Mon Sep 17 00:00:00 2001 From: cattivik66 Date: Fri, 26 Jun 2026 19:02:23 +0200 Subject: [PATCH] ROCm: discrete GPU memory management Add host-mapped expert cache fallback for VRAM-constrained discrete GPUs (e.g. AMD W7800 48GB). When the streaming expert cache is exhausted, copy the expert weights to pinned host memory and map them into the GPU address space via cudaHostRegister/cudaHostGetDevicePointer. Inference proceeds (slower) instead of hard-failing. Add automatic discrete/integrated GPU detection via cudaDeviceGetAttribute so the registered-weights path activates without manual configuration. DS4_ROCM_REGISTERED_WEIGHTS env var overrides detection for edge cases. Add three env var tunables: - DS4_ROCM_STREAM_FREE_RESERVE_GIB: override VRAM free-reserve floor (default 16 GiB). - DS4_CUDA_Q8_F16_CACHE_MB: limit the optional q8->f16 cache to free VRAM for routed-expert residency (e.g. DS4_CUDA_Q8_F16_CACHE_MB=0). - DS4_ROCM_HOST_MAPPED_EXPERT_FALLBACK: set to 0 to disable host-mapped expert overflow (default: enabled). Host-mapped fallback can be disabled with DS4_ROCM_HOST_MAPPED_EXPERT_FALLBACK=0. All defaults preserve upstream behavior. --- ds4_rocm.h | 1 + rocm/ds4_rocm_runtime.cuh | 150 +++++++++++++++++++++++++++++++++----- 2 files changed, 133 insertions(+), 18 deletions(-) diff --git a/ds4_rocm.h b/ds4_rocm.h index 50481b8d5..df5302e74 100644 --- a/ds4_rocm.h +++ b/ds4_rocm.h @@ -26,6 +26,7 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaDevAttrPageableMemoryAccess hipDeviceAttributePageableMemoryAccess #define cudaDevAttrMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin +#define cudaDevAttrIntegrated hipDeviceAttributeIntegrated #define cudaFuncAttributeMaxDynamicSharedMemorySize hipFuncAttributeMaxDynamicSharedMemorySize #define cudaFuncSetAttribute(func, attr, value) hipFuncSetAttribute((const void *)(func), (attr), (value)) #define cudaMemLocationTypeDevice hipMemLocationTypeDevice diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh index 3bd786f8e..9aa4f81d9 100644 --- a/rocm/ds4_rocm_runtime.cuh +++ b/rocm/ds4_rocm_runtime.cuh @@ -110,6 +110,8 @@ struct cuda_stream_resident_expert { char *down; uint64_t bytes; uint64_t last_used; + int host_mapped; /* discrete-VRAM fallback: base/host point at a host-registered, mapped buffer */ + void *host_base; /* host buffer passed to cudaHostRegister (free/unregister on evict) */ }; struct cuda_stream_resident_key { @@ -477,7 +479,14 @@ static void cuda_model_image_release_all(void) { static void cuda_stream_resident_cache_release(void) { for (cuda_stream_resident_expert &e : g_stream_resident_experts) { - if (e.base) (void)cudaFree(e.base); + if (e.host_mapped) { + if (e.host_base) { + (void)cudaHostUnregister(e.host_base); + free(e.host_base); + } + } else if (e.base) { + (void)cudaFree(e.base); + } } g_stream_resident_experts.clear(); g_stream_resident_index.clear(); @@ -970,6 +979,16 @@ static int cuda_stream_resident_evict_one( } static uint64_t cuda_stream_resident_free_reserve_bytes(void) { + /* Tunable for discrete GPUs with tight VRAM (e.g. 48 GiB W7800), where the + * upstream 16 GiB floor over-constrains the routed-expert cache and causes + * uncached experts to hard-fail instead of fitting. Default keeps upstream + * behavior. */ + const char *e = getenv("DS4_ROCM_STREAM_FREE_RESERVE_GIB"); + if (e && *e) { + char *end = NULL; + double g = strtod(e, &end); /* fractional GiB allowed, e.g. 0.5 */ + if (end != e && g >= 0) return (uint64_t)(g * 1024.0 * 1024.0 * 1024.0); + } return 16ull * 1024ull * 1024ull * 1024ull; } @@ -1021,33 +1040,82 @@ static int cuda_stream_resident_alloc( } bytes = gate_pair + down_expert_bytes; - if (!cuda_stream_resident_make_room(bytes, layer, selected_ids, n_selected)) { + int make_room_ok = cuda_stream_resident_make_room(bytes, layer, selected_ids, n_selected); + if (!make_room_ok) { fprintf(stderr, DS4_GPU_LOG_PREFIX "streaming expert cache cannot keep %.2f MiB " - "for layer=%u expert=%d while preserving %.2f GiB free\n", + "for layer=%u expert=%d while preserving %.2f GiB free; trying host-mapped fallback\n", (double)bytes / 1048576.0, layer, expert, (double)cuda_stream_resident_free_reserve_bytes() / 1073741824.0); - return -1; } void *base = NULL; - cudaError_t err = cudaMalloc(&base, (size_t)bytes); - while (err != cudaSuccess && cuda_stream_resident_evict_one(layer, selected_ids, n_selected)) { - (void)cudaGetLastError(); - err = cudaMalloc(&base, (size_t)bytes); + int host_mapped = 0; + void *host_base = NULL; + if (make_room_ok) { + cudaError_t err = cudaMalloc(&base, (size_t)bytes); + while (err != cudaSuccess && cuda_stream_resident_evict_one(layer, selected_ids, n_selected)) { + (void)cudaGetLastError(); + err = cudaMalloc(&base, (size_t)bytes); + } + if (err != cudaSuccess) { + (void)cudaGetLastError(); + base = NULL; + } } - if (err != cudaSuccess) { - fprintf(stderr, - DS4_GPU_LOG_PREFIX "streaming expert cache allocation failed " - "for layer=%u expert=%d (%.2f MiB): %s\n", - layer, - expert, - (double)bytes / 1048576.0, - cudaGetErrorString(err)); - (void)cudaGetLastError(); - return -1; + + if (!base) { + const char *hm_env = getenv("DS4_ROCM_HOST_MAPPED_EXPERT_FALLBACK"); + if (hm_env && strcmp(hm_env, "0") == 0) { + fprintf(stderr, + DS4_GPU_LOG_PREFIX "streaming expert cache allocation failed " + "for layer=%u expert=%d (%.2f MiB): VRAM exhausted\n", + layer, + expert, + (double)bytes / 1048576.0); + return -1; + } + /* Discrete-VRAM fallback: build a packed contiguous host copy of the + * expert (gate|up|down) and zero-copy map it into GPU address space over + * PCIe. Lets inference proceed (slower) instead of hard-failing when the + * routed-expert working set exceeds VRAM. */ + void *hbuf = malloc((size_t)bytes); + if (hbuf) { + memcpy((char *)hbuf, + (const char *)model_map + gate_offset, + (size_t)gate_expert_bytes); + memcpy((char *)hbuf + gate_expert_bytes, + (const char *)model_map + up_offset, + (size_t)gate_expert_bytes); + memcpy((char *)hbuf + 2ull * gate_expert_bytes, + (const char *)model_map + down_offset, + (size_t)down_expert_bytes); + cudaError_t err = cudaHostRegister(hbuf, (size_t)bytes, cudaHostRegisterMapped); + if (err == cudaSuccess) { + void *d = NULL; + err = cudaHostGetDevicePointer(&d, hbuf, 0); + if (err == cudaSuccess && d) { + base = (char *)d; + host_mapped = 1; + host_base = hbuf; + } else { + (void)cudaHostUnregister(hbuf); + (void)cudaGetLastError(); + } + } else { + (void)cudaGetLastError(); + } + if (!host_mapped) free(hbuf); + } + if (!base) { + fprintf(stderr, + DS4_GPU_LOG_PREFIX "streaming expert host-mapped fallback FAILED " + "for layer=%u expert=%d (%.2f MiB)\n", + layer, expert, (double)bytes / 1048576.0); + return -1; + } } cuda_stream_resident_expert e; @@ -1066,6 +1134,8 @@ static int cuda_stream_resident_alloc( e.down = e.base + 2u * gate_expert_bytes; e.bytes = bytes; e.last_used = ++g_stream_resident_clock; + e.host_mapped = host_mapped; + e.host_base = host_base; g_stream_resident_experts.push_back(e); g_stream_resident_index[cuda_stream_resident_entry_key(e)] = g_stream_resident_experts.size() - 1u; @@ -3551,6 +3621,14 @@ static const ds4_rocm_runtime_config *cuda_runtime_config(void) { } static uint64_t cuda_q8_f16_cache_limit_bytes(void) { + /* Honor DS4_CUDA_Q8_F16_CACHE_MB (0 disables the optional q8->f16 cache, + * freeing VRAM for routed-expert residency on tight discrete GPUs). */ + const char *e = getenv("DS4_CUDA_Q8_F16_CACHE_MB"); + if (e && *e) { + char *end = NULL; + long mb = strtol(e, &end, 10); + if (end != e && mb >= 0) return (uint64_t)mb * 1048576ull; + } return UINT64_MAX; } @@ -4174,6 +4252,40 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) { return (char *)dev; } +/* Detect whether the active GPU is discrete (separate VRAM) or integrated + * (unified memory, e.g. Strix Halo APU). On discrete GPUs, raw host pointers + * are accessible from kernels but traverse PCIe on every read; the + * cudaHostRegister mapping path is preferred. On integrated GPUs, host memory + * is directly accessible and the raw pointer path is fine. + * + * DS4_ROCM_REGISTERED_WEIGHTS overrides auto-detection: + * "1" = always treat as discrete (force registered path) + * "0" = always treat as integrated (force raw host pointer path) + * + * When the attribute query fails (unsupported ROCm version, etc.) the function + * defaults to discrete, which is the safer choice: the registered path works on + * all GPUs, while the raw pointer path silently degrades performance on + * discrete cards. */ +static int cuda_device_is_discrete(void) { + static int cached = -1; + if (cached >= 0) return cached; + + const char *env = getenv("DS4_ROCM_REGISTERED_WEIGHTS"); + if (env && *env) { + cached = (strcmp(env, "0") == 0) ? 0 : 1; + return cached; + } + + int integrated = 0; + if (cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, 0) == cudaSuccess) { + cached = integrated ? 0 : 1; + } else { + (void)cudaGetLastError(); + cached = 1; + } + return cached; +} + static const char *cuda_model_range_ptr_from_fd( const void *model_map, uint64_t offset, @@ -4183,11 +4295,13 @@ static const char *cuda_model_range_ptr_from_fd( if (g_model_fd_host_base != NULL && model_map != g_model_fd_host_base) return NULL; const uint64_t limit = cuda_model_cache_limit_bytes(); if (g_model_range_bytes > limit || bytes > limit - g_model_range_bytes) { + if (cuda_device_is_discrete()) return NULL; return cuda_model_ptr(model_map, offset); } char *dev = cuda_model_arena_alloc(bytes, what); if (!dev) { + if (cuda_device_is_discrete()) return NULL; return cuda_model_ptr(model_map, offset); } cudaError_t err = cudaSuccess;