antirez · fmolara · Jun 26, 2026
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
@@ -2029,25 +2029,107 @@ static int cuda_stream_expert_cache_load_slot(
         down_offset + (uint64_t)expert * down_expert_bytes;
     const uint64_t gate_dst = (uint64_t)slot * gate_expert_bytes;
     const uint64_t down_dst = (uint64_t)slot * down_expert_bytes;
-    if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
-                                            model_map,
-                                            model_size,
-                                            gate_src,
-                                            gate_expert_bytes,
-                                            "cached moe_gate") ||
-        !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
-                                            model_map,
-                                            model_size,
-                                            up_src,
-                                            gate_expert_bytes,
-                                            "cached moe_up") ||
-        !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
-                                            model_map,
-                                            model_size,
-                                            down_src,
-                                            down_expert_bytes,
-                                            "cached moe_down")) {
-        return 0;
+    int loaded = 0;
+    if (g_model_fd >= 0 &&
+        (g_model_fd_host_base == NULL || model_map == g_model_fd_host_base)) {
+        const uint64_t chunk = cuda_model_copy_chunk_bytes();
+        if (gate_expert_bytes <= chunk && down_expert_bytes <= chunk) {
+            const uint64_t stage_bytes =
+                chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1);
+            if (!cuda_stream_selected_stage_pool_alloc(stage_bytes)) return 0;
+
+            char *dst[3] = {
+                cache->gate_ptr + gate_dst,
+                cache->up_ptr + gate_dst,
+                cache->down_ptr + down_dst
+            };
+            uint64_t src[3] = { gate_src, up_src, down_src };
+            uint64_t bytes[3] = {
+                gate_expert_bytes,
+                gate_expert_bytes,
+                down_expert_bytes
+            };
+            const char *what[3] = {
+                "cached moe_gate",
+                "cached moe_up",
+                "cached moe_down"
+            };
+            int enqueued = 0;
+            for (uint32_t i = 0; i < 3; i++) {
+                const char *payload = NULL;
+                if (!cuda_model_stage_read(g_stream_selected_stage[i],
+                                           g_stream_selected_stage_bytes,
+                                           src[i],
+                                           bytes[i],
+                                           &payload)) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected read failed for %s: %s\n",
+                            what[i],
+                            strerror(errno));
+                    if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                cudaError_t err = cudaMemcpyAsync(dst[i],
+                                                  payload,
+                                                  (size_t)bytes[i],
+                                                  cudaMemcpyHostToDevice,
+                                                  g_stream_selected_upload_stream);
+                if (err != cudaSuccess) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected copy failed for %s: %s\n",
+                            what[i],
+                            cudaGetErrorString(err));
+                    (void)cudaGetLastError();
+                    if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                enqueued = 1;
+                err = cudaEventRecord(g_stream_selected_stage_event[i],
+                                      g_stream_selected_upload_stream);
+                if (err != cudaSuccess) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected staging record failed for %s: %s\n",
+                            what[i],
+                            cudaGetErrorString(err));
+                    (void)cudaGetLastError();
+                    (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                cuda_model_drop_file_pages(src[i], bytes[i]);
+                cuda_model_discard_source_pages(model_map, model_size, src[i], bytes[i]);
+            }
+            cudaError_t err = cudaStreamSynchronize(g_stream_selected_upload_stream);
+            if (err != cudaSuccess) {
+                fprintf(stderr,
+                        "ds4: CUDA streaming selected upload sync failed for cached moe expert: %s\n",
+                        cudaGetErrorString(err));
+                (void)cudaGetLastError();
+                return 0;
+            }
+            loaded = 1;
+        }
+    }
+    if (!loaded) {
+        if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
+                                                model_map,
+                                                model_size,
+                                                gate_src,
+                                                gate_expert_bytes,
+                                                "cached moe_gate") ||
+            !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
+                                                model_map,
+                                                model_size,
+                                                up_src,
+                                                gate_expert_bytes,
+                                                "cached moe_up") ||
+            !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
+                                                model_map,
+                                                model_size,
+                                                down_src,
+                                                down_expert_bytes,
+                                                "cached moe_down")) {
+            return 0;
+        }
     }
     cuda_stream_expert_cache_slot &entry = cache->slots[slot];
     entry.valid = 1;