From 770feb86c2816bfd0a893ab175e98cc6ff8a51cb Mon Sep 17 00:00:00 2001
From: Federico Molara <federico@molara.net>
Date: Fri, 26 Jun 2026 14:05:33 +0200
Subject: [PATCH] cuda: batch selected expert miss uploads

When a selected expert cache miss occurs in the direct-I/O CUDA path, gate/up/down tensors are currently uploaded independently, synchronizing the selected upload stream after each tensor.

Batch the three uploads for one expert and synchronize the upload stream once after all three have been enqueued.

Layouts, cache policy and kernels are unchanged. Unsupported cases transparently fall back to the existing path.
---
 ds4_cuda.cu | 120 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 101 insertions(+), 19 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 188b341ad..a1bafb66e 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -2029,25 +2029,107 @@ static int cuda_stream_expert_cache_load_slot(
         down_offset + (uint64_t)expert * down_expert_bytes;
     const uint64_t gate_dst = (uint64_t)slot * gate_expert_bytes;
     const uint64_t down_dst = (uint64_t)slot * down_expert_bytes;
-    if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
-                                            model_map,
-                                            model_size,
-                                            gate_src,
-                                            gate_expert_bytes,
-                                            "cached moe_gate") ||
-        !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
-                                            model_map,
-                                            model_size,
-                                            up_src,
-                                            gate_expert_bytes,
-                                            "cached moe_up") ||
-        !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
-                                            model_map,
-                                            model_size,
-                                            down_src,
-                                            down_expert_bytes,
-                                            "cached moe_down")) {
-        return 0;
+    int loaded = 0;
+    if (g_model_fd >= 0 &&
+        (g_model_fd_host_base == NULL || model_map == g_model_fd_host_base)) {
+        const uint64_t chunk = cuda_model_copy_chunk_bytes();
+        if (gate_expert_bytes <= chunk && down_expert_bytes <= chunk) {
+            const uint64_t stage_bytes =
+                chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1);
+            if (!cuda_stream_selected_stage_pool_alloc(stage_bytes)) return 0;
+
+            char *dst[3] = {
+                cache->gate_ptr + gate_dst,
+                cache->up_ptr + gate_dst,
+                cache->down_ptr + down_dst
+            };
+            uint64_t src[3] = { gate_src, up_src, down_src };
+            uint64_t bytes[3] = {
+                gate_expert_bytes,
+                gate_expert_bytes,
+                down_expert_bytes
+            };
+            const char *what[3] = {
+                "cached moe_gate",
+                "cached moe_up",
+                "cached moe_down"
+            };
+            int enqueued = 0;
+            for (uint32_t i = 0; i < 3; i++) {
+                const char *payload = NULL;
+                if (!cuda_model_stage_read(g_stream_selected_stage[i],
+                                           g_stream_selected_stage_bytes,
+                                           src[i],
+                                           bytes[i],
+                                           &payload)) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected read failed for %s: %s\n",
+                            what[i],
+                            strerror(errno));
+                    if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                cudaError_t err = cudaMemcpyAsync(dst[i],
+                                                  payload,
+                                                  (size_t)bytes[i],
+                                                  cudaMemcpyHostToDevice,
+                                                  g_stream_selected_upload_stream);
+                if (err != cudaSuccess) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected copy failed for %s: %s\n",
+                            what[i],
+                            cudaGetErrorString(err));
+                    (void)cudaGetLastError();
+                    if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                enqueued = 1;
+                err = cudaEventRecord(g_stream_selected_stage_event[i],
+                                      g_stream_selected_upload_stream);
+                if (err != cudaSuccess) {
+                    fprintf(stderr,
+                            "ds4: CUDA streaming selected staging record failed for %s: %s\n",
+                            what[i],
+                            cudaGetErrorString(err));
+                    (void)cudaGetLastError();
+                    (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
+                    return 0;
+                }
+                cuda_model_drop_file_pages(src[i], bytes[i]);
+                cuda_model_discard_source_pages(model_map, model_size, src[i], bytes[i]);
+            }
+            cudaError_t err = cudaStreamSynchronize(g_stream_selected_upload_stream);
+            if (err != cudaSuccess) {
+                fprintf(stderr,
+                        "ds4: CUDA streaming selected upload sync failed for cached moe expert: %s\n",
+                        cudaGetErrorString(err));
+                (void)cudaGetLastError();
+                return 0;
+            }
+            loaded = 1;
+        }
+    }
+    if (!loaded) {
+        if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
+                                                model_map,
+                                                model_size,
+                                                gate_src,
+                                                gate_expert_bytes,
+                                                "cached moe_gate") ||
+            !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
+                                                model_map,
+                                                model_size,
+                                                up_src,
+                                                gate_expert_bytes,
+                                                "cached moe_up") ||
+            !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
+                                                model_map,
+                                                model_size,
+                                                down_src,
+                                                down_expert_bytes,
+                                                "cached moe_down")) {
+            return 0;
+        }
     }
     cuda_stream_expert_cache_slot &entry = cache->slots[slot];
     entry.valid = 1;