diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 188b341ad..a1bafb66e 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -2029,25 +2029,107 @@ static int cuda_stream_expert_cache_load_slot( down_offset + (uint64_t)expert * down_expert_bytes; const uint64_t gate_dst = (uint64_t)slot * gate_expert_bytes; const uint64_t down_dst = (uint64_t)slot * down_expert_bytes; - if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst, - model_map, - model_size, - gate_src, - gate_expert_bytes, - "cached moe_gate") || - !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst, - model_map, - model_size, - up_src, - gate_expert_bytes, - "cached moe_up") || - !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst, - model_map, - model_size, - down_src, - down_expert_bytes, - "cached moe_down")) { - return 0; + int loaded = 0; + if (g_model_fd >= 0 && + (g_model_fd_host_base == NULL || model_map == g_model_fd_host_base)) { + const uint64_t chunk = cuda_model_copy_chunk_bytes(); + if (gate_expert_bytes <= chunk && down_expert_bytes <= chunk) { + const uint64_t stage_bytes = + chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1); + if (!cuda_stream_selected_stage_pool_alloc(stage_bytes)) return 0; + + char *dst[3] = { + cache->gate_ptr + gate_dst, + cache->up_ptr + gate_dst, + cache->down_ptr + down_dst + }; + uint64_t src[3] = { gate_src, up_src, down_src }; + uint64_t bytes[3] = { + gate_expert_bytes, + gate_expert_bytes, + down_expert_bytes + }; + const char *what[3] = { + "cached moe_gate", + "cached moe_up", + "cached moe_down" + }; + int enqueued = 0; + for (uint32_t i = 0; i < 3; i++) { + const char *payload = NULL; + if (!cuda_model_stage_read(g_stream_selected_stage[i], + g_stream_selected_stage_bytes, + src[i], + bytes[i], + &payload)) { + fprintf(stderr, + "ds4: CUDA streaming selected read failed for %s: %s\n", + what[i], + strerror(errno)); + if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream); + return 0; + } + cudaError_t err = cudaMemcpyAsync(dst[i], + payload, + (size_t)bytes[i], + cudaMemcpyHostToDevice, + g_stream_selected_upload_stream); + if (err != cudaSuccess) { + fprintf(stderr, + "ds4: CUDA streaming selected copy failed for %s: %s\n", + what[i], + cudaGetErrorString(err)); + (void)cudaGetLastError(); + if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream); + return 0; + } + enqueued = 1; + err = cudaEventRecord(g_stream_selected_stage_event[i], + g_stream_selected_upload_stream); + if (err != cudaSuccess) { + fprintf(stderr, + "ds4: CUDA streaming selected staging record failed for %s: %s\n", + what[i], + cudaGetErrorString(err)); + (void)cudaGetLastError(); + (void)cudaStreamSynchronize(g_stream_selected_upload_stream); + return 0; + } + cuda_model_drop_file_pages(src[i], bytes[i]); + cuda_model_discard_source_pages(model_map, model_size, src[i], bytes[i]); + } + cudaError_t err = cudaStreamSynchronize(g_stream_selected_upload_stream); + if (err != cudaSuccess) { + fprintf(stderr, + "ds4: CUDA streaming selected upload sync failed for cached moe expert: %s\n", + cudaGetErrorString(err)); + (void)cudaGetLastError(); + return 0; + } + loaded = 1; + } + } + if (!loaded) { + if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst, + model_map, + model_size, + gate_src, + gate_expert_bytes, + "cached moe_gate") || + !cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst, + model_map, + model_size, + up_src, + gate_expert_bytes, + "cached moe_up") || + !cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst, + model_map, + model_size, + down_src, + down_expert_bytes, + "cached moe_down")) { + return 0; + } } cuda_stream_expert_cache_slot &entry = cache->slots[slot]; entry.valid = 1;