Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 101 additions & 19 deletions ds4_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2029,25 +2029,107 @@ static int cuda_stream_expert_cache_load_slot(
down_offset + (uint64_t)expert * down_expert_bytes;
const uint64_t gate_dst = (uint64_t)slot * gate_expert_bytes;
const uint64_t down_dst = (uint64_t)slot * down_expert_bytes;
if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
model_map,
model_size,
gate_src,
gate_expert_bytes,
"cached moe_gate") ||
!cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
model_map,
model_size,
up_src,
gate_expert_bytes,
"cached moe_up") ||
!cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
model_map,
model_size,
down_src,
down_expert_bytes,
"cached moe_down")) {
return 0;
int loaded = 0;
if (g_model_fd >= 0 &&
(g_model_fd_host_base == NULL || model_map == g_model_fd_host_base)) {
const uint64_t chunk = cuda_model_copy_chunk_bytes();
if (gate_expert_bytes <= chunk && down_expert_bytes <= chunk) {
const uint64_t stage_bytes =
chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1);
if (!cuda_stream_selected_stage_pool_alloc(stage_bytes)) return 0;

char *dst[3] = {
cache->gate_ptr + gate_dst,
cache->up_ptr + gate_dst,
cache->down_ptr + down_dst
};
uint64_t src[3] = { gate_src, up_src, down_src };
uint64_t bytes[3] = {
gate_expert_bytes,
gate_expert_bytes,
down_expert_bytes
};
const char *what[3] = {
"cached moe_gate",
"cached moe_up",
"cached moe_down"
};
int enqueued = 0;
for (uint32_t i = 0; i < 3; i++) {
const char *payload = NULL;
if (!cuda_model_stage_read(g_stream_selected_stage[i],
g_stream_selected_stage_bytes,
src[i],
bytes[i],
&payload)) {
fprintf(stderr,
"ds4: CUDA streaming selected read failed for %s: %s\n",
what[i],
strerror(errno));
if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
return 0;
}
cudaError_t err = cudaMemcpyAsync(dst[i],
payload,
(size_t)bytes[i],
cudaMemcpyHostToDevice,
g_stream_selected_upload_stream);
if (err != cudaSuccess) {
fprintf(stderr,
"ds4: CUDA streaming selected copy failed for %s: %s\n",
what[i],
cudaGetErrorString(err));
(void)cudaGetLastError();
if (enqueued) (void)cudaStreamSynchronize(g_stream_selected_upload_stream);
return 0;
}
enqueued = 1;
err = cudaEventRecord(g_stream_selected_stage_event[i],
g_stream_selected_upload_stream);
if (err != cudaSuccess) {
fprintf(stderr,
"ds4: CUDA streaming selected staging record failed for %s: %s\n",
what[i],
cudaGetErrorString(err));
(void)cudaGetLastError();
(void)cudaStreamSynchronize(g_stream_selected_upload_stream);
return 0;
}
cuda_model_drop_file_pages(src[i], bytes[i]);
cuda_model_discard_source_pages(model_map, model_size, src[i], bytes[i]);
}
cudaError_t err = cudaStreamSynchronize(g_stream_selected_upload_stream);
if (err != cudaSuccess) {
fprintf(stderr,
"ds4: CUDA streaming selected upload sync failed for cached moe expert: %s\n",
cudaGetErrorString(err));
(void)cudaGetLastError();
return 0;
}
loaded = 1;
}
}
if (!loaded) {
if (!cuda_model_copy_to_device_streamed(cache->gate_ptr + gate_dst,
model_map,
model_size,
gate_src,
gate_expert_bytes,
"cached moe_gate") ||
!cuda_model_copy_to_device_streamed(cache->up_ptr + gate_dst,
model_map,
model_size,
up_src,
gate_expert_bytes,
"cached moe_up") ||
!cuda_model_copy_to_device_streamed(cache->down_ptr + down_dst,
model_map,
model_size,
down_src,
down_expert_bytes,
"cached moe_down")) {
return 0;
}
}
cuda_stream_expert_cache_slot &entry = cache->slots[slot];
entry.valid = 1;
Expand Down