-
Notifications
You must be signed in to change notification settings - Fork 242
feat(qwen35moe): pooled chunked prefill + snapshot/restore over KVFlash #430
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
03f2c50
33a7cd9
11cd43b
ce5c0ad
bffe297
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,10 @@ inline void kvflash_qk_chunk_scores( | |
| const float * query, | ||
| const KvFlashQkDims & d, | ||
| std::vector<float> & out, | ||
| float missing_score = -2.0f) { | ||
| float missing_score = -2.0f, | ||
| const float * seeded = nullptr, | ||
| float seeded_sentinel = -std::numeric_limits<float>::infinity(), | ||
| int seeded_n = -1) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: Default Prompt for AI agents |
||
| const int group = d.n_q_heads / d.n_kv_heads; | ||
| const int n_chunks = (int)pooled_keys.size(); | ||
| out.assign((size_t)n_chunks, missing_score); | ||
|
|
@@ -83,6 +86,19 @@ inline void kvflash_qk_chunk_scores( | |
| } | ||
| out[(size_t)c] = acc * inv_layers; // layer-MEAN (Phase-0 config) | ||
| } | ||
| // Seeded fallback: for chunks with no pooled key, use the ledger score from | ||
| // a prior turn if it is not the sentinel (i.e. it was actually scored). | ||
| // seeded_n bounds the valid range of the seeded array; chunks beyond it | ||
| // (n_chunks > seeded array length) fall back to missing_score safely. | ||
| if (seeded) { | ||
| const int seeded_limit = (seeded_n >= 0) ? seeded_n : n_chunks; | ||
| for (int c = 0; c < n_chunks; c++) { | ||
| if (!pooled_keys[(size_t)c] && c < seeded_limit && | ||
| seeded[c] != seeded_sentinel) { | ||
| out[(size_t)c] = seeded[c]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } // namespace dflash::common | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1009,7 +1009,7 @@ static int mmq_safe_sub_batch() { | |
| static const int v = [](){ | ||
| const char * e = std::getenv("DFLASH_MMQ_SUB_BATCH"); | ||
| if (e) return std::max(1, std::atoi(e)); | ||
| return (query_gpu_compute_sm() >= 80) ? 8 : 1; | ||
| return (query_gpu_compute_sm() >= 80) ? 4 : 1; // Q4_K MMVQ cap=4 on sm_86 | ||
| }(); | ||
| return v; | ||
| } | ||
|
|
@@ -1066,6 +1066,27 @@ static bool eval_moe_hybrid_ffn_batched_core( | |
| if (cl >= 0) { cold_sel[i] = cl; cold_wts[i] = selected_weights[i]; fp_has_cold = true; } | ||
| } | ||
| } | ||
| // Dummy slots (wts==0) may alias a real hot expert's local ID per token → | ||
| // ids_to_sorted_host drops entries → ASSERT in slow ggml_mul_mat_id path. | ||
| for (int t = 0; t < n_tokens; ++t) { | ||
| const int base = t * n_used; | ||
| int32_t next = 0; | ||
| for (int s = 0; s < n_used; ++s) { | ||
| if (hot_wts[base + s] > 0.0f) continue; | ||
| // Bounded search: at most n_hot_init probes. If every ID in | ||
|
cubic-dev-ai[bot] marked this conversation as resolved.
|
||
| // [0, n_hot_init) is already taken by another slot we break and | ||
| // keep `next` as-is (duplicate), which is safe — the zero-weight | ||
| // slot is ignored by ids_to_sorted_host anyway. | ||
| int tries = 0; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P2: Complex dummy-slot normalization logic is duplicated between the cached fast path and the inline rebuild path in the same function. This increases maintenance risk: a future bug fix or behavioral tweak to one loop can be missed in the other, producing path-dependent behavior for the same routing inputs. Extract a shared helper and call it from both paths. Prompt for AI agents |
||
| while (tries < n_hot_init && | ||
| [&]{ for (int k=0; k<n_used; ++k) if (k!=s && hot_sel[base+k]==next) return true; return false; }()) { | ||
| if (++next >= n_hot_init) next = 0; | ||
| ++tries; | ||
| } | ||
| hot_sel[base + s] = next++; | ||
| if (next >= n_hot_init) next = 0; | ||
| } | ||
| } | ||
|
|
||
| CachedHotBatchedGraph & hg = storage.hot_batched_mixed[n_tokens]; | ||
| const bool hg_ok = (hg.valid() && hg.n_tokens == n_tokens) | ||
|
|
@@ -1145,6 +1166,23 @@ static bool eval_moe_hybrid_ffn_batched_core( | |
| } | ||
| } | ||
| } | ||
| // Dummy slots (wts==0) may alias a real hot expert's local ID per token → | ||
| // ids_to_sorted_host drops entries → ASSERT in slow ggml_mul_mat_id path. | ||
| for (int t = 0; t < n_tokens; ++t) { | ||
| const int base = t * n_used; | ||
| int32_t next = 0; | ||
| for (int s = 0; s < n_used; ++s) { | ||
| if (hot_wts[base + s] > 0.0f) continue; | ||
| int tries = 0; | ||
| while (tries < n_hot_init && | ||
| [&]{ for (int k=0; k<n_used; ++k) if (k!=s && hot_sel[base+k]==next) return true; return false; }()) { | ||
| if (++next >= n_hot_init) next = 0; | ||
| ++tries; | ||
| } | ||
| hot_sel[base + s] = next++; | ||
| if (next >= n_hot_init) next = 0; | ||
| } | ||
| } | ||
|
|
||
| // ── Step 2: Build and run hot GPU graph (includes shared expert always) ── | ||
| std::vector<float> hot_partial((size_t)n_embd * (size_t)n_tokens, 0.0f); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.