imp/imp.conf.example at main · kekzl/imp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# imp.conf — central runtime configuration for the imp inference engine.
#
# Loading precedence (first non-empty wins):
#   1. --config <path>              CLI flag
#   2. $IMP_CONFIG                  environment variable
#   3. ./imp.conf                   working-dir relative
#   4. ~/.config/imp/imp.conf       user config directory
#   5. embedded defaults            (no file, all values below)
#
# Per-run overrides on top via:
#   imp --set kv_cache.dtype=fp8 --set runtime.cuda_graphs=never ...
#
# Lines starting with '#' are comments; section headers in [brackets].

[runtime]
# Force deterministic GEMM (cuBLASLt no_reduce_split). Slower but reproducible.
deterministic_gemm   = false
# Full reproducibility mode for temp=0 evals: deterministic MoE routing /
# expert-scatter and top-k sampling kernels, implies deterministic_gemm.
# Greedy output becomes bit-identical across runs/contexts (DetEvalE2ETest).
# Guarantees + known limits (ties, top_k>128, typical_p, GDN cross-context):
# docs/determinism.md.
# Legacy env: IMP_DETERMINISTIC=1. Costs a little throughput; off by default.
deterministic        = false
# CUDA Graph capture: "auto" picks per-model based on architecture support;
# "always" forces capture (fails for MoE with D2H routing); "never" disables.
cuda_graphs          = "auto"
# cudaStreamCaptureMode for graph capture: "relaxed" (default since
# 2026-05-16) | "global" (legacy strict) | "thread_local".
graph_capture_mode   = "relaxed"
# Capture prefill into a CUDA graph too (default ON since 2026-05-17).
# Auto-disabled when the model's prefill path is uncapturable.
prefill_graph        = true
# Max concurrent sequences in a decode batch (server continuous batching).
max_batch_size       = 4
# Run a warmup forward pass at engine init to prime cuBLAS handles + L2 cache.
# Off by default — opt in for prod rollouts where first-request TTFT
# matters. In dev / CI / one-shot runs the warmup is pure overhead and can
# also mask first-request calibration bugs.
warmup               = false
# Override the model's max_seq_len (0 = use the model default).
max_seq_len          = 0
# Disable Programmatic Dependent Launch (PDL). Diagnostic; small perf impact.
no_pdl               = false
# Naked FP16 path with FP8/NVFP4/graphs/warmup all forced off.
debug_raw            = false
# Disable SigLIP vision encoder CUDA-graph capture (debug).
no_vision_graph      = false

[kv_cache]
# KV-cache element type: fp16 | fp8 | int8 | int4 | nvfp4
dtype                       = "fp16"
# Allow cuBLASLt non-deterministic algorithms with FP8 KV. Slightly faster
# but reproducibility is lost.
allow_nondeterministic_fp8  = false
# Restore the legacy auto-upgrade-to-FP8 behavior (off-by-default since 2026-04).
fp8_auto_legacy             = false
# BitDecoding TC path for NVFP4 paged-attention QK (experimental).
bitdecoding_qk              = false
# BitDecoding residual FP16 cache for the newest N tokens (0 = disabled;
# typical 4..32; only with dtype = "nvfp4" + bitdecoding_qk).
bitdecoding_residual_tokens = 0

[attention]
# Each "auto" key picks the kernel automatically based on weight type +
# context length. Override only for benchmarking or compatibility tests.
fp8_prefill          = "auto"
# fp8-QK FMHA family: strictly opt-in ("on"). The raw e4m3 Q/K conversion
# compounds per-layer score error on real activations (#511) — teacher-forced
# PPL gemma-3-12b 16.6 -> 549 when it served prefill. Default "never" routes
# hd!=128 prefill to the FP16 WMMA kernel instead.
fp8_fmha             = "never"
fmha_sm120           = "auto"
# Register-resident FA2 prefill kernel for long prefill (F16, head_dim=128).
# Default "on" since PR #478; QK^T runs in f16 unless fa2_fp16qk=never AND
# fp8_fmha=on (#511). Declines to the FP16 WMMA kernel for unsupported configs.
fmha_fa2             = "on"
# FP16-QK FA2 for SHORT prefill (below fmha_prefill_threshold, hd=128):
# replaces the materialized cuBLAS+softmax path (PR #525, +25-35% pp512
# NVFP4). "never" restores the materialized cuBLAS path.
fa2_fp16qk           = "on"
# f16-accumulate QK^T in the fp16-qk FA2 kernel (#597). GeForce sm_120 runs
# f16-src/f32-acc HMMA at 1/4 rate (#606); f16-acc lifts the score MMA to the
# full-rate class: +4.7-5.0% pp4096 NVFP4 prefill (Qwen3-14B / 30B-A3B). PPL
# on a 5.8k teacher-forced corpus: 14B identical, 30B +0.10%, Q8 +0.013%.
# Default true since 2026-06-11; set false to restore f32 accumulate.
fa2_f16acc           = true
# Also f16-accumulate the PV MMA (full-rate HMMA + halved O-fragment
# registers): FA2 kernel −18% pp4096, e2e +9.7% 30B-A3B / +3.7% 14B NVFP4.
# PPL on a 14.8k teacher-forced corpus: 14B −0.06%, 30B −0.30%, Q8 +0.002%
# (noise). Default true since 2026-06-11. Needs fa2_f16acc.
fa2_pv_f16acc        = true
# Prefill chunk length (tokens) at/above which attention routes to FMHA
# (tiled, O(n) memory) instead of the materialized cuBLAS S-matrix path.
# -1 = auto (derived from the cuBLAS S-matrix VRAM cap).
fmha_prefill_threshold = -1
mxfp4                = "auto"
mxfp4_fp16_fallback  = false
# MXFP4 → FP16 cache pruning policy: "legacy" caches FP16 for every MXFP4
# tensor; "pruned" skips MoE expert weights + LM head (32 GiB-class loads).
mxfp4_fp16_cache_policy = "legacy"
force_cublas_decode  = false
no_qknorm_fused      = false
splitk_pipe          = true
gate_concat          = false
# Max VRAM (MiB) for the materialized cuBLAS-attention S-matrix; caps how
# long prefill stays on the fast cuBLAS path before falling back to FMHA.
attn_scores_mib      = 384

[moe]
# Expected per-expert overhead (% of expert size) before deciding to upload
# all experts vs. host-fall-back. 10 = aggressive auto-pick, 30 = conservative.
expert_overhead_pct  = 10
# Force the LAST N MoE layers off-GPU regardless of budget (debug path).
# 0 = disabled (auto-pick).
force_host_experts   = 0
skip                 = false
force_fp16_sync      = false
no_expert_cache      = false
# Zero MoE workspace buffers each layer (memory-safety bisect).
zero_workspace       = false
# Skip Gemma-4/Qwen3-Next/3.6 always-active shared MLP branch.
no_shared_mlp        = false
# Skip Qwen3-Next/3.6 sigmoid gate on shared expert output.
no_shexp_gate        = false
# Force-disable CUTLASS 3.x grouped MoE GEMM path (legacy fallback).
no_cutlass3x         = false
# Assert device-mirror == host LRU after every expert-cache mutation
# (CI/regression diagnosis; costly D2H readback — never in perf runs).
expert_cache_debug_parity = false
# Async H2D prefetch of next layer's top-K (proj, expert) pairs under
# host-offload (0 = off; sensible 3..16).
prefetch_top_k       = 0
# Allow CUDA Graphs while experts are host-offloaded (experimental;
# correct only with 1:1 prefetch coverage).
allow_graphs_under_offload = false
# Per-process MoE workspace reserve override in MiB (0 = computed default).
reserve_mib          = 0
# CUTLASS 3.x device-args full path for NVFP4 MoE prefill (default ON
# since 2026-05-14, +11-39% pp512).
nvfp4_device_args    = true
# Opt-in smallM kernel branch for NVFP4 MoE prefill + its M threshold.
nvfp4_smallM         = false
nvfp4_smallM_threshold = 64
# Rows-per-block for multi-row NVFP4 MoE decode GEMV (4|8|16|32).
mr_nr                = 8

[gdn]
# FP32 scan/output for Gated-DeltaNet (Qwen3.5/3.6). Slightly slower but
# eliminates FP16 precision drift at long context.
fp32_scan            = false
fp32_out             = false
# 0 = use the model's rms_norm_eps; otherwise override (diagnostic).
norm_eps_override    = 0.0
ref_kernel           = false
vhead_reorder        = false
# Chunkwise SSD scan (chunk-cached K/Q in smem) instead of the per-token
# loop. Default ON since 2026-05-29 (+16.7% on the GDN scan kernel).
chunkwise_scan       = true
# Override gated-DeltaNet weight layout (diagnostic; "" = auto).
layout_override      = ""

[gemm]
# dp4a = INT8 4-element packed multiply-accumulate. mmvq = matrix*vec
# quantized GEMM. Disabling forces dequant->cuBLAS path (diagnostic).
no_dp4a_gemv         = false
no_dp4a_lm           = false
no_mmvq              = false
no_mmvq_q8_0         = false
# Extend NVFP4 decode cache to Q4_K/Q3_K/Q2_K models (default: only Q8/Q6/Q5).
# Trades VRAM for decode throughput on sub-8-bit models.
# Gemma-3-12B Q4_K_M: 77 → 141 tok/s (+82%).
nvfp4_decode_all     = false
# Q4_K HMMA tensor-core GEMM scaffold (experimental, default off).
q4k_hmma_enabled     = false
# Quantize a native FP16/BF16 LM head to an NVFP4 decode cache (dense
# models; ~+8-16% decode). Default ON.
nvfp4_lm_head        = true
# Same for GDN/SSM-hybrid models (Qwen3.6-35B: +11.4% decode at +2.2% PPL;
# set false for maximum coherence). Default ON since PR #483.
nvfp4_lm_head_gdn    = true
# Force GGUF-hybrid GDN/SSM in/out projections into the NVFP4 decode cache
# (Qwen3.6-35B Q4_K_M: +53% decode, PPL flat). Opt-in.
nvfp4_ssm_proj       = false
# Quantize recipe-excluded BF16 attention q/k/v/o on native-NVFP4 hybrids
# (Nemotron-3-Nano: +3.8% decode, PPL-neutral). Opt-in.
nvfp4_attn_proj      = false
# Route native-NVFP4 MoE expert decode (M=1) through per-expert GEMV
# kernels instead of grouped CUTLASS (+54-80% MoE decode). Default ON.
nvfp4_moe_decode     = true
# FP16-accumulate cuBLAS prefill GEMMs (CUBLAS_COMPUTE_16F). GeForce sm_120
# runs FP32-accumulate tensor cores at 1/4 rate, so 32F caps prefill at
# ~225 TFLOPS; 16F roughly doubles GEMM throughput (Qwen3-8B Q8_0 pp512
# +24.9% measured 2026-06-07, PPL flat, decode neutral). auto (default) =
# ON per arch except Gemma-3/4 (+0.7% PPL measured) and gpt-oss
# (f16-overflow sensitivity); on = force everywhere; off = always 32F.
# Decode (M=1) is unaffected.
cublas_fp16_acc      = auto
# INT8-IMMA prefill family (fused dequant on int8 tensor cores, 968 TOPS
# measured). Validated 2026-06-07 with teacher-forced PPL gates on
# Qwen3-8B/30B-MoE/gemma-4-26B (neutral or better) and Qwen3.6-35B (+0.55%).
# Q8_0 dense prefill GEMMs (Qwen3-8B Q8_0 pp512 +7% over fp16-acc cuBLAS).
q8_imma_enabled      = true
# Grouped MoE expert prefill (one launch over all experts). The big one:
# Qwen3-30B-A3B +151% (ABOVE llama.cpp), gemma-4-26B +111%, Qwen3.6-35B +40%.
moe_imma_prefill     = true
# Dense Q4_K via IMMA: stays opt-in (never validated on a clean dense Q4_K
# model; dense Q6_K measured SLOWER than fp16-acc cuBLAS and is not wired).
q4k_imma_prefill     = false

# [gemma4] section moved out of the global RuntimeConfig in Phase 5
# Track A of the architecture refactor. These are now per-model
# overrides on ModelConfig::Overrides::Gemma4 (see src/model/model_config.h);
# the GGUF loader / engine init resolver populates them when the
# loaded model is Gemma-4. There is no user-facing knob today.

[generation]
no_logit_softcap     = false
lm_dequant_fp16      = false
# Max tokens spent in <think>...</think> blocks (0 = unlimited).
think_budget         = 0
# Force-prepend BOS token even when the tokenizer says no.
force_bos            = false
# Disable the banned-token list (debug).
no_ban               = false
# Disable RoPE inside the MTP draft head (diagnostic).
mtp_no_rope          = false

[server]
# Prefix caching: reuse KV blocks for shared prompt prefixes (system
# prompts, multi-turn history). Read by the engine and ORed into the live
# config at init; PrefixCacheE2ETest guards hit==fresh equality.
# Auto-disabled for recurrent (SSM/GDN) models. (Library/C-API embedders
# that do not load imp.conf default to OFF.)
prefix_cache         = true
# Cap on KV blocks pinned by Anthropic cache_control / "cache_prompt"
# requests, as percent of the KV pool. Oldest pins recycle FIFO. 0 = pin
# requests are ignored.
prefix_pin_budget_pct = 25
# Green Contexts / prefill-decode overlap streams in the server engine.
# Opt-in (off by default): suspected memSyncDomain race on sm_120 fallback
# streams (gemma-3-12b IMA). Set true to enable.
green_contexts       = false

[bench]
# imp-cli --bench: also run Engine::generate() loop for accurate timing.
generate             = false

[paths]
# Path to a vision projector .gguf file (Gemma-3, Mistral3 multimodal).
mmproj               = ""

[speculative]
# n-gram (prompt-lookup) speculative decoding: drafts come from suffix
# matches against the request's own prompt+output — no draft model. Greedy
# verify accepts the longest matching prefix, so output stays a faithful
# greedy decode. Batch-1, greedy sampling, dense-attention models only;
# disables the async decode graph loop while on. Wins on repetition-heavy
# (agent/code/RAG) workloads; can cost a few % on free-form prose.
ngram                = false
# Draft tokens proposed per verify step (verify cost is ~flat in k).
k                    = 16
# Shortest / longest suffix n-gram match searched. Longer min_match =
# fewer but far more precise drafts (6 vs 3: +16% on code-edit, and the
# number-table worst case shrinks from -13% to -2%).
min_match            = 6
max_match            = 12
# Give up on speculation (and return to the fast async decode loop) after
# this many consecutive draft misses, or when acceptance stays under 15%.
# 0 = never give up.
give_up_after        = 64
# Burst-hybrid: a given-up request runs the async loop in bursts of this
# many tokens and re-probes for drafts in between (think models reach their
# draft-rich region only after the reasoning prose). 0 = give-up is final.
burst                = 128
# On a draft miss, run the async loop for this many tokens (cheap rearm,
# no graph recapture) instead of stepping eagerly until the next draft.
# 0 = stay eager between drafts.
miss_burst           = 8

[ffn]
# SwiGLU/GeGLU sparsity probe — instrumentation only, counts skippable
# intermediate rows per threshold; no skipping (~1 µs/layer/token when on).
sparsity_probe       = false
# FFN down_proj row-skipping per Q8_0 block: skip blocks whose
# |silu(gate)*up| amax is below this (0.0 = disabled = bit-identical;
# recommended 0.005..0.05; Q8_0 decode only).
sparsity_threshold   = 0.0

[diagnostics]
# Verbose per-layer forward dumps. Heavy I/O — use only for bisecting bugs.
debug_forward        = false
# Print intermediate Jinja2 chat-template rendering steps.
debug_template       = false
# If non-empty, dump per-layer hidden states as .npy files to this directory.
# Special values: "1" or "all" → /tmp.
dump_hidden_dir      = ""
# If non-empty, dump MoE gate logits (pre-topk) and routing weights to file.
# Special value "all" dumps every layer (default: only last + selected layers).
dump_logits_dir      = ""
dump_routing_dir     = ""
# Print prefill token IDs to stderr (truncated to 20).
dump_tokens          = false
# Stop forward pass after layer N (-1 = run full forward).
exit_layer           = -1
profile              = false
graph_diag           = false
graph_dump_dir       = ""
# Force NVFP4 dispatch through dequant→FP16 GEMV (M=1 bisection tool).
nvfp4_force_dequant  = false
# Log shape + per-candidate algo for every cuBLASLt algo selection.
log_gemm_algo        = false
# MTP diagnostics: per-step pattern log / pass post-RMSNorm hidden to head.
mtp_pattern_log      = false
mtp_prenorm_h        = false
# Audit NVFP4 weight scales at load time.
audit_nvfp4_scales   = false