openinfer-project · xiaguan · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/docs/index.md b/docs/index.md
@@ -30,6 +30,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB, batched greedy sampling (#307), mixed greedy/non-greedy sampling (#284), and pegaflow KV offload (#316) are landed; open set is zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, and YaRN #8 follow-up. |
 | `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. |
 | `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. |
+| `models/qwen3/dflash-speculative-decoding.md` | DFlash speculative decoding behind `--dflash-draft-model-path`, modelled as an optimistic transaction (propose K → verify K+1 span → accept longest argmax prefix + 1 bonus → commit/roll back KV). Lossless up to bf16 tie-flips (bit-identical multi-token accepts; lm-eval gsm8k strict-match identical spec on/off). Single-stream decode 1.82× on 5070 Ti, 1.56× on 5090. Concurrent throughput fixed by batching the draft forward, then a piecewise verify CUDA Graph (dense ops captured, attention eager) closed single-stream: 5090 greedy c1 274 ≈ vLLM 278, c8 1525 > 1240, c16 1834 ≈ 1846 — all batch sizes now ≥ vLLM. Accept measured equal (9.1% vs 8.85%, same drafter); draft-side piecewise graph tracked next. Proposer trait deferred to EAGLE. |
 | `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. |
 | `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. |
 | `models/qwen3/tp-design.md` | Qwen3 tensor-parallel design: `TP=2` milestone scope plus the controller/worker broadcast execution model, request identity, and coarse-grained step protocol for future TP/MoE work. |

diff --git a/docs/models/qwen3/dflash-speculative-decoding.md b/docs/models/qwen3/dflash-speculative-decoding.md
diff --git a/openinfer-core/src/ops.rs b/openinfer-core/src/ops.rs
@@ -15,6 +15,7 @@ pub use attention::{
 pub use openinfer_kernels::ops::{
     GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into,
     add_batch, add_batch_into, argmax, argmax_batch_bf16_into, bf16_hidden_to_f32_into,
+    copy_hidden_rows_into, copy_hidden_token_range_into, dflash_qk_norm_rope_into,
     embedding_decode_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into,
     f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm,
     gemm_graphsafe_into_checked, gemm_graphsafe_ref_into_checked, gemm_into_checked, gemm_lt_tune,
@@ -23,7 +24,8 @@ pub use openinfer_kernels::ops::{
     qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
     rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
     scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
-    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into,
+    scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
+    single_prefill_nhd_noncausal_into, write_vec_into,
 };
 #[cfg(not(feature = "kernel-call-trace"))]
 pub use openinfer_kernels::ops::{

diff --git a/openinfer-core/src/ops/paged_plan.rs b/openinfer-core/src/ops/paged_plan.rs
@@ -147,6 +147,53 @@ impl PrefillPagedPlan {
         })
     }
 
+    /// Pre-allocate a worst-case-sized plan to be refilled in place by
+    /// [`Self::update_batch_with_cta_tile_q`] (graph-stable buffer reuse).
+    pub fn new_preallocated(
+        ctx: &DeviceContext,
+        max_total_tokens: usize,
+        max_total_pages: usize,
+        max_batch: usize,
+        max_tiles: usize,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: openinfer_kernels::ops::PrefillPagedPlan::new_preallocated(
+                ctx,
+                max_total_tokens,
+                max_total_pages,
+                max_batch,
+                max_tiles,
+            )?,
+        })
+    }
+
+    /// Refill a pre-allocated plan in place (no allocation, pointers unchanged).
+    #[allow(clippy::too_many_arguments)]
+    pub fn update_batch_with_cta_tile_q(
+        &mut self,
+        ctx: &DeviceContext,
+        page_indices: &[Vec<i32>],
+        last_page_lens: &[usize],
+        start_positions: &[usize],
+        seq_lens: &[usize],
+        num_q_heads: usize,
+        num_kv_heads: usize,
+        head_dim: usize,
+        cta_tile_q_override: i32,
+    ) -> Result<()> {
+        self.inner.update_batch_with_cta_tile_q(
+            ctx,
+            page_indices,
+            last_page_lens,
+            start_positions,
+            seq_lens,
+            num_q_heads,
+            num_kv_heads,
+            head_dim,
+            cta_tile_q_override,
+        )
+    }
+
     pub fn page_indices_d(&self) -> &CudaSlice<i32> {
         self.inner.page_indices_d()
     }

diff --git a/openinfer-kernels/csrc/shared/elementwise.cu b/openinfer-kernels/csrc/shared/elementwise.cu
@@ -63,6 +63,43 @@ __global__ void gather_hidden_tokens_kernel(
   }
 }
 
+__global__ void copy_hidden_rows_kernel(
+    const __nv_bfloat16 *__restrict__ src,
+    __nv_bfloat16 *__restrict__ dst,
+    int src_hidden_dim,
+    int dst_hidden_dim,
+    int row_offset,
+    int rows,
+    int seq_len) {
+  int total = rows * seq_len;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < total;
+       idx += gridDim.x * blockDim.x) {
+    int token = idx / rows;
+    int row = idx % rows;
+    dst[(size_t)token * dst_hidden_dim + row_offset + row] =
+        src[(size_t)token * src_hidden_dim + row];
+  }
+}
+
+__global__ void copy_hidden_token_range_kernel(
+    const __nv_bfloat16 *__restrict__ src,
+    __nv_bfloat16 *__restrict__ dst,
+    int hidden_dim,
+    int src_token_offset,
+    int dst_token_offset,
+    int token_count) {
+  int total = hidden_dim * token_count;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < total;
+       idx += gridDim.x * blockDim.x) {
+    int token = idx / hidden_dim;
+    int row = idx % hidden_dim;
+    dst[(size_t)(dst_token_offset + token) * hidden_dim + row] =
+        src[(size_t)(src_token_offset + token) * hidden_dim + row];
+  }
+}
+
 __global__ void scaled_add_rows_indexed_kernel(
     const __nv_bfloat16 *__restrict__ delta,
     float scale,
@@ -305,6 +342,53 @@ CUresult gather_hidden_tokens_cuda(
   return (CUresult)cudaGetLastError();
 }
 
+CUresult copy_hidden_rows_cuda(
+    const __nv_bfloat16 *src,
+    __nv_bfloat16 *dst,
+    int src_hidden_dim,
+    int dst_hidden_dim,
+    int row_offset,
+    int rows,
+    int seq_len,
+    cudaStream_t stream) {
+  if (src == nullptr || dst == nullptr || src_hidden_dim <= 0 ||
+      dst_hidden_dim <= 0 || row_offset < 0 || rows <= 0 || seq_len <= 0 ||
+      rows > src_hidden_dim || row_offset + rows > dst_hidden_dim) {
+    return CUDA_ERROR_INVALID_VALUE;
+  }
+  int total = rows * seq_len;
+  int block = 256;
+  int grid = (total + block - 1) / block;
+  copy_hidden_rows_kernel<<<grid, block, 0, stream>>>(
+      src, dst, src_hidden_dim, dst_hidden_dim, row_offset, rows, seq_len);
+  return (CUresult)cudaGetLastError();
+}
+
+CUresult copy_hidden_token_range_cuda(
+    const __nv_bfloat16 *src,
+    __nv_bfloat16 *dst,
+    int hidden_dim,
+    int src_token_offset,
+    int dst_token_offset,
+    int token_count,
+    int src_seq_len,
+    int dst_seq_len,
+    cudaStream_t stream) {
+  if (src == nullptr || dst == nullptr || hidden_dim <= 0 ||
+      src_token_offset < 0 || dst_token_offset < 0 || token_count <= 0 ||
+      src_seq_len <= 0 || dst_seq_len <= 0 ||
+      src_token_offset + token_count > src_seq_len ||
+      dst_token_offset + token_count > dst_seq_len) {
+    return CUDA_ERROR_INVALID_VALUE;
+  }
+  int total = hidden_dim * token_count;
+  int block = 256;
+  int grid = (total + block - 1) / block;
+  copy_hidden_token_range_kernel<<<grid, block, 0, stream>>>(
+      src, dst, hidden_dim, src_token_offset, dst_token_offset, token_count);
+  return (CUresult)cudaGetLastError();
+}
+
 CUresult scaled_add_rows_indexed_cuda(
     const __nv_bfloat16 *delta,
     float scale,

diff --git a/openinfer-kernels/csrc/shared/paged_attention.cu b/openinfer-kernels/csrc/shared/paged_attention.cu
@@ -607,6 +607,70 @@ int single_prefill_cuda(
             reinterpret_cast<cudaStream_t>(stream)));
 }
 
+int single_prefill_nhd_noncausal_cuda(
+    // Q and output (HiddenStates token-major: [seq_len, q_dim])
+    void*    q,
+    void*    output,
+    // Contiguous KV cache (HiddenStates token-major: [max_seq_len, kv_dim])
+    void*    k_cache,
+    void*    v_cache,
+    int32_t  num_qo_heads,
+    int32_t  num_kv_heads,
+    int32_t  head_dim,
+    int32_t  seq_len,
+    int32_t  kv_len,
+    int32_t  max_seq_len,
+    float    sm_scale,
+    void*    stream)
+{
+    if (q == nullptr || output == nullptr || k_cache == nullptr || v_cache == nullptr ||
+        num_qo_heads <= 0 || num_kv_heads <= 0 || head_dim != 128 ||
+        seq_len <= 0 || kv_len <= 0 || max_seq_len < kv_len) {
+        return static_cast<int>(cudaErrorInvalidValue);
+    }
+
+    uint32_t q_stride_n  = num_qo_heads * head_dim;
+    uint32_t q_stride_h  = head_dim;
+    uint32_t kv_stride_n = num_kv_heads * head_dim;
+    uint32_t kv_stride_h = head_dim;
+
+    PrefillParamsT params(
+        reinterpret_cast<DType*>(q),
+        reinterpret_cast<DType*>(k_cache),
+        reinterpret_cast<DType*>(v_cache),
+        /*maybe_custom_mask=*/nullptr,
+        reinterpret_cast<DType*>(output),
+        /*lse=*/nullptr,
+        /*maybe_alibi_slopes=*/nullptr,
+        num_qo_heads,
+        num_kv_heads,
+        static_cast<uint32_t>(seq_len),
+        static_cast<uint32_t>(kv_len),
+        q_stride_n,
+        q_stride_h,
+        kv_stride_n,
+        kv_stride_h,
+        static_cast<uint32_t>(head_dim),
+        /*window_left=*/-1,
+        /*logits_soft_cap=*/0.0f,
+        sm_scale,
+        /*rope_scale=*/1.0f,
+        /*rope_theta=*/1e6f);
+
+    return static_cast<int>(
+        SinglePrefillWithKVCacheDispatched<
+            /*HEAD_DIM_QK=*/128,
+            /*HEAD_DIM_VO=*/128,
+            PosEncodingMode::kNone,
+            /*USE_FP16_QK_REDUCTION=*/false,
+            MaskMode::kNone,
+            Variant,
+            PrefillParamsT>(
+            params,
+            /*tmp=*/nullptr,
+            reinterpret_cast<cudaStream_t>(stream)));
+}
+
 // ---------------------------------------------------------------------------
 // Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache.
 //

diff --git a/openinfer-kernels/csrc/shared/prefill_attention.cu b/openinfer-kernels/csrc/shared/prefill_attention.cu
@@ -93,6 +93,90 @@ __global__ void prefill_qk_norm_rope_kernel(
     data[offset] = result;
 }
 
+__global__ void dflash_qk_norm_rope_kernel(
+    __nv_bfloat16* __restrict__ q,
+    __nv_bfloat16* __restrict__ k,
+    const __nv_bfloat16* __restrict__ q_norm_weight,
+    const __nv_bfloat16* __restrict__ k_norm_weight,
+    const __nv_bfloat16* __restrict__ cos_cache,
+    const __nv_bfloat16* __restrict__ sin_cache,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int q_len,
+    int k_len,
+    int q_start_pos,
+    int k_start_pos,
+    float eps,
+    int cos_max_pos
+) {
+    int head_global = blockIdx.x;
+    int token = blockIdx.y;
+    int d = threadIdx.x;
+
+    bool is_q = (head_global < num_q_heads);
+    int local_heads = is_q ? num_q_heads : num_kv_heads;
+    int seq_len = is_q ? q_len : k_len;
+    if (token >= seq_len) return;
+
+    int head_local = is_q ? head_global : (head_global - num_q_heads);
+    if (head_local >= local_heads) return;
+
+    __nv_bfloat16* data = is_q ? q : k;
+    int dim_stride = local_heads * head_dim;
+    const __nv_bfloat16* norm_w = is_q ? q_norm_weight : k_norm_weight;
+    int pos = (is_q ? q_start_pos : k_start_pos) + token;
+    if (pos < 0 || pos >= cos_max_pos) __trap();
+
+    int offset = token * dim_stride + head_local * head_dim + d;
+    float val = __bfloat162float(data[offset]);
+
+    float sq = warp_reduce_sum(val * val);
+    int warp_id = d / WARP_SIZE;
+    int lane_id = d % WARP_SIZE;
+    __shared__ float warp_sums[4];
+    if (lane_id == 0) warp_sums[warp_id] = sq;
+    __syncthreads();
+
+    __shared__ float s_inv_rms;
+    if (warp_id == 0) {
+        float v = (lane_id < 4) ? warp_sums[lane_id] : 0.0f;
+        float total = warp_reduce_sum(v);
+        if (lane_id == 0) s_inv_rms = rsqrtf(total / head_dim + eps);
+    }
+    __syncthreads();
+
+    __nv_bfloat16 normed = __float2bfloat16(val * s_inv_rms);
+    float normed_f = __bfloat162float(normed) * __bfloat162float(norm_w[d]);
+
+    __shared__ __nv_bfloat16 smem[HEAD_DIM];
+    smem[d] = __float2bfloat16(normed_f);
+    __syncthreads();
+
+    int half = head_dim / 2;
+    __nv_bfloat16 result;
+    if (d < half) {
+        float lo = __bfloat162float(smem[d]);
+        float hi = __bfloat162float(smem[d + half]);
+        float c = __bfloat162float(cos_cache[pos * head_dim + d]);
+        float s = __bfloat162float(sin_cache[pos * head_dim + d]);
+        float lo_cos = __bfloat162float(__float2bfloat16(lo * c));
+        float hi_sin = __bfloat162float(__float2bfloat16(hi * s));
+        result = __float2bfloat16(lo_cos - hi_sin);
+    } else {
+        int pair_d = d - half;
+        float lo = __bfloat162float(smem[pair_d]);
+        float hi = __bfloat162float(smem[d]);
+        float c = __bfloat162float(cos_cache[pos * head_dim + pair_d]);
+        float s = __bfloat162float(sin_cache[pos * head_dim + pair_d]);
+        float lo_sin = __bfloat162float(__float2bfloat16(lo * s));
+        float hi_cos = __bfloat162float(__float2bfloat16(hi * c));
+        result = __float2bfloat16(lo_sin + hi_cos);
+    }
+
+    data[offset] = result;
+}
+
 extern "C" {
 
 // ============================================================================
@@ -136,4 +220,38 @@ void qk_norm_rope_batched_decode_cuda(
     );
 }
 
+int dflash_qk_norm_rope_cuda(
+    __nv_bfloat16* q,
+    __nv_bfloat16* k,
+    const __nv_bfloat16* q_norm_weight,
+    const __nv_bfloat16* k_norm_weight,
+    const __nv_bfloat16* cos_cache,
+    const __nv_bfloat16* sin_cache,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int q_len,
+    int k_len,
+    int q_start_pos,
+    int k_start_pos,
+    float rms_eps,
+    int cos_max_pos,
+    cudaStream_t stream
+) {
+    if (q == nullptr || k == nullptr || q_norm_weight == nullptr ||
+        k_norm_weight == nullptr || cos_cache == nullptr || sin_cache == nullptr ||
+        num_q_heads <= 0 || num_kv_heads <= 0 || head_dim != HEAD_DIM ||
+        q_len <= 0 || k_len <= 0 || q_start_pos < 0 || k_start_pos < 0 ||
+        q_start_pos + q_len > cos_max_pos || k_start_pos + k_len > cos_max_pos) {
+        return static_cast<int>(cudaErrorInvalidValue);
+    }
+
+    dim3 grid(num_q_heads + num_kv_heads, q_len > k_len ? q_len : k_len);
+    dflash_qk_norm_rope_kernel<<<grid, head_dim, 0, stream>>>(
+        q, k, q_norm_weight, k_norm_weight, cos_cache, sin_cache,
+        num_q_heads, num_kv_heads, head_dim, q_len, k_len,
+        q_start_pos, k_start_pos, rms_eps, cos_max_pos);
+    return static_cast<int>(cudaGetLastError());
+}
+
 } // extern "C"