Skip to content
Open
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
| `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB, batched greedy sampling (#307), mixed greedy/non-greedy sampling (#284), and pegaflow KV offload (#316) are landed; open set is zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, and YaRN #8 follow-up. |
| `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. |
| `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. |
| `models/qwen3/dflash-speculative-decoding.md` | DFlash speculative decoding behind `--dflash-draft-model-path`, modelled as an optimistic transaction (propose K → verify K+1 span → accept longest argmax prefix + 1 bonus → commit/roll back KV). Lossless up to bf16 tie-flips (bit-identical multi-token accepts; lm-eval gsm8k strict-match identical spec on/off). Single-stream decode 1.82× on 5070 Ti, 1.56× on 5090. Concurrent throughput fixed by batching the draft forward, then a piecewise verify CUDA Graph (dense ops captured, attention eager) closed single-stream: 5090 greedy c1 274 ≈ vLLM 278, c8 1525 > 1240, c16 1834 ≈ 1846 — all batch sizes now ≥ vLLM. Accept measured equal (9.1% vs 8.85%, same drafter); draft-side piecewise graph tracked next. Proposer trait deferred to EAGLE. |
| `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. |
| `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. |
| `models/qwen3/tp-design.md` | Qwen3 tensor-parallel design: `TP=2` milestone scope plus the controller/worker broadcast execution model, request identity, and coarse-grained step protocol for future TP/MoE work. |
Expand Down
143 changes: 143 additions & 0 deletions docs/models/qwen3/dflash-speculative-decoding.md

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion openinfer-core/src/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pub use attention::{
pub use openinfer_kernels::ops::{
GEMM_LT_MAX_N, LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into,
add_batch, add_batch_into, argmax, argmax_batch_bf16_into, bf16_hidden_to_f32_into,
copy_hidden_rows_into, copy_hidden_token_range_into, dflash_qk_norm_rope_into,
embedding_decode_into, extract_vec, extract_vec_into, extract_vec_ref, extract_vec_ref_into,
f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm,
gemm_graphsafe_into_checked, gemm_graphsafe_ref_into_checked, gemm_into_checked, gemm_lt_tune,
Expand All @@ -23,7 +24,8 @@ pub use openinfer_kernels::ops::{
qk_norm_partial_rope_batched_decode_hd256_into, rms_norm, rms_norm_batch_offset_into,
rms_norm_gated_batch_into, rms_norm_into, rms_norm_offset_into, scale_f32_in_place,
scaled_add_batch_into, scaled_add_rows_indexed_into, scaled_add_rows_into,
scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into,
scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into,
single_prefill_nhd_noncausal_into, write_vec_into,
};
#[cfg(not(feature = "kernel-call-trace"))]
pub use openinfer_kernels::ops::{
Expand Down
47 changes: 47 additions & 0 deletions openinfer-core/src/ops/paged_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,53 @@ impl PrefillPagedPlan {
})
}

/// Pre-allocate a worst-case-sized plan to be refilled in place by
/// [`Self::update_batch_with_cta_tile_q`] (graph-stable buffer reuse).
pub fn new_preallocated(
ctx: &DeviceContext,
max_total_tokens: usize,
max_total_pages: usize,
max_batch: usize,
max_tiles: usize,
) -> Result<Self> {
Ok(Self {
inner: openinfer_kernels::ops::PrefillPagedPlan::new_preallocated(
ctx,
max_total_tokens,
max_total_pages,
max_batch,
max_tiles,
)?,
})
}

/// Refill a pre-allocated plan in place (no allocation, pointers unchanged).
#[allow(clippy::too_many_arguments)]
pub fn update_batch_with_cta_tile_q(
&mut self,
ctx: &DeviceContext,
page_indices: &[Vec<i32>],
last_page_lens: &[usize],
start_positions: &[usize],
seq_lens: &[usize],
num_q_heads: usize,
num_kv_heads: usize,
head_dim: usize,
cta_tile_q_override: i32,
) -> Result<()> {
self.inner.update_batch_with_cta_tile_q(
ctx,
page_indices,
last_page_lens,
start_positions,
seq_lens,
num_q_heads,
num_kv_heads,
head_dim,
cta_tile_q_override,
)
}

pub fn page_indices_d(&self) -> &CudaSlice<i32> {
self.inner.page_indices_d()
}
Expand Down
84 changes: 84 additions & 0 deletions openinfer-kernels/csrc/shared/elementwise.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,43 @@ __global__ void gather_hidden_tokens_kernel(
}
}

__global__ void copy_hidden_rows_kernel(
const __nv_bfloat16 *__restrict__ src,
__nv_bfloat16 *__restrict__ dst,
int src_hidden_dim,
int dst_hidden_dim,
int row_offset,
int rows,
int seq_len) {
int total = rows * seq_len;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total;
idx += gridDim.x * blockDim.x) {
int token = idx / rows;
int row = idx % rows;
dst[(size_t)token * dst_hidden_dim + row_offset + row] =
src[(size_t)token * src_hidden_dim + row];
}
}

__global__ void copy_hidden_token_range_kernel(
const __nv_bfloat16 *__restrict__ src,
__nv_bfloat16 *__restrict__ dst,
int hidden_dim,
int src_token_offset,
int dst_token_offset,
int token_count) {
int total = hidden_dim * token_count;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total;
idx += gridDim.x * blockDim.x) {
int token = idx / hidden_dim;
int row = idx % hidden_dim;
dst[(size_t)(dst_token_offset + token) * hidden_dim + row] =
src[(size_t)(src_token_offset + token) * hidden_dim + row];
}
}

__global__ void scaled_add_rows_indexed_kernel(
const __nv_bfloat16 *__restrict__ delta,
float scale,
Expand Down Expand Up @@ -305,6 +342,53 @@ CUresult gather_hidden_tokens_cuda(
return (CUresult)cudaGetLastError();
}

CUresult copy_hidden_rows_cuda(
const __nv_bfloat16 *src,
__nv_bfloat16 *dst,
int src_hidden_dim,
int dst_hidden_dim,
int row_offset,
int rows,
int seq_len,
cudaStream_t stream) {
if (src == nullptr || dst == nullptr || src_hidden_dim <= 0 ||
dst_hidden_dim <= 0 || row_offset < 0 || rows <= 0 || seq_len <= 0 ||
rows > src_hidden_dim || row_offset + rows > dst_hidden_dim) {
return CUDA_ERROR_INVALID_VALUE;
}
int total = rows * seq_len;
int block = 256;
int grid = (total + block - 1) / block;
copy_hidden_rows_kernel<<<grid, block, 0, stream>>>(
src, dst, src_hidden_dim, dst_hidden_dim, row_offset, rows, seq_len);
return (CUresult)cudaGetLastError();
}

CUresult copy_hidden_token_range_cuda(
const __nv_bfloat16 *src,
__nv_bfloat16 *dst,
int hidden_dim,
int src_token_offset,
int dst_token_offset,
int token_count,
int src_seq_len,
int dst_seq_len,
cudaStream_t stream) {
if (src == nullptr || dst == nullptr || hidden_dim <= 0 ||
src_token_offset < 0 || dst_token_offset < 0 || token_count <= 0 ||
src_seq_len <= 0 || dst_seq_len <= 0 ||
src_token_offset + token_count > src_seq_len ||
dst_token_offset + token_count > dst_seq_len) {
return CUDA_ERROR_INVALID_VALUE;
}
int total = hidden_dim * token_count;
int block = 256;
int grid = (total + block - 1) / block;
copy_hidden_token_range_kernel<<<grid, block, 0, stream>>>(
src, dst, hidden_dim, src_token_offset, dst_token_offset, token_count);
return (CUresult)cudaGetLastError();
}

CUresult scaled_add_rows_indexed_cuda(
const __nv_bfloat16 *delta,
float scale,
Expand Down
64 changes: 64 additions & 0 deletions openinfer-kernels/csrc/shared/paged_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,70 @@ int single_prefill_cuda(
reinterpret_cast<cudaStream_t>(stream)));
}

int single_prefill_nhd_noncausal_cuda(
// Q and output (HiddenStates token-major: [seq_len, q_dim])
void* q,
void* output,
// Contiguous KV cache (HiddenStates token-major: [max_seq_len, kv_dim])
void* k_cache,
void* v_cache,
int32_t num_qo_heads,
int32_t num_kv_heads,
int32_t head_dim,
int32_t seq_len,
int32_t kv_len,
int32_t max_seq_len,
float sm_scale,
void* stream)
{
if (q == nullptr || output == nullptr || k_cache == nullptr || v_cache == nullptr ||
num_qo_heads <= 0 || num_kv_heads <= 0 || head_dim != 128 ||
seq_len <= 0 || kv_len <= 0 || max_seq_len < kv_len) {
return static_cast<int>(cudaErrorInvalidValue);
}

uint32_t q_stride_n = num_qo_heads * head_dim;
uint32_t q_stride_h = head_dim;
uint32_t kv_stride_n = num_kv_heads * head_dim;
uint32_t kv_stride_h = head_dim;

PrefillParamsT params(
reinterpret_cast<DType*>(q),
reinterpret_cast<DType*>(k_cache),
reinterpret_cast<DType*>(v_cache),
/*maybe_custom_mask=*/nullptr,
reinterpret_cast<DType*>(output),
/*lse=*/nullptr,
/*maybe_alibi_slopes=*/nullptr,
num_qo_heads,
num_kv_heads,
static_cast<uint32_t>(seq_len),
static_cast<uint32_t>(kv_len),
q_stride_n,
q_stride_h,
kv_stride_n,
kv_stride_h,
static_cast<uint32_t>(head_dim),
/*window_left=*/-1,
/*logits_soft_cap=*/0.0f,
sm_scale,
/*rope_scale=*/1.0f,
/*rope_theta=*/1e6f);

return static_cast<int>(
SinglePrefillWithKVCacheDispatched<
/*HEAD_DIM_QK=*/128,
/*HEAD_DIM_VO=*/128,
PosEncodingMode::kNone,
/*USE_FP16_QK_REDUCTION=*/false,
MaskMode::kNone,
Variant,
PrefillParamsT>(
params,
/*tmp=*/nullptr,
reinterpret_cast<cudaStream_t>(stream)));
}

// ---------------------------------------------------------------------------
// Single-request prefill for HEAD_DIM=256 — wraps FlashInfer SinglePrefillWithKVCache.
//
Expand Down
118 changes: 118 additions & 0 deletions openinfer-kernels/csrc/shared/prefill_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,90 @@ __global__ void prefill_qk_norm_rope_kernel(
data[offset] = result;
}

__global__ void dflash_qk_norm_rope_kernel(
__nv_bfloat16* __restrict__ q,
__nv_bfloat16* __restrict__ k,
const __nv_bfloat16* __restrict__ q_norm_weight,
const __nv_bfloat16* __restrict__ k_norm_weight,
const __nv_bfloat16* __restrict__ cos_cache,
const __nv_bfloat16* __restrict__ sin_cache,
int num_q_heads,
int num_kv_heads,
int head_dim,
int q_len,
int k_len,
int q_start_pos,
int k_start_pos,
float eps,
int cos_max_pos
) {
int head_global = blockIdx.x;
int token = blockIdx.y;
int d = threadIdx.x;

bool is_q = (head_global < num_q_heads);
int local_heads = is_q ? num_q_heads : num_kv_heads;
int seq_len = is_q ? q_len : k_len;
if (token >= seq_len) return;

int head_local = is_q ? head_global : (head_global - num_q_heads);
if (head_local >= local_heads) return;

__nv_bfloat16* data = is_q ? q : k;
int dim_stride = local_heads * head_dim;
const __nv_bfloat16* norm_w = is_q ? q_norm_weight : k_norm_weight;
int pos = (is_q ? q_start_pos : k_start_pos) + token;
if (pos < 0 || pos >= cos_max_pos) __trap();

int offset = token * dim_stride + head_local * head_dim + d;
float val = __bfloat162float(data[offset]);

float sq = warp_reduce_sum(val * val);
int warp_id = d / WARP_SIZE;
int lane_id = d % WARP_SIZE;
__shared__ float warp_sums[4];
if (lane_id == 0) warp_sums[warp_id] = sq;
__syncthreads();

__shared__ float s_inv_rms;
if (warp_id == 0) {
float v = (lane_id < 4) ? warp_sums[lane_id] : 0.0f;
float total = warp_reduce_sum(v);
if (lane_id == 0) s_inv_rms = rsqrtf(total / head_dim + eps);
}
__syncthreads();

__nv_bfloat16 normed = __float2bfloat16(val * s_inv_rms);
float normed_f = __bfloat162float(normed) * __bfloat162float(norm_w[d]);

__shared__ __nv_bfloat16 smem[HEAD_DIM];
smem[d] = __float2bfloat16(normed_f);
__syncthreads();

int half = head_dim / 2;
__nv_bfloat16 result;
if (d < half) {
float lo = __bfloat162float(smem[d]);
float hi = __bfloat162float(smem[d + half]);
float c = __bfloat162float(cos_cache[pos * head_dim + d]);
float s = __bfloat162float(sin_cache[pos * head_dim + d]);
float lo_cos = __bfloat162float(__float2bfloat16(lo * c));
float hi_sin = __bfloat162float(__float2bfloat16(hi * s));
result = __float2bfloat16(lo_cos - hi_sin);
} else {
int pair_d = d - half;
float lo = __bfloat162float(smem[pair_d]);
float hi = __bfloat162float(smem[d]);
float c = __bfloat162float(cos_cache[pos * head_dim + pair_d]);
float s = __bfloat162float(sin_cache[pos * head_dim + pair_d]);
float lo_sin = __bfloat162float(__float2bfloat16(lo * s));
float hi_cos = __bfloat162float(__float2bfloat16(hi * c));
result = __float2bfloat16(lo_sin + hi_cos);
}

data[offset] = result;
}

extern "C" {

// ============================================================================
Expand Down Expand Up @@ -136,4 +220,38 @@ void qk_norm_rope_batched_decode_cuda(
);
}

int dflash_qk_norm_rope_cuda(
__nv_bfloat16* q,
__nv_bfloat16* k,
const __nv_bfloat16* q_norm_weight,
const __nv_bfloat16* k_norm_weight,
const __nv_bfloat16* cos_cache,
const __nv_bfloat16* sin_cache,
int num_q_heads,
int num_kv_heads,
int head_dim,
int q_len,
int k_len,
int q_start_pos,
int k_start_pos,
float rms_eps,
int cos_max_pos,
cudaStream_t stream
) {
if (q == nullptr || k == nullptr || q_norm_weight == nullptr ||
k_norm_weight == nullptr || cos_cache == nullptr || sin_cache == nullptr ||
num_q_heads <= 0 || num_kv_heads <= 0 || head_dim != HEAD_DIM ||
q_len <= 0 || k_len <= 0 || q_start_pos < 0 || k_start_pos < 0 ||
q_start_pos + q_len > cos_max_pos || k_start_pos + k_len > cos_max_pos) {
return static_cast<int>(cudaErrorInvalidValue);
}

dim3 grid(num_q_heads + num_kv_heads, q_len > k_len ? q_len : k_len);
dflash_qk_norm_rope_kernel<<<grid, head_dim, 0, stream>>>(
q, k, q_norm_weight, k_norm_weight, cos_cache, sin_cache,
num_q_heads, num_kv_heads, head_dim, q_len, k_len,
q_start_pos, k_start_pos, rms_eps, cos_max_pos);
return static_cast<int>(cudaGetLastError());
}

} // extern "C"
Loading