From 224a9ba98bb0a2174865169b72b79ddc0417388c Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Mon, 4 May 2026 23:12:36 -0400
Subject: [PATCH 01/14] feat(runtime): Added production signals API and SQLite
 storage

Introduce the production-context signal foundation. Add the
internal/signals package with the Signal domain type, validation rules,
single-record POST handler, Store interface, unavailable store fallback, and
retention loop. Signal JSON preserves unknown top-level fields while
server-owned signal_id and received_at are generated only after validation.

Add SQLite-backed signal persistence in coldstore via a new signals table
migration and SignalStore implementation. Signals can be inserted, queried by
service/env/source/reason/type/time window, ordered deterministically, and
pruned by retention cutoff. The storage implementation lives in coldstore so it
can reuse the existing SQLite reader/writer handles and migration ownership.

Wire POST /v1/signals into the ingest server behind write-scope auth. When
SQLITE_PATH is unset, the endpoint returns a structured 503 durability error
without affecting existing v2 read APIs. Add WAYLOG_SIGNAL_RETENTION startup
validation and start a retention janitor only when SQLite-backed signal storage
is available.

Add Prometheus counters for accepted signals, rejected signals by reason, and
retention-pruned signals. Document the new endpoint in OpenAPI and add the new
retention env var to docs/env.md.
---
 cmd/ingest/main.go                            |  14 ++
 docs/env.md                                   |   1 +
 docs/openapi.yaml                             | 118 +++++++++++
 internal/coldstore/migrations/003_signals.sql |  20 ++
 internal/coldstore/signal_store.go            | 188 +++++++++++++++++
 internal/coldstore/signal_store_test.go       |  97 +++++++++
 internal/metrics/metrics.go                   |  25 +++
 internal/signals/handler.go                   | 118 +++++++++++
 internal/signals/handler_test.go              | 143 +++++++++++++
 internal/signals/retention.go                 |  39 ++++
 internal/signals/retention_test.go            |  51 +++++
 internal/signals/store.go                     |  40 ++++
 internal/signals/types.go                     | 190 ++++++++++++++++++
 internal/signals/types_test.go                |  78 +++++++
 internal/signals/validate.go                  |  74 +++++++
 internal/signals/validate_test.go             |  53 +++++
 16 files changed, 1249 insertions(+)
 create mode 100644 internal/coldstore/migrations/003_signals.sql
 create mode 100644 internal/coldstore/signal_store.go
 create mode 100644 internal/coldstore/signal_store_test.go
 create mode 100644 internal/signals/handler.go
 create mode 100644 internal/signals/handler_test.go
 create mode 100644 internal/signals/retention.go
 create mode 100644 internal/signals/retention_test.go
 create mode 100644 internal/signals/store.go
 create mode 100644 internal/signals/types.go
 create mode 100644 internal/signals/types_test.go
 create mode 100644 internal/signals/validate.go
 create mode 100644 internal/signals/validate_test.go

diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 52c700e..599993e 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -32,6 +32,7 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/metrics"
 	otelhttp "github.com/sssmaran/WaylogCLI/internal/otel"
 	"github.com/sssmaran/WaylogCLI/internal/persist"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
 	"github.com/sssmaran/WaylogCLI/internal/tracestore"
 )
@@ -124,6 +125,11 @@ func main() {
 	graphUI := config.GetenvBool("GRAPH_UI", false)
 	otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
 	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
+	signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+	if signalRetention <= 0 {
+		slog.Error("WAYLOG_SIGNAL_RETENTION must be positive", "value", signalRetention)
+		os.Exit(1)
+	}
 
 	causalEnabled := config.GetenvBool("CAUSAL_ENABLED", false)
 	causalInterval := config.GetenvDuration("CAUSAL_INTERVAL", 30*time.Second)
@@ -188,6 +194,7 @@ func main() {
 	// Optional SQLite cold store
 	var coldDB coldstore.ManagedStore
 	var coldWriter *coldstore.BatchWriter
+	var signalStore signals.Store = signals.UnavailableStore{}
 	if sqlitePath != "" {
 		if eventLogDir == "" {
 			slog.Warn("SQLITE_PATH set without EVENT_LOG_DIR — cold store is async-only, " +
@@ -207,6 +214,7 @@ func main() {
 			FlushInterval: config.GetenvDuration("SQLITE_FLUSH_INTERVAL", 500*time.Millisecond),
 		}, m)
 		coldWriter.Start()
+		signalStore = coldstore.NewSignalStore(coldDB.(*coldstore.SQLiteStore))
 
 		slog.Info("coldstore enabled", "path", sqlitePath)
 	}
@@ -355,6 +363,8 @@ func main() {
 	}
 	mux.Handle("/v1/events", writeAuth(http.HandlerFunc(eventsV2.Events)))
 	mux.Handle("/v1/events/validate", writeAuth(http.HandlerFunc(eventsV2.Validate)))
+	signalHandler := signals.NewHandler(signalStore, m)
+	mux.Handle("/v1/signals", writeAuth(http.HandlerFunc(signalHandler.Signals)))
 
 	// OTLP/HTTP traces reuse the same schema-2.0 WAL and projector as the SDK path.
 	if otlpEnabled {
@@ -453,6 +463,10 @@ func main() {
 	)
 	defer stop()
 
+	if _, ok := signalStore.(*coldstore.SignalStore); ok {
+		go signals.RunRetention(ctx, signalStore, signalRetention, 5*time.Minute, m, slog.Default())
+	}
+
 	go func() {
 		slog.Info("ingest listening", "addr", addr, "graph_hot_window", graphHotWindow)
 		if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
diff --git a/docs/env.md b/docs/env.md
index f0c77ea..7ace4cb 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -56,6 +56,7 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
 | `EVENT_LOG_SYNC` | `true` | Per-write fsync. Set `false` for dev/load testing |
 | `EVENT_LOG_MAX_FILE_MB` | `50` | Rotation size. `0` disables rotation |
 | `EVENT_LOG_RETENTION` | `72h` | Event log retention. Must be positive |
+| `WAYLOG_SIGNAL_RETENTION` | `72h` | Production-context signal retention. Must be positive. `/v1/signals` requires `SQLITE_PATH` |
 | `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
 | `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
 | `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 9d874ee..151392b 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -19,6 +19,8 @@ tags:
     description: Schema-2.0 event ingest and validation.
   - name: OTLP
     description: OTLP/HTTP trace ingest converted into schema-2.0 events.
+  - name: Signals
+    description: Production-context facts used by incident triage.
   - name: Events
     description: Direct event lookup and search.
   - name: Traces
@@ -136,6 +138,64 @@ paths:
               schema:
                 $ref: '#/components/schemas/IngestEnvelope'
 
+  /v1/signals:
+    post:
+      tags: [Signals]
+      operationId: ingestSignal
+      summary: Ingest one production-context signal
+      description: |
+        Accepts one low-volume production-context signal, such as a deploy,
+        dependency, runtime, healthcheck, config, or alert fact. Signals require
+        SQLite persistence and are used by the v2.1 incident engine.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/Signal'
+      responses:
+        '201':
+          description: Signal accepted and persisted
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/SignalAccepted'
+        '400':
+          description: Invalid signal JSON or fields
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ReadError'
+        '401':
+          description: Unauthorized
+        '405':
+          description: Method Not Allowed
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ReadError'
+        '413':
+          description: Request body too large
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ReadError'
+        '503':
+          description: Signal storage unavailable; set SQLITE_PATH
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ReadError'
+        '500':
+          description: Internal signal storage error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ReadError'
+
   /v1/otlp/v1/traces:
     post:
       tags: [OTLP]
@@ -911,6 +971,64 @@ components:
           items:
             $ref: '#/components/schemas/ErrorRef'
 
+    Signal:
+      type: object
+      additionalProperties: true
+      required: [type, source, service, env, severity, reason, timestamp]
+      example:
+        type: deploy
+        source: github-actions
+        service: checkout
+        env: prod
+        severity: info
+        reason: RolloutComplete
+        message: checkout 1.18.2 rolled out
+        metadata:
+          deployment_id: deploy_123
+          version: 1.18.2
+        timestamp: "2026-05-02T18:09:40Z"
+      properties:
+        signal_id:
+          type: string
+          readOnly: true
+          description: Server-generated signal id with sig_ prefix.
+        type:
+          type: string
+          enum: [deploy, runtime, healthcheck, dependency, config, alert]
+        source:
+          type: string
+        service:
+          type: string
+        env:
+          type: string
+        severity:
+          type: string
+          enum: [info, warning, critical]
+        reason:
+          type: string
+        message:
+          type: string
+        resource:
+          type: object
+          additionalProperties: true
+        metadata:
+          type: object
+          additionalProperties: true
+        timestamp:
+          type: string
+          format: date-time
+        received_at:
+          type: string
+          format: date-time
+          readOnly: true
+
+    SignalAccepted:
+      type: object
+      required: [signal]
+      properties:
+        signal:
+          $ref: '#/components/schemas/Signal'
+
     Anchor:
       type: object
       required: [step, error_code]
diff --git a/internal/coldstore/migrations/003_signals.sql b/internal/coldstore/migrations/003_signals.sql
new file mode 100644
index 0000000..e33156b
--- /dev/null
+++ b/internal/coldstore/migrations/003_signals.sql
@@ -0,0 +1,20 @@
+-- 003_signals.sql: production-context signal storage.
+
+CREATE TABLE IF NOT EXISTS signals (
+    signal_id   TEXT PRIMARY KEY,
+    type        TEXT NOT NULL,
+    source      TEXT NOT NULL,
+    service     TEXT NOT NULL,
+    env         TEXT NOT NULL,
+    severity    TEXT NOT NULL,
+    reason      TEXT NOT NULL,
+    message     TEXT,
+    resource    TEXT,
+    metadata    TEXT,
+    extra       TEXT,
+    timestamp   TEXT NOT NULL,
+    received_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now'))
+);
+
+CREATE INDEX IF NOT EXISTS idx_signals_service_env_type_ts ON signals (service, env, type, timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_signals_ts ON signals (timestamp);
diff --git a/internal/coldstore/signal_store.go b/internal/coldstore/signal_store.go
new file mode 100644
index 0000000..f9856be
--- /dev/null
+++ b/internal/coldstore/signal_store.go
@@ -0,0 +1,188 @@
+package coldstore
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+)
+
+type SignalStore struct {
+	db *SQLiteStore
+}
+
+func NewSignalStore(db *SQLiteStore) *SignalStore {
+	return &SignalStore{db: db}
+}
+
+func (s *SignalStore) Insert(ctx context.Context, sig *signals.Signal) error {
+	resource, err := marshalMap(sig.Resource)
+	if err != nil {
+		return fmt.Errorf("coldstore signals marshal resource: %w", err)
+	}
+	metadata, err := marshalMap(sig.Metadata)
+	if err != nil {
+		return fmt.Errorf("coldstore signals marshal metadata: %w", err)
+	}
+	extra, err := marshalMap(sig.Extra)
+	if err != nil {
+		return fmt.Errorf("coldstore signals marshal extra: %w", err)
+	}
+	_, err = s.db.writer.ExecContext(ctx, `
+		INSERT INTO signals (
+			signal_id, type, source, service, env, severity, reason, message,
+			resource, metadata, extra, timestamp, received_at
+		) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		sig.SignalID, string(sig.Type), sig.Source, sig.Service, sig.Env, string(sig.Severity), sig.Reason, sig.Message,
+		resource, metadata, extra, sig.Timestamp.UTC().Format(tsFormat), sig.ReceivedAt.UTC().Format(tsFormat),
+	)
+	if err != nil {
+		return fmt.Errorf("coldstore insert signal: %w", err)
+	}
+	return nil
+}
+
+func (s *SignalStore) Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error) {
+	if f.Limit <= 0 {
+		f.Limit = 200
+	}
+	if f.Limit > 200 {
+		f.Limit = 200
+	}
+	conds := []string{}
+	args := []any{}
+	if f.Service != "" {
+		conds = append(conds, "service = ?")
+		args = append(args, f.Service)
+	}
+	if f.Env != "" {
+		conds = append(conds, "env = ?")
+		args = append(args, f.Env)
+	}
+	if f.Source != "" {
+		conds = append(conds, "source = ?")
+		args = append(args, f.Source)
+	}
+	if f.Reason != "" {
+		conds = append(conds, "reason = ?")
+		args = append(args, f.Reason)
+	}
+	if len(f.Types) > 0 {
+		placeholders := make([]string, 0, len(f.Types))
+		for _, typ := range f.Types {
+			placeholders = append(placeholders, "?")
+			args = append(args, string(typ))
+		}
+		conds = append(conds, "type IN ("+strings.Join(placeholders, ", ")+")")
+	}
+	if !f.Since.IsZero() {
+		conds = append(conds, "timestamp >= ?")
+		args = append(args, f.Since.UTC().Format(tsFormat))
+	}
+	if !f.Until.IsZero() {
+		conds = append(conds, "timestamp <= ?")
+		args = append(args, f.Until.UTC().Format(tsFormat))
+	}
+	where := ""
+	if len(conds) > 0 {
+		where = "WHERE " + strings.Join(conds, " AND ")
+	}
+	query := fmt.Sprintf(`SELECT signal_id, type, source, service, env, severity, reason,
+		COALESCE(message, ''), COALESCE(resource, ''), COALESCE(metadata, ''), COALESCE(extra, ''),
+		timestamp, received_at
+		FROM signals %s ORDER BY timestamp DESC, signal_id DESC LIMIT ?`, where)
+	args = append(args, f.Limit)
+	rows, err := s.db.reader.QueryContext(ctx, query, args...)
+	if err != nil {
+		return nil, fmt.Errorf("coldstore query signals: %w", err)
+	}
+	defer rows.Close()
+	out := make([]signals.Signal, 0, f.Limit)
+	for rows.Next() {
+		sig, err := scanSignal(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, sig)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (s *SignalStore) PruneOlderThan(ctx context.Context, cutoff time.Time) (int, error) {
+	res, err := s.db.writer.ExecContext(ctx, `DELETE FROM signals WHERE timestamp < ?`, cutoff.UTC().Format(tsFormat))
+	if err != nil {
+		return 0, fmt.Errorf("coldstore prune signals: %w", err)
+	}
+	n, err := res.RowsAffected()
+	if err != nil {
+		return 0, fmt.Errorf("coldstore prune signals rows affected: %w", err)
+	}
+	return int(n), nil
+}
+
+func scanSignal(rows interface {
+	Scan(dest ...any) error
+}) (signals.Signal, error) {
+	var sig signals.Signal
+	var typ, severity, timestamp, receivedAt string
+	var resource, metadata, extra string
+	if err := rows.Scan(
+		&sig.SignalID, &typ, &sig.Source, &sig.Service, &sig.Env, &severity, &sig.Reason,
+		&sig.Message, &resource, &metadata, &extra, &timestamp, &receivedAt,
+	); err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore scan signal: %w", err)
+	}
+	sig.Type = signals.Type(typ)
+	sig.Severity = signals.Severity(severity)
+	ts, err := time.Parse(tsFormat, timestamp)
+	if err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore signal timestamp: %w", err)
+	}
+	sig.Timestamp = ts
+	recv, err := time.Parse(tsFormat, receivedAt)
+	if err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore signal received_at: %w", err)
+	}
+	sig.ReceivedAt = recv
+	if sig.Resource, err = unmarshalMap(resource); err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore signal resource: %w", err)
+	}
+	if sig.Metadata, err = unmarshalMap(metadata); err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore signal metadata: %w", err)
+	}
+	if sig.Extra, err = unmarshalMap(extra); err != nil {
+		return signals.Signal{}, fmt.Errorf("coldstore signal extra: %w", err)
+	}
+	return sig, nil
+}
+
+func marshalMap(m map[string]any) (sql.NullString, error) {
+	if len(m) == 0 {
+		return sql.NullString{}, nil
+	}
+	b, err := json.Marshal(m)
+	if err != nil {
+		return sql.NullString{}, err
+	}
+	return sql.NullString{String: string(b), Valid: true}, nil
+}
+
+func unmarshalMap(raw string) (map[string]any, error) {
+	if raw == "" {
+		return nil, nil
+	}
+	out := map[string]any{}
+	if err := json.Unmarshal([]byte(raw), &out); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+var _ signals.Store = (*SignalStore)(nil)
diff --git a/internal/coldstore/signal_store_test.go b/internal/coldstore/signal_store_test.go
new file mode 100644
index 0000000..7d85850
--- /dev/null
+++ b/internal/coldstore/signal_store_test.go
@@ -0,0 +1,97 @@
+package coldstore
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+)
+
+func TestSignalStoreInsertQueryAndPrune(t *testing.T) {
+	store := newSignalTestStore(t)
+	sigStore := NewSignalStore(store)
+	base := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+	rows := []signals.Signal{
+		testSignal("sig_a", signals.TypeDeploy, "github", "checkout", "prod", "RolloutComplete", base.Add(-time.Minute)),
+		testSignal("sig_b", signals.TypeDependency, "statuspage", "payment", "prod", "Provider5xx", base),
+		testSignal("sig_c", signals.TypeDeploy, "github", "checkout", "staging", "RolloutComplete", base.Add(-2*time.Minute)),
+	}
+	for i := range rows {
+		if err := sigStore.Insert(context.Background(), &rows[i]); err != nil {
+			t.Fatal(err)
+		}
+	}
+	got, err := sigStore.Query(context.Background(), signals.Filter{
+		Service: "checkout",
+		Env:     "prod",
+		Source:  "github",
+		Reason:  "RolloutComplete",
+		Types:   []signals.Type{signals.TypeDeploy},
+		Since:   base.Add(-2 * time.Minute),
+		Until:   base,
+		Limit:   10,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 1 || got[0].SignalID != "sig_a" {
+		t.Fatalf("got=%+v", got)
+	}
+	if got[0].Metadata["version"] != "1.2.3" || got[0].Extra["custom_tag"] != "alpha" {
+		t.Fatalf("metadata/extra not round-tripped: %+v", got[0])
+	}
+
+	got, err = sigStore.Query(context.Background(), signals.Filter{Env: "prod", Limit: 10})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 2 || got[0].SignalID != "sig_b" || got[1].SignalID != "sig_a" {
+		t.Fatalf("ordering got=%+v", got)
+	}
+
+	deleted, err := sigStore.PruneOlderThan(context.Background(), base.Add(-30*time.Second))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if deleted != 2 {
+		t.Fatalf("deleted=%d want 2", deleted)
+	}
+	got, err = sigStore.Query(context.Background(), signals.Filter{Limit: 10})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(got) != 1 || got[0].SignalID != "sig_b" {
+		t.Fatalf("after prune got=%+v", got)
+	}
+}
+
+func newSignalTestStore(t *testing.T) *SQLiteStore {
+	t.Helper()
+	managed, err := Open(":memory:")
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = managed.Close() })
+	store, ok := managed.(*SQLiteStore)
+	if !ok {
+		t.Fatalf("store type=%T", managed)
+	}
+	return store
+}
+
+func testSignal(id string, typ signals.Type, source, service, env, reason string, ts time.Time) signals.Signal {
+	return signals.Signal{
+		SignalID:   id,
+		Type:       typ,
+		Source:     source,
+		Service:    service,
+		Env:        env,
+		Severity:   signals.SeverityInfo,
+		Reason:     reason,
+		Metadata:   map[string]any{"version": "1.2.3"},
+		Extra:      map[string]any{"custom_tag": "alpha"},
+		Timestamp:  ts,
+		ReceivedAt: ts.Add(time.Second),
+	}
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index fd59cf2..a654cf8 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -67,6 +67,10 @@ type Metrics struct {
 	DeployUpsertsTotal prometheus.Counter
 	DeployUpsertErrors prometheus.Counter
 
+	SignalsAccepted       prometheus.Counter
+	SignalsRejected       *prometheus.CounterVec
+	SignalRetentionPruned prometheus.Counter
+
 	CausalRunsTotal   prometheus.Counter
 	CausalRunDuration prometheus.Histogram
 	CausalRunFailures prometheus.Counter
@@ -316,6 +320,26 @@ func New(reg *prometheus.Registry) *Metrics {
 		Help: "Failed deployment upserts (non-env-conflict).",
 	})
 
+	m.SignalsAccepted = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_signals_accepted_total",
+		Help: "Production-context signals accepted into durable storage.",
+	})
+	m.SignalsRejected = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "waylog_signals_rejected_total",
+		Help: "Production-context signals rejected by reason.",
+	}, []string{"reason"})
+	for _, reason := range []string{
+		"invalid_field", "unknown_type", "unknown_severity", "timestamp_too_far_in_future",
+		"body_oversize", "invalid_body", "invalid_json", "unsupported_method",
+		"durability_unavailable", "internal_error",
+	} {
+		m.SignalsRejected.WithLabelValues(reason).Add(0)
+	}
+	m.SignalRetentionPruned = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_signal_retention_pruned_total",
+		Help: "Production-context signals pruned by retention.",
+	})
+
 	m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_causal_runs_total",
 		Help: "Total causal inference runs.",
@@ -392,6 +416,7 @@ func New(reg *prometheus.Registry) *Metrics {
 		m.ToolDirectCallsTotal, m.DedupReplayTotal, m.DedupCacheSize,
 		m.ColdEventsWritten, m.ColdEventsDropped, m.ColdBatchLatency,
 		m.DeployUpsertsTotal, m.DeployUpsertErrors,
+		m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
 		m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
 		m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
 		m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,
diff --git a/internal/signals/handler.go b/internal/signals/handler.go
new file mode 100644
index 0000000..a148d3f
--- /dev/null
+++ b/internal/signals/handler.go
@@ -0,0 +1,118 @@
+package signals
+
+import (
+	"encoding/json"
+	"errors"
+	"io"
+	"net/http"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+)
+
+const defaultMaxBodyBytes int64 = 1 << 20
+
+type Handler struct {
+	store        Store
+	metrics      *metrics.Metrics
+	now          func() time.Time
+	futureSkew   time.Duration
+	maxBodyBytes int64
+}
+
+func NewHandler(store Store, m *metrics.Metrics) *Handler {
+	if store == nil {
+		store = UnavailableStore{}
+	}
+	return &Handler{
+		store:        store,
+		metrics:      m,
+		now:          time.Now,
+		futureSkew:   5 * time.Minute,
+		maxBodyBytes: defaultMaxBodyBytes,
+	}
+}
+
+func (h *Handler) Signals(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodPost {
+		h.reject(w, http.StatusMethodNotAllowed, CodeUnsupportedMethod, "method not allowed", "")
+		return
+	}
+
+	body, ok := h.readBody(w, r)
+	if !ok {
+		return
+	}
+	var signal Signal
+	if err := json.Unmarshal(body, &signal); err != nil {
+		var validation *ValidationError
+		if errors.As(err, &validation) {
+			h.reject(w, http.StatusBadRequest, validation.Code, "invalid signal", validation.Error())
+			return
+		}
+		h.reject(w, http.StatusBadRequest, CodeInvalidJSON, "invalid json", err.Error())
+		return
+	}
+	now := h.now().UTC()
+	if err := Validate(&signal, now, h.futureSkew); err != nil {
+		var validation *ValidationError
+		if errors.As(err, &validation) {
+			h.reject(w, http.StatusBadRequest, validation.Code, "invalid signal", validation.Error())
+			return
+		}
+		h.reject(w, http.StatusBadRequest, CodeInvalidField, "invalid signal", err.Error())
+		return
+	}
+	signal.SignalID = NewSignalID()
+	signal.ReceivedAt = now
+	if err := h.store.Insert(r.Context(), &signal); err != nil {
+		if errors.Is(err, ErrUnavailable) {
+			h.reject(w, http.StatusServiceUnavailable, CodeDurabilityUnavailable, "signals unavailable", "set SQLITE_PATH to enable signals")
+			return
+		}
+		h.reject(w, http.StatusInternalServerError, CodeInternalError, "internal error", "")
+		return
+	}
+	if h.metrics != nil {
+		h.metrics.SignalsAccepted.Inc()
+	}
+	writeJSON(w, http.StatusCreated, map[string]Signal{"signal": signal})
+}
+
+func (h *Handler) readBody(w http.ResponseWriter, r *http.Request) ([]byte, bool) {
+	r.Body = http.MaxBytesReader(w, r.Body, h.maxBodyBytes)
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		var maxErr *http.MaxBytesError
+		if errors.As(err, &maxErr) {
+			h.reject(w, http.StatusRequestEntityTooLarge, CodeBodyOversize, "body too large", "request body exceeds 1 MB")
+			return nil, false
+		}
+		h.reject(w, http.StatusBadRequest, CodeInvalidBody, "invalid body", err.Error())
+		return nil, false
+	}
+	return body, true
+}
+
+func (h *Handler) reject(w http.ResponseWriter, status int, code, message, detail string) {
+	if h.metrics != nil {
+		h.metrics.SignalsRejected.WithLabelValues(code).Inc()
+	}
+	writeJSON(w, status, errorResponse{Error: readError{Code: code, Message: message, Detail: detail}})
+}
+
+type errorResponse struct {
+	Error readError `json:"error"`
+}
+
+type readError struct {
+	Code    string `json:"code"`
+	Message string `json:"message"`
+	Detail  string `json:"detail,omitempty"`
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(v)
+}
diff --git a/internal/signals/handler_test.go b/internal/signals/handler_test.go
new file mode 100644
index 0000000..f52189a
--- /dev/null
+++ b/internal/signals/handler_test.go
@@ -0,0 +1,143 @@
+package signals
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestHandlerSignals(t *testing.T) {
+	now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+	store := &fakeStore{}
+	h := NewHandler(store, nil)
+	h.now = func() time.Time { return now }
+	body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z","custom_tag":"foo"}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+	rec := httptest.NewRecorder()
+	h.Signals(rec, req)
+	if rec.Code != http.StatusCreated {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if len(store.inserted) != 1 {
+		t.Fatalf("inserted=%d", len(store.inserted))
+	}
+	if store.inserted[0].SignalID == "" || store.inserted[0].ReceivedAt.IsZero() {
+		t.Fatalf("server fields not set: %+v", store.inserted[0])
+	}
+	if store.inserted[0].Extra["custom_tag"] != "foo" {
+		t.Fatalf("extra=%+v", store.inserted[0].Extra)
+	}
+	var resp struct {
+		Signal Signal `json:"signal"`
+	}
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatal(err)
+	}
+	if resp.Signal.SignalID != store.inserted[0].SignalID {
+		t.Fatalf("response id=%q inserted=%q", resp.Signal.SignalID, store.inserted[0].SignalID)
+	}
+}
+
+func TestHandlerRejectsInvalidSignals(t *testing.T) {
+	now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+	tests := []struct {
+		name   string
+		body   string
+		status int
+		code   string
+	}{
+		{name: "invalid json", body: `{`, status: 400, code: CodeInvalidJSON},
+		{name: "missing service", body: `{"type":"deploy","source":"github","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeInvalidField},
+		{name: "unknown type", body: `{"type":"wrong","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeUnknownType},
+		{name: "unknown severity", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"huge","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeUnknownSeverity},
+		{name: "future", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T20:00:00Z"}`, status: 400, code: CodeTimestampTooFarInFuture},
+		{name: "non object resource", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z","resource":"bad"}`, status: 400, code: CodeInvalidField},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			h := NewHandler(&fakeStore{}, nil)
+			h.now = func() time.Time { return now }
+			req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(tt.body))
+			rec := httptest.NewRecorder()
+			h.Signals(rec, req)
+			assertError(t, rec, tt.status, tt.code)
+		})
+	}
+}
+
+func TestHandlerRejectsMethod(t *testing.T) {
+	h := NewHandler(UnavailableStore{}, nil)
+	req := httptest.NewRequest(http.MethodGet, "/v1/signals", nil)
+	rec := httptest.NewRecorder()
+	h.Signals(rec, req)
+	assertError(t, rec, http.StatusMethodNotAllowed, CodeUnsupportedMethod)
+}
+
+func TestHandlerRejectsOversizeBody(t *testing.T) {
+	h := NewHandler(UnavailableStore{}, nil)
+	h.maxBodyBytes = 8
+	req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(`{"too":"large"}`))
+	rec := httptest.NewRecorder()
+	h.Signals(rec, req)
+	assertError(t, rec, http.StatusRequestEntityTooLarge, CodeBodyOversize)
+}
+
+func TestHandlerReportsStoreUnavailable(t *testing.T) {
+	h := NewHandler(UnavailableStore{}, nil)
+	h.now = func() time.Time { return time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC) }
+	body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+	rec := httptest.NewRecorder()
+	h.Signals(rec, req)
+	assertError(t, rec, http.StatusServiceUnavailable, CodeDurabilityUnavailable)
+}
+
+func TestHandlerReportsStoreError(t *testing.T) {
+	h := NewHandler(&fakeStore{err: errors.New("boom")}, nil)
+	h.now = func() time.Time { return time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC) }
+	body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`
+	req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+	rec := httptest.NewRecorder()
+	h.Signals(rec, req)
+	assertError(t, rec, http.StatusInternalServerError, CodeInternalError)
+}
+
+func assertError(t *testing.T, rec *httptest.ResponseRecorder, status int, code string) {
+	t.Helper()
+	if rec.Code != status {
+		t.Fatalf("status=%d want %d body=%s", rec.Code, status, rec.Body.String())
+	}
+	var resp errorResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatal(err)
+	}
+	if resp.Error.Code != code {
+		t.Fatalf("code=%q want %q body=%s", resp.Error.Code, code, rec.Body.String())
+	}
+}
+
+type fakeStore struct {
+	inserted []Signal
+	err      error
+}
+
+func (s *fakeStore) Insert(_ context.Context, sig *Signal) error {
+	if s.err != nil {
+		return s.err
+	}
+	s.inserted = append(s.inserted, *sig)
+	return nil
+}
+
+func (s *fakeStore) Query(context.Context, Filter) ([]Signal, error) {
+	return nil, errors.New("unused")
+}
+
+func (s *fakeStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+	return 0, errors.New("unused")
+}
diff --git a/internal/signals/retention.go b/internal/signals/retention.go
new file mode 100644
index 0000000..4246c1a
--- /dev/null
+++ b/internal/signals/retention.go
@@ -0,0 +1,39 @@
+package signals
+
+import (
+	"context"
+	"log/slog"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+)
+
+func RunRetention(ctx context.Context, store Store, retention, interval time.Duration, m *metrics.Metrics, log *slog.Logger) {
+	if store == nil || retention <= 0 || interval <= 0 {
+		return
+	}
+	if log == nil {
+		log = slog.Default()
+	}
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			cutoff := time.Now().UTC().Add(-retention)
+			deleted, err := store.PruneOlderThan(ctx, cutoff)
+			if err != nil {
+				log.Warn("signals retention prune failed", "err", err)
+				continue
+			}
+			if m != nil && deleted > 0 {
+				m.SignalRetentionPruned.Add(float64(deleted))
+			}
+			if deleted > 0 {
+				log.Info("signals retention pruned", "deleted", deleted, "cutoff", cutoff)
+			}
+		}
+	}
+}
diff --git a/internal/signals/retention_test.go b/internal/signals/retention_test.go
new file mode 100644
index 0000000..c56075b
--- /dev/null
+++ b/internal/signals/retention_test.go
@@ -0,0 +1,51 @@
+package signals
+
+import (
+	"context"
+	"log/slog"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func TestRunRetentionPrunesAndStops(t *testing.T) {
+	store := &retentionStore{}
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		RunRetention(ctx, store, time.Minute, time.Millisecond, nil, slog.Default())
+		close(done)
+	}()
+	deadline := time.After(time.Second)
+	for {
+		if store.calls() > 0 {
+			break
+		}
+		select {
+		case <-deadline:
+			t.Fatal("retention did not call prune")
+		default:
+			time.Sleep(time.Millisecond)
+		}
+	}
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("retention did not stop")
+	}
+}
+
+type retentionStore struct {
+	n atomic.Int64
+}
+
+func (s *retentionStore) Insert(context.Context, *Signal) error { return nil }
+func (s *retentionStore) Query(context.Context, Filter) ([]Signal, error) {
+	return nil, nil
+}
+func (s *retentionStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+	s.n.Add(1)
+	return 1, nil
+}
+func (s *retentionStore) calls() int { return int(s.n.Load()) }
diff --git a/internal/signals/store.go b/internal/signals/store.go
new file mode 100644
index 0000000..b6fdc18
--- /dev/null
+++ b/internal/signals/store.go
@@ -0,0 +1,40 @@
+package signals
+
+import (
+	"context"
+	"errors"
+	"time"
+)
+
+var ErrUnavailable = errors.New("signals: store unavailable")
+
+type Store interface {
+	Insert(ctx context.Context, s *Signal) error
+	Query(ctx context.Context, f Filter) ([]Signal, error)
+	PruneOlderThan(ctx context.Context, cutoff time.Time) (int, error)
+}
+
+type Filter struct {
+	Service string
+	Env     string
+	Source  string
+	Reason  string
+	Types   []Type
+	Since   time.Time
+	Until   time.Time
+	Limit   int
+}
+
+type UnavailableStore struct{}
+
+func (UnavailableStore) Insert(context.Context, *Signal) error {
+	return ErrUnavailable
+}
+
+func (UnavailableStore) Query(context.Context, Filter) ([]Signal, error) {
+	return nil, ErrUnavailable
+}
+
+func (UnavailableStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+	return 0, nil
+}
diff --git a/internal/signals/types.go b/internal/signals/types.go
new file mode 100644
index 0000000..8a27024
--- /dev/null
+++ b/internal/signals/types.go
@@ -0,0 +1,190 @@
+package signals
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/google/uuid"
+)
+
+type Type string
+
+const (
+	TypeDeploy      Type = "deploy"
+	TypeRuntime     Type = "runtime"
+	TypeHealthcheck Type = "healthcheck"
+	TypeDependency  Type = "dependency"
+	TypeConfig      Type = "config"
+	TypeAlert       Type = "alert"
+)
+
+func (t Type) Valid() bool {
+	switch t {
+	case TypeDeploy, TypeRuntime, TypeHealthcheck, TypeDependency, TypeConfig, TypeAlert:
+		return true
+	default:
+		return false
+	}
+}
+
+type Severity string
+
+const (
+	SeverityInfo     Severity = "info"
+	SeverityWarning  Severity = "warning"
+	SeverityCritical Severity = "critical"
+)
+
+func (s Severity) Valid() bool {
+	switch s {
+	case SeverityInfo, SeverityWarning, SeverityCritical:
+		return true
+	default:
+		return false
+	}
+}
+
+type Signal struct {
+	SignalID   string         `json:"signal_id"`
+	Type       Type           `json:"type"`
+	Source     string         `json:"source"`
+	Service    string         `json:"service"`
+	Env        string         `json:"env"`
+	Severity   Severity       `json:"severity"`
+	Reason     string         `json:"reason"`
+	Message    string         `json:"message,omitempty"`
+	Resource   map[string]any `json:"resource,omitempty"`
+	Metadata   map[string]any `json:"metadata,omitempty"`
+	Timestamp  time.Time      `json:"timestamp"`
+	ReceivedAt time.Time      `json:"received_at"`
+	Extra      map[string]any `json:"-"`
+}
+
+func NewSignalID() string {
+	return "sig_" + strings.ReplaceAll(uuid.NewString(), "-", "")
+}
+
+func (s *Signal) UnmarshalJSON(b []byte) error {
+	var raw map[string]json.RawMessage
+	if err := json.Unmarshal(b, &raw); err != nil {
+		return err
+	}
+	*s = Signal{}
+	extra := map[string]any{}
+	for key, value := range raw {
+		switch key {
+		case "signal_id":
+			if err := json.Unmarshal(value, &s.SignalID); err != nil {
+				return fmt.Errorf("signal_id: %w", err)
+			}
+		case "type":
+			var v string
+			if err := json.Unmarshal(value, &v); err != nil {
+				return fmt.Errorf("type: %w", err)
+			}
+			s.Type = Type(v)
+		case "source":
+			if err := json.Unmarshal(value, &s.Source); err != nil {
+				return fmt.Errorf("source: %w", err)
+			}
+		case "service":
+			if err := json.Unmarshal(value, &s.Service); err != nil {
+				return fmt.Errorf("service: %w", err)
+			}
+		case "env":
+			if err := json.Unmarshal(value, &s.Env); err != nil {
+				return fmt.Errorf("env: %w", err)
+			}
+		case "severity":
+			var v string
+			if err := json.Unmarshal(value, &v); err != nil {
+				return fmt.Errorf("severity: %w", err)
+			}
+			s.Severity = Severity(v)
+		case "reason":
+			if err := json.Unmarshal(value, &s.Reason); err != nil {
+				return fmt.Errorf("reason: %w", err)
+			}
+		case "message":
+			if err := json.Unmarshal(value, &s.Message); err != nil {
+				return fmt.Errorf("message: %w", err)
+			}
+		case "resource":
+			resource, err := decodeObject(value)
+			if err != nil {
+				return invalidField("resource", "resource must be an object")
+			}
+			s.Resource = resource
+		case "metadata":
+			metadata, err := decodeObject(value)
+			if err != nil {
+				return invalidField("metadata", "metadata must be an object")
+			}
+			s.Metadata = metadata
+		case "timestamp":
+			if err := json.Unmarshal(value, &s.Timestamp); err != nil {
+				return fmt.Errorf("timestamp: %w", err)
+			}
+		case "received_at":
+			if err := json.Unmarshal(value, &s.ReceivedAt); err != nil {
+				return fmt.Errorf("received_at: %w", err)
+			}
+		default:
+			var v any
+			if err := json.Unmarshal(value, &v); err != nil {
+				return fmt.Errorf("%s: %w", key, err)
+			}
+			extra[key] = v
+		}
+	}
+	if len(extra) > 0 {
+		s.Extra = extra
+	}
+	return nil
+}
+
+func (s Signal) MarshalJSON() ([]byte, error) {
+	out := map[string]any{}
+	for key, value := range s.Extra {
+		out[key] = value
+	}
+	out["signal_id"] = s.SignalID
+	out["type"] = s.Type
+	out["source"] = s.Source
+	out["service"] = s.Service
+	out["env"] = s.Env
+	out["severity"] = s.Severity
+	out["reason"] = s.Reason
+	if s.Message != "" {
+		out["message"] = s.Message
+	}
+	if s.Resource != nil {
+		out["resource"] = s.Resource
+	}
+	if s.Metadata != nil {
+		out["metadata"] = s.Metadata
+	}
+	if !s.Timestamp.IsZero() {
+		out["timestamp"] = s.Timestamp
+	}
+	if !s.ReceivedAt.IsZero() {
+		out["received_at"] = s.ReceivedAt
+	}
+	return json.Marshal(out)
+}
+
+func decodeObject(raw json.RawMessage) (map[string]any, error) {
+	if string(raw) == "null" {
+		return nil, nil
+	}
+	var out map[string]any
+	if err := json.Unmarshal(raw, &out); err != nil {
+		return nil, err
+	}
+	if out == nil {
+		return nil, fmt.Errorf("must be an object")
+	}
+	return out, nil
+}
diff --git a/internal/signals/types_test.go b/internal/signals/types_test.go
new file mode 100644
index 0000000..6970d38
--- /dev/null
+++ b/internal/signals/types_test.go
@@ -0,0 +1,78 @@
+package signals
+
+import (
+	"encoding/json"
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestSignalJSONPreservesExtraAndOverridesServerFields(t *testing.T) {
+	raw := []byte(`{
+		"signal_id":"client",
+		"type":"deploy",
+		"source":"github",
+		"service":"checkout",
+		"env":"prod",
+		"severity":"info",
+		"reason":"RolloutComplete",
+		"metadata":{"version":"1.2.3"},
+		"timestamp":"2026-05-02T18:09:40Z",
+		"received_at":"2026-05-02T18:09:41Z",
+		"custom_tag":"foo"
+	}`)
+	var sig Signal
+	if err := json.Unmarshal(raw, &sig); err != nil {
+		t.Fatal(err)
+	}
+	if got := sig.Extra["custom_tag"]; got != "foo" {
+		t.Fatalf("custom_tag=%v", got)
+	}
+	sig.SignalID = "sig_server"
+	sig.ReceivedAt = time.Date(2026, 5, 2, 18, 9, 42, 0, time.UTC)
+	out, err := json.Marshal(sig)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var decoded map[string]any
+	if err := json.Unmarshal(out, &decoded); err != nil {
+		t.Fatal(err)
+	}
+	if decoded["signal_id"] != "sig_server" {
+		t.Fatalf("signal_id=%v", decoded["signal_id"])
+	}
+	if decoded["custom_tag"] != "foo" {
+		t.Fatalf("custom_tag=%v", decoded["custom_tag"])
+	}
+}
+
+func TestTypeAndSeverityValidity(t *testing.T) {
+	for _, typ := range []Type{TypeDeploy, TypeRuntime, TypeHealthcheck, TypeDependency, TypeConfig, TypeAlert} {
+		if !typ.Valid() {
+			t.Fatalf("%q should be valid", typ)
+		}
+	}
+	if Type("bad").Valid() {
+		t.Fatal("bad type should be invalid")
+	}
+	for _, severity := range []Severity{SeverityInfo, SeverityWarning, SeverityCritical} {
+		if !severity.Valid() {
+			t.Fatalf("%q should be valid", severity)
+		}
+	}
+	if Severity("huge").Valid() {
+		t.Fatal("bad severity should be invalid")
+	}
+}
+
+func TestSignalJSONRejectsNonObjectResource(t *testing.T) {
+	var sig Signal
+	err := json.Unmarshal([]byte(`{"resource":"bad"}`), &sig)
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	var validation *ValidationError
+	if !errors.As(err, &validation) || validation.Code != CodeInvalidField {
+		t.Fatalf("err=%T %[1]v", err)
+	}
+}
diff --git a/internal/signals/validate.go b/internal/signals/validate.go
new file mode 100644
index 0000000..b8dcb2f
--- /dev/null
+++ b/internal/signals/validate.go
@@ -0,0 +1,74 @@
+package signals
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+const (
+	CodeInvalidField            = "invalid_field"
+	CodeUnknownType             = "unknown_type"
+	CodeUnknownSeverity         = "unknown_severity"
+	CodeTimestampTooFarInFuture = "timestamp_too_far_in_future"
+	CodeBodyOversize            = "body_oversize"
+	CodeInvalidBody             = "invalid_body"
+	CodeInvalidJSON             = "invalid_json"
+	CodeUnsupportedMethod       = "unsupported_method"
+	CodeDurabilityUnavailable   = "durability_unavailable"
+	CodeInternalError           = "internal_error"
+)
+
+type ValidationError struct {
+	Code   string
+	Field  string
+	Detail string
+}
+
+func (e *ValidationError) Error() string {
+	if e.Field == "" {
+		return e.Detail
+	}
+	return e.Field + ": " + e.Detail
+}
+
+func Validate(s *Signal, now time.Time, futureSkew time.Duration) error {
+	if s == nil {
+		return invalidField("signal", "signal is required")
+	}
+	if strings.TrimSpace(string(s.Type)) == "" {
+		return invalidField("type", "type is required")
+	}
+	if !s.Type.Valid() {
+		return &ValidationError{Code: CodeUnknownType, Field: "type", Detail: fmt.Sprintf("unknown type %q", s.Type)}
+	}
+	if strings.TrimSpace(s.Source) == "" {
+		return invalidField("source", "source is required")
+	}
+	if strings.TrimSpace(s.Service) == "" {
+		return invalidField("service", "service is required")
+	}
+	if strings.TrimSpace(s.Env) == "" {
+		return invalidField("env", "env is required")
+	}
+	if strings.TrimSpace(string(s.Severity)) == "" {
+		return invalidField("severity", "severity is required")
+	}
+	if !s.Severity.Valid() {
+		return &ValidationError{Code: CodeUnknownSeverity, Field: "severity", Detail: fmt.Sprintf("unknown severity %q", s.Severity)}
+	}
+	if strings.TrimSpace(s.Reason) == "" {
+		return invalidField("reason", "reason is required")
+	}
+	if s.Timestamp.IsZero() {
+		return invalidField("timestamp", "timestamp is required")
+	}
+	if futureSkew > 0 && s.Timestamp.After(now.UTC().Add(futureSkew)) {
+		return &ValidationError{Code: CodeTimestampTooFarInFuture, Field: "timestamp", Detail: "timestamp is too far in the future"}
+	}
+	return nil
+}
+
+func invalidField(field, detail string) *ValidationError {
+	return &ValidationError{Code: CodeInvalidField, Field: field, Detail: detail}
+}
diff --git a/internal/signals/validate_test.go b/internal/signals/validate_test.go
new file mode 100644
index 0000000..580c5e8
--- /dev/null
+++ b/internal/signals/validate_test.go
@@ -0,0 +1,53 @@
+package signals
+
+import (
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestValidate(t *testing.T) {
+	now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+	valid := Signal{
+		Type:      TypeDeploy,
+		Source:    "github",
+		Service:   "checkout",
+		Env:       "prod",
+		Severity:  SeverityInfo,
+		Reason:    "RolloutComplete",
+		Timestamp: now,
+	}
+	tests := []struct {
+		name string
+		edit func(*Signal)
+		code string
+	}{
+		{name: "valid"},
+		{name: "missing service", edit: func(s *Signal) { s.Service = "" }, code: CodeInvalidField},
+		{name: "unknown type", edit: func(s *Signal) { s.Type = "wrong" }, code: CodeUnknownType},
+		{name: "unknown severity", edit: func(s *Signal) { s.Severity = "huge" }, code: CodeUnknownSeverity},
+		{name: "future timestamp", edit: func(s *Signal) { s.Timestamp = now.Add(2 * time.Hour) }, code: CodeTimestampTooFarInFuture},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			sig := valid
+			if tt.edit != nil {
+				tt.edit(&sig)
+			}
+			err := Validate(&sig, now, 5*time.Minute)
+			if tt.code == "" {
+				if err != nil {
+					t.Fatal(err)
+				}
+				return
+			}
+			var validation *ValidationError
+			if !errors.As(err, &validation) {
+				t.Fatalf("err=%T %[1]v", err)
+			}
+			if validation.Code != tt.code {
+				t.Fatalf("code=%q want %q", validation.Code, tt.code)
+			}
+		})
+	}
+}

From f196c573c1cdce6e524e694a63436a0b556917d6 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Tue, 5 May 2026 02:45:51 -0400
Subject: [PATCH 02/14] feat: Implemented the incident engine over the
 schema-2.0 read path.

Adds internal/incidents with the incident domain model, stable incident IDs,
fixed-rule classification, evidence normalization, next-check templates,
snapshot rendering, HTTP handlers, in-memory test store, and engine lifecycle.
The engine derives incidents from v2 error-family spikes, enriches them with
signals and deployment context, persists stable samples, and transitions
active -> recovering -> resolved.

Adds SQLite incident persistence via coldstore migration 004_incidents.sql and
IncidentStore with upsert, get, active listing, and resolved pruning support.

Wires cmd/ingest so incidents start only when SQLITE_PATH is set,
WAYLOG_V2_READS=true, and WAYLOG_INCIDENTS_ENABLED=true. Bootstrap failure is
fatal under those conditions. The legacy detector continues as fallback when
incidents are unavailable or disabled, and is disabled only when the new engine
is running. /v1/insight now projects the top active incident when the v2.1
engine is active.

Adds read-auth incident routes:
- GET /v1/incidents/active
- GET /v1/incidents/{id}
- GET /v1/incidents/{id}/snapshot

Adds incident Prometheus metrics and updates OpenAPI/env docs for the new
incident surface and configuration.

Verification:
- go test ./internal/incidents ./internal/coldstore ./internal/ingest/v2 ./internal/ingest ./cmd/ingest
- go test ./...
- go test -race ./internal/incidents ./internal/coldstore
- go vet ./...
- bash scripts/check-doc-links.sh
- git diff --check
---
 cmd/ingest/main.go                            | 161 +++++-
 docs/env.md                                   |   8 +
 docs/openapi.yaml                             | 259 +++++++++
 internal/coldstore/incident_store.go          | 236 +++++++++
 internal/coldstore/incident_store_test.go     |  85 +++
 .../coldstore/migrations/004_incidents.sql    |  34 ++
 internal/incidents/classifier.go              | 269 ++++++++++
 internal/incidents/classifier_test.go         |  66 +++
 internal/incidents/engine.go                  | 497 ++++++++++++++++++
 internal/incidents/engine_test.go             | 105 ++++
 internal/incidents/handler.go                 |  92 ++++
 internal/incidents/handler_test.go            |  61 +++
 internal/incidents/id.go                      |  19 +
 internal/incidents/id_test.go                 |  25 +
 internal/incidents/interfaces.go              |  36 ++
 internal/incidents/nextchecks.go              |  30 ++
 internal/incidents/render.go                  |  48 ++
 internal/incidents/store.go                   | 104 ++++
 internal/incidents/test_helpers_test.go       |  60 +++
 internal/incidents/types.go                   | 103 ++++
 internal/metrics/metrics.go                   |  45 ++
 21 files changed, 2341 insertions(+), 2 deletions(-)
 create mode 100644 internal/coldstore/incident_store.go
 create mode 100644 internal/coldstore/incident_store_test.go
 create mode 100644 internal/coldstore/migrations/004_incidents.sql
 create mode 100644 internal/incidents/classifier.go
 create mode 100644 internal/incidents/classifier_test.go
 create mode 100644 internal/incidents/engine.go
 create mode 100644 internal/incidents/engine_test.go
 create mode 100644 internal/incidents/handler.go
 create mode 100644 internal/incidents/handler_test.go
 create mode 100644 internal/incidents/id.go
 create mode 100644 internal/incidents/id_test.go
 create mode 100644 internal/incidents/interfaces.go
 create mode 100644 internal/incidents/nextchecks.go
 create mode 100644 internal/incidents/render.go
 create mode 100644 internal/incidents/store.go
 create mode 100644 internal/incidents/test_helpers_test.go
 create mode 100644 internal/incidents/types.go

diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 599993e..650f58d 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -26,6 +26,7 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/graph/causal"
 	"github.com/sssmaran/WaylogCLI/internal/graph/core"
 	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
 	"github.com/sssmaran/WaylogCLI/internal/ingest"
 	ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
 	"github.com/sssmaran/WaylogCLI/internal/mcp/stdio"
@@ -35,6 +36,8 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/signals"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
 	"github.com/sssmaran/WaylogCLI/internal/tracestore"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
 )
 
 var graphStore *graphstore.Store
@@ -126,6 +129,16 @@ func main() {
 	otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
 	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
 	signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+	incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
+	incidentCfg := incidents.Config{
+		TickInterval:            config.GetenvDuration("WAYLOG_INCIDENT_TICK_INTERVAL", 30*time.Second),
+		Window:                  config.GetenvDuration("WAYLOG_INCIDENT_WINDOW", 10*time.Minute),
+		MinCount:                config.GetenvInt("WAYLOG_INCIDENT_MIN_COUNT", 5),
+		MinLift:                 config.GetenvFloat("WAYLOG_INCIDENT_MIN_LIFT", 3.0),
+		ResolveAfter:            config.GetenvDuration("WAYLOG_INCIDENT_RESOLVE_AFTER", 2*time.Minute),
+		DeployCorrelationWindow: config.GetenvDuration("WAYLOG_DEPLOY_CORRELATION_WINDOW", 15*time.Minute),
+		SampleLimit:             config.GetenvInt("WAYLOG_INCIDENT_SAMPLE_LIMIT", 5),
+	}
 	if signalRetention <= 0 {
 		slog.Error("WAYLOG_SIGNAL_RETENTION must be positive", "value", signalRetention)
 		os.Exit(1)
@@ -380,8 +393,11 @@ func main() {
 			func(w http.ResponseWriter, r *http.Request) { inner.ServeHTTP(w, r) }))
 	}
 	mux.Handle("/v1/overview", readCORS(ingestServer.Overview))
+	var v2Reader *ingestv2.Reader
+	var incidentEngine *incidents.Engine
+	incidentRunning := false
 	if v2ReadsEnabled {
-		v2Reader := ingestv2.NewReader(v2Index)
+		v2Reader = ingestv2.NewReader(v2Index)
 		v2ReadHandler := ingestv2.NewReadHandler(v2Reader, m, graphHotWindow)
 		mux.Handle("/v1/events/search", readCORS(v2ReadHandler.EventSearch))
 		mux.Handle("/v1/errors", readCORS(v2ReadHandler.Errors))
@@ -393,6 +409,32 @@ func main() {
 		mux.Handle("/v1/events/", readCORS(v2ReadHandler.EventByID))
 		mux.Handle("/v1/traces/", readCORS(v2ReadHandler.TraceByID))
 		slog.Info("v2 read endpoints enabled")
+		if incidentsEnabled {
+			if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok {
+				incidentStore := coldstore.NewIncidentStore(sqlite)
+				incidentEngine = incidents.NewEngine(
+					incidentReaderAdapter{reader: v2Reader},
+					signalStore,
+					coldDeployAdapter{store: sqlite},
+					incidentStore,
+					incidentCfg,
+					m,
+					slog.Default(),
+				)
+				if err := incidentEngine.Bootstrap(context.Background()); err != nil {
+					slog.Error("incident engine bootstrap failed", "err", err)
+					os.Exit(1)
+				}
+				incidentHandler := incidents.NewHandler(incidentEngine)
+				mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
+				mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
+				ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine})
+				incidentRunning = true
+				slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
+			} else {
+				slog.Info("incident engine disabled: SQLITE_PATH is not set")
+			}
+		}
 	} else {
 		mux.Handle("/v1/traces/story", readCORS(ingestServer.TraceStory))
 		mux.Handle("/v1/blast_radius", readCORS(ingestServer.BlastRadius))
@@ -466,6 +508,9 @@ func main() {
 	if _, ok := signalStore.(*coldstore.SignalStore); ok {
 		go signals.RunRetention(ctx, signalStore, signalRetention, 5*time.Minute, m, slog.Default())
 	}
+	if incidentRunning {
+		go incidentEngine.Run(ctx)
+	}
 
 	go func() {
 		slog.Info("ingest listening", "addr", addr, "graph_hot_window", graphHotWindow)
@@ -609,7 +654,7 @@ func main() {
 	// ---------------- Anomaly detection ticker ----------------
 
 	detectCfg := detect.ParseConfig()
-	if detectCfg.Enabled {
+	if detectCfg.Enabled && !incidentRunning {
 		var deploySrc detect.DeploySource
 		if coldDB != nil {
 			deploySrc = coldDB
@@ -617,6 +662,8 @@ func main() {
 		detector := detect.NewDetector(detectCfg, graphStore, traceStore, deploySrc)
 		ingestServer.SetDetector(detector)
 		go detector.Run(ctx)
+	} else if incidentRunning {
+		slog.Info("legacy anomaly detector disabled because v2.1 incident engine is running")
 	}
 
 	// ---------------- Causal inference ticker ----------------
@@ -794,6 +841,116 @@ func printHelp() {
 	os.Stdout.WriteString("\n\033[2mnotes: MCP stdio: run with MCP_STDIO=1\033[0m\n")
 }
 
+type coldDeployAdapter struct {
+	store *coldstore.SQLiteStore
+}
+
+func (a coldDeployAdapter) DeploymentsInWindow(ctx context.Context, start, end time.Time, serviceFilter string) ([]incidents.Deployment, error) {
+	rows, err := a.store.DeploymentsInWindow(ctx, start, end, serviceFilter)
+	if err != nil {
+		return nil, err
+	}
+	out := make([]incidents.Deployment, 0, len(rows))
+	for _, row := range rows {
+		out = append(out, incidents.Deployment{
+			ID:        row.ID,
+			Service:   row.Service,
+			Version:   row.Version,
+			Env:       row.Env,
+			FirstSeen: row.FirstSeen,
+			LastSeen:  row.LastSeen,
+			Metadata:  row.Metadata,
+		})
+	}
+	return out, nil
+}
+
+type incidentReaderAdapter struct {
+	reader *ingestv2.Reader
+}
+
+func (a incidentReaderAdapter) Errors(f incidents.SearchFilter, limit int) incidents.ErrorsResult {
+	res := a.reader.Errors(toV2SearchFilter(f), nil, limit)
+	return incidents.ErrorsResult{Rows: res.Rows}
+}
+
+func (a incidentReaderAdapter) BlastRadius(f incidents.SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse {
+	return a.reader.BlastRadius(toV2SearchFilter(f), ingestv2.BlastKeyMode{Key: key})
+}
+
+func (a incidentReaderAdapter) SearchEvents(f incidents.SearchFilter, limit int) []*eventv2.Event {
+	res := a.reader.SearchEvents(toV2SearchFilter(f), nil, limit)
+	return res.Events
+}
+
+func toV2SearchFilter(f incidents.SearchFilter) ingestv2.SearchFilter {
+	return ingestv2.SearchFilter{
+		Service:   f.Service,
+		Statuses:  f.Statuses,
+		ErrorCode: f.ErrorCode,
+		Since:     f.Since,
+		Until:     f.Until,
+	}
+}
+
+type incidentInsightAdapter struct {
+	engine *incidents.Engine
+}
+
+func (a incidentInsightAdapter) Current() *detect.Insight {
+	if a.engine == nil {
+		return nil
+	}
+	inc, err := a.engine.TopActive(context.Background())
+	if err != nil || inc == nil {
+		return nil
+	}
+	return projectIncidentInsight(*inc, time.Now().UTC())
+}
+
+func projectIncidentInsight(inc incidents.Incident, detectedAt time.Time) *detect.Insight {
+	affectedUsers := 0
+	if inc.AffectedUsers != nil {
+		affectedUsers = *inc.AffectedUsers
+	}
+	out := &detect.Insight{
+		DetectedAt:       detectedAt,
+		TopErrorCode:     inc.ErrorFamily.ErrorCode,
+		Lift:             inc.Lift,
+		CurrentCount:     inc.CurrentCount,
+		BaselineCount:    inc.BaselineCount,
+		AffectedRequests: inc.AffectedRequests,
+		AffectedUsers:    affectedUsers,
+		Services:         append([]string(nil), inc.TopServices...),
+		SeverityScore:    float64(inc.Severity),
+	}
+	if len(out.Services) == 0 {
+		out.Services = []string{inc.Service}
+	}
+	for _, ev := range inc.Evidence {
+		if ev.Kind == incidents.EvidenceDeployment && ev.DeployID != "" {
+			out.DeployCorrelation = &detect.DeployCorrelation{
+				DeploymentID: ev.DeployID,
+				Service:      ev.Service,
+				Confidence:   incidentConfidenceFloat(inc.Confidence),
+			}
+			break
+		}
+	}
+	return out
+}
+
+func incidentConfidenceFloat(c incidents.Confidence) float64 {
+	switch c {
+	case incidents.ConfidenceHigh:
+		return 0.9
+	case incidents.ConfidenceMedium:
+		return 0.65
+	default:
+		return 0.35
+	}
+}
+
 func parseSlogLevel(s string) slog.Level {
 	switch strings.ToLower(s) {
 	case "debug":
diff --git a/docs/env.md b/docs/env.md
index 7ace4cb..db9afac 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -57,6 +57,14 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
 | `EVENT_LOG_MAX_FILE_MB` | `50` | Rotation size. `0` disables rotation |
 | `EVENT_LOG_RETENTION` | `72h` | Event log retention. Must be positive |
 | `WAYLOG_SIGNAL_RETENTION` | `72h` | Production-context signal retention. Must be positive. `/v1/signals` requires `SQLITE_PATH` |
+| `WAYLOG_INCIDENTS_ENABLED` | `true` | Enable the v2.1 incident engine when `SQLITE_PATH` is set and `WAYLOG_V2_READS=true` |
+| `WAYLOG_INCIDENT_TICK_INTERVAL` | `30s` | Incident engine evaluation interval |
+| `WAYLOG_INCIDENT_WINDOW` | `10m` | Current error-family spike window |
+| `WAYLOG_INCIDENT_MIN_COUNT` | `5` | Minimum current-window failures needed to open an incident |
+| `WAYLOG_INCIDENT_MIN_LIFT` | `3.0` | Minimum current-vs-baseline lift when the family already exists in the baseline window |
+| `WAYLOG_INCIDENT_RESOLVE_AFTER` | `2m` | Time without renewed matching failures before a recovering incident resolves |
+| `WAYLOG_DEPLOY_CORRELATION_WINDOW` | `15m` | Window used to attach deploy signals and deployment records as incident evidence |
+| `WAYLOG_INCIDENT_SAMPLE_LIMIT` | `5` | Maximum persisted sample traces per incident |
 | `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
 | `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
 | `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 151392b..ce503dc 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -435,6 +435,79 @@ paths:
         '400':
           $ref: '#/components/responses/ReadBadRequest'
 
+  /v1/incidents/active:
+    get:
+      tags: [Triage]
+      operationId: listActiveIncidents
+      summary: Active v2.1 incidents
+      description: Returns active and recovering incidents derived from v2 error-family spikes, signals, and deployment context.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      responses:
+        '200':
+          description: Active incidents
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IncidentListResponse'
+        '401':
+          description: Unauthorized
+        '405':
+          description: Method Not Allowed
+
+  /v1/incidents/{id}:
+    get:
+      tags: [Triage]
+      operationId: getIncident
+      summary: Get one incident
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      parameters:
+        - $ref: '#/components/parameters/IncidentID'
+      responses:
+        '200':
+          description: Incident detail
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IncidentDetailResponse'
+        '401':
+          description: Unauthorized
+        '404':
+          $ref: '#/components/responses/ReadNotFound'
+        '405':
+          description: Method Not Allowed
+
+  /v1/incidents/{id}/snapshot:
+    get:
+      tags: [Triage]
+      operationId: getIncidentSnapshot
+      summary: Render an incident snapshot
+      description: Defaults to text/plain. Send Accept: application/json to receive the snapshot text plus the incident object.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      parameters:
+        - $ref: '#/components/parameters/IncidentID'
+      responses:
+        '200':
+          description: Incident snapshot
+          content:
+            text/plain:
+              schema:
+                type: string
+            application/json:
+              schema:
+                $ref: '#/components/schemas/IncidentSnapshotResponse'
+        '401':
+          description: Unauthorized
+        '404':
+          $ref: '#/components/responses/ReadNotFound'
+        '405':
+          description: Method Not Allowed
+
   /v1/capabilities:
     get:
       tags: [Capabilities]
@@ -710,6 +783,12 @@ components:
       required: true
       schema:
         type: string
+    IncidentID:
+      name: id
+      in: path
+      required: true
+      schema:
+        type: string
     TraceIDQuery:
       name: trace_id
       in: query
@@ -1430,6 +1509,186 @@ components:
           items:
             type: string
 
+    IncidentEvidence:
+      type: object
+      required: [kind, title, occurred_at]
+      properties:
+        kind:
+          type: string
+          enum: [signal, deployment, trace, metric]
+        title:
+          type: string
+        detail:
+          type: string
+        service:
+          type: string
+        signal_id:
+          type: string
+        deployment_id:
+          type: string
+        trace_id:
+          type: string
+        occurred_at:
+          type: string
+          format: date-time
+        fields:
+          type: object
+          additionalProperties: true
+
+    Incident:
+      type: object
+      required:
+        - incident_id
+        - env
+        - service
+        - error_family
+        - status
+        - cause
+        - confidence
+        - severity
+        - started_at
+        - updated_at
+        - last_seen_at
+        - affected_requests
+        - affected_services
+        - top_services
+        - sample_traces
+        - evidence
+        - next_checks
+        - lift
+        - baseline_count
+        - current_count
+      example:
+        incident_id: inc_7d0b0b3d5a52d891
+        env: prod
+        service: checkout
+        error_family:
+          service: checkout
+          step: payment.charge
+          error_code: PMT_502
+        status: active
+        cause: dependency
+        confidence: medium
+        severity: 8
+        started_at: '2026-05-04T16:00:00Z'
+        updated_at: '2026-05-04T16:02:00Z'
+        last_seen_at: '2026-05-04T16:02:00Z'
+        affected_requests: 12
+        affected_users: 8
+        affected_services: 3
+        top_services: [api-gateway, checkout, payment]
+        sample_traces: [7f3a2b9c000000000000000000000001]
+        evidence:
+          - kind: trace
+            title: First failing trace sample
+            detail: payment.charge/PMT_502
+            service: checkout
+            trace_id: 7f3a2b9c000000000000000000000001
+            occurred_at: '2026-05-04T16:00:00Z'
+        next_checks:
+          - Check the downstream service health and recent deploys.
+        lift: 6
+        baseline_count: 2
+        current_count: 12
+      properties:
+        incident_id:
+          type: string
+        env:
+          type: string
+        service:
+          type: string
+        error_family:
+          $ref: '#/components/schemas/ErrorFamily'
+        status:
+          type: string
+          enum: [active, recovering, resolved]
+        cause:
+          type: string
+          enum: [deploy, app, dependency, unknown]
+        confidence:
+          type: string
+          enum: [high, medium, low]
+        severity:
+          type: integer
+          minimum: 1
+          maximum: 10
+        started_at:
+          type: string
+          format: date-time
+        updated_at:
+          type: string
+          format: date-time
+        last_seen_at:
+          type: string
+          format: date-time
+        recovering_at:
+          type: string
+          format: date-time
+          nullable: true
+        resolved_at:
+          type: string
+          format: date-time
+          nullable: true
+        affected_requests:
+          type: integer
+        affected_users:
+          type: integer
+          nullable: true
+        affected_services:
+          type: integer
+        top_services:
+          type: array
+          items:
+            type: string
+        sample_traces:
+          type: array
+          items:
+            type: string
+        evidence:
+          type: array
+          items:
+            $ref: '#/components/schemas/IncidentEvidence'
+        next_checks:
+          type: array
+          items:
+            type: string
+        instrumentation_warnings:
+          type: array
+          items:
+            type: string
+        lift:
+          type: number
+          format: double
+        baseline_count:
+          type: integer
+        current_count:
+          type: integer
+
+    IncidentListResponse:
+      type: object
+      required: [incidents]
+      properties:
+        incidents:
+          type: array
+          items:
+            $ref: '#/components/schemas/Incident'
+
+    IncidentDetailResponse:
+      type: object
+      required: [incident]
+      properties:
+        incident:
+          $ref: '#/components/schemas/Incident'
+
+    IncidentSnapshotResponse:
+      type: object
+      required: [snapshot, incident]
+      properties:
+        snapshot:
+          type: string
+        incident:
+          $ref: '#/components/schemas/Incident'
+
     CapabilitiesResponse:
       type: object
       example:
diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go
new file mode 100644
index 0000000..b6a3160
--- /dev/null
+++ b/internal/coldstore/incident_store.go
@@ -0,0 +1,236 @@
+package coldstore
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+)
+
+type IncidentStore struct {
+	db *SQLiteStore
+}
+
+func NewIncidentStore(db *SQLiteStore) *IncidentStore {
+	return &IncidentStore{db: db}
+}
+
+func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) error {
+	topServices, err := jsonText(inc.TopServices)
+	if err != nil {
+		return fmt.Errorf("coldstore incident top services: %w", err)
+	}
+	samples, err := jsonText(inc.SampleTraces)
+	if err != nil {
+		return fmt.Errorf("coldstore incident samples: %w", err)
+	}
+	evidence, err := jsonText(inc.Evidence)
+	if err != nil {
+		return fmt.Errorf("coldstore incident evidence: %w", err)
+	}
+	nextChecks, err := jsonText(inc.NextChecks)
+	if err != nil {
+		return fmt.Errorf("coldstore incident next checks: %w", err)
+	}
+	warnings, err := jsonText(inc.InstrumentationWarnings)
+	if err != nil {
+		return fmt.Errorf("coldstore incident warnings: %w", err)
+	}
+	_, err = s.db.writer.ExecContext(ctx, `
+		INSERT INTO incidents (
+			incident_id, env, service, error_service, error_step, error_code,
+			status, cause, confidence, severity, started_at, updated_at, last_seen_at,
+			recovering_at, resolved_at, affected_requests, affected_users, affected_services,
+			top_services, sample_traces, evidence, next_checks, instrumentation_warnings,
+			lift, baseline_count, current_count
+		) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+		ON CONFLICT(incident_id) DO UPDATE SET
+			status = excluded.status,
+			cause = excluded.cause,
+			confidence = excluded.confidence,
+			severity = excluded.severity,
+			updated_at = excluded.updated_at,
+			last_seen_at = excluded.last_seen_at,
+			recovering_at = excluded.recovering_at,
+			resolved_at = excluded.resolved_at,
+			affected_requests = excluded.affected_requests,
+			affected_users = excluded.affected_users,
+			affected_services = excluded.affected_services,
+			top_services = excluded.top_services,
+			sample_traces = excluded.sample_traces,
+			evidence = excluded.evidence,
+			next_checks = excluded.next_checks,
+			instrumentation_warnings = excluded.instrumentation_warnings,
+			lift = excluded.lift,
+			baseline_count = excluded.baseline_count,
+			current_count = excluded.current_count`,
+		inc.IncidentID, inc.Env, inc.Service, inc.ErrorFamily.Service, inc.ErrorFamily.Step, inc.ErrorFamily.ErrorCode,
+		string(inc.Status), string(inc.Cause), string(inc.Confidence), inc.Severity,
+		formatTime(inc.StartedAt), formatTime(inc.UpdatedAt), formatTime(inc.LastSeenAt),
+		nullableTime(inc.RecoveringAt), nullableTime(inc.ResolvedAt),
+		inc.AffectedRequests, nullableInt(inc.AffectedUsers), inc.AffectedServices,
+		topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount,
+	)
+	if err != nil {
+		return fmt.Errorf("coldstore upsert incident: %w", err)
+	}
+	return nil
+}
+
+func (s *IncidentStore) Get(ctx context.Context, id string) (incidents.Incident, error) {
+	row := s.db.reader.QueryRowContext(ctx, incidentSelectSQL()+` WHERE incident_id = ?`, id)
+	inc, err := scanIncident(row)
+	if errors.Is(err, sql.ErrNoRows) {
+		return incidents.Incident{}, incidents.ErrNotFound
+	}
+	return inc, err
+}
+
+func (s *IncidentStore) ListActive(ctx context.Context) ([]incidents.Incident, error) {
+	rows, err := s.db.reader.QueryContext(ctx, incidentSelectSQL()+` WHERE status != ? ORDER BY severity DESC, started_at DESC, incident_id ASC`, string(incidents.StatusResolved))
+	if err != nil {
+		return nil, fmt.Errorf("coldstore list active incidents: %w", err)
+	}
+	defer rows.Close()
+	var out []incidents.Incident
+	for rows.Next() {
+		inc, err := scanIncident(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, inc)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (s *IncidentStore) PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error) {
+	res, err := s.db.writer.ExecContext(ctx, `DELETE FROM incidents WHERE status = ? AND resolved_at IS NOT NULL AND resolved_at < ?`, string(incidents.StatusResolved), formatTime(cutoff))
+	if err != nil {
+		return 0, fmt.Errorf("coldstore prune incidents: %w", err)
+	}
+	n, err := res.RowsAffected()
+	if err != nil {
+		return 0, fmt.Errorf("coldstore prune incidents rows affected: %w", err)
+	}
+	return int(n), nil
+}
+
+func incidentSelectSQL() string {
+	return `SELECT incident_id, env, service, error_service, error_step, error_code,
+		status, cause, confidence, severity, started_at, updated_at, last_seen_at,
+		COALESCE(recovering_at, ''), COALESCE(resolved_at, ''),
+		affected_requests, affected_users, affected_services,
+		COALESCE(top_services, ''), COALESCE(sample_traces, ''), COALESCE(evidence, ''),
+		COALESCE(next_checks, ''), COALESCE(instrumentation_warnings, ''),
+		lift, baseline_count, current_count
+		FROM incidents`
+}
+
+func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, error) {
+	var inc incidents.Incident
+	var status, cause, confidence string
+	var startedAt, updatedAt, lastSeenAt, recoveringAt, resolvedAt string
+	var affectedUsers sql.NullInt64
+	var topServices, samples, evidence, nextChecks, warnings string
+	err := row.Scan(
+		&inc.IncidentID, &inc.Env, &inc.Service, &inc.ErrorFamily.Service, &inc.ErrorFamily.Step, &inc.ErrorFamily.ErrorCode,
+		&status, &cause, &confidence, &inc.Severity, &startedAt, &updatedAt, &lastSeenAt,
+		&recoveringAt, &resolvedAt, &inc.AffectedRequests, &affectedUsers, &inc.AffectedServices,
+		&topServices, &samples, &evidence, &nextChecks, &warnings, &inc.Lift, &inc.BaselineCount, &inc.CurrentCount,
+	)
+	if err != nil {
+		return incidents.Incident{}, err
+	}
+	inc.Status = incidents.Status(status)
+	inc.Cause = incidents.Cause(cause)
+	inc.Confidence = incidents.Confidence(confidence)
+	var parseErr error
+	if inc.StartedAt, parseErr = time.Parse(tsFormat, startedAt); parseErr != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident started_at: %w", parseErr)
+	}
+	if inc.UpdatedAt, parseErr = time.Parse(tsFormat, updatedAt); parseErr != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident updated_at: %w", parseErr)
+	}
+	if inc.LastSeenAt, parseErr = time.Parse(tsFormat, lastSeenAt); parseErr != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident last_seen_at: %w", parseErr)
+	}
+	if recoveringAt != "" {
+		t, err := time.Parse(tsFormat, recoveringAt)
+		if err != nil {
+			return incidents.Incident{}, fmt.Errorf("coldstore incident recovering_at: %w", err)
+		}
+		inc.RecoveringAt = &t
+	}
+	if resolvedAt != "" {
+		t, err := time.Parse(tsFormat, resolvedAt)
+		if err != nil {
+			return incidents.Incident{}, fmt.Errorf("coldstore incident resolved_at: %w", err)
+		}
+		inc.ResolvedAt = &t
+	}
+	if affectedUsers.Valid {
+		v := int(affectedUsers.Int64)
+		inc.AffectedUsers = &v
+	}
+	if err := parseJSONText(topServices, &inc.TopServices); err != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident top services: %w", err)
+	}
+	if err := parseJSONText(samples, &inc.SampleTraces); err != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident samples: %w", err)
+	}
+	if err := parseJSONText(evidence, &inc.Evidence); err != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident evidence: %w", err)
+	}
+	if err := parseJSONText(nextChecks, &inc.NextChecks); err != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident next checks: %w", err)
+	}
+	if err := parseJSONText(warnings, &inc.InstrumentationWarnings); err != nil {
+		return incidents.Incident{}, fmt.Errorf("coldstore incident warnings: %w", err)
+	}
+	return inc, nil
+}
+
+func jsonText(v any) (sql.NullString, error) {
+	raw, err := json.Marshal(v)
+	if err != nil {
+		return sql.NullString{}, err
+	}
+	if string(raw) == "null" {
+		return sql.NullString{}, nil
+	}
+	return sql.NullString{String: string(raw), Valid: true}, nil
+}
+
+func parseJSONText(raw string, out any) error {
+	if raw == "" {
+		return nil
+	}
+	return json.Unmarshal([]byte(raw), out)
+}
+
+func formatTime(t time.Time) string {
+	return t.UTC().Format(tsFormat)
+}
+
+func nullableTime(t *time.Time) sql.NullString {
+	if t == nil {
+		return sql.NullString{}
+	}
+	return sql.NullString{String: formatTime(*t), Valid: true}
+}
+
+func nullableInt(v *int) sql.NullInt64 {
+	if v == nil {
+		return sql.NullInt64{}
+	}
+	return sql.NullInt64{Int64: int64(*v), Valid: true}
+}
+
+var _ incidents.Store = (*IncidentStore)(nil)
diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go
new file mode 100644
index 0000000..10cdfe8
--- /dev/null
+++ b/internal/coldstore/incident_store_test.go
@@ -0,0 +1,85 @@
+package coldstore
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestIncidentStoreRoundtripAndPrune(t *testing.T) {
+	managed, err := Open(":memory:")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer managed.Close()
+	store := NewIncidentStore(managed.(*SQLiteStore))
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	users := 3
+	inc := incidents.Incident{
+		IncidentID:       incidents.StableID("prod", apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}, now),
+		Env:              "prod",
+		Service:          "checkout",
+		ErrorFamily:      apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		Status:           incidents.StatusActive,
+		Cause:            incidents.CauseDependency,
+		Confidence:       incidents.ConfidenceHigh,
+		Severity:         8,
+		StartedAt:        now,
+		UpdatedAt:        now,
+		LastSeenAt:       now,
+		AffectedRequests: 9,
+		AffectedUsers:    &users,
+		AffectedServices: 2,
+		TopServices:      []string{"checkout", "payment"},
+		SampleTraces:     []string{"trace-a", "trace-b"},
+		Evidence:         []incidents.Evidence{{Kind: incidents.EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: now}},
+		NextChecks:       []string{"check downstream"},
+		Lift:             9,
+		CurrentCount:     9,
+	}
+	if err := store.Upsert(context.Background(), inc); err != nil {
+		t.Fatal(err)
+	}
+	got, err := store.Get(context.Background(), inc.IncidentID)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got.IncidentID != inc.IncidentID || got.AffectedUsers == nil || *got.AffectedUsers != users || len(got.SampleTraces) != 2 {
+		t.Fatalf("roundtrip=%+v", got)
+	}
+	active, err := store.ListActive(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(active) != 1 {
+		t.Fatalf("active=%+v", active)
+	}
+	resolvedAt := now.Add(time.Minute)
+	inc.Status = incidents.StatusResolved
+	inc.ResolvedAt = &resolvedAt
+	if err := store.Upsert(context.Background(), inc); err != nil {
+		t.Fatal(err)
+	}
+	active, err = store.ListActive(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(active) != 0 {
+		t.Fatalf("active after resolve=%+v", active)
+	}
+	deleted, err := store.PruneResolvedOlderThan(context.Background(), resolvedAt.Add(time.Second))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if deleted != 1 {
+		t.Fatalf("deleted=%d", deleted)
+	}
+	_, err = store.Get(context.Background(), inc.IncidentID)
+	if !errors.Is(err, incidents.ErrNotFound) {
+		t.Fatalf("expected not found, got %v", err)
+	}
+}
diff --git a/internal/coldstore/migrations/004_incidents.sql b/internal/coldstore/migrations/004_incidents.sql
new file mode 100644
index 0000000..076a66d
--- /dev/null
+++ b/internal/coldstore/migrations/004_incidents.sql
@@ -0,0 +1,34 @@
+-- 004_incidents.sql: v2.1 incident engine persistence.
+
+CREATE TABLE IF NOT EXISTS incidents (
+    incident_id              TEXT PRIMARY KEY,
+    env                      TEXT NOT NULL,
+    service                  TEXT NOT NULL,
+    error_service            TEXT NOT NULL,
+    error_step               TEXT NOT NULL,
+    error_code               TEXT NOT NULL,
+    status                   TEXT NOT NULL,
+    cause                    TEXT NOT NULL,
+    confidence               TEXT NOT NULL,
+    severity                 INTEGER NOT NULL,
+    started_at               TEXT NOT NULL,
+    updated_at               TEXT NOT NULL,
+    last_seen_at             TEXT NOT NULL,
+    recovering_at            TEXT,
+    resolved_at              TEXT,
+    affected_requests        INTEGER NOT NULL,
+    affected_users           INTEGER,
+    affected_services        INTEGER NOT NULL,
+    top_services             TEXT,
+    sample_traces            TEXT,
+    evidence                 TEXT,
+    next_checks              TEXT,
+    instrumentation_warnings TEXT,
+    lift                     REAL NOT NULL DEFAULT 0,
+    baseline_count           INTEGER NOT NULL DEFAULT 0,
+    current_count            INTEGER NOT NULL DEFAULT 0
+);
+
+CREATE INDEX IF NOT EXISTS idx_incidents_status_started ON incidents (status, started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_incidents_family_started ON incidents (env, service, error_step, error_code, started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_incidents_resolved_at ON incidents (resolved_at);
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
new file mode 100644
index 0000000..8cde790
--- /dev/null
+++ b/internal/incidents/classifier.go
@@ -0,0 +1,269 @@
+package incidents
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type ClassificationInput struct {
+	Incident    Incident
+	Events      []*eventv2.Event
+	Signals     []signals.Signal
+	Deployments []Deployment
+	Now         time.Time
+}
+
+type Classification struct {
+	Cause                   Cause
+	Confidence              Confidence
+	Evidence                []Evidence
+	NextChecks              []string
+	InstrumentationWarnings []string
+}
+
+func Classify(input ClassificationInput) Classification {
+	evidence := collectTraceEvidence(input.Events)
+	warnings := instrumentationWarnings(input.Events, input.Signals)
+
+	if dep := matchingDependencySignal(input); dep != nil {
+		evidence = append(evidence, signalEvidence(*dep, "Dependency signal overlaps first failing downstream"))
+		return classification(CauseDependency, ConfidenceHigh, evidence, warnings)
+	}
+	if downstream := firstFailingDownstream(input.Events); downstream != "" {
+		evidence = append(evidence, Evidence{
+			Kind:       EvidenceTrace,
+			Title:      "First failing step calls downstream service",
+			Detail:     downstream,
+			Service:    downstream,
+			OccurredAt: input.Incident.StartedAt,
+		})
+		return classification(CauseDependency, ConfidenceMedium, evidence, warnings)
+	}
+	if dep := matchingDeployment(input); dep != nil {
+		evidence = append(evidence, deploymentEvidence(*dep))
+		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+	}
+	if sig := matchingSignal(input, signals.TypeDeploy); sig != nil {
+		evidence = append(evidence, signalEvidence(*sig, "Deploy signal overlaps incident window"))
+		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+	}
+	if len(input.Events) > 0 && input.Incident.ErrorFamily.Step != "" && firstFailingDownstream(input.Events) == "" {
+		return classification(CauseApp, ConfidenceMedium, evidence, warnings)
+	}
+	return classification(CauseUnknown, ConfidenceLow, evidence, warnings)
+}
+
+func classification(cause Cause, confidence Confidence, evidence []Evidence, warnings []string) Classification {
+	return Classification{
+		Cause:                   cause,
+		Confidence:              confidence,
+		Evidence:                normalizeEvidence(evidence, 8),
+		NextChecks:              NextChecks(cause, confidence),
+		InstrumentationWarnings: uniqueStrings(warnings),
+	}
+}
+
+func matchingDependencySignal(input ClassificationInput) *signals.Signal {
+	downstream := firstFailingDownstream(input.Events)
+	for i := range input.Signals {
+		sig := input.Signals[i]
+		if sig.Type != signals.TypeDependency {
+			continue
+		}
+		if downstream != "" && sig.Service != downstream {
+			continue
+		}
+		return &input.Signals[i]
+	}
+	return nil
+}
+
+func matchingDeployment(input ClassificationInput) *Deployment {
+	version := sampleVersion(input.Events)
+	for i := range input.Deployments {
+		dep := input.Deployments[i]
+		if dep.Env != "" && input.Incident.Env != "" && dep.Env != input.Incident.Env {
+			continue
+		}
+		if dep.Service != input.Incident.Service {
+			continue
+		}
+		if version != "" && dep.Version != "" && dep.Version != version {
+			continue
+		}
+		return &input.Deployments[i]
+	}
+	return nil
+}
+
+func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal {
+	version := sampleVersion(input.Events)
+	for i := range input.Signals {
+		sig := input.Signals[i]
+		if sig.Type != typ || sig.Service != input.Incident.Service {
+			continue
+		}
+		if version == "" {
+			return &input.Signals[i]
+		}
+		if sigVersion := stringField(sig.Metadata, "version"); sigVersion == "" || sigVersion == version {
+			return &input.Signals[i]
+		}
+	}
+	return nil
+}
+
+func collectTraceEvidence(events []*eventv2.Event) []Evidence {
+	out := make([]Evidence, 0, 2)
+	for _, ev := range events {
+		if ev == nil || ev.Anchor == nil {
+			continue
+		}
+		out = append(out, Evidence{
+			Kind:       EvidenceTrace,
+			Title:      "First failing trace sample",
+			Detail:     fmt.Sprintf("%s/%s", ev.Anchor.Step, ev.Anchor.ErrorCode),
+			Service:    ev.Service,
+			TraceID:    ev.TraceID,
+			OccurredAt: ev.TsStart,
+		})
+		break
+	}
+	return out
+}
+
+func deploymentEvidence(dep Deployment) Evidence {
+	return Evidence{
+		Kind:       EvidenceDeployment,
+		Title:      "Deployment overlaps incident window",
+		Detail:     dep.Version,
+		Service:    dep.Service,
+		DeployID:   dep.ID,
+		OccurredAt: dep.FirstSeen,
+	}
+}
+
+func signalEvidence(sig signals.Signal, title string) Evidence {
+	return Evidence{
+		Kind:       EvidenceSignal,
+		Title:      title,
+		Detail:     sig.Reason,
+		Service:    sig.Service,
+		SignalID:   sig.SignalID,
+		OccurredAt: sig.Timestamp,
+		Fields: map[string]any{
+			"type":     string(sig.Type),
+			"severity": string(sig.Severity),
+			"source":   sig.Source,
+		},
+	}
+}
+
+func normalizeEvidence(evidence []Evidence, limit int) []Evidence {
+	sort.SliceStable(evidence, func(i, j int) bool {
+		if !evidence[i].OccurredAt.Equal(evidence[j].OccurredAt) {
+			return evidence[i].OccurredAt.Before(evidence[j].OccurredAt)
+		}
+		if evidence[i].Kind != evidence[j].Kind {
+			return evidence[i].Kind < evidence[j].Kind
+		}
+		return evidence[i].Title < evidence[j].Title
+	})
+	seen := map[string]struct{}{}
+	out := make([]Evidence, 0, len(evidence))
+	for _, ev := range evidence {
+		key := string(ev.Kind) + "|" + ev.Title + "|" + ev.SignalID + "|" + ev.DeployID + "|" + ev.TraceID
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		out = append(out, ev)
+		if limit > 0 && len(out) == limit {
+			break
+		}
+	}
+	return out
+}
+
+func instrumentationWarnings(events []*eventv2.Event, sigs []signals.Signal) []string {
+	var warnings []string
+	if sampleVersion(events) == "" {
+		warnings = append(warnings, "missing_service_version")
+	}
+	if firstFailingDownstream(events) != "" && !hasSignalType(sigs, signals.TypeDependency) {
+		warnings = append(warnings, "missing_dependency_signal")
+	}
+	for _, ev := range events {
+		if ev != nil && ev.Status == eventv2.StatusPartial {
+			warnings = append(warnings, "partial_trace")
+			break
+		}
+	}
+	return warnings
+}
+
+func firstFailingDownstream(events []*eventv2.Event) string {
+	for _, ev := range events {
+		if ev == nil || ev.Anchor == nil {
+			continue
+		}
+		for _, step := range ev.Steps {
+			if step.Name == ev.Anchor.Step && step.Status == eventv2.StepStatusError && step.Downstream != nil {
+				return step.Downstream.Service
+			}
+		}
+	}
+	return ""
+}
+
+func sampleVersion(events []*eventv2.Event) string {
+	for _, ev := range events {
+		if ev != nil && ev.Version != "" {
+			return ev.Version
+		}
+	}
+	return ""
+}
+
+func hasSignalType(sigs []signals.Signal, typ signals.Type) bool {
+	for _, sig := range sigs {
+		if sig.Type == typ {
+			return true
+		}
+	}
+	return false
+}
+
+func stringField(m map[string]any, key string) string {
+	if m == nil {
+		return ""
+	}
+	v, _ := m[key].(string)
+	return v
+}
+
+func uniqueStrings(in []string) []string {
+	if len(in) == 0 {
+		return nil
+	}
+	seen := map[string]struct{}{}
+	out := make([]string, 0, len(in))
+	for _, s := range in {
+		s = strings.TrimSpace(s)
+		if s == "" {
+			continue
+		}
+		if _, ok := seen[s]; ok {
+			continue
+		}
+		seen[s] = struct{}{}
+		out = append(out, s)
+	}
+	sort.Strings(out)
+	return out
+}
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
new file mode 100644
index 0000000..348787a
--- /dev/null
+++ b/internal/incidents/classifier_test.go
@@ -0,0 +1,66 @@
+package incidents
+
+import (
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func TestClassifierRules(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+	paymentEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+	t.Run("dependency with signal", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{paymentEvent},
+			Signals: []signals.Signal{{
+				SignalID:  "sig_dep",
+				Type:      signals.TypeDependency,
+				Service:   "payment",
+				Env:       "prod",
+				Reason:    "upstream_5xx",
+				Severity:  signals.SeverityCritical,
+				Timestamp: now.Add(-time.Minute),
+			}},
+		})
+		if got.Cause != CauseDependency || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("dependency trace only", func(t *testing.T) {
+		got := Classify(ClassificationInput{Incident: base, Events: []*eventv2.Event{paymentEvent}})
+		if got.Cause != CauseDependency || got.Confidence != ConfidenceMedium {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("deploy", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{testIncidentEvent("e2", "trace-b", now, "checkout", "cart.validate", "CHK_500", "")},
+			Deployments: []Deployment{{ID: "dep_1", Service: "checkout", Version: "v1", Env: "prod", FirstSeen: now.Add(-time.Minute)}},
+		})
+		if got.Cause != CauseDeploy || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("app", func(t *testing.T) {
+		got := Classify(ClassificationInput{Incident: base, Events: []*eventv2.Event{testIncidentEvent("e3", "trace-c", now, "checkout", "cart.validate", "CHK_500", "")}})
+		if got.Cause != CauseApp || got.Confidence != ConfidenceMedium {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("unknown", func(t *testing.T) {
+		got := Classify(ClassificationInput{Incident: base})
+		if got.Cause != CauseUnknown || got.Confidence != ConfidenceLow {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+}
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
new file mode 100644
index 0000000..6eb6cba
--- /dev/null
+++ b/internal/incidents/engine.go
@@ -0,0 +1,497 @@
+package incidents
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"math"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type Config struct {
+	TickInterval            time.Duration
+	Window                  time.Duration
+	MinCount                int
+	MinLift                 float64
+	ResolveAfter            time.Duration
+	DeployCorrelationWindow time.Duration
+	SampleLimit             int
+}
+
+func DefaultConfig() Config {
+	return Config{
+		TickInterval:            30 * time.Second,
+		Window:                  10 * time.Minute,
+		MinCount:                5,
+		MinLift:                 3.0,
+		ResolveAfter:            2 * time.Minute,
+		DeployCorrelationWindow: 15 * time.Minute,
+		SampleLimit:             5,
+	}
+}
+
+func (c Config) withDefaults() Config {
+	d := DefaultConfig()
+	if c.TickInterval <= 0 {
+		c.TickInterval = d.TickInterval
+	}
+	if c.Window <= 0 {
+		c.Window = d.Window
+	}
+	if c.MinCount <= 0 {
+		c.MinCount = d.MinCount
+	}
+	if c.MinLift <= 0 {
+		c.MinLift = d.MinLift
+	}
+	if c.ResolveAfter <= 0 {
+		c.ResolveAfter = d.ResolveAfter
+	}
+	if c.DeployCorrelationWindow <= 0 {
+		c.DeployCorrelationWindow = d.DeployCorrelationWindow
+	}
+	if c.SampleLimit <= 0 {
+		c.SampleLimit = d.SampleLimit
+	}
+	return c
+}
+
+type Engine struct {
+	reader  Reader
+	signals SignalStore
+	deploys DeploySource
+	store   Store
+	cfg     Config
+	metrics *metrics.Metrics
+	log     *slog.Logger
+	now     func() time.Time
+
+	mu     sync.RWMutex
+	active map[string]Incident
+}
+
+func NewEngine(reader Reader, signalStore SignalStore, deploys DeploySource, store Store, cfg Config, m *metrics.Metrics, log *slog.Logger) *Engine {
+	if log == nil {
+		log = slog.Default()
+	}
+	return &Engine{
+		reader:  reader,
+		signals: signalStore,
+		deploys: deploys,
+		store:   store,
+		cfg:     cfg.withDefaults(),
+		metrics: m,
+		log:     log,
+		now:     time.Now,
+		active:  map[string]Incident{},
+	}
+}
+
+func (e *Engine) Bootstrap(ctx context.Context) error {
+	rows, err := e.store.ListActive(ctx)
+	if err != nil {
+		return err
+	}
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.active = map[string]Incident{}
+	for _, inc := range rows {
+		e.active[inc.IncidentID] = inc
+	}
+	if e.metrics != nil {
+		e.metrics.IncidentActive.Set(float64(len(rows)))
+	}
+	return nil
+}
+
+func (e *Engine) Run(ctx context.Context) {
+	ticker := time.NewTicker(e.cfg.TickInterval)
+	defer ticker.Stop()
+	e.log.Info("incident engine started", "interval", e.cfg.TickInterval, "window", e.cfg.Window)
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			if err := e.Tick(ctx); err != nil {
+				e.log.Warn("incident tick failed", "err", err)
+			}
+		}
+	}
+}
+
+func (e *Engine) Tick(ctx context.Context) error {
+	start := time.Now()
+	if e.metrics != nil {
+		defer func() { e.metrics.IncidentTickLatency.Observe(time.Since(start).Seconds()) }()
+	}
+	now := e.now().UTC()
+	currentStart := now.Add(-e.cfg.Window)
+	baselineStart := now.Add(-2 * e.cfg.Window)
+	statuses := failedStatuses()
+	current := e.reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
+	baseline := e.reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
+	baselineByFamily := map[string]int{}
+	for _, row := range baseline.Rows {
+		baselineByFamily[familyKey(row.ErrorFamily)] = row.Count
+	}
+
+	seen := map[string]struct{}{}
+	for _, row := range current.Rows {
+		if row.Count < e.cfg.MinCount {
+			continue
+		}
+		baselineCount := baselineByFamily[familyKey(row.ErrorFamily)]
+		lift := computeLift(row.Count, baselineCount)
+		if baselineCount > 0 && lift < e.cfg.MinLift {
+			continue
+		}
+		inc, err := e.buildIncident(ctx, row, baselineCount, lift, currentStart, now)
+		if err != nil {
+			return err
+		}
+		seen[inc.IncidentID] = struct{}{}
+		if err := e.store.Upsert(ctx, inc); err != nil {
+			return err
+		}
+		e.remember(inc)
+	}
+	if err := e.transitionMissing(ctx, seen, now); err != nil {
+		return err
+	}
+	if e.metrics != nil {
+		e.metrics.IncidentActive.Set(float64(e.activeCount()))
+	}
+	return nil
+}
+
+func (e *Engine) Active(ctx context.Context) ([]Incident, error) {
+	rows, err := e.store.ListActive(ctx)
+	if err != nil {
+		return nil, err
+	}
+	sortIncidents(rows)
+	return rows, nil
+}
+
+func (e *Engine) Get(ctx context.Context, id string) (Incident, error) {
+	return e.store.Get(ctx, id)
+}
+
+func (e *Engine) TopActive(ctx context.Context) (*Incident, error) {
+	rows, err := e.Active(ctx)
+	if err != nil {
+		return nil, err
+	}
+	if len(rows) == 0 {
+		return nil, nil
+	}
+	return &rows[0], nil
+}
+
+func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, error) {
+	events := e.sampleEvents(row.ErrorFamily, since, now, 200)
+	startedAt := earliestEventTime(events, now)
+	env := firstEventEnv(events)
+	if existing, ok := e.findByFamily(env, row.ErrorFamily); ok {
+		startedAt = existing.StartedAt
+	}
+	id := StableID(env, row.ErrorFamily, startedAt)
+	existing, hadExisting := e.getCached(id)
+	if !hadExisting {
+		if prior, ok := e.findByFamily(env, row.ErrorFamily); ok {
+			existing = prior
+			id = prior.IncidentID
+			hadExisting = true
+		}
+	}
+	blast := e.reader.BlastRadius(
+		SearchFilter{Since: since, Until: now},
+		apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
+	)
+	sigs, err := e.querySignals(ctx, row.ErrorFamily.Service, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	if err != nil && !errors.Is(err, signals.ErrUnavailable) {
+		return Incident{}, err
+	}
+	deploys, err := e.queryDeploys(ctx, row.ErrorFamily.Service, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	if err != nil {
+		return Incident{}, err
+	}
+	inc := Incident{
+		IncidentID:       id,
+		Env:              env,
+		Service:          row.ErrorFamily.Service,
+		ErrorFamily:      row.ErrorFamily,
+		Status:           StatusActive,
+		Severity:         severity(row.Count, blast.AffectedServices, lift),
+		StartedAt:        startedAt,
+		UpdatedAt:        now,
+		LastSeenAt:       now,
+		AffectedRequests: blast.AffectedRequests,
+		AffectedUsers:    cloneInt(row.AffectedUsers),
+		AffectedServices: blast.AffectedServices,
+		TopServices:      append([]string(nil), blast.TopServices...),
+		SampleTraces:     stableSamples(existing.SampleTraces, events, e.cfg.SampleLimit),
+		Lift:             lift,
+		BaselineCount:    baselineCount,
+		CurrentCount:     row.Count,
+	}
+	if hadExisting {
+		inc.StartedAt = existing.StartedAt
+		inc.RecoveringAt = nil
+	}
+	class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
+	inc.Cause = class.Cause
+	inc.Confidence = class.Confidence
+	inc.Evidence = class.Evidence
+	inc.NextChecks = class.NextChecks
+	inc.InstrumentationWarnings = class.InstrumentationWarnings
+	e.observeClassification(inc.Cause, inc.Confidence)
+	if e.metrics != nil {
+		if hadExisting {
+			e.metrics.IncidentUpdated.Inc()
+		} else {
+			e.metrics.IncidentOpened.Inc()
+		}
+	}
+	return inc, nil
+}
+
+func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}, now time.Time) error {
+	e.mu.RLock()
+	rows := make([]Incident, 0, len(e.active))
+	for _, inc := range e.active {
+		rows = append(rows, cloneIncident(inc))
+	}
+	e.mu.RUnlock()
+	for _, inc := range rows {
+		if _, ok := seen[inc.IncidentID]; ok {
+			continue
+		}
+		switch inc.Status {
+		case StatusActive:
+			inc.Status = StatusRecovering
+			t := now
+			inc.RecoveringAt = &t
+			inc.UpdatedAt = now
+			if err := e.store.Upsert(ctx, inc); err != nil {
+				return err
+			}
+			e.remember(inc)
+			if e.metrics != nil {
+				e.metrics.IncidentRecovered.Inc()
+			}
+		case StatusRecovering:
+			if now.Sub(inc.LastSeenAt) >= e.cfg.ResolveAfter {
+				inc.Status = StatusResolved
+				t := now
+				inc.ResolvedAt = &t
+				inc.UpdatedAt = now
+				if err := e.store.Upsert(ctx, inc); err != nil {
+					return err
+				}
+				e.forget(inc.IncidentID)
+				if e.metrics != nil {
+					e.metrics.IncidentResolved.Inc()
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
+	events := e.reader.SearchEvents(SearchFilter{
+		Service:   f.Service,
+		ErrorCode: f.ErrorCode,
+		Since:     since,
+		Until:     until,
+		Statuses:  failedStatuses(),
+	}, limit)
+	out := make([]*eventv2.Event, 0, len(events))
+	for _, ev := range events {
+		if ev != nil && ev.Anchor != nil && ev.Anchor.Step == f.Step {
+			out = append(out, ev)
+		}
+	}
+	return out
+}
+
+func (e *Engine) querySignals(ctx context.Context, service, env string, since, until time.Time) ([]signals.Signal, error) {
+	if e.signals == nil {
+		return nil, nil
+	}
+	return e.signals.Query(ctx, signals.Filter{Service: service, Env: env, Since: since, Until: until, Limit: 200})
+}
+
+func (e *Engine) queryDeploys(ctx context.Context, service string, since, until time.Time) ([]Deployment, error) {
+	if e.deploys == nil {
+		return nil, nil
+	}
+	return e.deploys.DeploymentsInWindow(ctx, since, until, service)
+}
+
+func (e *Engine) remember(inc Incident) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.active[inc.IncidentID] = cloneIncident(inc)
+}
+
+func (e *Engine) forget(id string) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	delete(e.active, id)
+}
+
+func (e *Engine) getCached(id string) (Incident, bool) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	inc, ok := e.active[id]
+	return cloneIncident(inc), ok
+}
+
+func (e *Engine) findByFamily(env string, family apiv2.ErrorFamily) (Incident, bool) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	for _, inc := range e.active {
+		if inc.Env == env && inc.ErrorFamily == family && inc.Status != StatusResolved {
+			return cloneIncident(inc), true
+		}
+	}
+	return Incident{}, false
+}
+
+func (e *Engine) activeCount() int {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	return len(e.active)
+}
+
+func (e *Engine) observeClassification(cause Cause, confidence Confidence) {
+	if e.metrics == nil {
+		return
+	}
+	e.metrics.IncidentClassifications.With(prometheus.Labels{
+		"cause":      string(cause),
+		"confidence": string(confidence),
+	}).Inc()
+}
+
+func failedStatuses() map[eventv2.Status]struct{} {
+	return map[eventv2.Status]struct{}{
+		eventv2.StatusError:   {},
+		eventv2.StatusTimeout: {},
+		eventv2.StatusPartial: {},
+		eventv2.StatusAborted: {},
+	}
+}
+
+func computeLift(current, baseline int) float64 {
+	if baseline <= 0 {
+		return float64(current)
+	}
+	return float64(current) / float64(baseline)
+}
+
+func severity(count, services int, lift float64) int {
+	score := 1 + count/5 + services
+	if lift >= 10 {
+		score += 3
+	} else if lift >= 3 {
+		score += 2
+	}
+	return int(math.Min(10, float64(score)))
+}
+
+func familyKey(f apiv2.ErrorFamily) string {
+	return f.Service + "\x00" + f.Step + "\x00" + f.ErrorCode
+}
+
+func earliestEventTime(events []*eventv2.Event, fallback time.Time) time.Time {
+	out := fallback
+	for _, ev := range events {
+		if ev == nil {
+			continue
+		}
+		if out.IsZero() || ev.TsStart.Before(out) {
+			out = ev.TsStart
+		}
+	}
+	return out.UTC()
+}
+
+func firstEventEnv(events []*eventv2.Event) string {
+	for _, ev := range events {
+		if ev != nil && ev.Env != "" {
+			return ev.Env
+		}
+	}
+	return "unknown"
+}
+
+func stableSamples(existing []string, events []*eventv2.Event, limit int) []string {
+	if limit <= 0 {
+		return nil
+	}
+	out := append([]string(nil), existing...)
+	seen := map[string]struct{}{}
+	for _, traceID := range out {
+		seen[traceID] = struct{}{}
+	}
+	if len(out) == 0 {
+		ascending := append([]*eventv2.Event(nil), events...)
+		sort.SliceStable(ascending, func(i, j int) bool {
+			if !ascending[i].TsStart.Equal(ascending[j].TsStart) {
+				return ascending[i].TsStart.Before(ascending[j].TsStart)
+			}
+			return ascending[i].TraceID < ascending[j].TraceID
+		})
+		for _, ev := range ascending {
+			if ev != nil && ev.TraceID != "" {
+				out = append(out, ev.TraceID)
+				seen[ev.TraceID] = struct{}{}
+				break
+			}
+		}
+	}
+	recent := append([]*eventv2.Event(nil), events...)
+	sort.SliceStable(recent, func(i, j int) bool {
+		if !recent[i].TsStart.Equal(recent[j].TsStart) {
+			return recent[i].TsStart.After(recent[j].TsStart)
+		}
+		return recent[i].TraceID < recent[j].TraceID
+	})
+	for _, ev := range recent {
+		if ev == nil || ev.TraceID == "" {
+			continue
+		}
+		if _, ok := seen[ev.TraceID]; ok {
+			continue
+		}
+		out = append(out, ev.TraceID)
+		seen[ev.TraceID] = struct{}{}
+		if len(out) == limit {
+			break
+		}
+	}
+	if len(out) > limit {
+		out = out[:limit]
+	}
+	return out
+}
+
+func cloneInt(in *int) *int {
+	if in == nil {
+		return nil
+	}
+	v := *in
+	return &v
+}
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
new file mode 100644
index 0000000..0b76de0
--- /dev/null
+++ b/internal/incidents/engine_test.go
@@ -0,0 +1,105 @@
+package incidents
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func TestEngineLifecycleAndSampleStability(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	reader := &fakeReader{
+		current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily:    testFamily(),
+			Count:          6,
+			AffectedTraces: 6,
+			SampleTraces:   []string{"trace-new"},
+		}}},
+		blast: apiv2.BlastRadiusResponse{
+			AffectedRequests: 6,
+			AffectedServices: 2,
+			TopServices:      []string{"checkout", "payment"},
+			SampleTraces:     []string{"trace-new"},
+		},
+		events: []*eventv2.Event{
+			testIncidentEvent("old", "trace-old", now.Add(-2*time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+			testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+		},
+	}
+	store := NewMemoryStore()
+	engine := NewEngine(reader, nil, nil, store, Config{MinCount: 5, ResolveAfter: time.Minute, SampleLimit: 2}, nil, nil)
+	engine.now = func() time.Time { return now }
+	if err := engine.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	if err := engine.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, err := engine.Active(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rows) != 1 || rows[0].Status != StatusActive {
+		t.Fatalf("rows=%+v", rows)
+	}
+	if got := rows[0].SampleTraces; len(got) != 2 || got[0] != "trace-old" || got[1] != "trace-new" {
+		t.Fatalf("samples=%+v", got)
+	}
+
+	reader.current.Rows = nil
+	now = now.Add(30 * time.Second)
+	if err := engine.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, _ = engine.Active(context.Background())
+	if len(rows) != 1 || rows[0].Status != StatusRecovering {
+		t.Fatalf("recovering rows=%+v", rows)
+	}
+
+	now = now.Add(2 * time.Minute)
+	if err := engine.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, _ = engine.Active(context.Background())
+	if len(rows) != 0 {
+		t.Fatalf("expected resolved incident removed from active cache, rows=%+v", rows)
+	}
+
+	rehydrated := NewEngine(reader, nil, nil, store, Config{}, nil, nil)
+	if err := rehydrated.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, _ = rehydrated.Active(context.Background())
+	if len(rows) != 0 {
+		t.Fatalf("bootstrap should ignore resolved incidents, rows=%+v", rows)
+	}
+}
+
+type fakeReader struct {
+	current ErrorsResult
+	base    ErrorsResult
+	blast   apiv2.BlastRadiusResponse
+	events  []*eventv2.Event
+	calls   int
+}
+
+func (r *fakeReader) Errors(_ SearchFilter, _ int) ErrorsResult {
+	r.calls++
+	if r.calls%2 == 1 {
+		return r.current
+	}
+	return r.base
+}
+
+func (r *fakeReader) BlastRadius(_ SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse {
+	out := r.blast
+	out.Key = key
+	return out
+}
+
+func (r *fakeReader) SearchEvents(_ SearchFilter, _ int) []*eventv2.Event {
+	return r.events
+}
diff --git a/internal/incidents/handler.go b/internal/incidents/handler.go
new file mode 100644
index 0000000..08d57ee
--- /dev/null
+++ b/internal/incidents/handler.go
@@ -0,0 +1,92 @@
+package incidents
+
+import (
+	"encoding/json"
+	"errors"
+	"net/http"
+	"strings"
+)
+
+type Handler struct {
+	engine *Engine
+}
+
+func NewHandler(engine *Engine) *Handler {
+	return &Handler{engine: engine}
+}
+
+func (h *Handler) Active(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+		return
+	}
+	rows, err := h.engine.Active(r.Context())
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "internal_error", "query incidents failed", err.Error())
+		return
+	}
+	writeJSON(w, http.StatusOK, ActiveResponse{Incidents: rows})
+}
+
+func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+		return
+	}
+	path := strings.TrimPrefix(r.URL.Path, "/v1/incidents/")
+	if path == "" || path == r.URL.Path {
+		writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+		return
+	}
+	if strings.HasSuffix(path, "/snapshot") {
+		id := strings.TrimSuffix(path, "/snapshot")
+		h.snapshot(w, r, id)
+		return
+	}
+	inc, err := h.engine.Get(r.Context(), path)
+	if errors.Is(err, ErrNotFound) {
+		writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+		return
+	}
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
+		return
+	}
+	writeJSON(w, http.StatusOK, DetailResponse{Incident: inc})
+}
+
+func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
+	inc, err := h.engine.Get(r.Context(), id)
+	if errors.Is(err, ErrNotFound) {
+		writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+		return
+	}
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
+		return
+	}
+	snapshot := RenderSnapshot(inc)
+	if strings.Contains(r.Header.Get("Accept"), "application/json") {
+		writeJSON(w, http.StatusOK, SnapshotResponse{Snapshot: snapshot, Incident: inc})
+		return
+	}
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write([]byte(snapshot))
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+	writeJSON(w, status, map[string]any{
+		"error": map[string]any{
+			"code":    code,
+			"message": message,
+			"detail":  detail,
+		},
+	})
+}
diff --git a/internal/incidents/handler_test.go b/internal/incidents/handler_test.go
new file mode 100644
index 0000000..2be9a80
--- /dev/null
+++ b/internal/incidents/handler_test.go
@@ -0,0 +1,61 @@
+package incidents
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	store := NewMemoryStore()
+	inc := testIncident(now)
+	if err := store.Upsert(context.Background(), inc); err != nil {
+		t.Fatal(err)
+	}
+	engine := NewEngine(&fakeReader{}, nil, nil, store, Config{}, nil, nil)
+	if err := engine.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	h := NewHandler(engine)
+
+	rec := httptest.NewRecorder()
+	h.Active(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/active", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("active status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var active ActiveResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &active); err != nil {
+		t.Fatal(err)
+	}
+	if len(active.Incidents) != 1 || active.Incidents[0].IncidentID != inc.IncidentID {
+		t.Fatalf("active=%+v", active)
+	}
+
+	rec = httptest.NewRecorder()
+	h.Incident(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID, nil))
+	if rec.Code != http.StatusOK || !strings.Contains(rec.Body.String(), inc.IncidentID) {
+		t.Fatalf("detail status=%d body=%s", rec.Code, rec.Body.String())
+	}
+
+	rec = httptest.NewRecorder()
+	h.Incident(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID+"/snapshot", nil))
+	if rec.Code != http.StatusOK || !strings.Contains(rec.Header().Get("Content-Type"), "text/plain") {
+		t.Fatalf("snapshot status=%d content-type=%s", rec.Code, rec.Header().Get("Content-Type"))
+	}
+	if !strings.Contains(rec.Body.String(), "Incident "+inc.IncidentID) {
+		t.Fatalf("snapshot=%s", rec.Body.String())
+	}
+
+	rec = httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID+"/snapshot", nil)
+	req.Header.Set("Accept", "application/json")
+	h.Incident(rec, req)
+	if rec.Code != http.StatusOK || !strings.Contains(rec.Body.String(), `"snapshot"`) {
+		t.Fatalf("json snapshot status=%d body=%s", rec.Code, rec.Body.String())
+	}
+}
diff --git a/internal/incidents/id.go b/internal/incidents/id.go
new file mode 100644
index 0000000..8a038ce
--- /dev/null
+++ b/internal/incidents/id.go
@@ -0,0 +1,19 @@
+package incidents
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"strings"
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+const idBucket = 5 * time.Minute
+
+func StableID(env string, family apiv2.ErrorFamily, startedAt time.Time) string {
+	bucket := startedAt.UTC().Truncate(idBucket).Format(time.RFC3339)
+	parts := []string{env, family.Service, apiv2.FormatErrorFamily(family), bucket}
+	sum := sha256.Sum256([]byte(strings.Join(parts, "|")))
+	return "inc_" + hex.EncodeToString(sum[:])[:16]
+}
diff --git a/internal/incidents/id_test.go b/internal/incidents/id_test.go
new file mode 100644
index 0000000..bd96ede
--- /dev/null
+++ b/internal/incidents/id_test.go
@@ -0,0 +1,25 @@
+package incidents
+
+import (
+	"testing"
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestStableIDUsesFiveMinuteBucket(t *testing.T) {
+	family := apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}
+	base := time.Date(2026, 5, 4, 12, 3, 0, 0, time.UTC)
+	a := StableID("prod", family, base)
+	b := StableID("prod", family, base.Add(90*time.Second))
+	c := StableID("prod", family, base.Add(3*time.Minute))
+	if a != b {
+		t.Fatalf("same bucket ids differ: %s %s", a, b)
+	}
+	if a == c {
+		t.Fatalf("different bucket id did not change: %s", a)
+	}
+	if len(a) != len("inc_")+16 {
+		t.Fatalf("id length=%d id=%s", len(a), a)
+	}
+}
diff --git a/internal/incidents/interfaces.go b/internal/incidents/interfaces.go
new file mode 100644
index 0000000..d2ce85e
--- /dev/null
+++ b/internal/incidents/interfaces.go
@@ -0,0 +1,36 @@
+package incidents
+
+import (
+	"context"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type Reader interface {
+	Errors(f SearchFilter, limit int) ErrorsResult
+	BlastRadius(f SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse
+	SearchEvents(f SearchFilter, limit int) []*eventv2.Event
+}
+
+type SearchFilter struct {
+	Service   string
+	Statuses  map[eventv2.Status]struct{}
+	ErrorCode string
+	Since     time.Time
+	Until     time.Time
+}
+
+type ErrorsResult struct {
+	Rows []apiv2.ErrorRow
+}
+
+type SignalStore interface {
+	Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error)
+}
+
+type DeploySource interface {
+	DeploymentsInWindow(ctx context.Context, start, end time.Time, serviceFilter string) ([]Deployment, error)
+}
diff --git a/internal/incidents/nextchecks.go b/internal/incidents/nextchecks.go
new file mode 100644
index 0000000..b3a6559
--- /dev/null
+++ b/internal/incidents/nextchecks.go
@@ -0,0 +1,30 @@
+package incidents
+
+func NextChecks(cause Cause, confidence Confidence) []string {
+	switch cause {
+	case CauseDeploy:
+		return []string{
+			"Compare error onset with the deployment timestamp.",
+			"Check whether the deployed service version appears on failing traces.",
+			"Roll back or canary-disable the deployment if the affected family is still rising.",
+		}
+	case CauseDependency:
+		return []string{
+			"Check the downstream service health and recent deploys.",
+			"Inspect retries, timeouts, and circuit-breaker state for the failing step.",
+			"Notify the downstream owner with sample traces and affected service list.",
+		}
+	case CauseApp:
+		return []string{
+			"Inspect the first failing step and recent application logs.",
+			"Compare failing request fields against recent successful requests.",
+			"Add instrumentation if the step lacks enough context to isolate the bad branch.",
+		}
+	default:
+		return []string{
+			"Inspect sample traces for missing downstream or deploy evidence.",
+			"Check whether production signals are being posted to /v1/signals.",
+			"Add service version and dependency health signals to improve classification.",
+		}
+	}
+}
diff --git a/internal/incidents/render.go b/internal/incidents/render.go
new file mode 100644
index 0000000..75c2024
--- /dev/null
+++ b/internal/incidents/render.go
@@ -0,0 +1,48 @@
+package incidents
+
+import (
+	"fmt"
+	"strings"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func RenderSnapshot(inc Incident) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Incident %s\n", inc.IncidentID)
+	fmt.Fprintf(&b, "Status: %s\n", inc.Status)
+	fmt.Fprintf(&b, "Family: %s\n", apiv2.FormatErrorFamily(inc.ErrorFamily))
+	fmt.Fprintf(&b, "Cause: %s (%s confidence)\n", inc.Cause, inc.Confidence)
+	fmt.Fprintf(&b, "Started: %s\n", inc.StartedAt.Format("2006-01-02T15:04:05Z07:00"))
+	fmt.Fprintf(&b, "Affected: %d requests, %d services\n", inc.AffectedRequests, inc.AffectedServices)
+	fmt.Fprintf(&b, "Lift: %.2fx over baseline %d\n", inc.Lift, inc.BaselineCount)
+	if len(inc.TopServices) > 0 {
+		fmt.Fprintf(&b, "Top services: %s\n", strings.Join(inc.TopServices, ", "))
+	}
+	if len(inc.SampleTraces) > 0 {
+		fmt.Fprintf(&b, "Sample traces: %s\n", strings.Join(inc.SampleTraces, ", "))
+	}
+	if len(inc.Evidence) > 0 {
+		b.WriteString("\nEvidence:\n")
+		for _, ev := range inc.Evidence {
+			fmt.Fprintf(&b, "- %s: %s", ev.Kind, ev.Title)
+			if ev.Detail != "" {
+				fmt.Fprintf(&b, " (%s)", ev.Detail)
+			}
+			b.WriteByte('\n')
+		}
+	}
+	if len(inc.NextChecks) > 0 {
+		b.WriteString("\nNext checks:\n")
+		for _, check := range inc.NextChecks {
+			fmt.Fprintf(&b, "- %s\n", check)
+		}
+	}
+	if len(inc.InstrumentationWarnings) > 0 {
+		b.WriteString("\nInstrumentation warnings:\n")
+		for _, warning := range inc.InstrumentationWarnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+	}
+	return b.String()
+}
diff --git a/internal/incidents/store.go b/internal/incidents/store.go
new file mode 100644
index 0000000..90b7323
--- /dev/null
+++ b/internal/incidents/store.go
@@ -0,0 +1,104 @@
+package incidents
+
+import (
+	"context"
+	"errors"
+	"sort"
+	"sync"
+	"time"
+)
+
+var ErrNotFound = errors.New("incidents: not found")
+
+type Store interface {
+	Upsert(ctx context.Context, inc Incident) error
+	Get(ctx context.Context, id string) (Incident, error)
+	ListActive(ctx context.Context) ([]Incident, error)
+	PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error)
+}
+
+type MemoryStore struct {
+	mu   sync.Mutex
+	rows map[string]Incident
+}
+
+func NewMemoryStore() *MemoryStore {
+	return &MemoryStore{rows: map[string]Incident{}}
+}
+
+func (s *MemoryStore) Upsert(_ context.Context, inc Incident) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.rows[inc.IncidentID] = cloneIncident(inc)
+	return nil
+}
+
+func (s *MemoryStore) Get(_ context.Context, id string) (Incident, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	inc, ok := s.rows[id]
+	if !ok {
+		return Incident{}, ErrNotFound
+	}
+	return cloneIncident(inc), nil
+}
+
+func (s *MemoryStore) ListActive(_ context.Context) ([]Incident, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]Incident, 0, len(s.rows))
+	for _, inc := range s.rows {
+		if inc.Status != StatusResolved {
+			out = append(out, cloneIncident(inc))
+		}
+	}
+	sortIncidents(out)
+	return out, nil
+}
+
+func (s *MemoryStore) PruneResolvedOlderThan(_ context.Context, cutoff time.Time) (int, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	deleted := 0
+	for id, inc := range s.rows {
+		if inc.Status == StatusResolved && inc.ResolvedAt != nil && inc.ResolvedAt.Before(cutoff) {
+			delete(s.rows, id)
+			deleted++
+		}
+	}
+	return deleted, nil
+}
+
+func sortIncidents(rows []Incident) {
+	sort.SliceStable(rows, func(i, j int) bool {
+		if rows[i].Severity != rows[j].Severity {
+			return rows[i].Severity > rows[j].Severity
+		}
+		if !rows[i].StartedAt.Equal(rows[j].StartedAt) {
+			return rows[i].StartedAt.After(rows[j].StartedAt)
+		}
+		return rows[i].IncidentID < rows[j].IncidentID
+	})
+}
+
+func cloneIncident(in Incident) Incident {
+	out := in
+	out.TopServices = append([]string(nil), in.TopServices...)
+	out.SampleTraces = append([]string(nil), in.SampleTraces...)
+	out.Evidence = append([]Evidence(nil), in.Evidence...)
+	out.NextChecks = append([]string(nil), in.NextChecks...)
+	out.InstrumentationWarnings = append([]string(nil), in.InstrumentationWarnings...)
+	if in.AffectedUsers != nil {
+		v := *in.AffectedUsers
+		out.AffectedUsers = &v
+	}
+	if in.RecoveringAt != nil {
+		v := *in.RecoveringAt
+		out.RecoveringAt = &v
+	}
+	if in.ResolvedAt != nil {
+		v := *in.ResolvedAt
+		out.ResolvedAt = &v
+	}
+	return out
+}
diff --git a/internal/incidents/test_helpers_test.go b/internal/incidents/test_helpers_test.go
new file mode 100644
index 0000000..8ff2e4a
--- /dev/null
+++ b/internal/incidents/test_helpers_test.go
@@ -0,0 +1,60 @@
+package incidents
+
+import (
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func testFamily() apiv2.ErrorFamily {
+	return apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}
+}
+
+func testIncidentEvent(id, traceID string, ts time.Time, service, step, code, downstream string) *eventv2.Event {
+	ev := &eventv2.Event{
+		SchemaVersion: eventv2.SchemaVersion2,
+		EventID:       id,
+		TsStart:       ts,
+		TsEnd:         ts.Add(10 * time.Millisecond),
+		DurationMS:    10,
+		Kind:          "http",
+		Service:       service,
+		Env:           "prod",
+		Version:       "v1",
+		TraceID:       traceID,
+		SpanID:        id + "-span",
+		Status:        eventv2.StatusError,
+		Anchor:        &eventv2.Anchor{Step: step, ErrorCode: code},
+	}
+	stepObj := eventv2.Step{Name: step, StartMS: 0, DurationMS: 10, Status: eventv2.StepStatusError, Error: &eventv2.StepError{Code: code, Reason: "failed"}}
+	if downstream != "" {
+		stepObj.Downstream = &eventv2.Downstream{Service: downstream, Endpoint: "/charge", Kind: "http"}
+	}
+	ev.Steps = []eventv2.Step{stepObj}
+	return ev
+}
+
+func testIncident(now time.Time) Incident {
+	return Incident{
+		IncidentID:       StableID("prod", testFamily(), now),
+		Env:              "prod",
+		Service:          "checkout",
+		ErrorFamily:      testFamily(),
+		Status:           StatusActive,
+		Cause:            CauseDependency,
+		Confidence:       ConfidenceMedium,
+		Severity:         7,
+		StartedAt:        now,
+		UpdatedAt:        now,
+		LastSeenAt:       now,
+		AffectedRequests: 6,
+		AffectedServices: 2,
+		TopServices:      []string{"checkout", "payment"},
+		SampleTraces:     []string{"trace-a"},
+		Evidence:         []Evidence{{Kind: EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: now}},
+		NextChecks:       []string{"check payment"},
+		Lift:             6,
+		CurrentCount:     6,
+	}
+}
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
new file mode 100644
index 0000000..f1d8bdb
--- /dev/null
+++ b/internal/incidents/types.go
@@ -0,0 +1,103 @@
+package incidents
+
+import (
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+type Status string
+
+const (
+	StatusActive     Status = "active"
+	StatusRecovering Status = "recovering"
+	StatusResolved   Status = "resolved"
+)
+
+type Cause string
+
+const (
+	CauseDeploy     Cause = "deploy"
+	CauseApp        Cause = "app"
+	CauseDependency Cause = "dependency"
+	CauseUnknown    Cause = "unknown"
+)
+
+type Confidence string
+
+const (
+	ConfidenceHigh   Confidence = "high"
+	ConfidenceMedium Confidence = "medium"
+	ConfidenceLow    Confidence = "low"
+)
+
+type EvidenceKind string
+
+const (
+	EvidenceSignal     EvidenceKind = "signal"
+	EvidenceDeployment EvidenceKind = "deployment"
+	EvidenceTrace      EvidenceKind = "trace"
+	EvidenceMetric     EvidenceKind = "metric"
+)
+
+type Evidence struct {
+	Kind       EvidenceKind   `json:"kind"`
+	Title      string         `json:"title"`
+	Detail     string         `json:"detail,omitempty"`
+	Service    string         `json:"service,omitempty"`
+	SignalID   string         `json:"signal_id,omitempty"`
+	DeployID   string         `json:"deployment_id,omitempty"`
+	TraceID    string         `json:"trace_id,omitempty"`
+	OccurredAt time.Time      `json:"occurred_at"`
+	Fields     map[string]any `json:"fields,omitempty"`
+}
+
+type Incident struct {
+	IncidentID              string            `json:"incident_id"`
+	Env                     string            `json:"env"`
+	Service                 string            `json:"service"`
+	ErrorFamily             apiv2.ErrorFamily `json:"error_family"`
+	Status                  Status            `json:"status"`
+	Cause                   Cause             `json:"cause"`
+	Confidence              Confidence        `json:"confidence"`
+	Severity                int               `json:"severity"`
+	StartedAt               time.Time         `json:"started_at"`
+	UpdatedAt               time.Time         `json:"updated_at"`
+	LastSeenAt              time.Time         `json:"last_seen_at"`
+	RecoveringAt            *time.Time        `json:"recovering_at,omitempty"`
+	ResolvedAt              *time.Time        `json:"resolved_at,omitempty"`
+	AffectedRequests        int               `json:"affected_requests"`
+	AffectedUsers           *int              `json:"affected_users,omitempty"`
+	AffectedServices        int               `json:"affected_services"`
+	TopServices             []string          `json:"top_services"`
+	SampleTraces            []string          `json:"sample_traces"`
+	Evidence                []Evidence        `json:"evidence"`
+	NextChecks              []string          `json:"next_checks"`
+	InstrumentationWarnings []string          `json:"instrumentation_warnings,omitempty"`
+	Lift                    float64           `json:"lift"`
+	BaselineCount           int               `json:"baseline_count"`
+	CurrentCount            int               `json:"current_count"`
+}
+
+type ActiveResponse struct {
+	Incidents []Incident `json:"incidents"`
+}
+
+type DetailResponse struct {
+	Incident Incident `json:"incident"`
+}
+
+type SnapshotResponse struct {
+	Snapshot string   `json:"snapshot"`
+	Incident Incident `json:"incident"`
+}
+
+type Deployment struct {
+	ID        string
+	Service   string
+	Version   string
+	Env       string
+	FirstSeen time.Time
+	LastSeen  time.Time
+	Metadata  map[string]string
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index a654cf8..6412a70 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -71,6 +71,14 @@ type Metrics struct {
 	SignalsRejected       *prometheus.CounterVec
 	SignalRetentionPruned prometheus.Counter
 
+	IncidentOpened          prometheus.Counter
+	IncidentUpdated         prometheus.Counter
+	IncidentRecovered       prometheus.Counter
+	IncidentResolved        prometheus.Counter
+	IncidentTickLatency     prometheus.Histogram
+	IncidentActive          prometheus.Gauge
+	IncidentClassifications *prometheus.CounterVec
+
 	CausalRunsTotal   prometheus.Counter
 	CausalRunDuration prometheus.Histogram
 	CausalRunFailures prometheus.Counter
@@ -340,6 +348,41 @@ func New(reg *prometheus.Registry) *Metrics {
 		Help: "Production-context signals pruned by retention.",
 	})
 
+	m.IncidentOpened = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incidents_opened_total",
+		Help: "Incidents opened by the v2.1 incident engine.",
+	})
+	m.IncidentUpdated = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incidents_updated_total",
+		Help: "Incidents updated by the v2.1 incident engine.",
+	})
+	m.IncidentRecovered = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incidents_recovered_total",
+		Help: "Incidents moved to recovering by the v2.1 incident engine.",
+	})
+	m.IncidentResolved = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incidents_resolved_total",
+		Help: "Incidents resolved by the v2.1 incident engine.",
+	})
+	m.IncidentTickLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Name:    "waylog_incident_tick_latency_seconds",
+		Help:    "Incident engine tick duration.",
+		Buckets: defaultBuckets,
+	})
+	m.IncidentActive = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "waylog_incidents_active",
+		Help: "Active or recovering incidents currently tracked by the v2.1 incident engine.",
+	})
+	m.IncidentClassifications = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "waylog_incident_classifications_total",
+		Help: "Incident classifications by cause and confidence.",
+	}, []string{"cause", "confidence"})
+	for _, cause := range []string{"deploy", "app", "dependency", "unknown"} {
+		for _, confidence := range []string{"high", "medium", "low"} {
+			m.IncidentClassifications.WithLabelValues(cause, confidence).Add(0)
+		}
+	}
+
 	m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_causal_runs_total",
 		Help: "Total causal inference runs.",
@@ -417,6 +460,8 @@ func New(reg *prometheus.Registry) *Metrics {
 		m.ColdEventsWritten, m.ColdEventsDropped, m.ColdBatchLatency,
 		m.DeployUpsertsTotal, m.DeployUpsertErrors,
 		m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
+		m.IncidentOpened, m.IncidentUpdated, m.IncidentRecovered, m.IncidentResolved,
+		m.IncidentTickLatency, m.IncidentActive, m.IncidentClassifications,
 		m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
 		m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
 		m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,

From 3e1be99e1f9a74bf835038eefa2dec719457de30 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Tue, 5 May 2026 03:12:27 -0400
Subject: [PATCH 03/14] feat: added incident CLI and dashboard surfaces

Expose the incident engine through the operator CLI and embedded
dashboard.

Promote incident HTTP response DTOs into pkg/api/v2 so server handlers,
CLI clients, OpenAPI, and dashboard consumers share one public contract.
Update the internal incidents handler to convert internal engine incidents
into the shared API DTOs.

Added CLI commands:
- waylog incidents [--json]
- waylog incident <incident_id> [--json] [--snapshot]

The new commands reuse the existing v2-read capability gate, read auth,
path escaping, JSON rendering, and error handling. Snapshot mode supports
plain text by default and JSON when --json is supplied.

Added incident client methods and human renderers for active incident tables,
incident detail, evidence, next checks, instrumentation warnings, and sample
traces.

Add an active-incident strip to the dashboard and a #/incident/<id> detail
screen. The dashboard fetches /v1/incidents/active during the normal polling
cycle, renders incident cards above the existing errors panel, and links sample
traces into the existing explain screen. If the incident API is unavailable
(404/503), the dashboard shows an empty incident strip instead of failing the
main v2 triage UI.

Tests cover CLI routing/rendering, snapshot text and JSON behavior, dashboard
static references, and handler DTO shape.

Verification:
- go test ./pkg/api/v2 ./internal/incidents ./internal/cli/v2 ./internal/dashboard
- go test ./...
- go vet ./...
- bash scripts/check-doc-links.sh
- git diff --check
---
 internal/cli/v2/client.go            |  35 +++++++
 internal/cli/v2/cmd.go               |  66 ++++++++++++++
 internal/cli/v2/cmd_test.go          | 102 +++++++++++++++++++++
 internal/cli/v2/render.go            |  96 ++++++++++++++++++++
 internal/cli/v2/render_test.go       |  48 ++++++++++
 internal/cli/v2/types.go             |   5 +
 internal/dashboard/static/index.html | 131 ++++++++++++++++++++++++---
 internal/dashboard/static_test.go    |   7 ++
 internal/incidents/handler.go        |  63 ++++++++++++-
 internal/incidents/handler_test.go   |   4 +-
 internal/incidents/types.go          |  13 ---
 pkg/api/v2/types.go                  |  52 +++++++++++
 12 files changed, 591 insertions(+), 31 deletions(-)

diff --git a/internal/cli/v2/client.go b/internal/cli/v2/client.go
index 0ce7100..5430945 100644
--- a/internal/cli/v2/client.go
+++ b/internal/cli/v2/client.go
@@ -145,6 +145,30 @@ func (c *Client) Blast(ctx context.Context, p BlastParams) (BlastRadiusResponse,
 	return out, err
 }
 
+func (c *Client) Incidents(ctx context.Context) (IncidentListResponse, error) {
+	var out IncidentListResponse
+	err := c.do(ctx, "/v1/incidents/active", nil, &out)
+	return out, err
+}
+
+func (c *Client) Incident(ctx context.Context, incidentID string) (IncidentDetailResponse, error) {
+	var out IncidentDetailResponse
+	err := c.do(ctx, "/v1/incidents/"+url.PathEscape(incidentID), nil, &out)
+	return out, err
+}
+
+func (c *Client) IncidentSnapshotText(ctx context.Context, incidentID string) (string, error) {
+	var out string
+	err := c.doRaw(ctx, "/v1/incidents/"+url.PathEscape(incidentID)+"/snapshot", nil, "text/plain", &out)
+	return out, err
+}
+
+func (c *Client) IncidentSnapshotJSON(ctx context.Context, incidentID string) (IncidentSnapshotResponse, error) {
+	var out IncidentSnapshotResponse
+	err := c.doRaw(ctx, "/v1/incidents/"+url.PathEscape(incidentID)+"/snapshot", nil, "application/json", &out)
+	return out, err
+}
+
 func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchResponse, error) {
 	q := url.Values{}
 	addQuery(q, "error_code", p.ErrorCode)
@@ -160,6 +184,10 @@ func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchRespons
 }
 
 func (c *Client) do(ctx context.Context, path string, q url.Values, out any) error {
+	return c.doRaw(ctx, path, q, "application/json", out)
+}
+
+func (c *Client) doRaw(ctx context.Context, path string, q url.Values, accept string, out any) error {
 	u, err := url.Parse(c.base + path)
 	if err != nil {
 		return &TransportError{Err: err}
@@ -174,6 +202,9 @@ func (c *Client) do(ctx context.Context, path string, q url.Values, out any) err
 	if c.apiKey != "" {
 		req.Header.Set("Authorization", "Bearer "+c.apiKey)
 	}
+	if accept != "" {
+		req.Header.Set("Accept", accept)
+	}
 	resp, err := c.http.Do(req)
 	if err != nil {
 		return &TransportError{Err: err}
@@ -189,6 +220,10 @@ func (c *Client) do(ctx context.Context, path string, q url.Values, out any) err
 	if out == nil || len(strings.TrimSpace(string(body))) == 0 {
 		return nil
 	}
+	if text, ok := out.(*string); ok {
+		*text = string(body)
+		return nil
+	}
 	if err := json.Unmarshal(body, out); err != nil {
 		return &TransportError{Err: fmt.Errorf("decode response: %w", err)}
 	}
diff --git a/internal/cli/v2/cmd.go b/internal/cli/v2/cmd.go
index f6617f1..87c3fae 100644
--- a/internal/cli/v2/cmd.go
+++ b/internal/cli/v2/cmd.go
@@ -49,6 +49,10 @@ func RunCLI(args []string, _ io.Reader, stdout, stderr io.Writer) int {
 		return runCapabilities(ctx, client, cfg, rest[1:], stdout, stderr)
 	case "recent":
 		return runRecent(ctx, client, cfg, rest[1:], stdout, stderr)
+	case "incidents":
+		return runIncidents(ctx, client, cfg, rest[1:], stdout, stderr)
+	case "incident":
+		return runIncident(ctx, client, cfg, rest[1:], stdout, stderr)
 	case "errors":
 		return runErrors(ctx, client, cfg, rest[1:], stdout, stderr)
 	case "event":
@@ -182,6 +186,64 @@ func runRecent(ctx context.Context, client *Client, cfg cliConfig, args []string
 	return renderOrError(stdout, stderr, cfg.json, resp, err, RenderRecent)
 }
 
+func runIncidents(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+	if len(args) != 0 {
+		return usage(stderr, "usage: waylog incidents [--json]")
+	}
+	if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+		return gate
+	}
+	resp, err := client.Incidents(ctx)
+	return renderOrError(stdout, stderr, cfg.json, resp, err, RenderIncidents)
+}
+
+func runIncident(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+	incidentID, snapshot, err := parseIncidentArgs(args)
+	if err != nil {
+		return usage(stderr, err.Error())
+	}
+	if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+		return gate
+	}
+	if snapshot {
+		if cfg.json {
+			resp, err := client.IncidentSnapshotJSON(ctx, incidentID)
+			return renderOrError(stdout, stderr, true, resp, err, RenderIncidentSnapshot)
+		}
+		text, err := client.IncidentSnapshotText(ctx, incidentID)
+		if err != nil {
+			fmt.Fprintln(stderr, err)
+			return exitCodeForError(err)
+		}
+		fmt.Fprint(stdout, text)
+		return 0
+	}
+	resp, err := client.Incident(ctx, incidentID)
+	return renderOrError(stdout, stderr, cfg.json, resp, err, RenderIncident)
+}
+
+func parseIncidentArgs(args []string) (string, bool, error) {
+	incidentID := ""
+	snapshot := false
+	for _, arg := range args {
+		switch {
+		case arg == "--snapshot":
+			snapshot = true
+		case strings.HasPrefix(arg, "-"):
+			return "", false, fmt.Errorf("unknown flag: %s", arg)
+		default:
+			if incidentID != "" {
+				return "", false, errors.New("usage: waylog incident <incident_id> [--snapshot] [--json]")
+			}
+			incidentID = arg
+		}
+	}
+	if incidentID == "" {
+		return "", false, errors.New("usage: waylog incident <incident_id> [--snapshot] [--json]")
+	}
+	return incidentID, snapshot, nil
+}
+
 func runEvent(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
 	if len(args) != 1 {
 		return usage(stderr, "usage: waylog event <event_id> [--json]")
@@ -423,6 +485,8 @@ func printUsage(w io.Writer) {
 	fmt.Fprintln(w, `Usage:
   waylog capabilities [--json]
   waylog recent [--window <dur>] [--service <svc>] [--status <csv>] [--limit <n>] [--cursor <c>] [--include-suppressed] [--json]
+  waylog incidents [--json]
+  waylog incident <incident_id> [--snapshot] [--json]
   waylog errors [--window <dur>] [--service <svc>] [--limit <n>] [--cursor <c>] [--json]
   waylog event <event_id> [--json]
   waylog trace <trace_id> [--json]
@@ -431,6 +495,8 @@ func printUsage(w io.Writer) {
   waylog search <query> [--service <svc>] [--status <csv>] [--window <dur>] [--limit <n>] [--cursor <c>] [--json]
 
 Recommended loop:
+  waylog incidents
+  waylog incident <incident_id>
   waylog recent
   waylog errors --window 15m
   waylog blast checkout:payment.charge:PMT_502 --window 15m
diff --git a/internal/cli/v2/cmd_test.go b/internal/cli/v2/cmd_test.go
index 3f038f8..0cf256f 100644
--- a/internal/cli/v2/cmd_test.go
+++ b/internal/cli/v2/cmd_test.go
@@ -76,6 +76,108 @@ func TestRunCLIRecentSerializesFilters(t *testing.T) {
 	}
 }
 
+func TestRunCLIIncidentsListsActive(t *testing.T) {
+	var gotPath string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/v1/capabilities" {
+			_, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+			return
+		}
+		gotPath = r.URL.Path
+		_, _ = w.Write([]byte(`{"incidents":[{"incident_id":"inc_1234567890abcdef","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}]}`))
+	}))
+	defer srv.Close()
+
+	var stdout, stderr bytes.Buffer
+	code := RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+	if code != 0 {
+		t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+	}
+	if gotPath != "/v1/incidents/active" {
+		t.Fatalf("path=%q", gotPath)
+	}
+	for _, want := range []string{"INCIDENT", "dependency", "checkout:payment.charge:PMT_502"} {
+		if !strings.Contains(stdout.String(), want) {
+			t.Fatalf("stdout missing %q:\n%s", want, stdout.String())
+		}
+	}
+}
+
+func TestRunCLIIncidentsEmptyAndRequiresV2Reads(t *testing.T) {
+	calls := 0
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		calls++
+		_, _ = w.Write([]byte(`{"v2_reads":{"enabled":false}}`))
+	}))
+	defer srv.Close()
+
+	var stdout, stderr bytes.Buffer
+	code := RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+	if code != 3 || calls != 1 || !strings.Contains(stderr.String(), "WAYLOG_V2_READS=true") {
+		t.Fatalf("code=%d calls=%d stdout=%q stderr=%q", code, calls, stdout.String(), stderr.String())
+	}
+
+	srv.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/v1/capabilities" {
+			_, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+			return
+		}
+		_, _ = w.Write([]byte(`{"incidents":[]}`))
+	})
+	stdout.Reset()
+	stderr.Reset()
+	code = RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+	if code != 0 || !strings.Contains(stdout.String(), "No active incidents.") {
+		t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+	}
+}
+
+func TestRunCLIIncidentDetailAndSnapshot(t *testing.T) {
+	calls := []string{}
+	accepts := []string{}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/v1/capabilities" {
+			_, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+			return
+		}
+		calls = append(calls, r.URL.String())
+		accepts = append(accepts, r.Header.Get("Accept"))
+		switch {
+		case strings.HasSuffix(r.URL.Path, "/snapshot") && r.Header.Get("Accept") == "application/json":
+			_, _ = w.Write([]byte(`{"snapshot":"Incident inc/1\n","incident":{"incident_id":"inc/1","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}}`))
+		case strings.HasSuffix(r.URL.Path, "/snapshot"):
+			w.Header().Set("Content-Type", "text/plain")
+			_, _ = w.Write([]byte("Incident inc/1\n"))
+		default:
+			_, _ = w.Write([]byte(`{"incident":{"incident_id":"inc/1","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[{"kind":"trace","title":"sample","trace_id":"trace-a","occurred_at":"2026-05-04T12:00:00Z"}],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}}`))
+		}
+	}))
+	defer srv.Close()
+
+	var stdout, stderr bytes.Buffer
+	code := RunCLI([]string{"--addr", srv.URL, "incident", "inc/1"}, nil, &stdout, &stderr)
+	if code != 0 {
+		t.Fatalf("detail code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+	}
+	if calls[0] != "/v1/incidents/inc%2F1" || !strings.Contains(stdout.String(), "incident_id: inc/1") {
+		t.Fatalf("calls=%v stdout=%q", calls, stdout.String())
+	}
+
+	stdout.Reset()
+	stderr.Reset()
+	code = RunCLI([]string{"--addr", srv.URL, "incident", "inc/1", "--snapshot"}, nil, &stdout, &stderr)
+	if code != 0 || stdout.String() != "Incident inc/1\n" {
+		t.Fatalf("snapshot code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+	}
+
+	stdout.Reset()
+	stderr.Reset()
+	code = RunCLI([]string{"--addr", srv.URL, "--json", "incident", "inc/1", "--snapshot"}, nil, &stdout, &stderr)
+	if code != 0 || !strings.Contains(stdout.String(), `"snapshot"`) || accepts[len(accepts)-1] != "application/json" {
+		t.Fatalf("json snapshot code=%d accepts=%v stdout=%q stderr=%q", code, accepts, stdout.String(), stderr.String())
+	}
+}
+
 func TestRunCLIEventEscapesIDAndRequiresV2Reads(t *testing.T) {
 	calls := []string{}
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index e4051cc..bf9b574 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -68,6 +68,102 @@ func RenderRecent(w io.Writer, resp RecentTracesResponse) {
 	renderNextCursor(w, resp.NextCursor)
 }
 
+func RenderIncidents(w io.Writer, resp IncidentListResponse) {
+	if len(resp.Incidents) == 0 {
+		fmt.Fprintln(w, "No active incidents.")
+		return
+	}
+	tw := tabwriter.NewWriter(w, 0, 4, 2, ' ', 0)
+	fmt.Fprintln(tw, "INCIDENT\tSTATUS\tCAUSE\tCONF\tSEVERITY\tFAMILY\tAFFECTED\tSTARTED")
+	for _, inc := range resp.Incidents {
+		fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%d\t%s\t%d req / %d svc\t%s\n",
+			truncateID(inc.IncidentID),
+			inc.Status,
+			inc.Cause,
+			inc.Confidence,
+			inc.Severity,
+			apiv2.FormatErrorFamily(inc.ErrorFamily),
+			inc.AffectedRequests,
+			inc.AffectedServices,
+			formatTime(inc.StartedAt),
+		)
+	}
+	_ = tw.Flush()
+}
+
+func RenderIncident(w io.Writer, resp IncidentDetailResponse) {
+	renderIncidentBody(w, resp.Incident)
+}
+
+func RenderIncidentSnapshot(w io.Writer, resp IncidentSnapshotResponse) {
+	if resp.Snapshot != "" {
+		fmt.Fprintln(w, resp.Snapshot)
+	}
+}
+
+func renderIncidentBody(w io.Writer, inc Incident) {
+	fmt.Fprintf(w, "incident_id: %s\n", inc.IncidentID)
+	fmt.Fprintf(w, "status: %s\n", inc.Status)
+	fmt.Fprintf(w, "family: %s\n", apiv2.FormatErrorFamily(inc.ErrorFamily))
+	fmt.Fprintf(w, "cause: %s (%s confidence)\n", inc.Cause, inc.Confidence)
+	fmt.Fprintf(w, "severity: %d\n", inc.Severity)
+	fmt.Fprintf(w, "started_at: %s\n", formatTime(inc.StartedAt))
+	fmt.Fprintf(w, "updated_at: %s\n", formatTime(inc.UpdatedAt))
+	if inc.ResolvedAt != nil {
+		fmt.Fprintf(w, "resolved_at: %s\n", formatTime(*inc.ResolvedAt))
+	}
+	fmt.Fprintf(w, "affected_requests: %d\n", inc.AffectedRequests)
+	if inc.AffectedUsers == nil {
+		fmt.Fprintln(w, "affected_users: null")
+	} else {
+		fmt.Fprintf(w, "affected_users: %d\n", *inc.AffectedUsers)
+	}
+	fmt.Fprintf(w, "affected_services: %d\n", inc.AffectedServices)
+	fmt.Fprintf(w, "top_services: %s\n", strings.Join(inc.TopServices, ","))
+	fmt.Fprintf(w, "lift: %.2f\n", inc.Lift)
+	fmt.Fprintf(w, "baseline_count: %d\n", inc.BaselineCount)
+	fmt.Fprintf(w, "current_count: %d\n", inc.CurrentCount)
+
+	fmt.Fprintln(w, "\nevidence:")
+	if len(inc.Evidence) == 0 {
+		fmt.Fprintln(w, "  none")
+	} else {
+		for _, ev := range inc.Evidence {
+			detail := ev.Detail
+			if detail == "" {
+				detail = ev.Service
+			}
+			fmt.Fprintf(w, "  - %s: %s", ev.Kind, ev.Title)
+			if detail != "" {
+				fmt.Fprintf(w, " (%s)", detail)
+			}
+			if ev.TraceID != "" {
+				fmt.Fprintf(w, " trace=%s", truncateID(ev.TraceID))
+			}
+			fmt.Fprintln(w)
+		}
+	}
+
+	fmt.Fprintln(w, "\nnext_checks:")
+	if len(inc.NextChecks) == 0 {
+		fmt.Fprintln(w, "  none")
+	} else {
+		for _, check := range inc.NextChecks {
+			fmt.Fprintf(w, "  - %s\n", check)
+		}
+	}
+
+	if len(inc.InstrumentationWarnings) > 0 {
+		fmt.Fprintln(w, "\ninstrumentation_warnings:")
+		for _, warning := range inc.InstrumentationWarnings {
+			fmt.Fprintf(w, "  - %s\n", warning)
+		}
+	}
+	if len(inc.SampleTraces) > 0 {
+		fmt.Fprintf(w, "\nsample_traces: %s\n", truncateList(inc.SampleTraces))
+	}
+}
+
 func RenderEvent(w io.Writer, ev *Event) {
 	if ev == nil {
 		fmt.Fprintln(w, "No event found.")
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 598c54c..0d4a755 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -84,6 +84,54 @@ func TestRenderEventPrintsSummaryCounts(t *testing.T) {
 	}
 }
 
+func TestRenderIncidentsAndDetail(t *testing.T) {
+	start := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	inc := Incident{
+		IncidentID:       "inc_1234567890abcdef",
+		Env:              "prod",
+		Service:          "checkout",
+		ErrorFamily:      ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		Status:           "active",
+		Cause:            "dependency",
+		Confidence:       "medium",
+		Severity:         8,
+		StartedAt:        start,
+		UpdatedAt:        start.Add(time.Minute),
+		LastSeenAt:       start.Add(time.Minute),
+		AffectedRequests: 12,
+		AffectedServices: 3,
+		TopServices:      []string{"checkout", "payment"},
+		SampleTraces:     []string{"trace-1234567890"},
+		Evidence:         []IncidentEvidence{{Kind: "trace", Title: "First failing trace sample", Detail: "payment.charge/PMT_502", TraceID: "trace-1234567890", OccurredAt: start}},
+		NextChecks:       []string{"Check payment health."},
+		Lift:             6,
+		BaselineCount:    2,
+		CurrentCount:     12,
+	}
+
+	var out bytes.Buffer
+	RenderIncidents(&out, IncidentListResponse{Incidents: []Incident{inc}})
+	for _, want := range []string{"INCIDENT", "dependency", "medium", "checkout:payment.charge:PMT_502", "12 req / 3 svc"} {
+		if !strings.Contains(out.String(), want) {
+			t.Fatalf("list output missing %q:\n%s", want, out.String())
+		}
+	}
+
+	out.Reset()
+	RenderIncident(&out, IncidentDetailResponse{Incident: inc})
+	for _, want := range []string{"incident_id: inc_1234567890abcdef", "cause: dependency (medium confidence)", "evidence:", "next_checks:", "sample_traces:"} {
+		if !strings.Contains(out.String(), want) {
+			t.Fatalf("detail output missing %q:\n%s", want, out.String())
+		}
+	}
+
+	out.Reset()
+	RenderIncidents(&out, IncidentListResponse{})
+	if !strings.Contains(out.String(), "No active incidents.") {
+		t.Fatalf("empty output=%q", out.String())
+	}
+}
+
 func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
 	var out bytes.Buffer
 	resp := CapabilitiesResponse{}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 6672d99..1a940de 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -31,6 +31,11 @@ type ErrorRow = apiv2.ErrorRow
 type ErrorsResponse = apiv2.ErrorsResponse
 type BlastKey = apiv2.BlastKey
 type BlastRadiusResponse = apiv2.BlastRadiusResponse
+type Incident = apiv2.Incident
+type IncidentEvidence = apiv2.IncidentEvidence
+type IncidentListResponse = apiv2.IncidentListResponse
+type IncidentDetailResponse = apiv2.IncidentDetailResponse
+type IncidentSnapshotResponse = apiv2.IncidentSnapshotResponse
 
 type eventGetResponse struct {
 	Event *Event `json:"event"`
diff --git a/internal/dashboard/static/index.html b/internal/dashboard/static/index.html
index 6ee9ac6..db935b6 100644
--- a/internal/dashboard/static/index.html
+++ b/internal/dashboard/static/index.html
@@ -580,6 +580,36 @@
       }
       .empty strong { color: var(--text); display: block; margin-bottom: 6px; }
       .error-box { color: var(--danger); border-color: var(--danger-soft); background: var(--danger-soft); }
+      .incident-strip {
+        display: grid;
+        gap: 10px;
+        margin-bottom: 14px;
+      }
+      .incident-grid {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
+        gap: 8px;
+      }
+      .incident-card {
+        display: grid;
+        gap: 8px;
+        min-height: 44px;
+        border: 1px solid var(--line);
+        border-radius: var(--radius-md);
+        background: linear-gradient(135deg, var(--item-bg), transparent);
+        padding: 12px;
+        transition: border-color 0.15s ease, background 0.15s ease;
+      }
+      .incident-card:hover { border-color: var(--line-strong); background: var(--row-hover); }
+      .incident-title { display: flex; justify-content: space-between; gap: 8px; align-items: center; }
+      .incident-detail { display: grid; gap: 12px; }
+      .evidence-list, .check-list { display: grid; gap: 8px; margin: 0; padding: 0; list-style: none; }
+      .evidence-list li, .check-list li {
+        border: 1px solid var(--line);
+        border-radius: var(--radius-md);
+        background: var(--item-bg);
+        padding: 10px 12px;
+      }
       .disabled {
         display: grid;
         place-items: center;
@@ -619,6 +649,7 @@
         capabilities: null,
         errors: null,
         recent: null,
+        incidents: null,
         errorTrend: [],
         latencyTrend: [],
         timers: [],
@@ -736,13 +767,23 @@
       async function loadCapabilities() {
         state.capabilities = await fetchJSON("/v1/capabilities");
       }
+      async function loadIncidents() {
+        try {
+          return await fetchJSON("/v1/incidents/active");
+        } catch (err) {
+          if (err.status === 404 || err.status === 503) return { incidents: [] };
+          throw err;
+        }
+      }
       async function loadDashboardData() {
-        const [errors, recent] = await Promise.all([
+        const [errors, recent, incidents] = await Promise.all([
           fetchJSON("/v1/errors?" + params({ window: state.window, limit: 50 })),
           fetchJSON("/v1/traces/recent?" + params({ window: state.window, limit: 50, include_suppressed: true })),
+          loadIncidents(),
         ]);
         state.errors = errors;
         state.recent = recent;
+        state.incidents = incidents;
         const errorEvents = (errors.rows || []).reduce((sum, row) => sum + Number(row.count || 0), 0);
         const durations = (recent.traces || []).map(t => Number(t.duration_ms || 0));
         pushTrend(state.errorTrend, errorEvents);
@@ -791,7 +832,10 @@
             ${tab("#/blast/" + encodeURIComponent(firstFamily()), "Blast", route.screen === "blast", !firstFamily())}
           </nav>
           <main id="main" class="shell">
-            <section class="card panel bracketed" aria-live="polite">${content}</section>
+            <section>
+              ${renderIncidentStrip()}
+              <div class="card panel bracketed" aria-live="polite">${content}</div>
+            </section>
             <aside>${renderRecent()}</aside>
           </main>
         `;
@@ -811,6 +855,27 @@
         return state.recent?.traces?.[0]?.trace_id || state.errors?.rows?.[0]?.sample_traces?.[0] || "";
       }
 
+      function renderIncidentStrip() {
+        const incidents = state.incidents?.incidents || [];
+        if (!incidents.length) {
+          return `<section class="card panel bracketed incident-strip" aria-labelledby="incidents-title">
+            <div class="panel-head"><div><h2 id="incidents-title" class="panel-title">Active incidents</h2><div class="panel-sub">No active incidents.</div></div></div>
+          </section>`;
+        }
+        return `<section class="card panel bracketed incident-strip" aria-labelledby="incidents-title">
+          <div class="panel-head"><div><h2 id="incidents-title" class="panel-title">Active incidents</h2><div class="panel-sub">${nf.format(incidents.length)} incident${incidents.length === 1 ? "" : "s"} detected</div></div></div>
+          <div class="incident-grid">${incidents.map(incident => `
+            <a class="incident-card" href="#/incident/${encodeURIComponent(incident.incident_id)}">
+              <div class="incident-title">
+                <strong>${esc(formatFamily(incident.error_family))}</strong>
+                <span class="status ${statusClass(incident.status)}">${esc(incident.status)}</span>
+              </div>
+              <div class="item-meta">${esc(incident.cause || "unknown")} · ${esc(incident.confidence || "low")} confidence · severity ${nf.format(incident.severity || 0)}</div>
+              <div class="item-meta">${nf.format(incident.affected_requests || 0)} requests · ${nf.format(incident.affected_services || 0)} services · started ${esc(ago(incident.started_at))}</div>
+            </a>`).join("")}</div>
+        </section>`;
+      }
+
       function renderRecent() {
         return `<section class="card panel bracketed" aria-labelledby="recent-title">
           <div class="panel-head"><div><h2 id="recent-title" class="panel-title">Recent requests</h2><div class="panel-sub">Polls every 5s</div></div></div>
@@ -927,6 +992,51 @@ <h2>First failing step</h2>
             ${storyCard("Sample traces", `<div class="list">${(blast.sample_traces || []).map(id => `<a class="item" href="#/explain/${encodeURIComponent(id)}"><span class="item-title mono">${esc(shortID(id))}</span></a>`).join("") || "<div class='empty'>No sample traces.</div>"}</div>`)}
           </div>`);
       }
+
+      async function renderIncident(id) {
+        if (!id) return shell(`<div class="empty">Choose an incident to inspect.</div>`);
+        const resp = await fetchJSON("/v1/incidents/" + encodeURIComponent(id));
+        const incident = resp.incident || {};
+        return shell(`<div class="incident-detail">
+          <div class="hero">
+            <div class="panel-sub">incident summary</div>
+            <h2>${esc(formatFamily(incident.error_family))}</h2>
+            <div class="hero-anchor">${esc(incident.cause || "unknown")} · ${esc(incident.confidence || "low")} confidence</div>
+            <div class="muted">${esc(incident.status || "unknown")} · severity ${nf.format(incident.severity || 0)} · started ${esc(ago(incident.started_at))}</div>
+          </div>
+          <section class="impact">
+            ${impact("Affected requests", incident.affected_requests || 0)}
+            ${impact("Affected users", incident.affected_users == null ? "unknown" : incident.affected_users)}
+            ${impact("Affected services", incident.affected_services || 0)}
+            ${impact("Lift", Number(incident.lift || 0).toFixed(2) + "x")}
+          </section>
+          <div class="section-grid">
+            ${storyCard("Evidence", renderEvidence(incident.evidence || []))}
+            ${storyCard("Next checks", renderChecks(incident.next_checks || []))}
+            ${storyCard("Sample traces", renderIncidentSamples(incident.sample_traces || []))}
+            ${storyCard("Instrumentation warnings", renderWarnings(incident.instrumentation_warnings || []))}
+          </div>
+        </div>`);
+      }
+      function renderEvidence(items) {
+        if (!items.length) return `<div class="empty">No incident evidence attached.</div>`;
+        return `<ul class="evidence-list">${items.map(item => `<li>
+          <div class="item-title">${esc(item.title || item.kind || "evidence")}</div>
+          <div class="item-meta">${esc(item.kind || "unknown")}${item.detail ? " · " + esc(item.detail) : ""}${item.service ? " · " + esc(item.service) : ""}</div>
+        </li>`).join("")}</ul>`;
+      }
+      function renderChecks(items) {
+        if (!items.length) return `<div class="empty">No next checks generated.</div>`;
+        return `<ul class="check-list">${items.map(check => `<li>${esc(check)}</li>`).join("")}</ul>`;
+      }
+      function renderIncidentSamples(ids) {
+        if (!ids.length) return `<div class="empty">No sample traces attached.</div>`;
+        return `<div class="list">${ids.map(id => `<a class="item" href="#/explain/${encodeURIComponent(id)}"><span class="item-title mono">${esc(shortID(id))}</span><span class="item-meta">open explain</span></a>`).join("")}</div>`;
+      }
+      function renderWarnings(items) {
+        if (!items.length) return `<div class="empty">No instrumentation warnings.</div>`;
+        return `<ul class="check-list">${items.map(warning => `<li>${esc(warning)}</li>`).join("")}</ul>`;
+      }
       function impact(label, value) {
         return `<article class="subcard"><div class="panel-sub">${esc(label)}</div><div class="impact-value">${esc(typeof value === "number" ? nf.format(value) : value)}</div></article>`;
       }
@@ -943,6 +1053,8 @@ <h2>First failing step</h2>
             document.getElementById("app").innerHTML = await renderExplain(route.id);
           } else if (route.screen === "blast") {
             document.getElementById("app").innerHTML = await renderBlast(route.id);
+          } else if (route.screen === "incident") {
+            document.getElementById("app").innerHTML = await renderIncident(route.id);
           } else {
             document.getElementById("app").innerHTML = shell(renderErrors());
           }
@@ -985,18 +1097,9 @@ <h1>This dashboard requires WAYLOG_V2_READS=true.</h1>
         const recent = document.getElementById("recent-list");
         const scrollTop = recent ? recent.scrollTop : 0;
         await loadDashboardData();
-        if (parseHash().screen === "errors") {
-          await renderCurrentScreen();
-          const nextRecent = document.getElementById("recent-list");
-          if (nextRecent) nextRecent.scrollTop = scrollTop;
-        } else {
-          // /explain and /blast own their own DOM; just refresh the recent panel in place.
-          const list = document.getElementById("recent-list");
-          if (list) {
-            list.innerHTML = recentItemsHTML();
-            list.scrollTop = scrollTop;
-          }
-        }
+        await renderCurrentScreen();
+        const nextRecent = document.getElementById("recent-list");
+        if (nextRecent) nextRecent.scrollTop = scrollTop;
       }
       function reportPollError(err) {
         const live = document.getElementById("live-region");
diff --git a/internal/dashboard/static_test.go b/internal/dashboard/static_test.go
index d287abc..ce2587f 100644
--- a/internal/dashboard/static_test.go
+++ b/internal/dashboard/static_test.go
@@ -43,6 +43,13 @@ func TestStaticDashboardHTML(t *testing.T) {
 		"#/errors",
 		"#/explain",
 		"#/blast",
+		"#/incident",
+		"/v1/incidents/active",
+		"Active incidents",
+		"No active incidents.",
+		"Next checks",
+		"Instrumentation warnings",
+		"sample_traces",
 		"renderSparkline",
 		"This dashboard requires WAYLOG_V2_READS=true",
 		"first observable failing step",
diff --git a/internal/incidents/handler.go b/internal/incidents/handler.go
index 08d57ee..d6058c2 100644
--- a/internal/incidents/handler.go
+++ b/internal/incidents/handler.go
@@ -5,6 +5,8 @@ import (
 	"errors"
 	"net/http"
 	"strings"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 )
 
 type Handler struct {
@@ -25,7 +27,7 @@ func (h *Handler) Active(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusInternalServerError, "internal_error", "query incidents failed", err.Error())
 		return
 	}
-	writeJSON(w, http.StatusOK, ActiveResponse{Incidents: rows})
+	writeJSON(w, http.StatusOK, apiv2.IncidentListResponse{Incidents: toAPIIncidents(rows)})
 }
 
 func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
@@ -52,7 +54,7 @@ func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
 		return
 	}
-	writeJSON(w, http.StatusOK, DetailResponse{Incident: inc})
+	writeJSON(w, http.StatusOK, apiv2.IncidentDetailResponse{Incident: toAPIIncident(inc)})
 }
 
 func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
@@ -67,7 +69,7 @@ func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
 	}
 	snapshot := RenderSnapshot(inc)
 	if strings.Contains(r.Header.Get("Accept"), "application/json") {
-		writeJSON(w, http.StatusOK, SnapshotResponse{Snapshot: snapshot, Incident: inc})
+		writeJSON(w, http.StatusOK, apiv2.IncidentSnapshotResponse{Snapshot: snapshot, Incident: toAPIIncident(inc)})
 		return
 	}
 	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
@@ -75,6 +77,61 @@ func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
 	_, _ = w.Write([]byte(snapshot))
 }
 
+func toAPIIncidents(rows []Incident) []apiv2.Incident {
+	out := make([]apiv2.Incident, 0, len(rows))
+	for _, inc := range rows {
+		out = append(out, toAPIIncident(inc))
+	}
+	return out
+}
+
+func toAPIIncident(inc Incident) apiv2.Incident {
+	return apiv2.Incident{
+		IncidentID:              inc.IncidentID,
+		Env:                     inc.Env,
+		Service:                 inc.Service,
+		ErrorFamily:             inc.ErrorFamily,
+		Status:                  string(inc.Status),
+		Cause:                   string(inc.Cause),
+		Confidence:              string(inc.Confidence),
+		Severity:                inc.Severity,
+		StartedAt:               inc.StartedAt,
+		UpdatedAt:               inc.UpdatedAt,
+		LastSeenAt:              inc.LastSeenAt,
+		RecoveringAt:            inc.RecoveringAt,
+		ResolvedAt:              inc.ResolvedAt,
+		AffectedRequests:        inc.AffectedRequests,
+		AffectedUsers:           inc.AffectedUsers,
+		AffectedServices:        inc.AffectedServices,
+		TopServices:             inc.TopServices,
+		SampleTraces:            inc.SampleTraces,
+		Evidence:                toAPIEvidence(inc.Evidence),
+		NextChecks:              inc.NextChecks,
+		InstrumentationWarnings: inc.InstrumentationWarnings,
+		Lift:                    inc.Lift,
+		BaselineCount:           inc.BaselineCount,
+		CurrentCount:            inc.CurrentCount,
+	}
+}
+
+func toAPIEvidence(rows []Evidence) []apiv2.IncidentEvidence {
+	out := make([]apiv2.IncidentEvidence, 0, len(rows))
+	for _, ev := range rows {
+		out = append(out, apiv2.IncidentEvidence{
+			Kind:       string(ev.Kind),
+			Title:      ev.Title,
+			Detail:     ev.Detail,
+			Service:    ev.Service,
+			SignalID:   ev.SignalID,
+			DeployID:   ev.DeployID,
+			TraceID:    ev.TraceID,
+			OccurredAt: ev.OccurredAt,
+			Fields:     ev.Fields,
+		})
+	}
+	return out
+}
+
 func writeJSON(w http.ResponseWriter, status int, v any) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(status)
diff --git a/internal/incidents/handler_test.go b/internal/incidents/handler_test.go
index 2be9a80..670c19c 100644
--- a/internal/incidents/handler_test.go
+++ b/internal/incidents/handler_test.go
@@ -8,6 +8,8 @@ import (
 	"strings"
 	"testing"
 	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 )
 
 func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
@@ -28,7 +30,7 @@ func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
 	if rec.Code != http.StatusOK {
 		t.Fatalf("active status=%d body=%s", rec.Code, rec.Body.String())
 	}
-	var active ActiveResponse
+	var active apiv2.IncidentListResponse
 	if err := json.Unmarshal(rec.Body.Bytes(), &active); err != nil {
 		t.Fatal(err)
 	}
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
index f1d8bdb..cf3b59a 100644
--- a/internal/incidents/types.go
+++ b/internal/incidents/types.go
@@ -79,19 +79,6 @@ type Incident struct {
 	CurrentCount            int               `json:"current_count"`
 }
 
-type ActiveResponse struct {
-	Incidents []Incident `json:"incidents"`
-}
-
-type DetailResponse struct {
-	Incident Incident `json:"incident"`
-}
-
-type SnapshotResponse struct {
-	Snapshot string   `json:"snapshot"`
-	Incident Incident `json:"incident"`
-}
-
 type Deployment struct {
 	ID        string
 	Service   string
diff --git a/pkg/api/v2/types.go b/pkg/api/v2/types.go
index 9fd4a79..e14af86 100644
--- a/pkg/api/v2/types.go
+++ b/pkg/api/v2/types.go
@@ -119,6 +119,58 @@ type BlastRadiusResponse struct {
 	SampleTraces     []string `json:"sample_traces"`
 }
 
+type IncidentEvidence struct {
+	Kind       string         `json:"kind"`
+	Title      string         `json:"title"`
+	Detail     string         `json:"detail,omitempty"`
+	Service    string         `json:"service,omitempty"`
+	SignalID   string         `json:"signal_id,omitempty"`
+	DeployID   string         `json:"deployment_id,omitempty"`
+	TraceID    string         `json:"trace_id,omitempty"`
+	OccurredAt time.Time      `json:"occurred_at"`
+	Fields     map[string]any `json:"fields,omitempty"`
+}
+
+type Incident struct {
+	IncidentID              string             `json:"incident_id"`
+	Env                     string             `json:"env"`
+	Service                 string             `json:"service"`
+	ErrorFamily             ErrorFamily        `json:"error_family"`
+	Status                  string             `json:"status"`
+	Cause                   string             `json:"cause"`
+	Confidence              string             `json:"confidence"`
+	Severity                int                `json:"severity"`
+	StartedAt               time.Time          `json:"started_at"`
+	UpdatedAt               time.Time          `json:"updated_at"`
+	LastSeenAt              time.Time          `json:"last_seen_at"`
+	RecoveringAt            *time.Time         `json:"recovering_at,omitempty"`
+	ResolvedAt              *time.Time         `json:"resolved_at,omitempty"`
+	AffectedRequests        int                `json:"affected_requests"`
+	AffectedUsers           *int               `json:"affected_users,omitempty"`
+	AffectedServices        int                `json:"affected_services"`
+	TopServices             []string           `json:"top_services"`
+	SampleTraces            []string           `json:"sample_traces"`
+	Evidence                []IncidentEvidence `json:"evidence"`
+	NextChecks              []string           `json:"next_checks"`
+	InstrumentationWarnings []string           `json:"instrumentation_warnings,omitempty"`
+	Lift                    float64            `json:"lift"`
+	BaselineCount           int                `json:"baseline_count"`
+	CurrentCount            int                `json:"current_count"`
+}
+
+type IncidentListResponse struct {
+	Incidents []Incident `json:"incidents"`
+}
+
+type IncidentDetailResponse struct {
+	Incident Incident `json:"incident"`
+}
+
+type IncidentSnapshotResponse struct {
+	Snapshot string   `json:"snapshot"`
+	Incident Incident `json:"incident"`
+}
+
 func FormatErrorFamily(f ErrorFamily) string {
 	return escapeErrorFamilyPart(f.Service) + ":" + escapeErrorFamilyPart(f.Step) + ":" + escapeErrorFamilyPart(f.ErrorCode)
 }

From b8e9636d9e004a9ea60e76e6d7de1fe26ee095d2 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Tue, 5 May 2026 03:35:19 -0400
Subject: [PATCH 04/14] feat: auto-post demo signals for incident acceptance

Made the demo produce the full production-triage path from a single
Run traffic burst action.

Added a demo signal poster to the api-gateway burst path. The burst now posts
a checkout deploy signal and a payment dependency signal to /v1/signals using
INGEST_URL and WAYLOG_WRITE_KEY, then runs traffic as before. Signal failures
are reported in the burst summary but do not block traffic, so no-SQLite and
micro-demo style setups remain usable.

Seed each burst with up to six payment_502 requests before falling back to the
existing weighted traffic mix. This keeps the burst bounded and user-triggered
while making incident creation deterministic enough for demo acceptance.

Fix incident signal enrichment by querying signals across the incident env and
time window instead of filtering only to the primary service. This lets a
downstream payment dependency signal enrich checkout:payment.charge:PMT_502
incidents to high-confidence dependency classification.

Update the demo UI, README, and demo script copy to point evaluators at the
active incident flow. Extend demo acceptance to verify accepted signals,
active dependency incidents, incident detail, and text snapshots.

Tests cover signal posting, signal failure reporting, deterministic burst
seeding, downstream signal classification, UI copy, and acceptance JSON helpers.

Verification:
- go test ./examples/microdemo
- go test ./internal/incidents
- go test ./scripts/demo-acceptance-json
- go test ./examples/microdemo ./internal/incidents ./internal/cli/v2 ./internal/dashboard
- go test ./...
- go vet ./...
- bash -n scripts/demo.sh scripts/demo-acceptance.sh
- bash scripts/check-doc-links.sh
- git diff --check
---
 README.md                                 |   8 +-
 examples/cmd/api-gateway/main.go          |   4 +
 examples/microdemo/burst.go               |  22 +++-
 examples/microdemo/burst_test.go          |  63 ++++++++-
 examples/microdemo/gateway.go             |  11 ++
 examples/microdemo/signals.go             | 153 ++++++++++++++++++++++
 examples/microdemo/signals_test.go        |  84 ++++++++++++
 examples/microdemo/ui.html                |  13 +-
 examples/microdemo/ui_test.go             |   2 +
 internal/incidents/engine.go              |   6 +-
 internal/incidents/engine_test.go         |  57 ++++++++
 scripts/demo-acceptance-json/main.go      |  85 +++++++++++-
 scripts/demo-acceptance-json/main_test.go |  23 ++++
 scripts/demo-acceptance.sh                |  36 +++++
 scripts/demo.sh                           |   5 +-
 15 files changed, 553 insertions(+), 19 deletions(-)
 create mode 100644 examples/microdemo/signals.go
 create mode 100644 examples/microdemo/signals_test.go
 create mode 100644 scripts/demo-acceptance-json/main_test.go

diff --git a/README.md b/README.md
index 439170b..db85a6f 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,12 @@ Run `make demo` and see it yourself.
 make demo
 ```
 
-This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, and does not require Docker, Kafka, or the bridge process.
+This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, stores demo signals/incidents in local SQLite, and does not require Docker, Kafka, or the bridge process.
 
 Once the stack is up:
 
 1. Open demo controls at <http://localhost:9081/demo>, or open the dashboard at <http://localhost:8080/ui/>. The local demo disables dashboard login.
-2. Click **Run traffic burst** to fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
+2. Click **Run traffic burst** to post demo deploy/dependency signals and fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
    ```bash
    curl -s -X POST http://localhost:9081/purchase \
      -H 'Content-Type: application/json' \
@@ -54,6 +54,8 @@ Once the stack is up:
    ```
 3. Investigate with the v2 CLI:
    ```bash
+   ./waylog incidents
+   ./waylog incident <incident_id> --snapshot
    ./waylog errors --window 15m
    ./waylog explain <trace_id>
    ./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
@@ -315,4 +317,4 @@ Public alpha. APIs may break before 1.0.
 - No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
 - No multi-tenancy. One instance = one trust boundary.
 
-**Fastest walkthrough:** `make demo`, open <http://localhost:9081/demo>, click **Run traffic burst**, then use the dashboard or `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
+**Fastest walkthrough:** `make demo`, open <http://localhost:9081/demo>, click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
diff --git a/examples/cmd/api-gateway/main.go b/examples/cmd/api-gateway/main.go
index 12ca98e..24aa63d 100644
--- a/examples/cmd/api-gateway/main.go
+++ b/examples/cmd/api-gateway/main.go
@@ -17,6 +17,10 @@ func main() {
 
 	checkoutURL := config.Getenv("CHECKOUT_URL", "http://localhost:9082")
 	gateway := microdemo.NewGatewayHandler(checkoutURL)
+	gateway.SetSignalPoster(microdemo.NewDemoSignalPoster(
+		config.Getenv("INGEST_URL", "http://localhost:8080"),
+		config.Getenv("WAYLOG_WRITE_KEY", ""),
+	))
 
 	mux := http.NewServeMux()
 	mux.Handle("/purchase", gateway.PurchaseHandler())
diff --git a/examples/microdemo/burst.go b/examples/microdemo/burst.go
index 3c76258..cf7ccc4 100644
--- a/examples/microdemo/burst.go
+++ b/examples/microdemo/burst.go
@@ -14,6 +14,7 @@ import (
 const (
 	defaultBurstRequests    = 50
 	defaultBurstConcurrency = 10
+	incidentSeedPayments    = 6
 	maxBurstRequests        = 250
 	maxBurstConcurrency     = 50
 	maxBurstSamples         = 5
@@ -27,6 +28,7 @@ type BurstRequest struct {
 type BurstSummary struct {
 	Requested      BurstRequest   `json:"requested"`
 	Accepted       BurstRequest   `json:"accepted"`
+	Signals        []SignalResult `json:"signals,omitempty"`
 	DurationMs     int64          `json:"duration_ms"`
 	ByScenario     map[string]int `json:"by_scenario"`
 	OK             int            `json:"ok"`
@@ -87,6 +89,20 @@ func normalizeBurstRequest(raw BurstRequest) (requested, accepted BurstRequest)
 	return requested, accepted
 }
 
+func pickBurstScenarioForIndex(i, requests int) string {
+	if i < incidentSeedPaymentCount(requests) {
+		return ScenarioPayment502
+	}
+	return pickBurstScenario()
+}
+
+func incidentSeedPaymentCount(requests int) int {
+	if requests < incidentSeedPayments {
+		return requests
+	}
+	return incidentSeedPayments
+}
+
 func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) BurstSummary {
 	requested, accepted := normalizeBurstRequest(raw)
 	summary := BurstSummary{
@@ -112,11 +128,11 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs
 		// concurrency instead of stacking up `requests` blocked goroutines.
 		sem <- struct{}{}
 		wg.Add(1)
-		go func() {
+		scenario := pickBurstScenarioForIndex(i, accepted.Requests)
+		go func(scenario string) {
 			defer wg.Done()
 			defer func() { <-sem }()
 
-			scenario := pickBurstScenario()
 			payload, _ := json.Marshal(PurchaseRequest{
 				SKU:      "X1",
 				Scenario: scenario,
@@ -155,7 +171,7 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs
 					summary.SampleTraceIDs = append(summary.SampleTraceIDs, resp.TraceID)
 				}
 			}
-		}()
+		}(scenario)
 	}
 	wg.Wait()
 	summary.DurationMs = time.Since(start).Milliseconds()
diff --git a/examples/microdemo/burst_test.go b/examples/microdemo/burst_test.go
index eaa914b..8c49b87 100644
--- a/examples/microdemo/burst_test.go
+++ b/examples/microdemo/burst_test.go
@@ -1,6 +1,7 @@
 package microdemo
 
 import (
+	"context"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
@@ -61,8 +62,16 @@ func TestRunBurstDispatchesEveryRequestThroughHandler(t *testing.T) {
 		if got := r.Header.Get("Content-Type"); got != "application/json" {
 			t.Fatalf("content-type = %q, want application/json", got)
 		}
+		var purchase PurchaseRequest
+		if err := json.NewDecoder(r.Body).Decode(&purchase); err != nil {
+			t.Fatalf("decode purchase: %v", err)
+		}
 		w.Header().Set("Content-Type", "application/json")
-		_, _ = w.Write([]byte(`{"success":true,"trace_id":"t","scenario":"happy"}`))
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"success":  purchase.Scenario == ScenarioHappy,
+			"trace_id": "trace-" + purchase.Scenario,
+			"scenario": purchase.Scenario,
+		})
 	})
 
 	summary := runBurst(t.Context(), dispatch, BurstRequest{Requests: 20, Concurrency: 4})
@@ -72,11 +81,25 @@ func TestRunBurstDispatchesEveryRequestThroughHandler(t *testing.T) {
 	if summary.Accepted.Requests != 20 || summary.Accepted.Concurrency != 4 {
 		t.Fatalf("accepted = %#v, want 20/4", summary.Accepted)
 	}
-	if summary.OK != 20 || summary.Errors != 0 || summary.Suppressed != 0 {
-		t.Fatalf("summary counts = ok:%d errors:%d suppressed:%d", summary.OK, summary.Errors, summary.Suppressed)
+	if summary.Errors < incidentSeedPayments {
+		t.Fatalf("errors = %d, want at least seeded payment failures %d", summary.Errors, incidentSeedPayments)
+	}
+	if summary.ByScenario[ScenarioPayment502] < incidentSeedPayments {
+		t.Fatalf("payment_502 count = %d, want at least %d", summary.ByScenario[ScenarioPayment502], incidentSeedPayments)
+	}
+	if summary.OK+summary.Errors+summary.Suppressed != 20 {
+		t.Fatalf("summary total = %d, want 20", summary.OK+summary.Errors+summary.Suppressed)
+	}
+}
+
+func TestPickBurstScenarioForIndexSeedsPaymentFailures(t *testing.T) {
+	for i := 0; i < incidentSeedPayments; i++ {
+		if got := pickBurstScenarioForIndex(i, 20); got != ScenarioPayment502 {
+			t.Fatalf("seed scenario[%d] = %q, want payment_502", i, got)
+		}
 	}
-	if summary.ByScenario[ScenarioHappy] != 20 {
-		t.Fatalf("happy count = %d, want 20", summary.ByScenario[ScenarioHappy])
+	if got := incidentSeedPaymentCount(3); got != 3 {
+		t.Fatalf("seed count = %d, want capped to request count 3", got)
 	}
 }
 
@@ -139,6 +162,28 @@ func TestServeBurstAppliesDefaultsWhenZero(t *testing.T) {
 	}
 }
 
+func TestServeBurstPostsDemoSignals(t *testing.T) {
+	gateway := NewGatewayHandler("http://checkout.example")
+	gateway.SetPurchaseHandler(okBurstDispatch())
+	gateway.SetSignalPoster(staticSignalPoster{results: []SignalResult{{
+		Type: "dependency", Service: "payment", Reason: "payment_gateway_5xx", Accepted: true, Status: http.StatusCreated, SignalID: "sig_demo",
+	}}})
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/demo/burst", strings.NewReader(`{"requests":1,"concurrency":1}`))
+	req.Header.Set("Content-Type", "application/json")
+	gateway.ServeBurst(rec, req)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200: %s", rec.Code, rec.Body.String())
+	}
+	var summary BurstSummary
+	if err := json.Unmarshal(rec.Body.Bytes(), &summary); err != nil {
+		t.Fatalf("unmarshal summary: %v", err)
+	}
+	if len(summary.Signals) != 1 || !summary.Signals[0].Accepted || summary.Signals[0].SignalID != "sig_demo" {
+		t.Fatalf("signals = %+v", summary.Signals)
+	}
+}
+
 func serveBurstForTest(t *testing.T, body string) *httptest.ResponseRecorder {
 	t.Helper()
 	gateway := NewGatewayHandler("http://checkout.example")
@@ -150,6 +195,14 @@ func serveBurstForTest(t *testing.T, body string) *httptest.ResponseRecorder {
 	return rec
 }
 
+type staticSignalPoster struct {
+	results []SignalResult
+}
+
+func (p staticSignalPoster) PostDemoSignals(context.Context) []SignalResult {
+	return p.results
+}
+
 func okBurstDispatch() http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		var req PurchaseRequest
diff --git a/examples/microdemo/gateway.go b/examples/microdemo/gateway.go
index a6a7fac..f065440 100644
--- a/examples/microdemo/gateway.go
+++ b/examples/microdemo/gateway.go
@@ -30,6 +30,7 @@ type GatewayHandler struct {
 	checkoutURL string
 	client      *http.Client
 	purchase    http.Handler
+	signals     SignalPoster
 }
 
 type PurchaseRequest struct {
@@ -61,6 +62,11 @@ func (h *GatewayHandler) SetPurchaseHandler(handler http.Handler) {
 	h.purchase = handler
 }
 
+// SetSignalPoster overrides the signal poster used by /demo/burst. Test seam.
+func (h *GatewayHandler) SetSignalPoster(poster SignalPoster) {
+	h.signals = poster
+}
+
 func (h *GatewayHandler) ServeDemo(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "text/html")
 	_, _ = w.Write(uiHTML)
@@ -136,7 +142,12 @@ func (h *GatewayHandler) ServeBurst(w http.ResponseWriter, r *http.Request) {
 		}
 	}
 
+	var signalResults []SignalResult
+	if h.signals != nil {
+		signalResults = h.signals.PostDemoSignals(r.Context())
+	}
 	summary := runBurst(r.Context(), h.purchase, req)
+	summary.Signals = signalResults
 	w.Header().Set("Content-Type", "application/json")
 	_ = json.NewEncoder(w).Encode(summary)
 }
diff --git a/examples/microdemo/signals.go b/examples/microdemo/signals.go
new file mode 100644
index 0000000..b7b6611
--- /dev/null
+++ b/examples/microdemo/signals.go
@@ -0,0 +1,153 @@
+package microdemo
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+const demoSignalTimeout = 2 * time.Second
+
+type SignalResult struct {
+	Type     string `json:"type"`
+	Service  string `json:"service"`
+	Reason   string `json:"reason"`
+	Accepted bool   `json:"accepted"`
+	Status   int    `json:"status,omitempty"`
+	SignalID string `json:"signal_id,omitempty"`
+	Error    string `json:"error,omitempty"`
+}
+
+type SignalPoster interface {
+	PostDemoSignals(ctx context.Context) []SignalResult
+}
+
+type DemoSignalPoster struct {
+	ingestURL string
+	apiKey    string
+	client    *http.Client
+	now       func() time.Time
+}
+
+func NewDemoSignalPoster(ingestURL, apiKey string) *DemoSignalPoster {
+	return &DemoSignalPoster{
+		ingestURL: strings.TrimRight(strings.TrimSpace(ingestURL), "/"),
+		apiKey:    strings.TrimSpace(apiKey),
+		client:    &http.Client{Timeout: demoSignalTimeout},
+		now:       func() time.Time { return time.Now().UTC() },
+	}
+}
+
+func (p *DemoSignalPoster) PostDemoSignals(ctx context.Context) []SignalResult {
+	specs := []demoSignalSpec{
+		{
+			Type:     "deploy",
+			Service:  "checkout",
+			Severity: "info",
+			Reason:   "demo_checkout_rollout",
+			Message:  "Demo checkout rollout before the payment dependency incident.",
+			Resource: map[string]any{"service": "checkout"},
+			Metadata: map[string]any{"version": "demo-v2.1", "demo": "traffic_burst"},
+		},
+		{
+			Type:     "dependency",
+			Service:  "payment",
+			Severity: "critical",
+			Reason:   "payment_gateway_5xx",
+			Message:  "Demo payment provider is returning intermittent 5xx responses.",
+			Resource: map[string]any{"service": "payment", "endpoint": "POST /charge"},
+			Metadata: map[string]any{"error_code": "PMT_502", "downstream": "payment", "demo": "traffic_burst"},
+		},
+	}
+
+	results := make([]SignalResult, 0, len(specs))
+	for _, spec := range specs {
+		results = append(results, p.postSignal(ctx, spec))
+	}
+	return results
+}
+
+func (p *DemoSignalPoster) postSignal(ctx context.Context, spec demoSignalSpec) SignalResult {
+	result := SignalResult{Type: spec.Type, Service: spec.Service, Reason: spec.Reason}
+	if p == nil || p.ingestURL == "" {
+		result.Error = "INGEST_URL is not configured"
+		return result
+	}
+
+	body, err := json.Marshal(spec.body(p.now()))
+	if err != nil {
+		result.Error = err.Error()
+		return result
+	}
+	reqCtx, cancel := context.WithTimeout(ctx, demoSignalTimeout)
+	defer cancel()
+	req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, p.ingestURL+"/v1/signals", bytes.NewReader(body))
+	if err != nil {
+		result.Error = err.Error()
+		return result
+	}
+	req.Header.Set("Content-Type", "application/json")
+	if p.apiKey != "" {
+		req.Header.Set("X-API-Key", p.apiKey)
+	}
+
+	client := p.client
+	if client == nil {
+		client = &http.Client{Timeout: demoSignalTimeout}
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		result.Error = err.Error()
+		return result
+	}
+	defer resp.Body.Close()
+	result.Status = resp.StatusCode
+
+	raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
+	if resp.StatusCode != http.StatusCreated {
+		result.Error = fmt.Sprintf("signal POST returned HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
+		return result
+	}
+	var accepted struct {
+		Signal struct {
+			SignalID string `json:"signal_id"`
+		} `json:"signal"`
+	}
+	if err := json.Unmarshal(raw, &accepted); err != nil {
+		result.Error = "accepted signal response was not valid JSON: " + err.Error()
+		return result
+	}
+	result.Accepted = true
+	result.SignalID = accepted.Signal.SignalID
+	return result
+}
+
+type demoSignalSpec struct {
+	Type     string
+	Service  string
+	Severity string
+	Reason   string
+	Message  string
+	Resource map[string]any
+	Metadata map[string]any
+}
+
+func (s demoSignalSpec) body(ts time.Time) map[string]any {
+	return map[string]any{
+		"type":      s.Type,
+		"source":    "waylog-demo",
+		"service":   s.Service,
+		"env":       "demo",
+		"severity":  s.Severity,
+		"reason":    s.Reason,
+		"message":   s.Message,
+		"resource":  s.Resource,
+		"metadata":  s.Metadata,
+		"timestamp": ts.UTC(),
+	}
+}
diff --git a/examples/microdemo/signals_test.go b/examples/microdemo/signals_test.go
new file mode 100644
index 0000000..d1d530b
--- /dev/null
+++ b/examples/microdemo/signals_test.go
@@ -0,0 +1,84 @@
+package microdemo
+
+import (
+	"bytes"
+	"encoding/json"
+	"io"
+	"net/http"
+	"testing"
+	"time"
+)
+
+func TestDemoSignalPosterPostsDeployAndDependencySignals(t *testing.T) {
+	var posted []map[string]any
+	poster := NewDemoSignalPoster("http://ingest.example", "demo-write")
+	poster.client = &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) {
+		if r.URL.Path != "/v1/signals" {
+			t.Fatalf("path = %s, want /v1/signals", r.URL.Path)
+		}
+		if got := r.Header.Get("X-API-Key"); got != "demo-write" {
+			t.Fatalf("api key = %q, want demo-write", got)
+		}
+		var body map[string]any
+		if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+			t.Fatalf("decode signal: %v", err)
+		}
+		posted = append(posted, body)
+		raw, _ := json.Marshal(map[string]any{
+			"signal": map[string]any{"signal_id": "sig_" + body["type"].(string)},
+		})
+		return &http.Response{
+			StatusCode: http.StatusCreated,
+			Header:     http.Header{"Content-Type": []string{"application/json"}},
+			Body:       io.NopCloser(bytes.NewReader(raw)),
+		}, nil
+	})}
+	poster.now = func() time.Time { return time.Date(2026, 5, 5, 12, 0, 0, 0, time.UTC) }
+	results := poster.PostDemoSignals(t.Context())
+	if len(results) != 2 {
+		t.Fatalf("results len = %d, want 2", len(results))
+	}
+	for _, result := range results {
+		if !result.Accepted || result.SignalID == "" || result.Status != http.StatusCreated {
+			t.Fatalf("result = %+v", result)
+		}
+	}
+	if len(posted) != 2 {
+		t.Fatalf("posted len = %d, want 2", len(posted))
+	}
+	if posted[0]["type"] != "deploy" || posted[0]["service"] != "checkout" || posted[0]["env"] != "demo" {
+		t.Fatalf("deploy signal = %+v", posted[0])
+	}
+	if posted[1]["type"] != "dependency" || posted[1]["service"] != "payment" || posted[1]["reason"] != "payment_gateway_5xx" {
+		t.Fatalf("dependency signal = %+v", posted[1])
+	}
+	metadata, ok := posted[1]["metadata"].(map[string]any)
+	if !ok || metadata["error_code"] != "PMT_502" {
+		t.Fatalf("dependency metadata = %+v", posted[1]["metadata"])
+	}
+}
+
+func TestDemoSignalPosterReportsNonCreatedResponse(t *testing.T) {
+	poster := NewDemoSignalPoster("http://ingest.example", "")
+	poster.client = &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) {
+		return &http.Response{
+			StatusCode: http.StatusServiceUnavailable,
+			Body:       io.NopCloser(bytes.NewBufferString("set SQLITE_PATH to enable signals")),
+		}, nil
+	})}
+	results := poster.PostDemoSignals(t.Context())
+	if len(results) != 2 {
+		t.Fatalf("results len = %d, want 2", len(results))
+	}
+	for _, result := range results {
+		if result.Accepted || result.Status != http.StatusServiceUnavailable || result.Error == "" {
+			t.Fatalf("result = %+v", result)
+		}
+	}
+}
+
+type roundTripFunc func(*http.Request) (*http.Response, error)
+
+func (f roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
+	return f(r)
+}
diff --git a/examples/microdemo/ui.html b/examples/microdemo/ui.html
index 30ede80..b5ac343 100644
--- a/examples/microdemo/ui.html
+++ b/examples/microdemo/ui.html
@@ -542,7 +542,7 @@ <h2>Scenarios</h2>
     <section class="section burst-row" aria-labelledby="burst-title">
       <div class="burst-meta">
         <h2 id="burst-title">Production-like traffic mix</h2>
-        <p class="muted small">Fires concurrent purchases through the same chain, with realistic checkout steps and failure cutoffs.</p>
+        <p class="muted small">Fires concurrent purchases, posts demo deploy/dependency signals, and creates incident context for the dashboard.</p>
       </div>
       <form id="burst-form" class="burst-form">
         <label><span>Requests</span><input type="number" name="requests" value="50" min="1" max="250" required></label>
@@ -671,7 +671,7 @@ <h2 id="result-title">Result</h2>
       };
       const result = document.getElementById("result");
       result.className = "result-empty";
-      result.textContent = "Running production-like traffic mix through the demo chain…";
+      result.textContent = "Posting demo signals and running production-like traffic through the checkout chain…";
       try {
         const resp = await fetch("/demo/burst", {
           method: "POST",
@@ -770,6 +770,11 @@ <h2 id="result-title">Result</h2>
       const samples = (summary.sample_trace_ids || []).slice(0, 3)
         .map(id => `<a class="button" href="${dashboardURL}#/explain/${encodeURIComponent(id)}">Explain ${esc(id.slice(0, 12))}…</a>`)
         .join("");
+      const signals = (summary.signals || []).map(signal => {
+        const cls = signal.accepted ? "" : "warn";
+        const label = signal.accepted ? "accepted" : (signal.error ? "unavailable" : "skipped");
+        return `<span class="burst-count"><span class="burst-dot ${cls}" aria-hidden="true"></span>${esc(signal.type)}:${esc(signal.service)} <strong>${esc(label)}</strong></span>`;
+      }).join("");
 
       const result = document.getElementById("result");
       result.className = "bracketed";
@@ -777,10 +782,12 @@ <h2 id="result-title">Result</h2>
         <div>
           <div class="eyebrow">Burst captured</div>
           <p class="muted mono">${reqLabel} · ${summary.accepted.concurrency}× · ${summary.duration_ms}ms</p>
+          <p class="muted small">Dashboard should now show an active incident with dependency evidence after the next incident tick.</p>
         </div>
         <div class="burst-counts">${counts}</div>
+        ${signals ? `<div class="burst-counts" aria-label="Signals">${signals}</div>` : ""}
         <div class="links">
-          <a class="button primary" href="${dashboardURL}#/errors">Open dashboard</a>
+          <a class="button primary" href="${dashboardURL}">Open dashboard</a>
           ${blastLinks}
           ${samples}
         </div>
diff --git a/examples/microdemo/ui_test.go b/examples/microdemo/ui_test.go
index 2607b0e..0b541d8 100644
--- a/examples/microdemo/ui_test.go
+++ b/examples/microdemo/ui_test.go
@@ -27,6 +27,8 @@ func TestDemoUIProductShowcaseCopy(t *testing.T) {
 		"Run checkout 500",
 		"Run traffic burst",
 		"Production-like traffic mix",
+		"posts demo deploy/dependency signals",
+		"active incident",
 		"Burst captured",
 		"Open dashboard",
 		"Explain this trace",
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
index 6eb6cba..11162c9 100644
--- a/internal/incidents/engine.go
+++ b/internal/incidents/engine.go
@@ -217,7 +217,7 @@ func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baseline
 		SearchFilter{Since: since, Until: now},
 		apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
 	)
-	sigs, err := e.querySignals(ctx, row.ErrorFamily.Service, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	sigs, err := e.querySignals(ctx, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
 	if err != nil && !errors.Is(err, signals.ErrUnavailable) {
 		return Incident{}, err
 	}
@@ -325,11 +325,11 @@ func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit
 	return out
 }
 
-func (e *Engine) querySignals(ctx context.Context, service, env string, since, until time.Time) ([]signals.Signal, error) {
+func (e *Engine) querySignals(ctx context.Context, env string, since, until time.Time) ([]signals.Signal, error) {
 	if e.signals == nil {
 		return nil, nil
 	}
-	return e.signals.Query(ctx, signals.Filter{Service: service, Env: env, Since: since, Until: until, Limit: 200})
+	return e.signals.Query(ctx, signals.Filter{Env: env, Since: since, Until: until, Limit: 200})
 }
 
 func (e *Engine) queryDeploys(ctx context.Context, service string, since, until time.Time) ([]Deployment, error) {
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
index 0b76de0..41403bc 100644
--- a/internal/incidents/engine_test.go
+++ b/internal/incidents/engine_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/sssmaran/WaylogCLI/internal/signals"
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
 )
@@ -78,6 +79,52 @@ func TestEngineLifecycleAndSampleStability(t *testing.T) {
 	}
 }
 
+func TestEngineUsesDownstreamDependencySignal(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	reader := &fakeReader{
+		current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily:    testFamily(),
+			Count:          6,
+			AffectedTraces: 6,
+			SampleTraces:   []string{"trace-a"},
+		}}},
+		blast: apiv2.BlastRadiusResponse{AffectedRequests: 6, AffectedServices: 2, TopServices: []string{"checkout", "payment"}},
+		events: []*eventv2.Event{
+			testIncidentEvent("e1", "trace-a", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+		},
+	}
+	signalStore := &fakeSignalStore{rows: []signals.Signal{{
+		SignalID:  "sig_payment",
+		Type:      signals.TypeDependency,
+		Service:   "payment",
+		Env:       "prod",
+		Severity:  signals.SeverityCritical,
+		Reason:    "payment_gateway_5xx",
+		Timestamp: now.Add(-2 * time.Minute),
+	}}}
+	engine := NewEngine(reader, signalStore, nil, NewMemoryStore(), Config{MinCount: 5, SampleLimit: 2}, nil, nil)
+	engine.now = func() time.Time { return now }
+	if err := engine.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	if err := engine.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, err := engine.Active(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("active incidents = %d, want 1", len(rows))
+	}
+	if rows[0].Cause != CauseDependency || rows[0].Confidence != ConfidenceHigh {
+		t.Fatalf("classification = %s/%s, want dependency/high", rows[0].Cause, rows[0].Confidence)
+	}
+	if len(signalStore.filters) != 1 || signalStore.filters[0].Service != "" || signalStore.filters[0].Env != "prod" {
+		t.Fatalf("signal filters = %+v", signalStore.filters)
+	}
+}
+
 type fakeReader struct {
 	current ErrorsResult
 	base    ErrorsResult
@@ -103,3 +150,13 @@ func (r *fakeReader) BlastRadius(_ SearchFilter, key apiv2.BlastKey) apiv2.Blast
 func (r *fakeReader) SearchEvents(_ SearchFilter, _ int) []*eventv2.Event {
 	return r.events
 }
+
+type fakeSignalStore struct {
+	rows    []signals.Signal
+	filters []signals.Filter
+}
+
+func (s *fakeSignalStore) Query(_ context.Context, f signals.Filter) ([]signals.Signal, error) {
+	s.filters = append(s.filters, f)
+	return s.rows, nil
+}
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index 4996ede..c8dd8f2 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -31,9 +31,32 @@ type searchEvent struct {
 	EventID string `json:"event_id"`
 }
 
+type burstSummary struct {
+	Signals []signalResult `json:"signals"`
+}
+
+type signalResult struct {
+	Type     string `json:"type"`
+	Service  string `json:"service"`
+	Reason   string `json:"reason"`
+	Accepted bool   `json:"accepted"`
+}
+
+type incidentsResponse struct {
+	Incidents []incident `json:"incidents"`
+}
+
+type incident struct {
+	IncidentID  string      `json:"incident_id"`
+	ErrorFamily errorFamily `json:"error_family"`
+	Cause       string      `json:"cause"`
+	Confidence  string      `json:"confidence"`
+	Status      string      `json:"status"`
+}
+
 func main() {
 	if len(os.Args) != 2 {
-		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|first-payment-trace|first-event-id>")
+		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id>")
 		os.Exit(2)
 	}
 
@@ -52,6 +75,16 @@ func main() {
 		fmt.Println(firstPaymentTrace(body))
 	case "first-event-id":
 		fmt.Println(firstEventID(body))
+	case "burst-signals-accepted":
+		if !burstSignalsAccepted(body) {
+			os.Exit(1)
+		}
+	case "has-dependency-incident":
+		if !hasDependencyIncident(body) {
+			os.Exit(1)
+		}
+	case "first-incident-id":
+		fmt.Println(firstIncidentID(body))
 	default:
 		fmt.Fprintf(os.Stderr, "unknown command: %s\n", os.Args[1])
 		os.Exit(2)
@@ -100,3 +133,53 @@ func isPayment502(row errorRow) bool {
 		row.ErrorFamily.Step == "payment.charge" &&
 		row.ErrorFamily.ErrorCode == "PMT_502"
 }
+
+func burstSignalsAccepted(body []byte) bool {
+	var summary burstSummary
+	if err := json.Unmarshal(body, &summary); err != nil {
+		return false
+	}
+	seen := map[string]bool{}
+	for _, signal := range summary.Signals {
+		if signal.Accepted {
+			seen[signal.Type+":"+signal.Service+":"+signal.Reason] = true
+		}
+	}
+	return seen["deploy:checkout:demo_checkout_rollout"] &&
+		seen["dependency:payment:payment_gateway_5xx"]
+}
+
+func hasDependencyIncident(body []byte) bool {
+	var resp incidentsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return false
+	}
+	for _, inc := range resp.Incidents {
+		if isPaymentFamily(inc.ErrorFamily) &&
+			inc.Cause == "dependency" &&
+			(inc.Confidence == "high" || inc.Confidence == "medium") &&
+			inc.Status == "active" {
+			return true
+		}
+	}
+	return false
+}
+
+func firstIncidentID(body []byte) string {
+	var resp incidentsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return ""
+	}
+	for _, inc := range resp.Incidents {
+		if isPaymentFamily(inc.ErrorFamily) {
+			return inc.IncidentID
+		}
+	}
+	return ""
+}
+
+func isPaymentFamily(f errorFamily) bool {
+	return f.Service == "checkout" &&
+		f.Step == "payment.charge" &&
+		f.ErrorCode == "PMT_502"
+}
diff --git a/scripts/demo-acceptance-json/main_test.go b/scripts/demo-acceptance-json/main_test.go
new file mode 100644
index 0000000..7b3dfd6
--- /dev/null
+++ b/scripts/demo-acceptance-json/main_test.go
@@ -0,0 +1,23 @@
+package main
+
+import "testing"
+
+func TestBurstSignalsAccepted(t *testing.T) {
+	body := []byte(`{"signals":[
+		{"type":"deploy","service":"checkout","reason":"demo_checkout_rollout","accepted":true},
+		{"type":"dependency","service":"payment","reason":"payment_gateway_5xx","accepted":true}
+	]}`)
+	if !burstSignalsAccepted(body) {
+		t.Fatal("expected accepted deploy and dependency signals")
+	}
+}
+
+func TestDependencyIncidentHelpers(t *testing.T) {
+	body := []byte(`{"incidents":[{"incident_id":"inc_123","status":"active","cause":"dependency","confidence":"high","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"}}]}`)
+	if !hasDependencyIncident(body) {
+		t.Fatal("expected dependency incident")
+	}
+	if got := firstIncidentID(body); got != "inc_123" {
+		t.Fatalf("firstIncidentID = %q, want inc_123", got)
+	}
+}
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index d9fa462..590b315 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -31,6 +31,18 @@ json_first_event_id() {
   "$JSON_BIN" first-event-id
 }
 
+json_burst_signals_accepted() {
+  "$JSON_BIN" burst-signals-accepted
+}
+
+json_has_dependency_incident() {
+  "$JSON_BIN" has-dependency-incident
+}
+
+json_first_incident_id() {
+  "$JSON_BIN" first-incident-id
+}
+
 if [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
   fail "demo stack is not running. Start it with: make demo"
 fi
@@ -65,6 +77,9 @@ burst_status="$(curl -s -o /tmp/waylog-demo-burst.json -w "%{http_code}" \
 [[ "$burst_status" == "200" ]] || fail "traffic burst failed: HTTP $burst_status"
 echo "PASS: traffic burst captured (${REQUESTS} requests / ${CONCURRENCY} concurrency)"
 
+json_burst_signals_accepted </tmp/waylog-demo-burst.json || fail "demo burst did not accept deploy and dependency signals"
+echo "PASS: demo signals accepted"
+
 errors_json=""
 for _ in $(seq 1 12); do
   errors_json="$("${CLI[@]}" --json errors --window 15m --limit 10)" || fail "waylog errors failed"
@@ -95,4 +110,25 @@ event_id="$(json_first_event_id <<<"$search_json")"
 "${CLI[@]}" --json event "$event_id" >/dev/null || fail "waylog event failed for event $event_id"
 echo "PASS: waylog event"
 
+incidents_json=""
+for _ in $(seq 1 20); do
+  incidents_json="$("${CLI[@]}" --json incidents)" || fail "waylog incidents failed"
+  if json_has_dependency_incident <<<"$incidents_json"; then
+    break
+  fi
+  sleep 1
+done
+json_has_dependency_incident <<<"$incidents_json" || fail "dependency incident did not appear in /v1/incidents/active"
+echo "PASS: waylog incidents contains active dependency incident"
+
+incident_id="$(json_first_incident_id <<<"$incidents_json")"
+[[ -n "$incident_id" ]] || fail "no incident_id found for payment dependency incident"
+
+"${CLI[@]}" --json incident "$incident_id" >/dev/null || fail "waylog incident failed for incident $incident_id"
+echo "PASS: waylog incident"
+
+snapshot="$("${CLI[@]}" incident "$incident_id" --snapshot)" || fail "waylog incident snapshot failed for incident $incident_id"
+[[ "$snapshot" == *"payment.charge"* ]] || fail "incident snapshot did not mention payment.charge"
+echo "PASS: waylog incident snapshot"
+
 echo "Demo acceptance passed."
diff --git a/scripts/demo.sh b/scripts/demo.sh
index aab621d..d738520 100755
--- a/scripts/demo.sh
+++ b/scripts/demo.sh
@@ -31,6 +31,7 @@ else
   export WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
 fi
 export WAYLOG_V2_READS="${WAYLOG_V2_READS:-true}"
+export WAYLOG_INCIDENT_TICK_INTERVAL="${WAYLOG_INCIDENT_TICK_INTERVAL:-5s}"
 export EVENT_LOG_DIR="${EVENT_LOG_DIR:-${STATE_DIR}/eventlog}"
 export EVENT_LOG_V2_DIR="${EVENT_LOG_V2_DIR:-${STATE_DIR}/eventlog-v2}"
 export SNAPSHOT_PATH="${SNAPSHOT_PATH:-${STATE_DIR}/graph_snapshot.json}"
@@ -142,14 +143,16 @@ Open:
 
 How to demo it:
   1. Open Demo controls and click "Run traffic burst".
-  2. Open Dashboard and inspect errors, impact, and trace explanation.
+  2. Open Dashboard and inspect the active incident, errors, impact, and trace explanation.
   3. Or run: make demo-acceptance
 
 Useful CLI checks:
   ./waylog capabilities
   ./waylog recent --limit 5
+  ./waylog incidents
   ./waylog errors --window 15m
   ./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
+  ./waylog incident <incident_id> --snapshot
 
 Logs:
   ${LOG_DIR}

From 2f2a0068c7ea77888408470ac96f1357c7f77fd0 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Thu, 7 May 2026 16:28:47 -0400
Subject: [PATCH 05/14] feat: added deterministic incident triage artifact
 Added TriageReport v1 with CLI, read endpoint, and agent tool surfaces.

- added pkg/triage schema, validation, and canonical hash
- added internal triage engine and production adapters
- register triage_incident tool
- added GET /v1/triage/{incident_id}
- add waylog triage command and renderer
- updated OpenAPI, README, and demo acceptance coverage
---
 README.md                                 |  34 +-
 cmd/ingest/main.go                        |  33 +-
 docs/openapi.yaml                         | 152 +++++++-
 internal/cli/v2/client.go                 |  13 +
 internal/cli/v2/client_test.go            |  29 ++
 internal/cli/v2/cmd.go                    |  55 ++-
 internal/cli/v2/cmd_test.go               |  31 ++
 internal/cli/v2/render.go                 |  38 ++
 internal/cli/v2/render_test.go            |  28 ++
 internal/cli/v2/types.go                  |   8 +
 internal/ingest/triage_route_test.go      |  92 +++++
 internal/tools/triage.go                  |  57 +++
 internal/tools/triage_test.go             |  92 +++++
 internal/triage/adapter.go                | 272 +++++++++++++
 internal/triage/adapter_test.go           | 454 ++++++++++++++++++++++
 internal/triage/engine.go                 | 139 +++++++
 internal/triage/engine_test.go            | 317 +++++++++++++++
 internal/triage/idempotency_test.go       |  62 +++
 internal/triage/options.go                |  28 ++
 internal/triage/options_test.go           |  50 +++
 internal/triagehttp/handler.go            |  66 ++++
 internal/triagehttp/handler_test.go       | 141 +++++++
 pkg/triage/report.go                      | 102 +++++
 pkg/triage/report_test.go                 | 137 +++++++
 scripts/demo-acceptance-json/main.go      |  16 +-
 scripts/demo-acceptance-json/main_test.go |  10 +
 scripts/demo-acceptance.sh                |  15 +
 27 files changed, 2453 insertions(+), 18 deletions(-)
 create mode 100644 internal/ingest/triage_route_test.go
 create mode 100644 internal/tools/triage.go
 create mode 100644 internal/tools/triage_test.go
 create mode 100644 internal/triage/adapter.go
 create mode 100644 internal/triage/adapter_test.go
 create mode 100644 internal/triage/engine.go
 create mode 100644 internal/triage/engine_test.go
 create mode 100644 internal/triage/idempotency_test.go
 create mode 100644 internal/triage/options.go
 create mode 100644 internal/triage/options_test.go
 create mode 100644 internal/triagehttp/handler.go
 create mode 100644 internal/triagehttp/handler_test.go
 create mode 100644 pkg/triage/report.go
 create mode 100644 pkg/triage/report_test.go

diff --git a/README.md b/README.md
index db85a6f..e5b7712 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 </pre></div>
 
 <p align="center">
-  <strong>Structured logging that explains failure propagation.</strong><br>
+  <strong>Structured logging that explains failed requests and active incidents.</strong><br>
   Drop-in SDKs (Go, TypeScript) or OTLP/HTTP. Agent-native by design.
 </p>
 
@@ -11,14 +11,14 @@
 </p>
 
 <p align="center">
-  <em>Public alpha — an impact-analysis engine for backend systems built on WideEvents.</em>
+  <em>Public alpha — request triage plus signal-driven incident triage for backend systems.</em>
 </p>
 
 ---
 
 ## What Waylog does
 
-A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened:
+A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened in the request, then groups repeated failures into an incident with signal-backed cause evidence:
 
 ```text
   trace 7f3a2b9c…   flow=purchase   user=standard   region=us-east-1
@@ -31,7 +31,7 @@ A request hits your API gateway, fans out to three services, and one of them fai
   blast radius:  12 requests · 8 users · 4 services
 ```
 
-This is not log search. Waylog builds a live in-memory graph from every request flowing through your services. When you ask a question — "why did this trace fail?", "who is affected by `PMT_502`?", "what changed in the last 10 minutes?" — it walks the graph and returns a precomputed, structured answer. Root-cause rollups count the originating failure once, not once per propagated hop.
+This is not log search, metrics storage, or incident management. Waylog builds request-triage views from WideEvents, accepts production-context signals such as deploys and dependency health, and returns deterministic answers for "why did this trace fail?", "what incident is active?", and "who is affected by `PMT_502`?". Root-cause rollups count the originating failure once, not once per propagated hop.
 
 Run `make demo` and see it yourself.
 
@@ -60,9 +60,10 @@ Once the stack is up:
    ./waylog explain <trace_id>
    ./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
    ./waylog blast --code PMT_502 --window 15m
+   ./waylog triage <incident_id>
    ```
 
-The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
+The traffic burst posts fresh demo deploy/dependency signals on each run so the incident panel has evidence to attach. The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
 
 Stop with `make demo-stop`.
 
@@ -72,8 +73,9 @@ Prefer Docker? Use `make docker-dev` / `make docker-down`. Prefer foreground ser
 ## How it works
 
 1. **Capture** — services emit [WideEvents](docs/waylog-sdk-contract.md) via the Go or TypeScript SDK, or push OpenTelemetry spans to `/v1/otlp/v1/traces`. Every event is durably logged (WAL + fsync) before it enters the derived read models.
-2. **Analyze** — the ingest server projects completed execution segments into request, service, error, user, and trace views. Deterministic tools answer specific questions: propagation chain, blast radius, what-changed, deploy correlation.
-3. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views through the same tool registry. Every answer is also callable by agents as a structured tool with idempotency keys.
+2. **Signal** — deploy systems, dependency monitors, or operators post small production-context facts to `/v1/signals`.
+3. **Triage** — the ingest server projects request views (`recent`, `errors`, `explain`, `blast`) and opens incidents when error families spike against overlapping signals.
+4. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views. Primary incident surfaces are `waylog incidents`, `waylog incident <id>`, `/v1/incidents/*`, and the dashboard incident cards.
 
 ## Get traces in
 
@@ -206,7 +208,7 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
 
 ### Analysis tools
 
-All ten tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
 
 | Tool               | Answers                                                       |
 | ------------------ | ------------------------------------------------------------- |
@@ -220,6 +222,7 @@ All ten tools are deterministic, idempotent, and available via CLI, REST `/v1/to
 | `graph_query`      | DSL query over the graph (`expr` + `window`)                  |
 | `compare_windows`  | Diff error rates between two windows                          |
 | `graph_insights`   | Windowed rollup of top errors and patterns                    |
+| `triage_incident`  | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
 
 Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
 
@@ -231,6 +234,7 @@ The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs a
 - `#/errors` — top error families over `/v1/errors`
 - `#/explain/<id>` — first observable failing step over `/v1/traces/story`
 - `#/blast/<key>` — impact panel over `/v1/blast_radius`
+- `#/incident/<id>` — incident evidence and next checks over `/v1/incidents/{id}`
 - recent-request stream from `/v1/traces/recent`, polled every 5s
 - no Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts
 
@@ -243,8 +247,8 @@ Go / TS services (SDK) · OTLP/HTTP collectors
         ▼
   ingest server
     ├─ event log (append-only WAL, source of truth)
-    ├─ derived read models (errors · explain · blast · recent traces)
-    ├─ SQLite cold store (events · deployments · causal claims)
+    ├─ derived read models (errors · explain · blast · recent traces · incidents)
+    ├─ SQLite cold store (events · deployments · signals · incidents · causal claims)
     ├─ tool registry · Ask · plan execution
     └─ v2 dashboard · health · metrics · OpenAPI
         │
@@ -275,7 +279,7 @@ Waylog uses three scoped keys. They are independent — the dashboard never hold
 
 | Key                | Protects                                              |
 | ------------------ | ----------------------------------------------------- |
-| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces` (SDKs, collectors) |
+| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces`, `/v1/signals` (SDKs, collectors, production signals) |
 | `WAYLOG_READ_KEY`  | Read APIs, dashboard session                          |
 | `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*`               |
 
@@ -292,12 +296,13 @@ Public alpha. APIs may break before 1.0.
 - durable ingest with WAL + replay
 - hot graph with flattened 3-node model + dedicated trace store
 - schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
-- SQLite cold store (events, deployments, causal claims)
+- SQLite cold store (events, deployments, signals, incidents, causal claims)
+- signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, and dashboard incident cards
 - 10 deterministic analysis tools, rollup-correct root-cause attribution
 - agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
 - `/v1/traces/story` and indented failure-path rendering in the dashboard
 - dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
+- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
 - live TUI (`waylog-live --dev` streams via SSE), MCP stdio
 - scoped auth (write/read/agent) with startup validation
 
@@ -314,7 +319,10 @@ Public alpha. APIs may break before 1.0.
 - OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
 - Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
 - SQLite cold store fits demos and small deployments; not sized for production-scale retention.
+- Signal and incident records are SQLite-backed; they do not use the event WAL/replay path.
+- Incident cause classification is deterministic and heuristic. `runtime` signals are accepted but do not produce a `runtime` cause label yet.
 - No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
 - No multi-tenancy. One instance = one trust boundary.
+- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
 
 **Fastest walkthrough:** `make demo`, open <http://localhost:9081/demo>, click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 650f58d..7d76092 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -36,6 +36,8 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/signals"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
 	"github.com/sssmaran/WaylogCLI/internal/tracestore"
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	"github.com/sssmaran/WaylogCLI/internal/triagehttp"
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
 )
@@ -412,8 +414,9 @@ func main() {
 		if incidentsEnabled {
 			if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok {
 				incidentStore := coldstore.NewIncidentStore(sqlite)
+				incReader := incidentReaderAdapter{reader: v2Reader}
 				incidentEngine = incidents.NewEngine(
-					incidentReaderAdapter{reader: v2Reader},
+					incReader,
 					signalStore,
 					coldDeployAdapter{store: sqlite},
 					incidentStore,
@@ -429,6 +432,34 @@ func main() {
 				mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
 				mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
 				ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine})
+
+				// Triage engine: deterministic TriageReport build for a given
+				// incident. Reuses the same v2Reader-backed adapter for blast
+				// queries, the live graph + trace store for first-failure
+				// stories, and the configured signal store. Read-scope auth.
+				triageEng, err := triage.NewEngine(triage.Deps{
+					Incidents: triage.NewIncidentLookupAdapter(incidentEngine),
+					Blast:     triage.NewBlastQueryAdapter(incReader),
+					Story: triage.NewStoryBuilderAdapter(
+						incidentEngine,
+						func(traceID string) (apiv2.StoryResponse, bool) {
+							return v2Reader.TraceStoryByTraceID(traceID)
+						},
+					),
+					Signals:    triage.NewSignalQueryAdapter(signalStore),
+					NextChecks: triage.NewNextChecksAdapter(),
+				})
+				if err != nil {
+					slog.Error("triage engine init failed", "err", err)
+					os.Exit(1)
+				}
+				if err := tools.RegisterTriageTool(reg, triageEng); err != nil {
+					slog.Error("triage tool register failed", "err", err)
+					os.Exit(1)
+				}
+				triageHandler := triagehttp.NewHandler(triageEng)
+				mux.Handle("/v1/triage/", readCORS(triageHandler.Triage))
+
 				incidentRunning = true
 				slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
 			} else {
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index ce503dc..d60eb4d 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -7,7 +7,9 @@ info:
 
     The primary product path is schema-2.0 ingest plus the v2 read APIs used by
     the operator CLI and embedded dashboard: recent traces, error families,
-    trace story, blast radius, event search, and direct trace/event lookup.
+    trace story, blast radius, incidents, event search, and direct trace/event
+    lookup. `/v1/insight` is a compatibility endpoint; new clients should use
+    `/v1/incidents/*`.
 
     Write endpoints require write-scope auth when auth is configured. Read
     endpoints require read-scope auth when read keys are configured.
@@ -485,7 +487,9 @@ paths:
       tags: [Triage]
       operationId: getIncidentSnapshot
       summary: Render an incident snapshot
-      description: Defaults to text/plain. Send Accept: application/json to receive the snapshot text plus the incident object.
+      description: |
+        Defaults to text/plain. Send Accept: application/json to receive the
+        snapshot text plus the incident object.
       security:
         - ApiKeyHeader: []
         - BearerAuth: []
@@ -508,6 +512,81 @@ paths:
         '405':
           description: Method Not Allowed
 
+  /v1/triage/{incident_id}:
+    get:
+      tags: [Triage]
+      operationId: getTriageReport
+      summary: Build a deterministic TriageReport for an open incident
+      description: |
+        Returns a structured TriageReport v1 (incident_ref, blast_snapshot,
+        first_failure, sample_traces, signals, next_checks, confidence,
+        report_hash). Same builder backs `POST /v1/tools/triage_incident`;
+        both surfaces produce identical `report_hash` for the same input.
+        Read-scope auth.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      parameters:
+        - name: incident_id
+          in: path
+          required: true
+          schema: {type: string}
+        - name: window
+          in: query
+          required: false
+          schema: {type: string, default: "15m"}
+          description: Go duration string (e.g. 15m, 1h). Default 15m.
+        - name: snapshot
+          in: query
+          required: false
+          schema: {type: boolean, default: false}
+          description: |
+            When true, freeze evaluation bounds to the incident's started_at
+            and updated_at instead of using wall-clock now.
+      responses:
+        '200':
+          description: TriageReport v1
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/TriageReport'
+        '400':
+          description: Missing or invalid parameters
+        '401':
+          description: Unauthorized
+        '500':
+          description: Triage build failed
+
+  /v1/insight:
+    get:
+      tags: [Operational]
+      operationId: getCompatibilityInsight
+      summary: Compatibility anomaly insight
+      deprecated: true
+      description: |
+        Compatibility endpoint for older dashboard/tool consumers. When the
+        v2.1 incident engine is running, this projects the top active incident
+        into the legacy insight shape. Otherwise it falls back to the legacy
+        detector. New clients should use `/v1/incidents/active` and
+        `/v1/incidents/{id}`.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      responses:
+        '200':
+          description: Compatibility insight object
+          content:
+            application/json:
+              schema:
+                type: object
+                additionalProperties: true
+        '204':
+          description: No active insight
+        '401':
+          description: Unauthorized
+        '405':
+          description: Method Not Allowed
+
   /v1/capabilities:
     get:
       tags: [Capabilities]
@@ -1689,6 +1768,75 @@ components:
         incident:
           $ref: '#/components/schemas/Incident'
 
+    TriageReport:
+      type: object
+      required: [schema_version, incident_ref, blast_snapshot, confidence, generated_at, report_hash]
+      description: |
+        Deterministic triage artifact for an open incident. Versioned via
+        schema_version. report_hash is sha256 over the canonical JSON
+        excluding generated_at, plan_run_id, and report_hash itself.
+      properties:
+        schema_version:
+          type: string
+          enum: ["triage.v1"]
+        incident_ref:
+          type: object
+          required: [id]
+          properties:
+            id: {type: string}
+            window: {type: string, description: "Go duration string e.g. 15m0s"}
+        blast_snapshot:
+          type: object
+          properties:
+            requests: {type: integer}
+            users: {type: integer}
+            services: {type: integer}
+            top_error_families:
+              type: array
+              items:
+                type: object
+                properties:
+                  service: {type: string}
+                  step: {type: string}
+                  error_code: {type: string}
+                  count: {type: integer}
+        first_failure:
+          type: object
+          additionalProperties: true
+          description: Full /v1/traces/story payload for the first observed failing step.
+        sample_traces:
+          type: array
+          items:
+            type: object
+            properties:
+              trace_id: {type: string}
+              summary: {type: string}
+        signals:
+          type: array
+          items:
+            type: object
+            properties:
+              id: {type: string}
+              type: {type: string}
+              evidence_ids: {type: array, items: {type: string}}
+        next_checks:
+          type: array
+          items:
+            type: object
+            properties:
+              id: {type: string}
+              prompt: {type: string}
+        confidence:
+          type: string
+          enum: [low, medium, high]
+        generated_at: {type: string}
+        plan_run_id:
+          type: string
+          description: Set only when produced via /v1/plans/execute.
+        report_hash:
+          type: string
+          description: "sha256:<hex>"
+
     CapabilitiesResponse:
       type: object
       example:
diff --git a/internal/cli/v2/client.go b/internal/cli/v2/client.go
index 5430945..d44f3dd 100644
--- a/internal/cli/v2/client.go
+++ b/internal/cli/v2/client.go
@@ -169,6 +169,19 @@ func (c *Client) IncidentSnapshotJSON(ctx context.Context, incidentID string) (I
 	return out, err
 }
 
+func (c *Client) Triage(ctx context.Context, id string, p TriageParams) (*TriageReport, error) {
+	q := url.Values{}
+	addQuery(q, "window", p.Window)
+	if p.Snapshot {
+		q.Set("snapshot", "true")
+	}
+	var rep TriageReport
+	if err := c.do(ctx, "/v1/triage/"+url.PathEscape(id), q, &rep); err != nil {
+		return nil, err
+	}
+	return &rep, nil
+}
+
 func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchResponse, error) {
 	q := url.Values{}
 	addQuery(q, "error_code", p.ErrorCode)
diff --git a/internal/cli/v2/client_test.go b/internal/cli/v2/client_test.go
index 8d7d621..bbe73ad 100644
--- a/internal/cli/v2/client_test.go
+++ b/internal/cli/v2/client_test.go
@@ -71,3 +71,32 @@ func containsQuery(raw, want string) bool {
 	}
 	return false
 }
+
+func TestClientTriageBuildsExpectedURL(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/v1/triage/inc_abc" {
+			t.Fatalf("path = %q", r.URL.Path)
+		}
+		if r.URL.Query().Get("snapshot") != "true" {
+			t.Fatalf("snapshot query missing")
+		}
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{
+			"schema_version":"triage.v1",
+			"incident_ref":{"id":"inc_abc","window":"15m"},
+			"confidence":"medium",
+			"generated_at":"t",
+			"report_hash":"sha256:x"
+		}`))
+	}))
+	defer srv.Close()
+
+	c := NewClient(ClientConfig{BaseURL: srv.URL, APIKey: "test-key", Timeout: 5 * time.Second})
+	rep, err := c.Triage(context.Background(), "inc_abc", TriageParams{Snapshot: true})
+	if err != nil {
+		t.Fatalf("triage: %v", err)
+	}
+	if rep.IncidentRef.ID != "inc_abc" {
+		t.Fatalf("got id %q", rep.IncidentRef.ID)
+	}
+}
diff --git a/internal/cli/v2/cmd.go b/internal/cli/v2/cmd.go
index 87c3fae..9c2ba9e 100644
--- a/internal/cli/v2/cmd.go
+++ b/internal/cli/v2/cmd.go
@@ -12,7 +12,7 @@ import (
 	"time"
 )
 
-const version = "v2-phase-2"
+const version = "v2.1-triage"
 
 type cliConfig struct {
 	addr    string
@@ -65,6 +65,8 @@ func RunCLI(args []string, _ io.Reader, stdout, stderr io.Writer) int {
 		return runBlast(ctx, client, cfg, rest[1:], stdout, stderr)
 	case "search":
 		return runSearch(ctx, client, cfg, rest[1:], stdout, stderr)
+	case "triage":
+		return runTriage(ctx, client, cfg, rest[1:], stdout, stderr)
 	default:
 		fmt.Fprintf(stderr, "unknown command: %s\n", rest[0])
 		printUsage(stderr)
@@ -244,6 +246,57 @@ func parseIncidentArgs(args []string) (string, bool, error) {
 	return incidentID, snapshot, nil
 }
 
+func runTriage(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+	id, window, snapshot, err := parseTriageArgs(args)
+	if err != nil {
+		return usage(stderr, err.Error())
+	}
+	if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+		return gate
+	}
+	rep, err := client.Triage(ctx, id, TriageParams{Window: window, Snapshot: snapshot})
+	if err != nil {
+		fmt.Fprintln(stderr, err)
+		return exitCodeForError(err)
+	}
+	if cfg.json {
+		if err := renderJSON(stdout, rep); err != nil {
+			fmt.Fprintln(stderr, err)
+			return 2
+		}
+		return 0
+	}
+	return RenderTriage(stdout, rep)
+}
+
+func parseTriageArgs(args []string) (id, window string, snapshot bool, err error) {
+	for i := 0; i < len(args); i++ {
+		arg := args[i]
+		switch {
+		case arg == "--snapshot":
+			snapshot = true
+		case arg == "--window":
+			if i+1 >= len(args) {
+				return "", "", false, fmt.Errorf("--window requires a value")
+			}
+			window = args[i+1]
+			i++
+		case strings.HasPrefix(arg, "--window="):
+			window = strings.TrimPrefix(arg, "--window=")
+		case strings.HasPrefix(arg, "-"):
+			return "", "", false, fmt.Errorf("unknown flag: %s", arg)
+		case id == "":
+			id = arg
+		default:
+			return "", "", false, fmt.Errorf("unexpected argument: %s", arg)
+		}
+	}
+	if id == "" {
+		return "", "", false, fmt.Errorf("usage: waylog triage <incident_id> [--window 15m] [--snapshot]")
+	}
+	return id, window, snapshot, nil
+}
+
 func runEvent(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
 	if len(args) != 1 {
 		return usage(stderr, "usage: waylog event <event_id> [--json]")
diff --git a/internal/cli/v2/cmd_test.go b/internal/cli/v2/cmd_test.go
index 0cf256f..51c39fb 100644
--- a/internal/cli/v2/cmd_test.go
+++ b/internal/cli/v2/cmd_test.go
@@ -309,3 +309,34 @@ func TestRunCLIUsage(t *testing.T) {
 		t.Fatalf("code=%d stderr=%q", code, stderr.String())
 	}
 }
+
+func TestParseTriageArgs(t *testing.T) {
+	cases := []struct {
+		name     string
+		in       []string
+		wantID   string
+		wantSnap bool
+		wantWin  string
+		wantErr  bool
+	}{
+		{"id only", []string{"inc_abc"}, "inc_abc", false, "", false},
+		{"id + snapshot", []string{"inc_abc", "--snapshot"}, "inc_abc", true, "", false},
+		{"id + window", []string{"inc_abc", "--window", "30m"}, "inc_abc", false, "30m", false},
+		{"id + window=30m", []string{"inc_abc", "--window=30m"}, "inc_abc", false, "30m", false},
+		{"missing id", []string{}, "", false, "", true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			id, win, snap, err := parseTriageArgs(tc.in)
+			if (err != nil) != tc.wantErr {
+				t.Fatalf("err=%v wantErr=%v", err, tc.wantErr)
+			}
+			if err != nil {
+				return
+			}
+			if id != tc.wantID || snap != tc.wantSnap || win != tc.wantWin {
+				t.Fatalf("got id=%q win=%q snap=%v want %q %q %v", id, win, snap, tc.wantID, tc.wantWin, tc.wantSnap)
+			}
+		})
+	}
+}
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index bf9b574..e0bed00 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -357,3 +357,41 @@ func formatTime(t time.Time) string {
 	}
 	return t.Format(time.RFC3339)
 }
+
+func RenderTriage(w io.Writer, rep *TriageReport) int {
+	fmt.Fprintf(w, "Triage report  incident=%s  window=%s  confidence=%s\n",
+		rep.IncidentRef.ID, rep.IncidentRef.Window, rep.Confidence)
+	fmt.Fprintf(w, "  hash: %s\n\n", rep.ReportHash)
+
+	fmt.Fprintln(w, "Blast")
+	fmt.Fprintf(w, "  requests=%d  users=%d  services=%d\n",
+		rep.BlastSnapshot.Requests, rep.BlastSnapshot.Users, rep.BlastSnapshot.Services)
+	for _, f := range rep.BlastSnapshot.TopErrorFamilies {
+		fmt.Fprintf(w, "  %s/%s/%s  count=%d\n", f.Service, f.Step, f.ErrorCode, f.Count)
+	}
+	fmt.Fprintln(w)
+
+	if len(rep.SampleTraces) > 0 {
+		fmt.Fprintln(w, "Sample traces")
+		for _, s := range rep.SampleTraces {
+			fmt.Fprintf(w, "  %s  %s\n", s.TraceID, s.Summary)
+		}
+		fmt.Fprintln(w)
+	}
+
+	if len(rep.Signals) > 0 {
+		fmt.Fprintln(w, "Signals")
+		for _, s := range rep.Signals {
+			fmt.Fprintf(w, "  %s  type=%s  evidence=%v\n", s.ID, s.Type, s.EvidenceIDs)
+		}
+		fmt.Fprintln(w)
+	}
+
+	if len(rep.NextChecks) > 0 {
+		fmt.Fprintln(w, "Next checks")
+		for _, c := range rep.NextChecks {
+			fmt.Fprintf(w, "  - %s\n", c.Prompt)
+		}
+	}
+	return 0
+}
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 0d4a755..a60e4cc 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -9,6 +9,7 @@ import (
 
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+	triage "github.com/sssmaran/WaylogCLI/pkg/triage"
 )
 
 func TestRenderStoryPinsObservableLanguage(t *testing.T) {
@@ -160,3 +161,30 @@ func TestRenderNextCursor(t *testing.T) {
 		t.Fatalf("output=%s", out.String())
 	}
 }
+
+func TestRenderTriageHeaderAndSections(t *testing.T) {
+	rep := &TriageReport{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_abc", Window: "15m"},
+		BlastSnapshot: triage.BlastSnapshot{
+			Requests: 12, Users: 8, Services: 4,
+			TopErrorFamilies: []triage.ErrorFamily{
+				{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+			},
+		},
+		Signals:    []triage.SignalRef{{ID: "sig_1", Type: "deploy"}},
+		NextChecks: []triage.NextCheck{{ID: "check_payment_health", Prompt: "Verify payment-service health"}},
+		Confidence: triage.ConfidenceMedium,
+		ReportHash: "sha256:abc",
+	}
+	var buf bytes.Buffer
+	if rc := RenderTriage(&buf, rep); rc != 0 {
+		t.Fatalf("render returned %d", rc)
+	}
+	out := buf.String()
+	for _, want := range []string{"inc_abc", "PMT_502", "deploy", "Verify payment-service health", "sha256:abc"} {
+		if !strings.Contains(out, want) {
+			t.Fatalf("output missing %q\noutput:\n%s", want, out)
+		}
+	}
+}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 1a940de..58fcde9 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -5,6 +5,7 @@ import (
 
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
 )
 
 type CapabilitiesResponse struct {
@@ -85,3 +86,10 @@ type ClientConfig struct {
 	APIKey  string
 	Timeout time.Duration
 }
+
+type TriageParams struct {
+	Window   string
+	Snapshot bool
+}
+
+type TriageReport = pkgtriage.Report
diff --git a/internal/ingest/triage_route_test.go b/internal/ingest/triage_route_test.go
new file mode 100644
index 0000000..de1f4a7
--- /dev/null
+++ b/internal/ingest/triage_route_test.go
@@ -0,0 +1,92 @@
+package ingest_test
+
+// Integration test for Task 11: verifies that the /v1/triage/{id} route is
+// dispatched to the triage handler when wired into a ServeMux the same way
+// cmd/ingest/main.go wires it. The Server type does not own this route
+// (cmd/ingest/main.go composes the mux directly), so this test reproduces
+// the exact mount pattern with stubbed triage dependencies.
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	"github.com/sssmaran/WaylogCLI/internal/triagehttp"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestTriageRouteDispatchesToHandler(t *testing.T) {
+	eng, err := triage.NewEngine(triage.Deps{
+		Incidents:  stubTriageIncidents{},
+		Blast:      stubTriageBlast{},
+		Story:      stubTriageStory{},
+		Signals:    stubTriageSignals{},
+		NextChecks: stubTriageNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	})
+	if err != nil {
+		t.Fatalf("NewEngine: %v", err)
+	}
+	h := triagehttp.NewHandler(eng)
+
+	// Mirror cmd/ingest/main.go: mux.Handle("/v1/triage/", readCORS(h.Triage)).
+	// We omit auth here because the auth wrapper is exercised elsewhere; this
+	// test verifies the dispatch wiring (path → handler).
+	mux := http.NewServeMux()
+	mux.Handle("/v1/triage/", http.HandlerFunc(h.Triage))
+
+	srv := httptest.NewServer(mux)
+	t.Cleanup(srv.Close)
+
+	resp, err := http.Get(srv.URL + "/v1/triage/inc_abc")
+	if err != nil {
+		t.Fatalf("GET: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == http.StatusNotFound {
+		t.Fatalf("route not registered (404)")
+	}
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	if ct := resp.Header.Get("Content-Type"); !strings.Contains(ct, "json") {
+		t.Fatalf("Content-Type = %q, want json", ct)
+	}
+}
+
+// --- stub dependencies ---
+
+type stubTriageIncidents struct{}
+
+func (stubTriageIncidents) GetIncident(_ context.Context, id string) (triage.IncidentSummary, error) {
+	return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type stubTriageBlast struct{}
+
+func (stubTriageBlast) BlastSnapshot(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+	return triage.BlastSnapshotResult{}, nil
+}
+
+type stubTriageStory struct{}
+
+func (stubTriageStory) FirstFailureStory(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.FirstFailureResult, error) {
+	return triage.FirstFailureResult{}, nil
+}
+
+type stubTriageSignals struct{}
+
+func (stubTriageSignals) SignalsFor(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) ([]triage.SignalEvidence, error) {
+	return nil, nil
+}
+
+type stubTriageNextChecks struct{}
+
+func (stubTriageNextChecks) NextChecks(_ context.Context, _ triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+	return nil, nil
+}
diff --git a/internal/tools/triage.go b/internal/tools/triage.go
new file mode 100644
index 0000000..7ad969d
--- /dev/null
+++ b/internal/tools/triage.go
@@ -0,0 +1,57 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+const triageInputSchema = `{
+  "type": "object",
+  "required": ["incident_id"],
+  "properties": {
+    "incident_id": {"type": "string"},
+    "window":      {"type": "string", "description": "Go duration string, default 15m"},
+    "snapshot":    {"type": "boolean", "description": "Freeze evaluation bounds to incident.started_at..updated_at"}
+  }
+}`
+
+const triageOutputSchema = `{
+  "type": "object",
+  "description": "TriageReport v1; see pkg/triage.Report for the full Go struct."
+}`
+
+func RegisterTriageTool(reg *Registry, engine *triage.Engine) error {
+	return reg.Register(Tool{
+		Name:         "triage_incident",
+		Description:  "Build a deterministic TriageReport for an open incident.",
+		Version:      "triage.v1",
+		InputSchema:  json.RawMessage(triageInputSchema),
+		OutputSchema: json.RawMessage(triageOutputSchema),
+		Examples: []string{
+			`{"incident_id":"inc_01HX...","window":"15m"}`,
+			`{"incident_id":"inc_01HX...","snapshot":true}`,
+		},
+		Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+			var p struct {
+				IncidentID string `json:"incident_id"`
+				Window     string `json:"window"`
+				Snapshot   bool   `json:"snapshot"`
+			}
+			if err := json.Unmarshal(params, &p); err != nil {
+				return nil, fmt.Errorf("triage_incident: bad params: %w", err)
+			}
+			if p.IncidentID == "" {
+				return nil, fmt.Errorf("triage_incident: incident_id required")
+			}
+			opts, err := triage.ParseBuildOptions(p.Window, p.Snapshot, time.Now())
+			if err != nil {
+				return nil, err
+			}
+			return engine.Build(ctx, p.IncidentID, opts)
+		},
+	})
+}
diff --git a/internal/tools/triage_test.go b/internal/tools/triage_test.go
new file mode 100644
index 0000000..560e802
--- /dev/null
+++ b/internal/tools/triage_test.go
@@ -0,0 +1,92 @@
+package tools_test
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/tools"
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestRegisterTriageToolListsTool(t *testing.T) {
+	reg := tools.NewRegistry()
+	eng := newStubEngine(t)
+	if err := tools.RegisterTriageTool(reg, eng); err != nil {
+		t.Fatalf("register: %v", err)
+	}
+	if _, ok := reg.Tool("triage_incident"); !ok {
+		t.Fatalf("triage_incident not registered")
+	}
+}
+
+func TestTriageToolHandlerReturnsReport(t *testing.T) {
+	reg := tools.NewRegistry()
+	eng := newStubEngine(t)
+	if err := tools.RegisterTriageTool(reg, eng); err != nil {
+		t.Fatalf("register: %v", err)
+	}
+	params := json.RawMessage(`{"incident_id":"inc_abc","window":"15m","snapshot":false}`)
+	out, err := reg.Call(context.Background(), nil /* graph store unused by triage */, "triage_incident", params)
+	if err != nil {
+		t.Fatalf("call: %v", err)
+	}
+	rep, ok := out.(*pkgtriage.Report)
+	if !ok {
+		t.Fatalf("expected *pkgtriage.Report, got %T", out)
+	}
+	if rep.IncidentRef.ID != "inc_abc" {
+		t.Fatalf("wrong incident id: %q", rep.IncidentRef.ID)
+	}
+}
+
+// newStubEngine wires a triage.Engine with stub deps that always succeed.
+// We duplicate the stubs inline to avoid creating a separate `triagetest` helper package for M1.
+func newStubEngine(t *testing.T) *triage.Engine {
+	t.Helper()
+	deps := triage.Deps{
+		Incidents:  triageStubIncidents{},
+		Blast:      triageStubBlast{},
+		Story:      triageStubStory{},
+		Signals:    triageStubSignals{},
+		NextChecks: triageStubNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := triage.NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	return eng
+}
+
+type triageStubIncidents struct{}
+
+func (triageStubIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+	return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type triageStubBlast struct{}
+
+func (triageStubBlast) BlastSnapshot(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+	return triage.BlastSnapshotResult{}, nil
+}
+
+type triageStubStory struct{}
+
+func (triageStubStory) FirstFailureStory(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.FirstFailureResult, error) {
+	return triage.FirstFailureResult{}, nil
+}
+
+type triageStubSignals struct{}
+
+func (triageStubSignals) SignalsFor(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) ([]triage.SignalEvidence, error) {
+	return nil, nil
+}
+
+type triageStubNextChecks struct{}
+
+func (triageStubNextChecks) NextChecks(ctx context.Context, inc triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+	return nil, nil
+}
diff --git a/internal/triage/adapter.go b/internal/triage/adapter.go
new file mode 100644
index 0000000..8188bd9
--- /dev/null
+++ b/internal/triage/adapter.go
@@ -0,0 +1,272 @@
+package triage
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strconv"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// Upstream collaborator interfaces. Defined narrowly so adapters are testable
+// without instantiating real engines/stores. Production wiring (Task 11)
+// satisfies these with *incidents.Engine (Get / BlastRadius+Errors), the
+// signal store, and a closure over (*core.Graph, *tracestore.Store).
+
+// IncidentReader returns a single incident by ID. *incidents.Engine satisfies
+// this via its Get method.
+type IncidentReader interface {
+	Get(ctx context.Context, id string) (incidents.Incident, error)
+}
+
+// BlastReader exposes the read-side queries the blast adapter needs. The
+// production reader passed to *incidents.Engine (incidents.Reader) satisfies
+// this directly because the method signatures are identical.
+type BlastReader interface {
+	BlastRadius(f incidents.SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse
+	Errors(f incidents.SearchFilter, limit int) incidents.ErrorsResult
+}
+
+// SignalStore is the read surface of internal/signals.Store the adapter calls.
+type SignalStore interface {
+	Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error)
+}
+
+// StoryBuildFunc renders the public-shape trace story for a given trace ID.
+// Production wiring closes over *ingestv2.Reader and calls
+// Reader.TraceStoryByTraceID. Tests inject a stub directly. The bool return
+// is the "found" indicator: when false, the adapter returns an empty result
+// without erroring.
+type StoryBuildFunc func(traceID string) (apiv2.StoryResponse, bool)
+
+// ----- adapter implementations -----
+
+const defaultWindowLabel = "15m"
+
+type incidentLookupAdapter struct{ r IncidentReader }
+
+func NewIncidentLookupAdapter(r IncidentReader) IncidentLookup {
+	return incidentLookupAdapter{r: r}
+}
+
+func (a incidentLookupAdapter) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+	inc, err := a.r.Get(ctx, id)
+	if err != nil {
+		if errors.Is(err, incidents.ErrNotFound) {
+			return IncidentSummary{}, fmt.Errorf("%w: %s", ErrUnknownIncident, id)
+		}
+		return IncidentSummary{}, err
+	}
+	return IncidentSummary{
+		ID:         inc.IncidentID,
+		Window:     defaultWindowLabel,
+		Env:        inc.Env,
+		StartedAt:  inc.StartedAt,
+		UpdatedAt:  inc.UpdatedAt,
+		Service:    inc.ErrorFamily.Service,
+		Step:       inc.ErrorFamily.Step,
+		ErrorCode:  inc.ErrorFamily.ErrorCode,
+		Confidence: mapConfidence(inc.Confidence),
+		NextChecks: append([]string(nil), inc.NextChecks...),
+	}, nil
+}
+
+// mapConfidence converts an incidents.Confidence string to its pkg/triage
+// counterpart. Unknown values default to medium so the produced report
+// always passes Validate.
+func mapConfidence(c incidents.Confidence) pkgtriage.Confidence {
+	switch c {
+	case incidents.ConfidenceHigh:
+		return pkgtriage.ConfidenceHigh
+	case incidents.ConfidenceLow:
+		return pkgtriage.ConfidenceLow
+	case incidents.ConfidenceMedium:
+		return pkgtriage.ConfidenceMedium
+	default:
+		return pkgtriage.ConfidenceMedium
+	}
+}
+
+type blastQueryAdapter struct{ r BlastReader }
+
+func NewBlastQueryAdapter(r BlastReader) BlastQuery {
+	return blastQueryAdapter{r: r}
+}
+
+func (a blastQueryAdapter) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+	end := opts.Now
+	if end.IsZero() {
+		end = inc.UpdatedAt
+	}
+	window := opts.Window
+	if window <= 0 {
+		window = defaultWindow
+	}
+	filter := incidents.SearchFilter{
+		Service:   inc.Service,
+		ErrorCode: inc.ErrorCode,
+		Since:     end.Add(-window),
+		Until:     end,
+	}
+	br := a.r.BlastRadius(filter, apiv2.BlastKey{
+		Service:   inc.Service,
+		Step:      inc.Step,
+		ErrorCode: inc.ErrorCode,
+	})
+	users := 0
+	if br.AffectedUsers != nil {
+		users = *br.AffectedUsers
+	}
+	rows := a.r.Errors(filter, 5).Rows
+	families := make([]pkgtriage.ErrorFamily, 0, len(rows))
+	for _, row := range rows {
+		families = append(families, pkgtriage.ErrorFamily{
+			Service:   row.ErrorFamily.Service,
+			Step:      row.ErrorFamily.Step,
+			ErrorCode: row.ErrorFamily.ErrorCode,
+			Count:     row.Count,
+		})
+	}
+	return BlastSnapshotResult{
+		Requests:         br.AffectedRequests,
+		Users:            users,
+		Services:         br.AffectedServices,
+		TopErrorFamilies: families,
+	}, nil
+}
+
+type storyBuilderAdapter struct {
+	r     IncidentReader
+	build StoryBuildFunc
+}
+
+// NewStoryBuilderAdapter wraps an upstream incident reader (to discover the
+// first-failure trace ID) and a story-build function (production: closure
+// over tracestory.BuildWithTraceStore). The trace selected is the first
+// SampleTraces entry on the underlying incident; if none exists, returns an
+// empty result rather than erroring (M1).
+func NewStoryBuilderAdapter(r IncidentReader, build StoryBuildFunc) StoryBuilder {
+	return storyBuilderAdapter{r: r, build: build}
+}
+
+func (a storyBuilderAdapter) FirstFailureStory(ctx context.Context, inc IncidentSummary, _ BuildOptions) (FirstFailureResult, error) {
+	upstream, err := a.r.Get(ctx, inc.ID)
+	if err != nil {
+		if errors.Is(err, incidents.ErrNotFound) {
+			return FirstFailureResult{}, nil
+		}
+		return FirstFailureResult{}, err
+	}
+	if len(upstream.SampleTraces) == 0 {
+		return FirstFailureResult{}, nil
+	}
+	traceID := upstream.SampleTraces[0]
+	resp, ok := a.build(traceID)
+	if !ok {
+		return FirstFailureResult{}, nil
+	}
+	payload, err := json.Marshal(resp)
+	if err != nil {
+		return FirstFailureResult{}, fmt.Errorf("triage: marshal story: %w", err)
+	}
+	summary := storySummary(resp, inc)
+	return FirstFailureResult{
+		Payload:      payload,
+		SampleTraces: []pkgtriage.TraceSample{{TraceID: resp.TraceID, Summary: summary}},
+	}, nil
+}
+
+func storySummary(s apiv2.StoryResponse, inc IncidentSummary) string {
+	svc := s.Service
+	step := ""
+	code := ""
+	if s.Anchor != nil {
+		step = s.Anchor.Step
+		code = s.Anchor.ErrorCode
+	}
+	switch {
+	case svc != "" && step != "" && code != "":
+		return svc + "/" + step + "/" + code
+	case svc != "" && code != "":
+		return svc + " " + code
+	case svc != "":
+		return svc + " failure"
+	case code != "":
+		return code
+	}
+	if inc.Service != "" && inc.Step != "" && inc.ErrorCode != "" {
+		return inc.Service + "/" + inc.Step + "/" + inc.ErrorCode
+	}
+	if inc.Service != "" && inc.ErrorCode != "" {
+		return inc.Service + " " + inc.ErrorCode
+	}
+	return "first failure"
+}
+
+type signalQueryAdapter struct{ s SignalStore }
+
+func NewSignalQueryAdapter(s SignalStore) SignalQuery {
+	return signalQueryAdapter{s: s}
+}
+
+func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+	end := opts.Now
+	if end.IsZero() {
+		end = inc.UpdatedAt
+	}
+	window := opts.Window
+	if window <= 0 {
+		window = defaultWindow
+	}
+	// Mirror incidents.Engine.querySignals: filter by env+window only. A
+	// service filter would drop cross-service evidence (e.g. a payment
+	// dependency signal on a checkout incident).
+	rows, err := a.s.Query(ctx, signals.Filter{
+		Env:   inc.Env,
+		Since: end.Add(-window),
+		Until: end,
+		Limit: 200,
+	})
+	if err != nil {
+		if errors.Is(err, signals.ErrUnavailable) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	out := make([]SignalEvidence, 0, len(rows))
+	for _, sig := range rows {
+		out = append(out, SignalEvidence{
+			ID:          sig.SignalID,
+			Type:        string(sig.Type),
+			EvidenceIDs: []string{sig.SignalID},
+		})
+	}
+	return out, nil
+}
+
+type nextChecksAdapter struct{}
+
+// NewNextChecksAdapter returns a passthrough that converts the incident's
+// own NextChecks list (already populated by the incidents engine via
+// internal/incidents.NextChecks(cause, confidence)) into the typed
+// NextCheckSpec entries the report consumes. Stable IDs (check_<index>)
+// keep the report deterministic across runs.
+func NewNextChecksAdapter() NextChecksProvider {
+	return nextChecksAdapter{}
+}
+
+func (nextChecksAdapter) NextChecks(_ context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+	if len(inc.NextChecks) == 0 {
+		return nil, nil
+	}
+	out := make([]NextCheckSpec, 0, len(inc.NextChecks))
+	for i, prompt := range inc.NextChecks {
+		out = append(out, NextCheckSpec{ID: "check_" + strconv.Itoa(i), Prompt: prompt})
+	}
+	return out, nil
+}
diff --git a/internal/triage/adapter_test.go b/internal/triage/adapter_test.go
new file mode 100644
index 0000000..2e678d6
--- /dev/null
+++ b/internal/triage/adapter_test.go
@@ -0,0 +1,454 @@
+package triage_test
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// ----- IncidentLookupAdapter -----
+
+type fakeIncidentReader struct {
+	inc incidents.Incident
+	err error
+}
+
+func (f fakeIncidentReader) Get(_ context.Context, _ string) (incidents.Incident, error) {
+	if f.err != nil {
+		return incidents.Incident{}, f.err
+	}
+	return f.inc, nil
+}
+
+func TestIncidentLookupAdapter_MapsFamilyFields(t *testing.T) {
+	started := time.Date(2026, 5, 6, 11, 0, 0, 0, time.UTC)
+	updated := time.Date(2026, 5, 6, 11, 5, 0, 0, time.UTC)
+	reader := fakeIncidentReader{inc: incidents.Incident{
+		IncidentID: "inc_abc",
+		Env:        "demo",
+		StartedAt:  started,
+		UpdatedAt:  updated,
+		Service:    "payment",
+		ErrorFamily: apiv2.ErrorFamily{
+			Service:   "payment",
+			Step:      "payment.charge",
+			ErrorCode: "PMT_502",
+		},
+		Confidence: incidents.ConfidenceHigh,
+		NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+	}}
+	a := triage.NewIncidentLookupAdapter(reader)
+	got, err := a.GetIncident(context.Background(), "inc_abc")
+	if err != nil {
+		t.Fatalf("GetIncident: %v", err)
+	}
+	if got.ID != "inc_abc" {
+		t.Fatalf("ID = %q, want inc_abc", got.ID)
+	}
+	if got.Env != "demo" {
+		t.Fatalf("Env = %q, want demo", got.Env)
+	}
+	if !got.StartedAt.Equal(started) {
+		t.Fatalf("StartedAt = %v, want %v", got.StartedAt, started)
+	}
+	if !got.UpdatedAt.Equal(updated) {
+		t.Fatalf("UpdatedAt = %v, want %v", got.UpdatedAt, updated)
+	}
+	if got.Service != "payment" || got.Step != "payment.charge" || got.ErrorCode != "PMT_502" {
+		t.Fatalf("family fields = %+v", got)
+	}
+	if got.Window != "15m" {
+		t.Fatalf("Window default = %q, want 15m", got.Window)
+	}
+	if got.Confidence != pkgtriage.ConfidenceHigh {
+		t.Fatalf("Confidence = %q, want high", got.Confidence)
+	}
+	wantChecks := []string{"Verify payment-service health", "Check recent deploys"}
+	if len(got.NextChecks) != len(wantChecks) {
+		t.Fatalf("NextChecks len = %d, want %d (%+v)", len(got.NextChecks), len(wantChecks), got.NextChecks)
+	}
+	for i := range wantChecks {
+		if got.NextChecks[i] != wantChecks[i] {
+			t.Fatalf("NextChecks[%d] = %q, want %q", i, got.NextChecks[i], wantChecks[i])
+		}
+	}
+}
+
+func TestIncidentLookupAdapter_ConfidenceMapping(t *testing.T) {
+	cases := []struct {
+		in   incidents.Confidence
+		want pkgtriage.Confidence
+	}{
+		{incidents.ConfidenceHigh, pkgtriage.ConfidenceHigh},
+		{incidents.ConfidenceMedium, pkgtriage.ConfidenceMedium},
+		{incidents.ConfidenceLow, pkgtriage.ConfidenceLow},
+		{incidents.Confidence("nonsense"), pkgtriage.ConfidenceMedium},
+	}
+	for _, tc := range cases {
+		reader := fakeIncidentReader{inc: incidents.Incident{
+			IncidentID: "inc_abc",
+			Confidence: tc.in,
+		}}
+		a := triage.NewIncidentLookupAdapter(reader)
+		got, err := a.GetIncident(context.Background(), "inc_abc")
+		if err != nil {
+			t.Fatalf("GetIncident(%q): %v", tc.in, err)
+		}
+		if got.Confidence != tc.want {
+			t.Fatalf("Confidence(%q) = %q, want %q", tc.in, got.Confidence, tc.want)
+		}
+	}
+}
+
+func TestIncidentLookupAdapter_NextChecksDefensiveCopy(t *testing.T) {
+	original := []string{"a", "b"}
+	reader := fakeIncidentReader{inc: incidents.Incident{
+		IncidentID: "inc_abc",
+		NextChecks: original,
+	}}
+	a := triage.NewIncidentLookupAdapter(reader)
+	got, err := a.GetIncident(context.Background(), "inc_abc")
+	if err != nil {
+		t.Fatalf("GetIncident: %v", err)
+	}
+	// Mutating the original slice must not affect the summary's copy.
+	original[0] = "MUTATED"
+	if got.NextChecks[0] != "a" {
+		t.Fatalf("NextChecks copy must be defensive, got %q after mutation", got.NextChecks[0])
+	}
+}
+
+func TestIncidentLookupAdapter_NotFoundIsErrUnknown(t *testing.T) {
+	a := triage.NewIncidentLookupAdapter(fakeIncidentReader{err: incidents.ErrNotFound})
+	if _, err := a.GetIncident(context.Background(), "missing"); !errors.Is(err, triage.ErrUnknownIncident) {
+		t.Fatalf("err = %v, want ErrUnknownIncident", err)
+	}
+}
+
+// ----- BlastQueryAdapter -----
+
+type fakeBlastReader struct {
+	br   apiv2.BlastRadiusResponse
+	rows []apiv2.ErrorRow
+}
+
+func (f fakeBlastReader) BlastRadius(_ incidents.SearchFilter, _ apiv2.BlastKey) apiv2.BlastRadiusResponse {
+	return f.br
+}
+
+func (f fakeBlastReader) Errors(_ incidents.SearchFilter, _ int) incidents.ErrorsResult {
+	return incidents.ErrorsResult{Rows: f.rows}
+}
+
+func TestBlastQueryAdapter_MapsCountsAndTopFamilies(t *testing.T) {
+	users := 8
+	reader := fakeBlastReader{
+		br: apiv2.BlastRadiusResponse{
+			AffectedRequests: 12,
+			AffectedUsers:    &users,
+			AffectedServices: 4,
+		},
+		rows: []apiv2.ErrorRow{
+			{ErrorFamily: apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502"}, Count: 11},
+			{ErrorFamily: apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_503"}, Count: 3},
+		},
+	}
+	a := triage.NewBlastQueryAdapter(reader)
+	inc := triage.IncidentSummary{
+		ID: "inc_abc", Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502",
+		UpdatedAt: time.Date(2026, 5, 6, 11, 5, 0, 0, time.UTC),
+	}
+	opts, _ := triage.ParseBuildOptions("15m", true, time.Now())
+	opts.Now = inc.UpdatedAt
+
+	got, err := a.BlastSnapshot(context.Background(), inc, opts)
+	if err != nil {
+		t.Fatalf("BlastSnapshot: %v", err)
+	}
+	if got.Requests != 12 || got.Users != 8 || got.Services != 4 {
+		t.Fatalf("counts = %+v", got)
+	}
+	if len(got.TopErrorFamilies) != 2 {
+		t.Fatalf("top families = %d, want 2", len(got.TopErrorFamilies))
+	}
+	if got.TopErrorFamilies[0].ErrorCode != "PMT_502" || got.TopErrorFamilies[0].Count != 11 {
+		t.Fatalf("first family = %+v", got.TopErrorFamilies[0])
+	}
+}
+
+func TestBlastQueryAdapter_NilUsersBecomesZero(t *testing.T) {
+	reader := fakeBlastReader{br: apiv2.BlastRadiusResponse{AffectedRequests: 1, AffectedUsers: nil}}
+	a := triage.NewBlastQueryAdapter(reader)
+	inc := triage.IncidentSummary{Service: "x", UpdatedAt: time.Now()}
+	opts, _ := triage.ParseBuildOptions("15m", false, time.Now())
+	got, err := a.BlastSnapshot(context.Background(), inc, opts)
+	if err != nil {
+		t.Fatalf("BlastSnapshot: %v", err)
+	}
+	if got.Users != 0 {
+		t.Fatalf("Users = %d, want 0 when AffectedUsers is nil", got.Users)
+	}
+}
+
+// ----- StoryBuilderAdapter -----
+
+type fakeIncForStory struct{ inc incidents.Incident }
+
+func (f fakeIncForStory) Get(_ context.Context, _ string) (incidents.Incident, error) {
+	return f.inc, nil
+}
+
+func TestStoryBuilderAdapter_UsesFirstSampleTrace(t *testing.T) {
+	traceID := "abc123"
+	wantStory := apiv2.StoryResponse{
+		TraceID: traceID,
+		Service: "payment",
+		Anchor:  &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"},
+		Linkage: "trace_id",
+	}
+
+	called := false
+	build := func(tid string) (apiv2.StoryResponse, bool) {
+		called = true
+		if tid != traceID {
+			t.Fatalf("build called with %q, want %q", tid, traceID)
+		}
+		return wantStory, true
+	}
+
+	incReader := fakeIncForStory{inc: incidents.Incident{
+		IncidentID:   "inc_abc",
+		SampleTraces: []string{traceID, "other"},
+		Service:      "payment",
+		ErrorFamily:  apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502"},
+	}}
+	a := triage.NewStoryBuilderAdapter(incReader, build)
+	inc := triage.IncidentSummary{ID: "inc_abc"}
+	opts, _ := triage.ParseBuildOptions("15m", false, time.Now())
+
+	got, err := a.FirstFailureStory(context.Background(), inc, opts)
+	if err != nil {
+		t.Fatalf("FirstFailureStory: %v", err)
+	}
+	if !called {
+		t.Fatalf("build func was not called")
+	}
+	if len(got.SampleTraces) != 1 || got.SampleTraces[0].TraceID != traceID {
+		t.Fatalf("sample traces = %+v", got.SampleTraces)
+	}
+	// Payload should be a non-empty JSON object that decodes to the public
+	// StoryResponse shape.
+	if len(got.Payload) == 0 || got.Payload[0] != '{' {
+		t.Fatalf("payload not JSON object: %s", string(got.Payload))
+	}
+	var decoded map[string]any
+	if err := json.Unmarshal(got.Payload, &decoded); err != nil {
+		t.Fatalf("payload unmarshal: %v", err)
+	}
+	if decoded["trace_id"] != traceID {
+		t.Fatalf("payload.trace_id = %v, want %q", decoded["trace_id"], traceID)
+	}
+}
+
+func TestStoryBuilderAdapter_NoSampleTraceReturnsEmptyResult(t *testing.T) {
+	build := func(string) (apiv2.StoryResponse, bool) {
+		t.Fatalf("build should not be called when no sample trace")
+		return apiv2.StoryResponse{}, false
+	}
+	incReader := fakeIncForStory{inc: incidents.Incident{IncidentID: "inc_abc"}}
+	a := triage.NewStoryBuilderAdapter(incReader, build)
+	got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+	if err != nil {
+		t.Fatalf("FirstFailureStory: %v", err)
+	}
+	if len(got.SampleTraces) != 0 {
+		t.Fatalf("expected no sample traces, got %+v", got.SampleTraces)
+	}
+}
+
+func TestStoryBuilderAdapter_StoryNotFoundReturnsEmpty(t *testing.T) {
+	// When TraceStoryByTraceID returns ok=false (no matching trace), the
+	// adapter must produce an empty result without erroring.
+	build := func(string) (apiv2.StoryResponse, bool) {
+		return apiv2.StoryResponse{}, false
+	}
+	incReader := fakeIncForStory{inc: incidents.Incident{
+		IncidentID:   "inc_abc",
+		SampleTraces: []string{"missing"},
+	}}
+	a := triage.NewStoryBuilderAdapter(incReader, build)
+	got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+	if err != nil {
+		t.Fatalf("FirstFailureStory: %v", err)
+	}
+	if len(got.Payload) != 0 || len(got.SampleTraces) != 0 {
+		t.Fatalf("expected empty result for not-found story, got %+v", got)
+	}
+}
+
+// TestStoryBuilderAdapterPayloadHasReadAPIFields verifies the FirstFailure
+// payload uses the public StoryResponse shape — keys consumers see at
+// /v1/traces/story.
+func TestStoryBuilderAdapterPayloadHasReadAPIFields(t *testing.T) {
+	traceID := "trace_demo"
+	resp := apiv2.StoryResponse{
+		TraceID: traceID,
+		Anchor:  &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"},
+		Path:    []apiv2.StoryStep{{Name: "payment.charge", StartMS: 0, DurationMS: 12}},
+		Logs:    []apiv2.StoryLog{{TsOffsetMS: 5, Msg: "boom"}},
+		Downstream: []apiv2.StoryDownstream{
+			{Step: "payment.charge", Service: "payment", Endpoint: "/charge"},
+		},
+		Linkage: "trace_id",
+	}
+	build := func(string) (apiv2.StoryResponse, bool) { return resp, true }
+	incReader := fakeIncForStory{inc: incidents.Incident{
+		IncidentID:   "inc_abc",
+		SampleTraces: []string{traceID},
+	}}
+	a := triage.NewStoryBuilderAdapter(incReader, build)
+
+	got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+	if err != nil {
+		t.Fatalf("FirstFailureStory: %v", err)
+	}
+	var decoded map[string]any
+	if err := json.Unmarshal(got.Payload, &decoded); err != nil {
+		t.Fatalf("payload unmarshal: %v", err)
+	}
+	for _, key := range []string{"trace_id", "anchor", "path", "logs", "downstream", "linkage"} {
+		if _, ok := decoded[key]; !ok {
+			t.Fatalf("payload missing read-API key %q: %v", key, decoded)
+		}
+	}
+}
+
+// ----- SignalQueryAdapter -----
+
+type fakeSignalStore struct {
+	out []signals.Signal
+	err error
+	got signals.Filter
+}
+
+func (f *fakeSignalStore) Query(_ context.Context, filter signals.Filter) ([]signals.Signal, error) {
+	f.got = filter
+	if f.err != nil {
+		return nil, f.err
+	}
+	return f.out, nil
+}
+
+func TestSignalQueryAdapter_QueriesBroadByEnvWindowNotService(t *testing.T) {
+	// Adapter must mirror incidents.Engine.querySignals: filter by Env + window
+	// only. Service is intentionally NOT set so cross-service dependency
+	// signals (e.g. a payment-service signal evidencing a checkout incident)
+	// are surfaced.
+	store := &fakeSignalStore{
+		out: []signals.Signal{
+			{SignalID: "sig_1", Type: signals.TypeDeploy, Service: "payment"},
+			{SignalID: "sig_2", Type: signals.TypeDependency, Service: "payment"},
+		},
+	}
+	a := triage.NewSignalQueryAdapter(store)
+	now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+	inc := triage.IncidentSummary{
+		Service:   "checkout",
+		Env:       "demo",
+		UpdatedAt: now,
+	}
+	opts, _ := triage.ParseBuildOptions("15m", false, now)
+
+	got, err := a.SignalsFor(context.Background(), inc, opts)
+	if err != nil {
+		t.Fatalf("SignalsFor: %v", err)
+	}
+	if store.got.Service != "" {
+		t.Fatalf("filter.Service = %q, want empty (broad query)", store.got.Service)
+	}
+	if store.got.Env != "demo" {
+		t.Fatalf("filter.Env = %q, want demo", store.got.Env)
+	}
+	wantSince := now.Add(-15 * time.Minute)
+	if !store.got.Since.Equal(wantSince) {
+		t.Fatalf("filter.Since = %v, want %v", store.got.Since, wantSince)
+	}
+	if !store.got.Until.Equal(now) {
+		t.Fatalf("filter.Until = %v, want %v", store.got.Until, now)
+	}
+	if store.got.Limit != 200 {
+		t.Fatalf("filter.Limit = %d, want 200", store.got.Limit)
+	}
+	if len(got) != 2 {
+		t.Fatalf("got %d signals, want 2 (cross-service signals must be returned)", len(got))
+	}
+	if got[0].ID != "sig_1" || got[0].Type != "deploy" {
+		t.Fatalf("first signal = %+v", got[0])
+	}
+	// Critical assertion for Fix 1: the payment-service dependency signal must
+	// be in the result even though inc.Service = checkout.
+	foundPaymentDep := false
+	for _, s := range got {
+		if s.ID == "sig_2" && s.Type == "dependency" {
+			foundPaymentDep = true
+		}
+	}
+	if !foundPaymentDep {
+		t.Fatalf("payment-service dependency signal dropped: got %+v", got)
+	}
+}
+
+func TestSignalQueryAdapter_UnavailableReturnsEmpty(t *testing.T) {
+	a := triage.NewSignalQueryAdapter(&fakeSignalStore{err: signals.ErrUnavailable})
+	got, err := a.SignalsFor(context.Background(), triage.IncidentSummary{UpdatedAt: time.Now()}, triage.BuildOptions{Window: time.Minute})
+	if err != nil {
+		t.Fatalf("SignalsFor: %v", err)
+	}
+	if len(got) != 0 {
+		t.Fatalf("want empty when unavailable, got %+v", got)
+	}
+}
+
+// ----- NextChecksAdapter -----
+
+func TestNextChecksAdapter_ConsumesIncidentNextChecks(t *testing.T) {
+	a := triage.NewNextChecksAdapter()
+	got, err := a.NextChecks(context.Background(), triage.IncidentSummary{
+		Service:    "checkout",
+		ErrorCode:  "PMT_502",
+		NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+	})
+	if err != nil {
+		t.Fatalf("NextChecks: %v", err)
+	}
+	if len(got) != 2 {
+		t.Fatalf("got %d checks, want 2: %+v", len(got), got)
+	}
+	if got[0].ID != "check_0" || got[0].Prompt != "Verify payment-service health" {
+		t.Fatalf("got[0] = %+v, want {check_0, Verify payment-service health}", got[0])
+	}
+	if got[1].ID != "check_1" || got[1].Prompt != "Check recent deploys" {
+		t.Fatalf("got[1] = %+v, want {check_1, Check recent deploys}", got[1])
+	}
+}
+
+func TestNextChecksAdapter_EmptyIncidentReturnsEmpty(t *testing.T) {
+	a := triage.NewNextChecksAdapter()
+	got, err := a.NextChecks(context.Background(), triage.IncidentSummary{
+		Service: "anything", ErrorCode: "XYZ_123",
+	})
+	if err != nil {
+		t.Fatalf("NextChecks: %v", err)
+	}
+	if len(got) != 0 {
+		t.Fatalf("expected no checks for empty NextChecks, got %+v", got)
+	}
+}
diff --git a/internal/triage/engine.go b/internal/triage/engine.go
new file mode 100644
index 0000000..07f5a04
--- /dev/null
+++ b/internal/triage/engine.go
@@ -0,0 +1,139 @@
+package triage
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"time"
+
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+var ErrUnknownIncident = errors.New("triage: unknown incident")
+
+// IncidentSummary is the minimal incident shape this package needs.
+// Adapter types in the wiring layer convert from internal/incidents.Incident.
+type IncidentSummary struct {
+	ID         string
+	Window     string
+	Env        string
+	StartedAt  time.Time
+	UpdatedAt  time.Time
+	Service    string
+	Step       string
+	ErrorCode  string
+	Confidence pkgtriage.Confidence
+	NextChecks []string
+}
+
+type BlastSnapshotResult struct {
+	Requests         int
+	Users            int
+	Services         int
+	TopErrorFamilies []pkgtriage.ErrorFamily
+}
+
+type FirstFailureResult struct {
+	Payload      json.RawMessage
+	SampleTraces []pkgtriage.TraceSample
+}
+
+type SignalEvidence = pkgtriage.SignalRef
+
+type NextCheckSpec = pkgtriage.NextCheck
+
+type IncidentLookup interface {
+	GetIncident(ctx context.Context, id string) (IncidentSummary, error)
+}
+
+type BlastQuery interface {
+	BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error)
+}
+
+type StoryBuilder interface {
+	FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error)
+}
+
+type SignalQuery interface {
+	SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error)
+}
+
+type NextChecksProvider interface {
+	NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error)
+}
+
+type Deps struct {
+	Incidents  IncidentLookup
+	Blast      BlastQuery
+	Story      StoryBuilder
+	Signals    SignalQuery
+	NextChecks NextChecksProvider
+	Now        func() time.Time
+}
+
+type Engine struct {
+	deps Deps
+}
+
+func NewEngine(d Deps) (*Engine, error) {
+	if d.Incidents == nil || d.Blast == nil || d.Story == nil || d.Signals == nil || d.NextChecks == nil {
+		return nil, fmt.Errorf("triage: NewEngine requires all dependencies")
+	}
+	if d.Now == nil {
+		d.Now = time.Now
+	}
+	return &Engine{deps: d}, nil
+}
+
+func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions) (*pkgtriage.Report, error) {
+	inc, err := e.deps.Incidents.GetIncident(ctx, incidentID)
+	if err != nil {
+		return nil, err
+	}
+	if opts.Snapshot {
+		opts.Now = inc.UpdatedAt
+	}
+
+	blast, err := e.deps.Blast.BlastSnapshot(ctx, inc, opts)
+	if err != nil {
+		return nil, fmt.Errorf("triage: blast: %w", err)
+	}
+	story, err := e.deps.Story.FirstFailureStory(ctx, inc, opts)
+	if err != nil {
+		return nil, fmt.Errorf("triage: story: %w", err)
+	}
+	sigs, err := e.deps.Signals.SignalsFor(ctx, inc, opts)
+	if err != nil {
+		return nil, fmt.Errorf("triage: signals: %w", err)
+	}
+	checks, err := e.deps.NextChecks.NextChecks(ctx, inc)
+	if err != nil {
+		return nil, fmt.Errorf("triage: next_checks: %w", err)
+	}
+
+	r := &pkgtriage.Report{
+		SchemaVersion: pkgtriage.SchemaVersionV1,
+		IncidentRef:   pkgtriage.IncidentRef{ID: inc.ID, Window: opts.Window.String()},
+		BlastSnapshot: pkgtriage.BlastSnapshot{
+			Requests: blast.Requests, Users: blast.Users, Services: blast.Services,
+			TopErrorFamilies: blast.TopErrorFamilies,
+		},
+		FirstFailure: story.Payload,
+		SampleTraces: story.SampleTraces,
+		Signals:      sigs,
+		NextChecks:   checks,
+		Confidence:   inc.Confidence,
+		GeneratedAt:  e.deps.Now().UTC().Format(time.RFC3339Nano),
+	}
+
+	hash, err := r.CanonicalHash()
+	if err != nil {
+		return nil, fmt.Errorf("triage: hash: %w", err)
+	}
+	r.ReportHash = hash
+	if err := r.Validate(); err != nil {
+		return nil, fmt.Errorf("triage: produced invalid report: %w", err)
+	}
+	return r, nil
+}
diff --git a/internal/triage/engine_test.go b/internal/triage/engine_test.go
new file mode 100644
index 0000000..b316830
--- /dev/null
+++ b/internal/triage/engine_test.go
@@ -0,0 +1,317 @@
+package triage
+
+import (
+	"context"
+	"encoding/json"
+	"strings"
+	"testing"
+	"time"
+
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestNewEngineRequiresAllDeps(t *testing.T) {
+	if _, err := NewEngine(Deps{}); err == nil {
+		t.Fatalf("expected error when deps are zero, got nil")
+	}
+}
+
+func TestEngineBuildReturnsErrorForUnknownIncident(t *testing.T) {
+	deps := stubDeps()
+	deps.Incidents = stubIncidentLookup{err: ErrUnknownIncident}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("", false, time.Now())
+	if _, err := eng.Build(context.Background(), "inc_missing", opts); err == nil {
+		t.Fatalf("expected error for unknown incident")
+	}
+}
+
+// --- test helpers ---
+
+type stubIncidentLookup struct {
+	err error
+}
+
+func (s stubIncidentLookup) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+	return IncidentSummary{}, s.err
+}
+
+type stubBlastQuery struct{}
+
+func (stubBlastQuery) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+	return BlastSnapshotResult{}, nil
+}
+
+type stubStoryBuilder struct{}
+
+func (stubStoryBuilder) FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error) {
+	return FirstFailureResult{}, nil
+}
+
+type stubSignalQuery struct{}
+
+func (stubSignalQuery) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+	return nil, nil
+}
+
+type stubNextChecks struct{}
+
+func (stubNextChecks) NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+	return nil, nil
+}
+
+func stubDeps() Deps {
+	return Deps{
+		Incidents:  stubIncidentLookup{},
+		Blast:      stubBlastQuery{},
+		Story:      stubStoryBuilder{},
+		Signals:    stubSignalQuery{},
+		NextChecks: stubNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC) },
+	}
+}
+
+type richBlast struct{}
+
+func (richBlast) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+	return BlastSnapshotResult{
+		Requests: 12, Users: 8, Services: 4,
+		TopErrorFamilies: []pkgtriage.ErrorFamily{
+			{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+		},
+	}, nil
+}
+
+type richStory struct{}
+
+func (richStory) FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error) {
+	return FirstFailureResult{
+		Payload:      json.RawMessage(`{"trace_id":"abc","first_failure":"payment.charge"}`),
+		SampleTraces: []pkgtriage.TraceSample{{TraceID: "abc", Summary: "payment 502"}},
+	}, nil
+}
+
+type richSignals struct{}
+
+func (richSignals) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+	return []SignalEvidence{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}}, nil
+}
+
+type richNextChecks struct{}
+
+func (richNextChecks) NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+	return []NextCheckSpec{{ID: "check_payment_health", Prompt: "Verify payment-service health"}}, nil
+}
+
+type richIncidents struct{}
+
+func (richIncidents) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+	return IncidentSummary{
+		ID: id, Window: "15m", Env: "demo",
+		StartedAt: time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC),
+		UpdatedAt: time.Date(2026, 5, 6, 0, 5, 0, 0, time.UTC),
+		Service:   "payment", Step: "payment.charge", ErrorCode: "PMT_502",
+		Confidence: pkgtriage.ConfidenceHigh,
+		NextChecks: []string{"Verify payment-service health"},
+	}, nil
+}
+
+func TestEngineBuildAssemblesAllSections(t *testing.T) {
+	deps := Deps{
+		Incidents:  richIncidents{},
+		Blast:      richBlast{},
+		Story:      richStory{},
+		Signals:    richSignals{},
+		NextChecks: richNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("15m", false, deps.Now())
+	r, err := eng.Build(context.Background(), "inc_abc", opts)
+	if err != nil {
+		t.Fatalf("build: %v", err)
+	}
+	if r.IncidentRef.Window != "15m0s" {
+		t.Fatalf("incident_ref.window should reflect opts.Window, got %q", r.IncidentRef.Window)
+	}
+	if r.BlastSnapshot.Requests != 12 {
+		t.Fatalf("blast.requests = %d, want 12", r.BlastSnapshot.Requests)
+	}
+	if len(r.SampleTraces) != 1 || r.SampleTraces[0].TraceID != "abc" {
+		t.Fatalf("sample_traces wrong: %+v", r.SampleTraces)
+	}
+	if len(r.Signals) != 1 || r.Signals[0].Type != "deploy" {
+		t.Fatalf("signals wrong: %+v", r.Signals)
+	}
+	if len(r.NextChecks) != 1 {
+		t.Fatalf("next_checks missing")
+	}
+	if r.Confidence != pkgtriage.ConfidenceHigh {
+		t.Fatalf("Confidence = %q, want high (must come from incident, not hard-coded medium)", r.Confidence)
+	}
+	if r.ReportHash == "" || !strings.HasPrefix(r.ReportHash, "sha256:") {
+		t.Fatalf("report_hash missing/invalid: %q", r.ReportHash)
+	}
+	if err := r.Validate(); err != nil {
+		t.Fatalf("produced report failed validation: %v", err)
+	}
+}
+
+// TestTriageReportFromDemoShape exercises the engine end-to-end against the
+// demo's actual shape — the cross-service signal (payment dependency on a
+// checkout incident), high confidence, and incident-provided next checks.
+// It is the regression gate for the four M1 fixes.
+func TestTriageReportFromDemoShape(t *testing.T) {
+	demoIncidents := stubGetIncident(IncidentSummary{
+		ID:      "inc_demo",
+		Window:  "15m",
+		Env:     "demo",
+		Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502",
+		StartedAt:  time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC),
+		UpdatedAt:  time.Date(2026, 5, 6, 0, 5, 0, 0, time.UTC),
+		Confidence: pkgtriage.ConfidenceHigh,
+		NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+	})
+	demoBlast := stubBlastSnapshot{
+		out: BlastSnapshotResult{
+			Requests: 7, Users: 3, Services: 2,
+			TopErrorFamilies: []pkgtriage.ErrorFamily{
+				{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502", Count: 6},
+			},
+		},
+	}
+	// Story payload mirrors apiv2.StoryResponse shape; engine treats it as
+	// opaque RawMessage.
+	demoStory := stubStoryResult{
+		out: FirstFailureResult{
+			Payload: json.RawMessage(`{"trace_id":"t_demo","anchor":{"step":"payment.charge","error_code":"PMT_502"},"path":[],"logs":[],"downstream":[],"linkage":"trace_id"}`),
+			SampleTraces: []pkgtriage.TraceSample{
+				{TraceID: "t_demo", Summary: "checkout PMT_502"},
+			},
+		},
+	}
+	// Cross-service signal: incident is on `checkout`, but the dependency
+	// signal is from `payment`. Fix 1 ensures the broad query surfaces it.
+	demoSignals := stubSignalsResult{
+		out: []SignalEvidence{
+			{ID: "sig_payment_dep", Type: "dependency", EvidenceIDs: []string{"sig_payment_dep"}},
+		},
+	}
+	// Fix 3: NextChecks must come from the incident, not a static map keyed
+	// by service+code.
+	demoChecks := stubNextChecksResult{}
+
+	deps := Deps{
+		Incidents:  demoIncidents,
+		Blast:      demoBlast,
+		Story:      demoStory,
+		Signals:    demoSignals,
+		NextChecks: demoChecks,
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("15m", false, deps.Now())
+
+	r, err := eng.Build(context.Background(), "inc_demo", opts)
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if r.IncidentRef.ID != "inc_demo" {
+		t.Fatalf("IncidentRef.ID = %q, want inc_demo", r.IncidentRef.ID)
+	}
+	foundFamily := false
+	for _, fam := range r.BlastSnapshot.TopErrorFamilies {
+		if fam.ErrorCode == "PMT_502" {
+			foundFamily = true
+		}
+	}
+	if !foundFamily {
+		t.Fatalf("BlastSnapshot.TopErrorFamilies missing PMT_502: %+v", r.BlastSnapshot.TopErrorFamilies)
+	}
+	foundPaymentSig := false
+	for _, sig := range r.Signals {
+		if sig.ID == "sig_payment_dep" {
+			foundPaymentSig = true
+		}
+	}
+	if !foundPaymentSig {
+		t.Fatalf("Signals missing payment dependency signal: %+v", r.Signals)
+	}
+	if r.Confidence != pkgtriage.ConfidenceHigh {
+		t.Fatalf("Confidence = %q, want high", r.Confidence)
+	}
+	if len(r.NextChecks) != 2 {
+		t.Fatalf("NextChecks len = %d, want 2: %+v", len(r.NextChecks), r.NextChecks)
+	}
+	if r.NextChecks[0].ID != "check_0" || r.NextChecks[0].Prompt != "Verify payment-service health" {
+		t.Fatalf("NextChecks[0] = %+v, want {check_0, Verify payment-service health}", r.NextChecks[0])
+	}
+	if r.NextChecks[1].ID != "check_1" || r.NextChecks[1].Prompt != "Check recent deploys" {
+		t.Fatalf("NextChecks[1] = %+v, want {check_1, Check recent deploys}", r.NextChecks[1])
+	}
+	if r.ReportHash == "" {
+		t.Fatalf("ReportHash empty")
+	}
+}
+
+// --- additional stubs used by the demo-shape regression test ---
+
+type stubGetIncident IncidentSummary
+
+func (s stubGetIncident) GetIncident(_ context.Context, _ string) (IncidentSummary, error) {
+	return IncidentSummary(s), nil
+}
+
+type stubBlastSnapshot struct{ out BlastSnapshotResult }
+
+func (s stubBlastSnapshot) BlastSnapshot(_ context.Context, _ IncidentSummary, _ BuildOptions) (BlastSnapshotResult, error) {
+	return s.out, nil
+}
+
+type stubStoryResult struct{ out FirstFailureResult }
+
+func (s stubStoryResult) FirstFailureStory(_ context.Context, _ IncidentSummary, _ BuildOptions) (FirstFailureResult, error) {
+	return s.out, nil
+}
+
+type stubSignalsResult struct{ out []SignalEvidence }
+
+func (s stubSignalsResult) SignalsFor(_ context.Context, _ IncidentSummary, _ BuildOptions) ([]SignalEvidence, error) {
+	return s.out, nil
+}
+
+// stubNextChecksResult mirrors the production adapter: it consumes
+// inc.NextChecks and converts them to NextCheckSpec entries with stable IDs.
+type stubNextChecksResult struct{}
+
+func (stubNextChecksResult) NextChecks(_ context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+	out := make([]NextCheckSpec, 0, len(inc.NextChecks))
+	for i, prompt := range inc.NextChecks {
+		out = append(out, NextCheckSpec{ID: nextCheckID(i), Prompt: prompt})
+	}
+	return out, nil
+}
+
+func nextCheckID(i int) string {
+	return "check_" + itoa(i)
+}
+
+func itoa(i int) string {
+	switch i {
+	case 0:
+		return "0"
+	case 1:
+		return "1"
+	}
+	// Tests only exercise small indices.
+	return "n"
+}
diff --git a/internal/triage/idempotency_test.go b/internal/triage/idempotency_test.go
new file mode 100644
index 0000000..9327295
--- /dev/null
+++ b/internal/triage/idempotency_test.go
@@ -0,0 +1,62 @@
+package triage
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+func TestBuildIsIdempotentForSameInput(t *testing.T) {
+	deps := Deps{
+		Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+		Signals: richSignals{}, NextChecks: richNextChecks{},
+		// Two different "now" values to prove generated_at doesn't enter the hash
+		Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("15m", false, deps.Now())
+
+	r1, err := eng.Build(context.Background(), "inc_abc", opts)
+	if err != nil {
+		t.Fatalf("build 1: %v", err)
+	}
+	r2, err := eng.Build(context.Background(), "inc_abc", opts)
+	if err != nil {
+		t.Fatalf("build 2: %v", err)
+	}
+	if r1.ReportHash != r2.ReportHash {
+		t.Fatalf("two builds should have identical report_hash, got %q vs %q", r1.ReportHash, r2.ReportHash)
+	}
+}
+
+func TestSnapshotModeUsesIncidentUpdatedAt(t *testing.T) {
+	deps := Deps{
+		Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+		Signals: richSignals{}, NextChecks: richNextChecks{},
+		Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, _ := NewEngine(deps)
+
+	wallClockOpts, _ := ParseBuildOptions("15m", false, deps.Now())
+	snapshotOpts, _ := ParseBuildOptions("15m", true, deps.Now())
+
+	wall, err := eng.Build(context.Background(), "inc_abc", wallClockOpts)
+	if err != nil {
+		t.Fatalf("wall build: %v", err)
+	}
+	snap, err := eng.Build(context.Background(), "inc_abc", snapshotOpts)
+	if err != nil {
+		t.Fatalf("snap build: %v", err)
+	}
+	// Both reports describe the same incident state; with the same upstream stubs they hash equal.
+	// The point of this test is that snapshot mode does not crash and produces a valid report.
+	if snap.ReportHash == "" || wall.ReportHash == "" {
+		t.Fatalf("hashes must be non-empty (snap=%q wall=%q)", snap.ReportHash, wall.ReportHash)
+	}
+	if snap.IncidentRef.ID != "inc_abc" {
+		t.Fatalf("snap report missing incident ref")
+	}
+}
diff --git a/internal/triage/options.go b/internal/triage/options.go
new file mode 100644
index 0000000..1b30059
--- /dev/null
+++ b/internal/triage/options.go
@@ -0,0 +1,28 @@
+// Package triage builds the TriageReport for an incident.
+// The Report type is the public artifact (pkg/triage); this package is the orchestrator.
+package triage
+
+import (
+	"fmt"
+	"time"
+)
+
+const defaultWindow = 15 * time.Minute
+
+type BuildOptions struct {
+	Window   time.Duration
+	Snapshot bool
+	Now      time.Time
+}
+
+func ParseBuildOptions(window string, snapshot bool, now time.Time) (BuildOptions, error) {
+	w := defaultWindow
+	if window != "" {
+		parsed, err := time.ParseDuration(window)
+		if err != nil {
+			return BuildOptions{}, fmt.Errorf("triage: invalid window %q: %w", window, err)
+		}
+		w = parsed
+	}
+	return BuildOptions{Window: w, Snapshot: snapshot, Now: now}, nil
+}
diff --git a/internal/triage/options_test.go b/internal/triage/options_test.go
new file mode 100644
index 0000000..179dd49
--- /dev/null
+++ b/internal/triage/options_test.go
@@ -0,0 +1,50 @@
+package triage
+
+import (
+	"testing"
+	"time"
+)
+
+func TestBuildOptionsDefaults(t *testing.T) {
+	now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+	opts, err := ParseBuildOptions("", false, now)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if opts.Window != 15*time.Minute {
+		t.Fatalf("default window should be 15m, got %s", opts.Window)
+	}
+	if opts.Snapshot {
+		t.Fatalf("default snapshot should be false")
+	}
+	if !opts.Now.Equal(now) {
+		t.Fatalf("Now should be passed through")
+	}
+}
+
+func TestBuildOptionsWindowParse(t *testing.T) {
+	now := time.Now()
+	opts, err := ParseBuildOptions("30m", false, now)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if opts.Window != 30*time.Minute {
+		t.Fatalf("got %s want 30m", opts.Window)
+	}
+}
+
+func TestBuildOptionsBadWindow(t *testing.T) {
+	if _, err := ParseBuildOptions("forever", false, time.Now()); err == nil {
+		t.Fatalf("expected error for bad window")
+	}
+}
+
+func TestBuildOptionsSnapshotFlag(t *testing.T) {
+	opts, err := ParseBuildOptions("15m", true, time.Now())
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if !opts.Snapshot {
+		t.Fatalf("snapshot flag not honored")
+	}
+}
diff --git a/internal/triagehttp/handler.go b/internal/triagehttp/handler.go
new file mode 100644
index 0000000..0adc3c2
--- /dev/null
+++ b/internal/triagehttp/handler.go
@@ -0,0 +1,66 @@
+package triagehttp
+
+import (
+	"encoding/json"
+	"errors"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+type Handler struct {
+	engine *triage.Engine
+}
+
+func NewHandler(engine *triage.Engine) *Handler {
+	return &Handler{engine: engine}
+}
+
+func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+		return
+	}
+	id := strings.TrimPrefix(r.URL.Path, "/v1/triage/")
+	id = strings.Trim(id, "/")
+	if id == "" {
+		writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
+		return
+	}
+	q := r.URL.Query()
+	opts, err := triage.ParseBuildOptions(q.Get("window"), q.Get("snapshot") == "true", time.Now())
+	if err != nil {
+		writeError(w, http.StatusBadRequest, "bad_options", err.Error(), "")
+		return
+	}
+	rep, err := h.engine.Build(r.Context(), id, opts)
+	if errors.Is(err, triage.ErrUnknownIncident) {
+		writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+		return
+	}
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "triage_build_failed", err.Error(), "")
+		return
+	}
+	writeJSON(w, http.StatusOK, rep)
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(map[string]any{
+		"error": map[string]string{
+			"code":    code,
+			"message": message,
+			"detail":  detail,
+		},
+	})
+}
diff --git a/internal/triagehttp/handler_test.go b/internal/triagehttp/handler_test.go
new file mode 100644
index 0000000..fba414e
--- /dev/null
+++ b/internal/triagehttp/handler_test.go
@@ -0,0 +1,141 @@
+package triagehttp_test
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	"github.com/sssmaran/WaylogCLI/internal/triagehttp"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestTriageHandlerReturnsReport(t *testing.T) {
+	eng := newTriageEngineForHandler(t)
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String())
+	}
+	var rep pkgtriage.Report
+	if err := json.Unmarshal(rr.Body.Bytes(), &rep); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if rep.IncidentRef.ID != "inc_abc" {
+		t.Fatalf("got id %q want inc_abc", rep.IncidentRef.ID)
+	}
+}
+
+func TestTriageHandlerHonorsSnapshotQuery(t *testing.T) {
+	eng := newTriageEngineForHandler(t)
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc?snapshot=true&window=30m", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+func TestTriageHandlerRejectsMissingID(t *testing.T) {
+	eng := newTriageEngineForHandler(t)
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/triage/", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+	if rr.Code != http.StatusBadRequest {
+		t.Fatalf("expected 400 for missing id, got %d", rr.Code)
+	}
+}
+
+func TestTriageHandlerRejectsNonGET(t *testing.T) {
+	eng := newTriageEngineForHandler(t)
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodPost, "/v1/triage/inc_abc", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+	if rr.Code != http.StatusMethodNotAllowed {
+		t.Fatalf("expected 405 for POST, got %d", rr.Code)
+	}
+}
+
+func TestTriageHandlerUnknownIncidentIsNotFound(t *testing.T) {
+	eng := newTriageEngineForHandlerWithIncidents(t, handlerUnknownIncidents{})
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_missing", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+	if rr.Code != http.StatusNotFound {
+		t.Fatalf("expected 404 for unknown incident, got %d; body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+// helper: stub engine
+func newTriageEngineForHandler(t *testing.T) *triage.Engine {
+	return newTriageEngineForHandlerWithIncidents(t, handlerStubIncidents{})
+}
+
+func newTriageEngineForHandlerWithIncidents(t *testing.T, incidents triage.IncidentLookup) *triage.Engine {
+	t.Helper()
+	deps := triage.Deps{
+		Incidents:  incidents,
+		Blast:      handlerStubBlast{},
+		Story:      handlerStubStory{},
+		Signals:    handlerStubSignals{},
+		NextChecks: handlerStubNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := triage.NewEngine(deps)
+	if err != nil {
+		t.Fatalf("engine: %v", err)
+	}
+	return eng
+}
+
+type handlerStubIncidents struct{}
+
+func (handlerStubIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+	return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type handlerUnknownIncidents struct{}
+
+func (handlerUnknownIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+	return triage.IncidentSummary{}, triage.ErrUnknownIncident
+}
+
+type handlerStubBlast struct{}
+
+func (handlerStubBlast) BlastSnapshot(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+	return triage.BlastSnapshotResult{}, nil
+}
+
+type handlerStubStory struct{}
+
+func (handlerStubStory) FirstFailureStory(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.FirstFailureResult, error) {
+	return triage.FirstFailureResult{}, nil
+}
+
+type handlerStubSignals struct{}
+
+func (handlerStubSignals) SignalsFor(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) ([]triage.SignalEvidence, error) {
+	return nil, nil
+}
+
+type handlerStubNextChecks struct{}
+
+func (handlerStubNextChecks) NextChecks(ctx context.Context, inc triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+	return nil, nil
+}
diff --git a/pkg/triage/report.go b/pkg/triage/report.go
new file mode 100644
index 0000000..388933d
--- /dev/null
+++ b/pkg/triage/report.go
@@ -0,0 +1,102 @@
+// Package triage exposes the TriageReport schema. Experimental: report shape may change until triage.v2.
+package triage
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+)
+
+type Confidence string
+
+const (
+	ConfidenceLow    Confidence = "low"
+	ConfidenceMedium Confidence = "medium"
+	ConfidenceHigh   Confidence = "high"
+)
+
+type Report struct {
+	SchemaVersion string          `json:"schema_version"`
+	IncidentRef   IncidentRef     `json:"incident_ref"`
+	BlastSnapshot BlastSnapshot   `json:"blast_snapshot"`
+	FirstFailure  json.RawMessage `json:"first_failure,omitempty"`
+	SampleTraces  []TraceSample   `json:"sample_traces,omitempty"`
+	Signals       []SignalRef     `json:"signals,omitempty"`
+	NextChecks    []NextCheck     `json:"next_checks,omitempty"`
+	Confidence    Confidence      `json:"confidence"`
+	GeneratedAt   string          `json:"generated_at"`
+	PlanRunID     string          `json:"plan_run_id,omitempty"`
+	ReportHash    string          `json:"report_hash"`
+}
+
+type IncidentRef struct {
+	ID     string `json:"id"`
+	Window string `json:"window"`
+}
+
+type BlastSnapshot struct {
+	Requests         int           `json:"requests"`
+	Users            int           `json:"users"`
+	Services         int           `json:"services"`
+	TopErrorFamilies []ErrorFamily `json:"top_error_families"`
+}
+
+type ErrorFamily struct {
+	Service   string `json:"service"`
+	Step      string `json:"step"`
+	ErrorCode string `json:"error_code"`
+	Count     int    `json:"count"`
+}
+
+type TraceSample struct {
+	TraceID string `json:"trace_id"`
+	Summary string `json:"summary"`
+}
+
+type SignalRef struct {
+	ID          string   `json:"id"`
+	Type        string   `json:"type"`
+	EvidenceIDs []string `json:"evidence_ids"`
+}
+
+type NextCheck struct {
+	ID     string `json:"id"`
+	Prompt string `json:"prompt"`
+}
+
+const SchemaVersionV1 = "triage.v1"
+
+func (r *Report) Validate() error {
+	if r.SchemaVersion != SchemaVersionV1 {
+		return fmt.Errorf("triage: schema_version must be %q, got %q", SchemaVersionV1, r.SchemaVersion)
+	}
+	if r.IncidentRef.ID == "" {
+		return fmt.Errorf("triage: incident_ref.id required")
+	}
+	switch r.Confidence {
+	case ConfidenceLow, ConfidenceMedium, ConfidenceHigh:
+	default:
+		return fmt.Errorf("triage: confidence must be low|medium|high, got %q", r.Confidence)
+	}
+	if r.GeneratedAt == "" {
+		return fmt.Errorf("triage: generated_at required")
+	}
+	return nil
+}
+
+// CanonicalHash returns sha256:<hex> over the report's canonical JSON,
+// excluding generated_at, plan_run_id, and report_hash itself.
+// Two reports built from the same upstream state produce the same hash.
+func (r *Report) CanonicalHash() (string, error) {
+	clone := *r
+	clone.GeneratedAt = ""
+	clone.PlanRunID = ""
+	clone.ReportHash = ""
+	raw, err := json.Marshal(&clone)
+	if err != nil {
+		return "", fmt.Errorf("triage: canonical marshal: %w", err)
+	}
+	sum := sha256.Sum256(raw)
+	return "sha256:" + hex.EncodeToString(sum[:]), nil
+}
diff --git a/pkg/triage/report_test.go b/pkg/triage/report_test.go
new file mode 100644
index 0000000..f4f575f
--- /dev/null
+++ b/pkg/triage/report_test.go
@@ -0,0 +1,137 @@
+package triage_test
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+
+	"github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestReportJSONRoundTrip(t *testing.T) {
+	in := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_test", Window: "15m"},
+		BlastSnapshot: triage.BlastSnapshot{
+			Requests: 12, Users: 8, Services: 4,
+			TopErrorFamilies: []triage.ErrorFamily{
+				{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+			},
+		},
+		Signals:     []triage.SignalRef{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}},
+		NextChecks:  []triage.NextCheck{{ID: "check_1", Prompt: "verify x"}},
+		Confidence:  triage.ConfidenceMedium,
+		GeneratedAt: "2026-05-06T00:00:00Z",
+		ReportHash:  "sha256:abc",
+	}
+	raw, err := json.Marshal(&in)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	var out triage.Report
+	if err := json.Unmarshal(raw, &out); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if out.SchemaVersion != in.SchemaVersion {
+		t.Fatalf("schema_version mismatch: got %q want %q", out.SchemaVersion, in.SchemaVersion)
+	}
+	if out.BlastSnapshot.TopErrorFamilies[0].ErrorCode != "PMT_502" {
+		t.Fatalf("top_error_families round-trip lost data: %+v", out.BlastSnapshot.TopErrorFamilies)
+	}
+	if out.Confidence != triage.ConfidenceMedium {
+		t.Fatalf("confidence mismatch: got %q", out.Confidence)
+	}
+}
+
+func TestReportValidate(t *testing.T) {
+	good := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_x"},
+		Confidence:    triage.ConfidenceMedium,
+		GeneratedAt:   "2026-05-06T00:00:00Z",
+		ReportHash:    "sha256:x",
+	}
+	if err := good.Validate(); err != nil {
+		t.Fatalf("good report failed validation: %v", err)
+	}
+
+	cases := map[string]triage.Report{
+		"missing schema_version": {IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+		"wrong schema_version":   {SchemaVersion: "triage.v2", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+		"missing incident id":    {SchemaVersion: "triage.v1", Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+		"bad confidence":         {SchemaVersion: "triage.v1", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: "extreme", GeneratedAt: "t", ReportHash: "h"},
+		"missing generated_at":   {SchemaVersion: "triage.v1", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, ReportHash: "h"},
+	}
+	for name, r := range cases {
+		t.Run(name, func(t *testing.T) {
+			if err := r.Validate(); err == nil {
+				t.Fatalf("%s: expected validation error, got nil", name)
+			}
+		})
+	}
+}
+
+func TestCanonicalHashExcludesGeneratedAtPlanRunIDAndReportHash(t *testing.T) {
+	a := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_1"},
+		Confidence:    triage.ConfidenceMedium,
+		GeneratedAt:   "2026-05-06T00:00:00Z",
+		ReportHash:    "sha256:placeholder",
+	}
+	hashA, err := a.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash a: %v", err)
+	}
+
+	b := a
+	b.GeneratedAt = "2099-01-01T00:00:00Z"
+	b.PlanRunID = "plan_other"
+	b.ReportHash = "sha256:something_else"
+	hashB, err := b.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash b: %v", err)
+	}
+
+	if hashA != hashB {
+		t.Fatalf("CanonicalHash must exclude generated_at, plan_run_id, report_hash. got %q vs %q", hashA, hashB)
+	}
+}
+
+func TestCanonicalHashChangesWhenContentChanges(t *testing.T) {
+	base := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_1"},
+		Confidence:    triage.ConfidenceMedium,
+		GeneratedAt:   "t",
+		ReportHash:    "h",
+	}
+	h1, _ := base.CanonicalHash()
+
+	mutated := base
+	mutated.IncidentRef.ID = "inc_2"
+	h2, _ := mutated.CanonicalHash()
+	if h1 == h2 {
+		t.Fatalf("hash must change when incident_ref.id changes")
+	}
+}
+
+func TestCanonicalHashFormat(t *testing.T) {
+	r := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_1"},
+		Confidence:    triage.ConfidenceLow,
+		GeneratedAt:   "t",
+		ReportHash:    "h",
+	}
+	h, err := r.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash: %v", err)
+	}
+	if !strings.HasPrefix(h, "sha256:") {
+		t.Fatalf("hash should be prefixed with sha256:, got %q", h)
+	}
+	if len(h) != len("sha256:")+64 {
+		t.Fatalf("hash length wrong: got %d (%q)", len(h), h)
+	}
+}
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index c8dd8f2..1629b72 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -54,9 +54,13 @@ type incident struct {
 	Status      string      `json:"status"`
 }
 
+type triageReport struct {
+	ReportHash string `json:"report_hash"`
+}
+
 func main() {
 	if len(os.Args) != 2 {
-		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id>")
+		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id|triage-report-hash>")
 		os.Exit(2)
 	}
 
@@ -85,6 +89,8 @@ func main() {
 		}
 	case "first-incident-id":
 		fmt.Println(firstIncidentID(body))
+	case "triage-report-hash":
+		fmt.Println(triageReportHash(body))
 	default:
 		fmt.Fprintf(os.Stderr, "unknown command: %s\n", os.Args[1])
 		os.Exit(2)
@@ -183,3 +189,11 @@ func isPaymentFamily(f errorFamily) bool {
 		f.Step == "payment.charge" &&
 		f.ErrorCode == "PMT_502"
 }
+
+func triageReportHash(body []byte) string {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return ""
+	}
+	return rep.ReportHash
+}
diff --git a/scripts/demo-acceptance-json/main_test.go b/scripts/demo-acceptance-json/main_test.go
index 7b3dfd6..8619a77 100644
--- a/scripts/demo-acceptance-json/main_test.go
+++ b/scripts/demo-acceptance-json/main_test.go
@@ -21,3 +21,13 @@ func TestDependencyIncidentHelpers(t *testing.T) {
 		t.Fatalf("firstIncidentID = %q, want inc_123", got)
 	}
 }
+
+func TestTriageReportHash(t *testing.T) {
+	body := []byte(`{"schema_version":"triage.v1","incident_ref":{"id":"inc_x"},"confidence":"medium","generated_at":"t","report_hash":"sha256:deadbeef"}`)
+	if got := triageReportHash(body); got != "sha256:deadbeef" {
+		t.Fatalf("triageReportHash = %q, want sha256:deadbeef", got)
+	}
+	if got := triageReportHash([]byte(`{not-json`)); got != "" {
+		t.Fatalf("malformed input should return empty, got %q", got)
+	}
+}
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index 590b315..b40edb1 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -43,6 +43,10 @@ json_first_incident_id() {
   "$JSON_BIN" first-incident-id
 }
 
+json_triage_report_hash() {
+  "$JSON_BIN" triage-report-hash
+}
+
 if [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
   fail "demo stack is not running. Start it with: make demo"
 fi
@@ -131,4 +135,15 @@ snapshot="$("${CLI[@]}" incident "$incident_id" --snapshot)" || fail "waylog inc
 [[ "$snapshot" == *"payment.charge"* ]] || fail "incident snapshot did not mention payment.charge"
 echo "PASS: waylog incident snapshot"
 
+triage_a="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage failed for incident $incident_id"
+hash_a="$(json_triage_report_hash <<<"$triage_a")"
+[[ -n "$hash_a" ]] || fail "triage report_hash A is empty"
+
+triage_b="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage second run failed for incident $incident_id"
+hash_b="$(json_triage_report_hash <<<"$triage_b")"
+[[ -n "$hash_b" ]] || fail "triage report_hash B is empty"
+
+[[ "$hash_a" == "$hash_b" ]] || fail "triage report_hash unstable across runs: A=$hash_a B=$hash_b"
+echo "PASS: waylog triage stable report_hash=$hash_a"
+
 echo "Demo acceptance passed."

From a605ec85d109986d4907b540294cbf33be61cb79 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Thu, 7 May 2026 19:29:06 -0400
Subject: [PATCH 06/14] feat: added incident rebuild and provider credibility
 layer

Shipped the M2 credibility layer for incident triage.

- refactored incident ticking into derive/apply paths
- added startup-only hot-window incident rebuild from schema-2.0 WAL
- added atomic ReplaceNonResolved for incident stores
- preserve live tick per-row behavior while rebuild uses atomic replacement
- added rebuild metrics and max-event safety cap
- added runtime incident cause classification and next checks
- added provider-neutral LLM selection with explicit none mode
- make Ask missing-provider errors provider-agnostic
- expose llm and incidents rebuild state in /v1/capabilities
- updated README, env docs, and OpenAPI for M2 provider/rebuild fields
- add regression coverage for rebuild, runtime cause, provider selection, and capabilities
---
 README.md                                 |   9 +-
 cmd/ingest/main.go                        | 114 ++++++++---
 docs/env.md                               |  16 +-
 docs/openapi.yaml                         |  35 ++++
 internal/cli/root.go                      |  28 +--
 internal/coldstore/incident_store.go      |  41 +++-
 internal/coldstore/incident_store_test.go |  77 ++++++++
 internal/incidents/classifier.go          |  24 +++
 internal/incidents/classifier_test.go     | 138 ++++++++++++++
 internal/incidents/engine.go              | 219 +++++++++++++++++++++-
 internal/incidents/engine_test.go         | 130 +++++++++++++
 internal/incidents/nextchecks.go          |   7 +
 internal/incidents/rebuild.go             |  40 ++++
 internal/incidents/store.go               |  15 ++
 internal/incidents/types.go               |   1 +
 internal/ingest/handler.go                | 198 ++++++++++---------
 internal/ingest/handler_test.go           | 175 +++++++++++++++++
 internal/llm/provider.go                  |  75 ++++++++
 internal/llm/provider_test.go             | 160 ++++++++++++++++
 internal/metrics/metrics.go               |  24 ++-
 20 files changed, 1381 insertions(+), 145 deletions(-)
 create mode 100644 internal/incidents/rebuild.go
 create mode 100644 internal/llm/provider.go
 create mode 100644 internal/llm/provider_test.go

diff --git a/README.md b/README.md
index e5b7712..66893ff 100644
--- a/README.md
+++ b/README.md
@@ -297,8 +297,9 @@ Public alpha. APIs may break before 1.0.
 - hot graph with flattened 3-node model + dedicated trace store
 - schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
 - SQLite cold store (events, deployments, signals, incidents, causal claims)
-- signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, and dashboard incident cards
-- 10 deterministic analysis tools, rollup-correct root-cause attribution
+- signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
+- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER`; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- 11 deterministic analysis tools, rollup-correct root-cause attribution
 - agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
 - `/v1/traces/story` and indented failure-path rendering in the dashboard
 - dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
@@ -319,8 +320,8 @@ Public alpha. APIs may break before 1.0.
 - OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
 - Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
 - SQLite cold store fits demos and small deployments; not sized for production-scale retention.
-- Signal and incident records are SQLite-backed; they do not use the event WAL/replay path.
-- Incident cause classification is deterministic and heuristic. `runtime` signals are accepted but do not produce a `runtime` cause label yet.
+- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
+- Incident cause classification is deterministic and heuristic.
 - No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
 - No multi-tenancy. One instance = one trust boundary.
 - No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 7d76092..579e3a6 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -29,6 +29,7 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/incidents"
 	"github.com/sssmaran/WaylogCLI/internal/ingest"
 	ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
+	"github.com/sssmaran/WaylogCLI/internal/llm"
 	"github.com/sssmaran/WaylogCLI/internal/mcp/stdio"
 	"github.com/sssmaran/WaylogCLI/internal/metrics"
 	otelhttp "github.com/sssmaran/WaylogCLI/internal/otel"
@@ -150,6 +151,10 @@ func main() {
 	causalInterval := config.GetenvDuration("CAUSAL_INTERVAL", 30*time.Second)
 
 	trustProxy := config.GetenvBool("WAYLOG_TRUST_PROXY", false)
+	if _, err := llm.SelectFromEnv(); err != nil {
+		slog.Error("LLM provider config error", "err", err)
+		os.Exit(1)
+	}
 
 	dedupCache := ingest.NewDedupCache()
 	planStore := ingest.NewPlanStore()
@@ -236,28 +241,31 @@ func main() {
 
 	// Create ingest server with the store
 	ingestServer := ingest.NewServer(ingest.ServerConfig{
-		Store:               graphStore,
-		TraceStore:          traceStore,
-		MaxBodyBytes:        maxBody,
-		EventLogDir:         eventLogDir,
-		Metrics:             m,
-		StartTime:           time.Now(),
-		AskRegistry:         reg,
-		AskMaxStepsDefault:  askMaxStepsDefault,
-		AskMaxStepsMax:      askMaxStepsMax,
-		DashboardRefreshSec: dashboardRefreshSec,
-		PrometheusURL:       prometheusURL,
-		GrafanaURL:          grafanaURL,
-		GraphUI:             graphUI,
-		DedupCache:          dedupCache,
-		AgentKey:            agentKey,
-		TrustProxy:          trustProxy,
-		ColdWriter:          coldWriter,
-		ColdStore:           coldDB,
-		PlanStore:           planStore,
-		GraphHotWindow:      graphHotWindow,
-		OTLPEnabled:         otlpEnabled,
-		V2ReadsEnabled:      v2ReadsEnabled,
+		Store:                    graphStore,
+		TraceStore:               traceStore,
+		MaxBodyBytes:             maxBody,
+		EventLogDir:              eventLogDir,
+		Metrics:                  m,
+		StartTime:                time.Now(),
+		AskRegistry:              reg,
+		AskMaxStepsDefault:       askMaxStepsDefault,
+		AskMaxStepsMax:           askMaxStepsMax,
+		DashboardRefreshSec:      dashboardRefreshSec,
+		PrometheusURL:            prometheusURL,
+		GrafanaURL:               grafanaURL,
+		GraphUI:                  graphUI,
+		DedupCache:               dedupCache,
+		AgentKey:                 agentKey,
+		TrustProxy:               trustProxy,
+		ColdWriter:               coldWriter,
+		ColdStore:                coldDB,
+		PlanStore:                planStore,
+		GraphHotWindow:           graphHotWindow,
+		OTLPEnabled:              otlpEnabled,
+		V2ReadsEnabled:           v2ReadsEnabled,
+		IncidentsEnabled:         v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+		IncidentsPersistent:      v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+		IncidentRebuildSupported: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
 	})
 
 	// SSE hub for real-time dashboard updates
@@ -428,6 +436,66 @@ func main() {
 					slog.Error("incident engine bootstrap failed", "err", err)
 					os.Exit(1)
 				}
+				if config.GetenvBool("WAYLOG_REBUILD_INCIDENTS_ON_START", false) {
+					rebuildMaxEvents := config.GetenvInt("WAYLOG_INCIDENT_REBUILD_MAX_EVENTS", 250000)
+					if rebuildMaxEvents <= 0 {
+						rebuildMaxEvents = 250000
+					}
+					replayWindow := graphHotWindow
+					if minWindow := 2 * incidentCfg.Window; minWindow > replayWindow {
+						replayWindow = minWindow
+					}
+					replaySince := time.Now().UTC().Add(-replayWindow)
+					seed := incidentEngine.SnapshotActive()
+					for _, inc := range seed {
+						if inc.StartedAt.Before(replaySince) {
+							slog.Info("incident continuity broken: started_at older than WAL retention",
+								"incident_id", inc.IncidentID,
+								"started_at", inc.StartedAt,
+								"replay_since", replaySince,
+							)
+							break
+						}
+					}
+					tempIndex := ingestv2.NewRecentIndex(nil)
+					tempDedup := ingestv2.NewDedup(dedupCapacity, nil)
+					tempProjector := ingestv2.NewProjector(tempIndex)
+					replay, err := ingestv2.ReplayWAL(eventLogV2Dir, tempDedup, tempProjector, replaySince, m)
+					if err != nil {
+						m.IncidentRebuildFailures.Inc()
+						slog.Error("incident rebuild WAL replay failed", "err", err)
+						os.Exit(1)
+					}
+					m.IncidentRebuildReplayed.Add(float64(replay.Projected))
+					if replay.Projected > rebuildMaxEvents {
+						m.IncidentRebuildFailures.Inc()
+						slog.Error("incident rebuild replay exceeded max events", "projected", replay.Projected, "max_events", rebuildMaxEvents)
+						os.Exit(1)
+					}
+					if replay.Projected == 0 {
+						if len(seed) > 0 {
+							slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+						}
+					} else {
+						result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{
+							Engine: incidentEngine,
+							Reader: incidentReaderAdapter{reader: ingestv2.NewReader(tempIndex)},
+							Now:    time.Now,
+						})
+						if err != nil {
+							m.IncidentRebuildFailures.Inc()
+							slog.Error("incident rebuild failed", "err", err)
+							os.Exit(1)
+						}
+						m.IncidentRebuildDuration.Observe(result.Duration.Seconds())
+						m.IncidentRebuildRows.Add(float64(result.RowsReplaced))
+						slog.Info("incident rebuild complete",
+							"replayed_events", replay.Projected,
+							"rows_replaced", result.RowsReplaced,
+							"duration", result.Duration,
+						)
+					}
+				}
 				incidentHandler := incidents.NewHandler(incidentEngine)
 				mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
 				mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
@@ -463,7 +531,7 @@ func main() {
 				incidentRunning = true
 				slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
 			} else {
-				slog.Info("incident engine disabled: SQLITE_PATH is not set")
+				slog.Warn("incidents requested but SQLite not configured; running without incidents")
 			}
 		}
 	} else {
diff --git a/docs/env.md b/docs/env.md
index db9afac..58953f6 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -6,7 +6,19 @@ Reference for configuring the Waylog ingest server and SDK. All variables are re
 
 | Variable | Purpose |
 |---|---|
-| `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Required when server-side Ask/tool flows use Gemini |
+| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. For the current Gemini provider, set `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+
+## LLM provider
+
+Deterministic tools, plans, triage, MCP, and read APIs do not require an LLM provider. The provider is only used by natural-language Ask flows. If Ask cannot construct the selected provider, it returns a provider-agnostic "LLM provider not configured" error.
+
+| Variable | Default | Purpose |
+|---|---|---|
+| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values in M2: `none`, `gemini` |
+| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. For Gemini, this takes precedence over `GEMINI_MODEL` |
+| `GEMINI_MODEL` | `gemini-2.5-flash` | Gemini-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `GEMINI_API_BASE` | Gemini API default | Gemini-specific API base URL override |
+| `GEMINI_TOOL_MODE` | `text` | Gemini-specific tool-calling mode |
 
 ## Auth
 
@@ -65,6 +77,8 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
 | `WAYLOG_INCIDENT_RESOLVE_AFTER` | `2m` | Time without renewed matching failures before a recovering incident resolves |
 | `WAYLOG_DEPLOY_CORRELATION_WINDOW` | `15m` | Window used to attach deploy signals and deployment records as incident evidence |
 | `WAYLOG_INCIDENT_SAMPLE_LIMIT` | `5` | Maximum persisted sample traces per incident |
+| `WAYLOG_REBUILD_INCIDENTS_ON_START` | `false` | Rebuild non-resolved incident rows at startup from the schema-2.0 WAL hot window plus signals |
+| `WAYLOG_INCIDENT_REBUILD_MAX_EVENTS` | `250000` | Safety cap for startup incident rebuild replay |
 | `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
 | `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
 | `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index d60eb4d..23f1e8c 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -1849,6 +1849,26 @@ components:
         ask:
           type: object
           additionalProperties: true
+        llm:
+          type: object
+          description: Provider-neutral Ask configuration and runtime state.
+          properties:
+            provider:
+              type: string
+              description: Resolved Ask provider. `custom` is used for injected providers.
+              enum: [none, gemini, custom]
+            model:
+              type: string
+              description: Resolved model for the selected provider, or empty when Ask is disabled.
+            tool_mode:
+              type: string
+              description: Resolved tool-calling mode for the selected provider, or empty when Ask is disabled.
+            configured:
+              type: boolean
+              description: True when a provider was explicitly selected or inferred from credentials.
+            ask_enabled:
+              type: boolean
+              description: True when Ask has a provider implementation available.
         dashboard:
           type: object
           additionalProperties: true
@@ -1867,6 +1887,21 @@ components:
           properties:
             enabled:
               type: boolean
+        incidents:
+          type: object
+          properties:
+            enabled:
+              type: boolean
+            persistent:
+              type: boolean
+            rebuild:
+              type: object
+              properties:
+                supported:
+                  type: boolean
+                scope:
+                  type: string
+                  enum: ["", hot-window]
         architecture:
           type: object
           additionalProperties: true
diff --git a/internal/cli/root.go b/internal/cli/root.go
index 42128b2..069b1b7 100644
--- a/internal/cli/root.go
+++ b/internal/cli/root.go
@@ -130,19 +130,16 @@ func handleAsk(store tools.Store, args []string) {
 		return
 	}
 
-	apiKey := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
-	if apiKey == "" {
-		apiKey = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+	sel, err := llm.SelectFromEnv()
+	if err != nil {
+		fmt.Println(err)
+		return
 	}
-	if apiKey == "" {
-		fmt.Println("GEMINI_API_KEY (or GOOGLE_API_KEY) is required")
+	if !sel.AskEnabled {
+		fmt.Println(llm.ErrProviderNotConfigured)
 		return
 	}
 
-	model := strings.TrimSpace(os.Getenv("GEMINI_MODEL"))
-	baseURL := strings.TrimSpace(os.Getenv("GEMINI_API_BASE"))
-	toolMode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE"))
-
 	reg := tools.NewRegistry()
 	if err := tools.RegisterGraphTools(reg); err != nil {
 		fmt.Println("tool registry error:", err)
@@ -158,18 +155,7 @@ func handleAsk(store tools.Store, args []string) {
 		})
 	}
 
-	client := llm.NewGeminiClient(apiKey)
-	if model != "" {
-		client.Model = model
-	}
-	if baseURL != "" {
-		client.BaseURL = baseURL
-	}
-	if toolMode != "" {
-		client.ToolMode = toolMode
-	}
-
-	answer, _, err := llm.Ask(context.Background(), client, toolDefs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) {
+	answer, _, err := llm.Ask(context.Background(), sel.Impl, toolDefs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) {
 		return reg.Call(ctx, store, name, params)
 	}), prompt, llm.AskOptions{MaxSteps: 5})
 	if err != nil {
diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go
index b6a3160..9a05201 100644
--- a/internal/coldstore/incident_store.go
+++ b/internal/coldstore/incident_store.go
@@ -20,6 +20,43 @@ func NewIncidentStore(db *SQLiteStore) *IncidentStore {
 }
 
 func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) error {
+	if err := upsertIncident(ctx, s.db.writer, inc); err != nil {
+		return fmt.Errorf("coldstore upsert incident: %w", err)
+	}
+	return nil
+}
+
+func (s *IncidentStore) ReplaceNonResolved(ctx context.Context, rows []incidents.Incident) error {
+	tx, err := s.db.writer.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelSerializable})
+	if err != nil {
+		return fmt.Errorf("coldstore replace incidents begin: %w", err)
+	}
+	committed := false
+	defer func() {
+		if !committed {
+			_ = tx.Rollback()
+		}
+	}()
+	if _, err := tx.ExecContext(ctx, `DELETE FROM incidents WHERE status != ?`, string(incidents.StatusResolved)); err != nil {
+		return fmt.Errorf("coldstore replace incidents delete: %w", err)
+	}
+	for _, inc := range rows {
+		if err := upsertIncident(ctx, tx, inc); err != nil {
+			return fmt.Errorf("coldstore replace incident %s: %w", inc.IncidentID, err)
+		}
+	}
+	if err := tx.Commit(); err != nil {
+		return fmt.Errorf("coldstore replace incidents commit: %w", err)
+	}
+	committed = true
+	return nil
+}
+
+type incidentExecer interface {
+	ExecContext(ctx context.Context, query string, args ...any) (sql.Result, error)
+}
+
+func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.Incident) error {
 	topServices, err := jsonText(inc.TopServices)
 	if err != nil {
 		return fmt.Errorf("coldstore incident top services: %w", err)
@@ -40,7 +77,7 @@ func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) erro
 	if err != nil {
 		return fmt.Errorf("coldstore incident warnings: %w", err)
 	}
-	_, err = s.db.writer.ExecContext(ctx, `
+	_, err = execer.ExecContext(ctx, `
 		INSERT INTO incidents (
 			incident_id, env, service, error_service, error_step, error_code,
 			status, cause, confidence, severity, started_at, updated_at, last_seen_at,
@@ -76,7 +113,7 @@ func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) erro
 		topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount,
 	)
 	if err != nil {
-		return fmt.Errorf("coldstore upsert incident: %w", err)
+		return err
 	}
 	return nil
 }
diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go
index 10cdfe8..9853f3b 100644
--- a/internal/coldstore/incident_store_test.go
+++ b/internal/coldstore/incident_store_test.go
@@ -83,3 +83,80 @@ func TestIncidentStoreRoundtripAndPrune(t *testing.T) {
 		t.Fatalf("expected not found, got %v", err)
 	}
 }
+
+func TestIncidentStoreReplaceNonResolved(t *testing.T) {
+	ctx := context.Background()
+	managed, err := Open(":memory:")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer managed.Close()
+	store := NewIncidentStore(managed.(*SQLiteStore))
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+
+	oldActive := testColdIncident("inc_old_active", incidents.StatusActive, now.Add(-20*time.Minute))
+	oldRecovering := testColdIncident("inc_old_recovering", incidents.StatusRecovering, now.Add(-15*time.Minute))
+	preservedResolved := testColdIncident("inc_preserved_resolved", incidents.StatusResolved, now.Add(-30*time.Minute))
+	overwrittenResolved := testColdIncident("inc_overwritten_resolved", incidents.StatusResolved, now.Add(-25*time.Minute))
+	for _, inc := range []incidents.Incident{oldActive, oldRecovering, preservedResolved, overwrittenResolved} {
+		if err := store.Upsert(ctx, inc); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	newActive := testColdIncident("inc_new_active", incidents.StatusActive, now)
+	replacement := testColdIncident("inc_overwritten_resolved", incidents.StatusActive, now)
+	if err := store.ReplaceNonResolved(ctx, []incidents.Incident{newActive, replacement}); err != nil {
+		t.Fatal(err)
+	}
+
+	active, err := store.ListActive(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	gotActive := map[string]incidents.Status{}
+	for _, inc := range active {
+		gotActive[inc.IncidentID] = inc.Status
+	}
+	if _, ok := gotActive["inc_old_active"]; ok {
+		t.Fatalf("old active row preserved unexpectedly: %+v", gotActive)
+	}
+	if _, ok := gotActive["inc_old_recovering"]; ok {
+		t.Fatalf("old recovering row preserved unexpectedly: %+v", gotActive)
+	}
+	if gotActive["inc_new_active"] != incidents.StatusActive || gotActive["inc_overwritten_resolved"] != incidents.StatusActive {
+		t.Fatalf("active rows after replace=%+v", gotActive)
+	}
+	if got, err := store.Get(ctx, "inc_preserved_resolved"); err != nil || got.Status != incidents.StatusResolved {
+		t.Fatalf("preserved resolved row got=%+v err=%v", got, err)
+	}
+}
+
+func testColdIncident(id string, status incidents.Status, at time.Time) incidents.Incident {
+	resolvedAt := at.Add(time.Minute)
+	inc := incidents.Incident{
+		IncidentID:       id,
+		Env:              "prod",
+		Service:          "checkout",
+		ErrorFamily:      apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		Status:           status,
+		Cause:            incidents.CauseDependency,
+		Confidence:       incidents.ConfidenceHigh,
+		Severity:         8,
+		StartedAt:        at,
+		UpdatedAt:        at,
+		LastSeenAt:       at,
+		AffectedRequests: 9,
+		AffectedServices: 2,
+		TopServices:      []string{"checkout", "payment"},
+		SampleTraces:     []string{"trace-a"},
+		Evidence:         []incidents.Evidence{{Kind: incidents.EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: at}},
+		NextChecks:       []string{"check downstream"},
+		Lift:             9,
+		CurrentCount:     9,
+	}
+	if status == incidents.StatusResolved {
+		inc.ResolvedAt = &resolvedAt
+	}
+	return inc
+}
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
index 8cde790..5f10c66 100644
--- a/internal/incidents/classifier.go
+++ b/internal/incidents/classifier.go
@@ -52,6 +52,10 @@ func Classify(input ClassificationInput) Classification {
 		evidence = append(evidence, signalEvidence(*sig, "Deploy signal overlaps incident window"))
 		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
 	}
+	if sig := matchingRuntimeSignal(input); sig != nil {
+		evidence = append(evidence, signalEvidence(*sig, "Runtime signal overlaps incident window"))
+		return classification(CauseRuntime, ConfidenceHigh, evidence, warnings)
+	}
 	if len(input.Events) > 0 && input.Incident.ErrorFamily.Step != "" && firstFailingDownstream(input.Events) == "" {
 		return classification(CauseApp, ConfidenceMedium, evidence, warnings)
 	}
@@ -101,6 +105,26 @@ func matchingDeployment(input ClassificationInput) *Deployment {
 	return nil
 }
 
+func matchingRuntimeSignal(input ClassificationInput) *signals.Signal {
+	start := input.Incident.StartedAt
+	lo := start.Add(-5 * time.Minute)
+	hi := start.Add(time.Minute)
+	for i := range input.Signals {
+		sig := input.Signals[i]
+		if sig.Type != signals.TypeRuntime && sig.Type != signals.TypeHealthcheck {
+			continue
+		}
+		if sig.Service != input.Incident.Service {
+			continue
+		}
+		if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
+			continue
+		}
+		return &input.Signals[i]
+	}
+	return nil
+}
+
 func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal {
 	version := sampleVersion(input.Events)
 	for i := range input.Signals {
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
index 348787a..5c5a16d 100644
--- a/internal/incidents/classifier_test.go
+++ b/internal/incidents/classifier_test.go
@@ -64,3 +64,141 @@ func TestClassifierRules(t *testing.T) {
 		}
 	})
 }
+
+func TestClassifierRuntime(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+	checkoutEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "cart.validate", "CHK_500", "")
+	paymentEvent := testIncidentEvent("e2", "trace-b", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+	runtimeSig := signals.Signal{
+		SignalID:  "sig_rt",
+		Type:      signals.TypeRuntime,
+		Service:   "checkout",
+		Env:       "prod",
+		Reason:    "container restarted",
+		Severity:  signals.SeverityWarning,
+		Timestamp: now.Add(-time.Minute),
+	}
+
+	t.Run("runtime signal in window", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{runtimeSig},
+		})
+		if got.Cause != CauseRuntime || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+		found := false
+		for _, ev := range got.Evidence {
+			if ev.SignalID == "sig_rt" {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("runtime signal evidence missing: %+v", got.Evidence)
+		}
+	})
+
+	t.Run("healthcheck signal in window", func(t *testing.T) {
+		sig := runtimeSig
+		sig.SignalID = "sig_hc"
+		sig.Type = signals.TypeHealthcheck
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{sig},
+		})
+		if got.Cause != CauseRuntime || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("alert with OOM reason does not classify runtime", func(t *testing.T) {
+		sig := runtimeSig
+		sig.Type = signals.TypeAlert
+		sig.Reason = "OOM kill"
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{sig},
+		})
+		if got.Cause == CauseRuntime {
+			t.Fatalf("alert signal classified as runtime: %+v", got)
+		}
+	})
+
+	t.Run("runtime signal outside window", func(t *testing.T) {
+		sig := runtimeSig
+		sig.Timestamp = now.Add(-6 * time.Minute)
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{sig},
+		})
+		if got.Cause == CauseRuntime {
+			t.Fatalf("out-of-window signal classified as runtime: %+v", got)
+		}
+	})
+
+	t.Run("runtime signal for different service", func(t *testing.T) {
+		sig := runtimeSig
+		sig.Service = "payment"
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{sig},
+		})
+		if got.Cause == CauseRuntime {
+			t.Fatalf("foreign-service signal classified as runtime: %+v", got)
+		}
+	})
+
+	t.Run("deploy beats runtime", func(t *testing.T) {
+		deploySig := signals.Signal{
+			SignalID:  "sig_dep",
+			Type:      signals.TypeDeploy,
+			Service:   "checkout",
+			Env:       "prod",
+			Severity:  signals.SeverityWarning,
+			Timestamp: now.Add(-time.Minute),
+		}
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{checkoutEvent},
+			Signals:  []signals.Signal{runtimeSig, deploySig},
+		})
+		if got.Cause != CauseDeploy {
+			t.Fatalf("expected deploy, got %+v", got)
+		}
+	})
+
+	t.Run("dependency beats runtime", func(t *testing.T) {
+		depSig := signals.Signal{
+			SignalID:  "sig_depy",
+			Type:      signals.TypeDependency,
+			Service:   "payment",
+			Env:       "prod",
+			Reason:    "upstream_5xx",
+			Severity:  signals.SeverityCritical,
+			Timestamp: now.Add(-time.Minute),
+		}
+		got := Classify(ClassificationInput{
+			Incident: base,
+			Events:   []*eventv2.Event{paymentEvent},
+			Signals:  []signals.Signal{runtimeSig, depSig},
+		})
+		if got.Cause != CauseDependency {
+			t.Fatalf("expected dependency, got %+v", got)
+		}
+	})
+}
+
+func TestNextChecksRuntime(t *testing.T) {
+	got := NextChecks(CauseRuntime, ConfidenceHigh)
+	if len(got) == 0 {
+		t.Fatalf("expected non-empty next checks for runtime cause")
+	}
+}
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
index 11162c9..4dd9f6c 100644
--- a/internal/incidents/engine.go
+++ b/internal/incidents/engine.go
@@ -134,17 +134,36 @@ func (e *Engine) Tick(ctx context.Context) error {
 		defer func() { e.metrics.IncidentTickLatency.Observe(time.Since(start).Seconds()) }()
 	}
 	now := e.now().UTC()
+	rows, err := e.derive(ctx, now, e.SnapshotActive(), e.reader)
+	if err != nil {
+		return err
+	}
+	return e.ApplyLive(ctx, rows)
+}
+
+// derivedRow carries the derivation output plus whether the row was already
+// in the seed (used by ApplyLive to distinguish Opened vs Updated metrics).
+type derivedRow struct {
+	Incident Incident
+	Existed  bool
+}
+
+// derive computes the full set of incident rows for the cycle from the seed +
+// reader without touching e.active or the store. Used by both live Tick and
+// startup Rebuild.
+func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Incident, reader Reader) ([]derivedRow, error) {
 	currentStart := now.Add(-e.cfg.Window)
 	baselineStart := now.Add(-2 * e.cfg.Window)
 	statuses := failedStatuses()
-	current := e.reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
-	baseline := e.reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
+	current := reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
+	baseline := reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
 	baselineByFamily := map[string]int{}
 	for _, row := range baseline.Rows {
 		baselineByFamily[familyKey(row.ErrorFamily)] = row.Count
 	}
 
 	seen := map[string]struct{}{}
+	out := make([]derivedRow, 0, len(current.Rows))
 	for _, row := range current.Rows {
 		if row.Count < e.cfg.MinCount {
 			continue
@@ -154,25 +173,126 @@ func (e *Engine) Tick(ctx context.Context) error {
 		if baselineCount > 0 && lift < e.cfg.MinLift {
 			continue
 		}
-		inc, err := e.buildIncident(ctx, row, baselineCount, lift, currentStart, now)
+		inc, existed, err := e.buildIncidentFromSeed(ctx, seed, reader, row, baselineCount, lift, currentStart, now)
 		if err != nil {
-			return err
+			return nil, err
 		}
 		seen[inc.IncidentID] = struct{}{}
-		if err := e.store.Upsert(ctx, inc); err != nil {
+		out = append(out, derivedRow{Incident: inc, Existed: existed})
+	}
+	out = append(out, e.deriveMissing(seed, seen, now)...)
+	return out, nil
+}
+
+// deriveMissing emits transitions for seed rows absent from the current cycle:
+// active → recovering, and recovering → resolved once LastSeenAt is older
+// than ResolveAfter. Mirrors the previous transitionMissing semantics.
+func (e *Engine) deriveMissing(seed map[string]Incident, seen map[string]struct{}, now time.Time) []derivedRow {
+	out := make([]derivedRow, 0)
+	for _, inc := range seed {
+		if _, ok := seen[inc.IncidentID]; ok {
+			continue
+		}
+		switch inc.Status {
+		case StatusActive:
+			row := cloneIncident(inc)
+			row.Status = StatusRecovering
+			t := now
+			row.RecoveringAt = &t
+			row.UpdatedAt = now
+			out = append(out, derivedRow{Incident: row, Existed: true})
+		case StatusRecovering:
+			if now.Sub(inc.LastSeenAt) >= e.cfg.ResolveAfter {
+				row := cloneIncident(inc)
+				row.Status = StatusResolved
+				t := now
+				row.ResolvedAt = &t
+				row.UpdatedAt = now
+				out = append(out, derivedRow{Incident: row, Existed: true})
+			}
+		}
+	}
+	return out
+}
+
+// ApplyLive persists derived rows for a live tick: per-row Upsert, in-memory
+// cache update, and per-transition metric increments matching pre-refactor
+// Tick behavior.
+func (e *Engine) ApplyLive(ctx context.Context, rows []derivedRow) error {
+	for _, dr := range rows {
+		if err := e.store.Upsert(ctx, dr.Incident); err != nil {
 			return err
 		}
-		e.remember(inc)
+		switch dr.Incident.Status {
+		case StatusResolved:
+			e.forget(dr.Incident.IncidentID)
+			if e.metrics != nil {
+				e.metrics.IncidentResolved.Inc()
+			}
+		case StatusRecovering:
+			e.remember(dr.Incident)
+			if dr.Existed {
+				if e.metrics != nil {
+					e.metrics.IncidentRecovered.Inc()
+				}
+			}
+		default:
+			e.remember(dr.Incident)
+			if e.metrics != nil {
+				if dr.Existed {
+					e.metrics.IncidentUpdated.Inc()
+				} else {
+					e.metrics.IncidentOpened.Inc()
+				}
+			}
+		}
 	}
-	if err := e.transitionMissing(ctx, seen, now); err != nil {
+	if e.metrics != nil {
+		e.metrics.IncidentActive.Set(float64(e.activeCount()))
+	}
+	return nil
+}
+
+// ApplyRebuild atomically replaces non-resolved store rows with the derived
+// set, then reloads the in-memory cache from the store. ApplyRebuild owns
+// cache reload; do NOT call Bootstrap after it. Per-row Opened/Updated/
+// Recovered/Resolved counters are intentionally not incremented here —
+// rebuild metrics live in main.go.
+func (e *Engine) ApplyRebuild(ctx context.Context, rows []derivedRow) error {
+	incs := make([]Incident, 0, len(rows))
+	for _, dr := range rows {
+		incs = append(incs, dr.Incident)
+	}
+	if err := e.store.ReplaceNonResolved(ctx, incs); err != nil {
+		return err
+	}
+	active, err := e.store.ListActive(ctx)
+	if err != nil {
 		return err
 	}
+	e.mu.Lock()
+	e.active = make(map[string]Incident, len(active))
+	for _, inc := range active {
+		e.active[inc.IncidentID] = cloneIncident(inc)
+	}
+	e.mu.Unlock()
 	if e.metrics != nil {
-		e.metrics.IncidentActive.Set(float64(e.activeCount()))
+		e.metrics.IncidentActive.Set(float64(len(active)))
 	}
 	return nil
 }
 
+// SnapshotActive returns a deep clone of the in-memory active map.
+func (e *Engine) SnapshotActive() map[string]Incident {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	out := make(map[string]Incident, len(e.active))
+	for id, inc := range e.active {
+		out[id] = cloneIncident(inc)
+	}
+	return out
+}
+
 func (e *Engine) Active(ctx context.Context) ([]Incident, error) {
 	rows, err := e.store.ListActive(ctx)
 	if err != nil {
@@ -265,6 +385,69 @@ func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baseline
 	return inc, nil
 }
 
+func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Incident, reader Reader, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, bool, error) {
+	events := sampleEventsFromReader(reader, row.ErrorFamily, since, now, 200)
+	startedAt := earliestEventTime(events, now)
+	env := firstEventEnv(events)
+	if existing, ok := findByFamilyIn(seed, env, row.ErrorFamily); ok {
+		startedAt = existing.StartedAt
+	}
+	id := StableID(env, row.ErrorFamily, startedAt)
+	existing, hadExisting := getCachedIn(seed, id)
+	if !hadExisting {
+		if prior, ok := findByFamilyIn(seed, env, row.ErrorFamily); ok {
+			existing = prior
+			id = prior.IncidentID
+			hadExisting = true
+		}
+	}
+	blast := reader.BlastRadius(
+		SearchFilter{Since: since, Until: now},
+		apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
+	)
+	sigs, err := e.querySignals(ctx, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	if err != nil && !errors.Is(err, signals.ErrUnavailable) {
+		return Incident{}, false, err
+	}
+	deploys, err := e.queryDeploys(ctx, row.ErrorFamily.Service, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	if err != nil {
+		return Incident{}, false, err
+	}
+	inc := Incident{
+		IncidentID:       id,
+		Env:              env,
+		Service:          row.ErrorFamily.Service,
+		ErrorFamily:      row.ErrorFamily,
+		Status:           StatusActive,
+		Severity:         severity(row.Count, blast.AffectedServices, lift),
+		StartedAt:        startedAt,
+		UpdatedAt:        now,
+		LastSeenAt:       now,
+		AffectedRequests: blast.AffectedRequests,
+		AffectedUsers:    cloneInt(row.AffectedUsers),
+		AffectedServices: blast.AffectedServices,
+		TopServices:      append([]string(nil), blast.TopServices...),
+		SampleTraces:     stableSamples(existing.SampleTraces, events, e.cfg.SampleLimit),
+		Lift:             lift,
+		BaselineCount:    baselineCount,
+		CurrentCount:     row.Count,
+	}
+	if hadExisting {
+		inc.StartedAt = existing.StartedAt
+		inc.RecoveringAt = nil
+	}
+	class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
+	inc.Cause = class.Cause
+	inc.Confidence = class.Confidence
+	inc.Evidence = class.Evidence
+	inc.NextChecks = class.NextChecks
+	inc.InstrumentationWarnings = class.InstrumentationWarnings
+	if e.metrics != nil {
+		e.observeClassification(inc.Cause, inc.Confidence)
+	}
+	return inc, hadExisting, nil
+}
+
 func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}, now time.Time) error {
 	e.mu.RLock()
 	rows := make([]Incident, 0, len(e.active))
@@ -309,7 +492,11 @@ func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}
 }
 
 func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
-	events := e.reader.SearchEvents(SearchFilter{
+	return sampleEventsFromReader(e.reader, f, since, until, limit)
+}
+
+func sampleEventsFromReader(reader Reader, f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
+	events := reader.SearchEvents(SearchFilter{
 		Service:   f.Service,
 		ErrorCode: f.ErrorCode,
 		Since:     since,
@@ -325,6 +512,20 @@ func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit
 	return out
 }
 
+func getCachedIn(seed map[string]Incident, id string) (Incident, bool) {
+	inc, ok := seed[id]
+	return cloneIncident(inc), ok
+}
+
+func findByFamilyIn(seed map[string]Incident, env string, family apiv2.ErrorFamily) (Incident, bool) {
+	for _, inc := range seed {
+		if inc.Env == env && inc.ErrorFamily == family && inc.Status != StatusResolved {
+			return cloneIncident(inc), true
+		}
+	}
+	return Incident{}, false
+}
+
 func (e *Engine) querySignals(ctx context.Context, env string, since, until time.Time) ([]signals.Signal, error) {
 	if e.signals == nil {
 		return nil, nil
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
index 41403bc..6aea07f 100644
--- a/internal/incidents/engine_test.go
+++ b/internal/incidents/engine_test.go
@@ -125,6 +125,136 @@ func TestEngineUsesDownstreamDependencySignal(t *testing.T) {
 	}
 }
 
+func TestDerivePreservesSeedContinuity(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	started := now.Add(-20 * time.Minute)
+	seeded := testIncident(started)
+	seeded.SampleTraces = []string{"trace-seeded"}
+	reader := &fakeReader{
+		current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily: testFamily(),
+			Count:       6,
+		}}},
+		blast: apiv2.BlastRadiusResponse{AffectedRequests: 6, AffectedServices: 2, TopServices: []string{"checkout", "payment"}},
+		events: []*eventv2.Event{
+			testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+		},
+	}
+	engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{MinCount: 5, SampleLimit: 2}, nil, nil)
+	rows, err := engine.derive(context.Background(), now, map[string]Incident{seeded.IncidentID: seeded}, reader)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("rows=%+v", rows)
+	}
+	got := rows[0].Incident
+	if !got.StartedAt.Equal(started) {
+		t.Fatalf("started_at=%s want %s", got.StartedAt, started)
+	}
+	if len(got.SampleTraces) != 2 || got.SampleTraces[0] != "trace-seeded" || got.SampleTraces[1] != "trace-new" {
+		t.Fatalf("sample_traces=%+v", got.SampleTraces)
+	}
+	if !rows[0].Existed {
+		t.Fatalf("seeded row should be marked existed")
+	}
+}
+
+func TestDeriveMissingTransitions(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	reader := &fakeReader{}
+	engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{ResolveAfter: time.Minute}, nil, nil)
+
+	active := testIncident(now.Add(-5 * time.Minute))
+	rows, err := engine.derive(context.Background(), now, map[string]Incident{active.IncidentID: active}, reader)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rows) != 1 || rows[0].Incident.Status != StatusRecovering {
+		t.Fatalf("active missing rows=%+v", rows)
+	}
+
+	recovering := testIncident(now.Add(-5 * time.Minute))
+	recovering.Status = StatusRecovering
+	recovering.LastSeenAt = now.Add(-2 * time.Minute)
+	rows, err = engine.derive(context.Background(), now, map[string]Incident{recovering.IncidentID: recovering}, reader)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rows) != 1 || rows[0].Incident.Status != StatusResolved {
+		t.Fatalf("recovering missing rows=%+v", rows)
+	}
+}
+
+func TestApplyRebuildReplacesStoreAndReloadsCache(t *testing.T) {
+	ctx := context.Background()
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	store := NewMemoryStore()
+	oldActive := testIncident(now.Add(-10 * time.Minute))
+	resolved := testIncident(now.Add(-20 * time.Minute))
+	resolved.IncidentID = "inc_resolved"
+	resolved.Status = StatusResolved
+	resolvedAt := now.Add(-5 * time.Minute)
+	resolved.ResolvedAt = &resolvedAt
+	if err := store.Upsert(ctx, oldActive); err != nil {
+		t.Fatal(err)
+	}
+	if err := store.Upsert(ctx, resolved); err != nil {
+		t.Fatal(err)
+	}
+	engine := NewEngine(&fakeReader{}, nil, nil, store, Config{}, nil, nil)
+	if err := engine.Bootstrap(ctx); err != nil {
+		t.Fatal(err)
+	}
+	newActive := testIncident(now)
+	newActive.IncidentID = "inc_new"
+	if err := engine.ApplyRebuild(ctx, []derivedRow{{Incident: newActive}}); err != nil {
+		t.Fatal(err)
+	}
+	active, err := engine.Active(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(active) != 1 || active[0].IncidentID != "inc_new" {
+		t.Fatalf("active after rebuild=%+v", active)
+	}
+	if _, ok := engine.SnapshotActive()["inc_new"]; !ok {
+		t.Fatalf("cache was not reloaded from rebuilt rows")
+	}
+	if _, err := store.Get(ctx, "inc_resolved"); err != nil {
+		t.Fatalf("resolved row should be preserved: %v", err)
+	}
+	if _, err := store.Get(ctx, oldActive.IncidentID); err == nil {
+		t.Fatalf("old non-resolved row should be replaced")
+	}
+}
+
+func TestRebuildOrchestratorUsesRebuildApply(t *testing.T) {
+	ctx := context.Background()
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	reader := &fakeReader{
+		current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily: testFamily(),
+			Count:       6,
+		}}},
+		blast: apiv2.BlastRadiusResponse{AffectedRequests: 6, AffectedServices: 2},
+		events: []*eventv2.Event{
+			testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+		},
+	}
+	engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{MinCount: 5, SampleLimit: 2}, nil, nil)
+	result, err := Rebuild(ctx, RebuildDeps{Engine: engine, Reader: reader, Now: func() time.Time { return now }})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if result.RowsReplaced != 1 {
+		t.Fatalf("rows_replaced=%d", result.RowsReplaced)
+	}
+	if len(engine.SnapshotActive()) != 1 {
+		t.Fatalf("cache should reflect rebuilt active rows")
+	}
+}
+
 type fakeReader struct {
 	current ErrorsResult
 	base    ErrorsResult
diff --git a/internal/incidents/nextchecks.go b/internal/incidents/nextchecks.go
index b3a6559..9f0d696 100644
--- a/internal/incidents/nextchecks.go
+++ b/internal/incidents/nextchecks.go
@@ -14,6 +14,13 @@ func NextChecks(cause Cause, confidence Confidence) []string {
 			"Inspect retries, timeouts, and circuit-breaker state for the failing step.",
 			"Notify the downstream owner with sample traces and affected service list.",
 		}
+	case CauseRuntime:
+		return []string{
+			"Check the service for recent restarts or crashloops.",
+			"Inspect memory and CPU usage for OOM kills or resource pressure.",
+			"Review readiness and liveness probe results around the incident start.",
+			"Verify node and task health for the affected service instances.",
+		}
 	case CauseApp:
 		return []string{
 			"Inspect the first failing step and recent application logs.",
diff --git a/internal/incidents/rebuild.go b/internal/incidents/rebuild.go
new file mode 100644
index 0000000..1abd0cc
--- /dev/null
+++ b/internal/incidents/rebuild.go
@@ -0,0 +1,40 @@
+package incidents
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+type RebuildDeps struct {
+	Engine *Engine
+	Reader Reader
+	Now    func() time.Time
+}
+
+type RebuildResult struct {
+	RowsReplaced int
+	Duration     time.Duration
+}
+
+func Rebuild(ctx context.Context, deps RebuildDeps) (RebuildResult, error) {
+	if deps.Engine == nil {
+		return RebuildResult{}, fmt.Errorf("incidents rebuild: engine required")
+	}
+	if deps.Reader == nil {
+		return RebuildResult{}, fmt.Errorf("incidents rebuild: reader required")
+	}
+	nowFn := deps.Now
+	if nowFn == nil {
+		nowFn = time.Now
+	}
+	start := time.Now()
+	rows, err := deps.Engine.derive(ctx, nowFn().UTC(), deps.Engine.SnapshotActive(), deps.Reader)
+	if err != nil {
+		return RebuildResult{}, err
+	}
+	if err := deps.Engine.ApplyRebuild(ctx, rows); err != nil {
+		return RebuildResult{}, err
+	}
+	return RebuildResult{RowsReplaced: len(rows), Duration: time.Since(start)}, nil
+}
diff --git a/internal/incidents/store.go b/internal/incidents/store.go
index 90b7323..bd3fb5b 100644
--- a/internal/incidents/store.go
+++ b/internal/incidents/store.go
@@ -12,6 +12,7 @@ var ErrNotFound = errors.New("incidents: not found")
 
 type Store interface {
 	Upsert(ctx context.Context, inc Incident) error
+	ReplaceNonResolved(ctx context.Context, rows []Incident) error
 	Get(ctx context.Context, id string) (Incident, error)
 	ListActive(ctx context.Context) ([]Incident, error)
 	PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error)
@@ -33,6 +34,20 @@ func (s *MemoryStore) Upsert(_ context.Context, inc Incident) error {
 	return nil
 }
 
+func (s *MemoryStore) ReplaceNonResolved(_ context.Context, rows []Incident) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	for id, inc := range s.rows {
+		if inc.Status != StatusResolved {
+			delete(s.rows, id)
+		}
+	}
+	for _, inc := range rows {
+		s.rows[inc.IncidentID] = cloneIncident(inc)
+	}
+	return nil
+}
+
 func (s *MemoryStore) Get(_ context.Context, id string) (Incident, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
index cf3b59a..fed7350 100644
--- a/internal/incidents/types.go
+++ b/internal/incidents/types.go
@@ -20,6 +20,7 @@ const (
 	CauseDeploy     Cause = "deploy"
 	CauseApp        Cause = "app"
 	CauseDependency Cause = "dependency"
+	CauseRuntime    Cause = "runtime"
 	CauseUnknown    Cause = "unknown"
 )
 
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index bd1a4e8..e6150a1 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -14,7 +14,6 @@ import (
 	"net"
 	"net/http"
 	"net/url"
-	"os"
 	"sort"
 	"strconv"
 	"strings"
@@ -136,8 +135,11 @@ type Server struct {
 
 	// OTLP capability flag — reported by /v1/capabilities. Set via
 	// ServerConfig when the OTLP handler is mounted in main.go.
-	otlpEnabled    bool
-	v2ReadsEnabled bool
+	otlpEnabled               bool
+	v2ReadsEnabled            bool
+	incidentsEnabled          bool
+	incidentsPersistent       bool
+	incidentsRebuildSupported bool
 
 	// SSE
 	sseHub               *SSEHub
@@ -178,31 +180,34 @@ func (s *Server) SetCausalRunResult(err error) {
 
 // ServerConfig holds configuration for creating a new Server.
 type ServerConfig struct {
-	Store               *store.Store
-	TraceStore          *tracestore.Store
-	Sampler             *sampler.Sampler
-	Metrics             *metrics.Metrics
-	MaxBodyBytes        int64
-	EventLogDir         string
-	StartTime           time.Time
-	SampleRatePct       int // 0 means use sampler's default from env
-	AskProvider         llm.Provider
-	AskRegistry         *tools.Registry
-	AskMaxStepsDefault  int
-	AskMaxStepsMax      int
-	DashboardRefreshSec int
-	PrometheusURL       string
-	GrafanaURL          string
-	GraphUI             bool
-	DedupCache          *DedupCache
-	AgentKey            string
-	TrustProxy          bool
-	ColdWriter          *coldstore.BatchWriter
-	ColdStore           coldstore.Store
-	PlanStore           *PlanStore
-	GraphHotWindow      time.Duration
-	OTLPEnabled         bool
-	V2ReadsEnabled      bool
+	Store                    *store.Store
+	TraceStore               *tracestore.Store
+	Sampler                  *sampler.Sampler
+	Metrics                  *metrics.Metrics
+	MaxBodyBytes             int64
+	EventLogDir              string
+	StartTime                time.Time
+	SampleRatePct            int // 0 means use sampler's default from env
+	AskProvider              llm.Provider
+	AskRegistry              *tools.Registry
+	AskMaxStepsDefault       int
+	AskMaxStepsMax           int
+	DashboardRefreshSec      int
+	PrometheusURL            string
+	GrafanaURL               string
+	GraphUI                  bool
+	DedupCache               *DedupCache
+	AgentKey                 string
+	TrustProxy               bool
+	ColdWriter               *coldstore.BatchWriter
+	ColdStore                coldstore.Store
+	PlanStore                *PlanStore
+	GraphHotWindow           time.Duration
+	OTLPEnabled              bool
+	V2ReadsEnabled           bool
+	IncidentsEnabled         bool
+	IncidentsPersistent      bool
+	IncidentRebuildSupported bool
 }
 
 // NewServer creates a new ingest server with the given configuration.
@@ -216,33 +221,36 @@ func NewServer(cfg ServerConfig) *Server {
 		startTime = time.Now()
 	}
 	s := &Server{
-		store:               cfg.Store,
-		traceStore:          cfg.TraceStore,
-		builder:             build.NewBuilder(),
-		sampler:             cfg.Sampler,
-		metrics:             cfg.Metrics,
-		maxBodyBytes:        maxBody,
-		startTime:           startTime,
-		EventLogDir:         cfg.EventLogDir,
-		sampleRatePct:       cfg.SampleRatePct,
-		askProvider:         cfg.AskProvider,
-		askRegistry:         cfg.AskRegistry,
-		askMaxStepsDefault:  cfg.AskMaxStepsDefault,
-		askMaxStepsMax:      cfg.AskMaxStepsMax,
-		dashboardRefreshSec: cfg.DashboardRefreshSec,
-		prometheusURL:       cfg.PrometheusURL,
-		grafanaURL:          cfg.GrafanaURL,
-		graphUI:             cfg.GraphUI,
-		dedupCache:          cfg.DedupCache,
-		agentKey:            cfg.AgentKey,
-		trustProxy:          cfg.TrustProxy,
-		coldWriter:          cfg.ColdWriter,
-		coldStore:           cfg.ColdStore,
-		planStore:           cfg.PlanStore,
-		graphHotWindow:      cfg.GraphHotWindow,
-		otlpEnabled:         cfg.OTLPEnabled,
-		v2ReadsEnabled:      cfg.V2ReadsEnabled,
-		replayStatus:        "none",
+		store:                     cfg.Store,
+		traceStore:                cfg.TraceStore,
+		builder:                   build.NewBuilder(),
+		sampler:                   cfg.Sampler,
+		metrics:                   cfg.Metrics,
+		maxBodyBytes:              maxBody,
+		startTime:                 startTime,
+		EventLogDir:               cfg.EventLogDir,
+		sampleRatePct:             cfg.SampleRatePct,
+		askProvider:               cfg.AskProvider,
+		askRegistry:               cfg.AskRegistry,
+		askMaxStepsDefault:        cfg.AskMaxStepsDefault,
+		askMaxStepsMax:            cfg.AskMaxStepsMax,
+		dashboardRefreshSec:       cfg.DashboardRefreshSec,
+		prometheusURL:             cfg.PrometheusURL,
+		grafanaURL:                cfg.GrafanaURL,
+		graphUI:                   cfg.GraphUI,
+		dedupCache:                cfg.DedupCache,
+		agentKey:                  cfg.AgentKey,
+		trustProxy:                cfg.TrustProxy,
+		coldWriter:                cfg.ColdWriter,
+		coldStore:                 cfg.ColdStore,
+		planStore:                 cfg.PlanStore,
+		graphHotWindow:            cfg.GraphHotWindow,
+		otlpEnabled:               cfg.OTLPEnabled,
+		v2ReadsEnabled:            cfg.V2ReadsEnabled,
+		incidentsEnabled:          cfg.IncidentsEnabled,
+		incidentsPersistent:       cfg.IncidentsPersistent,
+		incidentsRebuildSupported: cfg.IncidentRebuildSupported,
+		replayStatus:              "none",
 	}
 	if s.sampler == nil {
 		s.sampler = sampler.New(sampler.LoadConfigFromEnv())
@@ -559,19 +567,26 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
 		return
 	}
-	askEnabled, model, toolMode := s.askCapabilityState()
+	askState := s.askCapabilityState()
 	hotWindow := s.effectiveGraphHotWindow()
 	_, hotWindowSource := runtimeGraphHotWindow()
 
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(map[string]any{
 		"ask": map[string]any{
-			"enabled":           askEnabled,
-			"model":             model,
-			"tool_mode":         toolMode,
+			"enabled":           askState.AskEnabled,
+			"model":             askState.Model,
+			"tool_mode":         askState.ToolMode,
 			"max_steps_default": s.askMaxStepsDefault,
 			"max_steps_max":     s.askMaxStepsMax,
 		},
+		"llm": map[string]any{
+			"provider":    askState.Provider,
+			"model":       askState.Model,
+			"tool_mode":   askState.ToolMode,
+			"configured":  askState.Configured,
+			"ask_enabled": askState.AskEnabled,
+		},
 		"dashboard": map[string]any{
 			"refresh_interval_sec": s.dashboardRefreshSec,
 		},
@@ -586,6 +601,19 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 		"v2_reads": map[string]any{
 			"enabled": s.v2ReadsEnabled,
 		},
+		"incidents": map[string]any{
+			"enabled":    s.incidentsEnabled,
+			"persistent": s.incidentsPersistent,
+			"rebuild": map[string]any{
+				"supported": s.incidentsRebuildSupported,
+				"scope": func() string {
+					if s.incidentsRebuildSupported {
+						return "hot-window"
+					}
+					return ""
+				}(),
+			},
+		},
 		"architecture": map[string]any{
 			"flattened": true,
 			"graph": map[string]any{
@@ -1785,40 +1813,42 @@ func normalizeJSONValue(v any) any {
 	return out
 }
 
-func (s *Server) askProviderFromEnv() (llm.Provider, string, string, error) {
-	key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
-	if key == "" {
-		key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
-	}
-	if key == "" {
-		return nil, "", "", errors.New("gemini api key is not configured")
-	}
+type askCapability struct {
+	Provider   string
+	Model      string
+	ToolMode   string
+	Configured bool
+	AskEnabled bool
+}
 
-	client := llm.NewGeminiClient(key)
-	model := strings.TrimSpace(os.Getenv("GEMINI_MODEL"))
-	base := strings.TrimSpace(os.Getenv("GEMINI_API_BASE"))
-	mode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE"))
-	if model != "" {
-		client.Model = model
-	}
-	if base != "" {
-		client.BaseURL = base
+func (s *Server) askProviderFromEnv() (llm.Provider, string, string, error) {
+	sel, err := llm.SelectFromEnv()
+	if err != nil {
+		return nil, "", "", err
 	}
-	if mode != "" {
-		client.ToolMode = mode
+	if !sel.AskEnabled {
+		return nil, "", "", llm.ErrProviderNotConfigured
 	}
-	return client, client.Model, client.ToolMode, nil
+	return sel.Impl, sel.Model, sel.ToolMode, nil
 }
 
-func (s *Server) askCapabilityState() (bool, string, string) {
+// askCapabilityState reports current LLM provider state for /v1/capabilities.
+// When s.askProvider != nil (test injection), provider is reported as "custom".
+func (s *Server) askCapabilityState() askCapability {
 	if s.askProvider != nil {
-		return true, "", ""
+		return askCapability{Provider: "custom", Configured: true, AskEnabled: true}
 	}
-	_, model, toolMode, err := s.askProviderFromEnv()
+	sel, err := llm.SelectFromEnv()
 	if err != nil {
-		return false, "", ""
+		return askCapability{Provider: "none"}
+	}
+	return askCapability{
+		Provider:   sel.Provider,
+		Model:      sel.Model,
+		ToolMode:   sel.ToolMode,
+		Configured: sel.Configured,
+		AskEnabled: sel.AskEnabled,
 	}
-	return true, model, toolMode
 }
 
 func attrToInt64(v any) int64 {
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 0c99c6c..0ed3768 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -349,6 +349,65 @@ func TestCapabilities_V2ReadsEnabled(t *testing.T) {
 	}
 }
 
+func TestCapabilities_IncidentsBlock(t *testing.T) {
+	tests := []struct {
+		name             string
+		cfg              ServerConfig
+		wantEnabled      bool
+		wantPersistent   bool
+		wantRebuild      bool
+		wantRebuildScope string
+	}{
+		{name: "disabled"},
+		{
+			name: "sqlite enabled",
+			cfg: ServerConfig{
+				IncidentsEnabled:         true,
+				IncidentsPersistent:      true,
+				IncidentRebuildSupported: true,
+			},
+			wantEnabled:      true,
+			wantPersistent:   true,
+			wantRebuild:      true,
+			wantRebuildScope: "hot-window",
+		},
+		{name: "requested but sqlite missing"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			srv := NewServer(tc.cfg)
+			req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+			w := httptest.NewRecorder()
+			srv.Capabilities(w, req)
+			var resp struct {
+				Incidents struct {
+					Enabled    bool `json:"enabled"`
+					Persistent bool `json:"persistent"`
+					Rebuild    struct {
+						Supported bool   `json:"supported"`
+						Scope     string `json:"scope"`
+					} `json:"rebuild"`
+				} `json:"incidents"`
+			}
+			if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+				t.Fatalf("invalid json: %v", err)
+			}
+			if resp.Incidents.Enabled != tc.wantEnabled {
+				t.Fatalf("enabled=%v want %v", resp.Incidents.Enabled, tc.wantEnabled)
+			}
+			if resp.Incidents.Persistent != tc.wantPersistent {
+				t.Fatalf("persistent=%v want %v", resp.Incidents.Persistent, tc.wantPersistent)
+			}
+			if resp.Incidents.Rebuild.Supported != tc.wantRebuild {
+				t.Fatalf("rebuild.supported=%v want %v", resp.Incidents.Rebuild.Supported, tc.wantRebuild)
+			}
+			if resp.Incidents.Rebuild.Scope != tc.wantRebuildScope {
+				t.Fatalf("rebuild.scope=%q want %q", resp.Incidents.Rebuild.Scope, tc.wantRebuildScope)
+			}
+		})
+	}
+}
+
 const successTrace = "bbbb0000cccc1111dddd2222eeee3333"
 
 func makeTestServerMixed() *Server {
@@ -1686,6 +1745,37 @@ func TestAsk_DedupSafetyNet_PreservesActualStatus(t *testing.T) {
 	}
 }
 
+func TestAsk_MissingProviderMessageIsProviderAgnostic(t *testing.T) {
+	t.Setenv("WAYLOG_LLM_PROVIDER", "")
+	t.Setenv("GEMINI_API_KEY", "")
+	t.Setenv("GOOGLE_API_KEY", "")
+	srv := &Server{
+		store:        graphstore.NewStore(),
+		maxBodyBytes: 1 << 20,
+		dedupCache:   NewDedupCache(),
+	}
+	r := httptest.NewRequest("POST", "/v1/ask?envelope=v2", strings.NewReader(`{"prompt":"test"}`))
+	w := httptest.NewRecorder()
+	srv.Ask(w, r)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want 503", w.Code)
+	}
+	var resp APIResponse
+	if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if resp.Error == nil {
+		t.Fatalf("expected error response")
+	}
+	if got := resp.Error.Message; got != llm.ErrProviderNotConfigured.Error() {
+		t.Fatalf("message = %q, want %q", got, llm.ErrProviderNotConfigured.Error())
+	}
+	if strings.Contains(strings.ToLower(resp.Error.Message), "gemini") {
+		t.Fatalf("message should not pin Gemini: %q", resp.Error.Message)
+	}
+}
+
 func TestToolCall_DedupSafetyNet_Exists(t *testing.T) {
 	dc := NewDedupCache()
 	reg := tools.NewRegistry()
@@ -1913,3 +2003,88 @@ func TestOverview_IncludesLatestFailedTraceID(t *testing.T) {
 		t.Fatal("overview response missing latest_failed_trace_id field")
 	}
 }
+
+type stubAskProvider struct{}
+
+func (stubAskProvider) Generate(ctx context.Context, prompt string, tools []llm.ToolDefinition, history []llm.Turn) (llm.Result, error) {
+	return llm.Result{}, nil
+}
+
+func TestCapabilities_LLMBlock(t *testing.T) {
+	tests := []struct {
+		name           string
+		env            map[string]string
+		askProvider    llm.Provider
+		wantProvider   string
+		wantConfigured bool
+		wantAskEnabled bool
+	}{
+		{
+			name:           "no env",
+			env:            map[string]string{},
+			wantProvider:   "none",
+			wantConfigured: false,
+			wantAskEnabled: false,
+		},
+		{
+			name:           "gemini key set",
+			env:            map[string]string{"WAYLOG_LLM_PROVIDER": "gemini", "GEMINI_API_KEY": "test-key"},
+			wantProvider:   "gemini",
+			wantConfigured: true,
+			wantAskEnabled: true,
+		},
+		{
+			name:           "custom injected provider",
+			env:            map[string]string{},
+			askProvider:    stubAskProvider{},
+			wantProvider:   "custom",
+			wantConfigured: true,
+			wantAskEnabled: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Setenv("WAYLOG_LLM_PROVIDER", "")
+			t.Setenv("WAYLOG_LLM_MODEL", "")
+			t.Setenv("GEMINI_API_KEY", "")
+			t.Setenv("GOOGLE_API_KEY", "")
+			t.Setenv("GEMINI_MODEL", "")
+			t.Setenv("GEMINI_API_BASE", "")
+			t.Setenv("GEMINI_TOOL_MODE", "")
+			for k, v := range tc.env {
+				t.Setenv(k, v)
+			}
+
+			srv := NewServer(ServerConfig{AskProvider: tc.askProvider})
+			req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+			w := httptest.NewRecorder()
+			srv.Capabilities(w, req)
+
+			if w.Code != http.StatusOK {
+				t.Fatalf("status = %d, want 200: %s", w.Code, w.Body.String())
+			}
+			var resp struct {
+				LLM struct {
+					Provider   string `json:"provider"`
+					Model      string `json:"model"`
+					ToolMode   string `json:"tool_mode"`
+					Configured bool   `json:"configured"`
+					AskEnabled bool   `json:"ask_enabled"`
+				} `json:"llm"`
+			}
+			if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+				t.Fatalf("invalid json: %v", err)
+			}
+			if resp.LLM.Provider != tc.wantProvider {
+				t.Errorf("provider = %q, want %q", resp.LLM.Provider, tc.wantProvider)
+			}
+			if resp.LLM.Configured != tc.wantConfigured {
+				t.Errorf("configured = %v, want %v", resp.LLM.Configured, tc.wantConfigured)
+			}
+			if resp.LLM.AskEnabled != tc.wantAskEnabled {
+				t.Errorf("ask_enabled = %v, want %v", resp.LLM.AskEnabled, tc.wantAskEnabled)
+			}
+		})
+	}
+}
diff --git a/internal/llm/provider.go b/internal/llm/provider.go
new file mode 100644
index 0000000..c75ea4e
--- /dev/null
+++ b/internal/llm/provider.go
@@ -0,0 +1,75 @@
+package llm
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+)
+
+// ErrProviderNotConfigured is returned when Ask cannot construct a provider
+// from the current environment.
+var ErrProviderNotConfigured = errors.New("LLM provider not configured; set WAYLOG_LLM_PROVIDER and provider credentials")
+
+// Selection describes the resolved LLM provider state.
+type Selection struct {
+	Provider   string
+	Model      string
+	ToolMode   string
+	Configured bool
+	AskEnabled bool
+	Impl       Provider
+}
+
+// SelectFromEnv resolves the LLM provider from environment variables.
+//
+// WAYLOG_LLM_PROVIDER may be "none" or "gemini". When unset, a Gemini key
+// (GEMINI_API_KEY or GOOGLE_API_KEY) infers gemini; otherwise none.
+// Model precedence: WAYLOG_LLM_MODEL > GEMINI_MODEL > built-in default.
+func SelectFromEnv() (Selection, error) {
+	raw := strings.ToLower(strings.TrimSpace(os.Getenv("WAYLOG_LLM_PROVIDER")))
+	key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
+	if key == "" {
+		key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+	}
+
+	switch raw {
+	case "":
+		if key == "" {
+			return Selection{Provider: "none"}, nil
+		}
+		return buildGemini(key, true), nil
+	case "none":
+		return Selection{Provider: "none", Configured: true}, nil
+	case "gemini":
+		if key == "" {
+			return Selection{Provider: "gemini", Configured: false}, nil
+		}
+		return buildGemini(key, true), nil
+	default:
+		return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini", raw)
+	}
+}
+
+func buildGemini(key string, configured bool) Selection {
+	client := NewGeminiClient(key)
+	if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+		client.Model = model
+	} else if model := strings.TrimSpace(os.Getenv("GEMINI_MODEL")); model != "" {
+		client.Model = model
+	}
+	if base := strings.TrimSpace(os.Getenv("GEMINI_API_BASE")); base != "" {
+		client.BaseURL = base
+	}
+	if mode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE")); mode != "" {
+		client.ToolMode = mode
+	}
+	return Selection{
+		Provider:   "gemini",
+		Model:      client.Model,
+		ToolMode:   client.ToolMode,
+		Configured: configured,
+		AskEnabled: true,
+		Impl:       client,
+	}
+}
diff --git a/internal/llm/provider_test.go b/internal/llm/provider_test.go
new file mode 100644
index 0000000..b73e466
--- /dev/null
+++ b/internal/llm/provider_test.go
@@ -0,0 +1,160 @@
+package llm
+
+import (
+	"strings"
+	"testing"
+)
+
+func clearProviderEnv(t *testing.T) {
+	t.Helper()
+	t.Setenv("WAYLOG_LLM_PROVIDER", "")
+	t.Setenv("WAYLOG_LLM_MODEL", "")
+	t.Setenv("GEMINI_API_KEY", "")
+	t.Setenv("GOOGLE_API_KEY", "")
+	t.Setenv("GEMINI_MODEL", "")
+	t.Setenv("GEMINI_API_BASE", "")
+	t.Setenv("GEMINI_TOOL_MODE", "")
+}
+
+func TestSelectFromEnv_NoEnv(t *testing.T) {
+	clearProviderEnv(t)
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "none" {
+		t.Errorf("Provider = %q, want %q", sel.Provider, "none")
+	}
+	if sel.Configured {
+		t.Error("Configured = true, want false")
+	}
+	if sel.AskEnabled {
+		t.Error("AskEnabled = true, want false")
+	}
+	if sel.Impl != nil {
+		t.Error("Impl != nil, want nil")
+	}
+}
+
+func TestSelectFromEnv_NoneExplicit(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "none")
+	t.Setenv("GEMINI_API_KEY", "ignored")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "none" {
+		t.Errorf("Provider = %q, want %q", sel.Provider, "none")
+	}
+	if !sel.Configured {
+		t.Error("Configured = false, want true")
+	}
+	if sel.AskEnabled {
+		t.Error("AskEnabled = true, want false")
+	}
+	if sel.Model != "" || sel.ToolMode != "" {
+		t.Errorf("model/tool mode should be empty for none, got %q/%q", sel.Model, sel.ToolMode)
+	}
+	if sel.Impl != nil {
+		t.Error("Impl != nil, want nil")
+	}
+}
+
+func TestSelectFromEnv_GeminiWithKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+	t.Setenv("GEMINI_API_KEY", "test-key")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "gemini" {
+		t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+	}
+	if !sel.Configured {
+		t.Error("Configured = false, want true")
+	}
+	if !sel.AskEnabled {
+		t.Error("AskEnabled = false, want true")
+	}
+	if sel.Impl == nil {
+		t.Error("Impl = nil, want non-nil")
+	}
+	if sel.Model == "" {
+		t.Error("Model is empty, want default")
+	}
+}
+
+func TestSelectFromEnv_GeminiInferredFromKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("GOOGLE_API_KEY", "test-key")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "gemini" {
+		t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+	}
+	if !sel.AskEnabled {
+		t.Error("AskEnabled = false, want true")
+	}
+}
+
+func TestSelectFromEnv_GeminiMissingKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "gemini" {
+		t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+	}
+	if sel.Configured {
+		t.Error("Configured = true, want false")
+	}
+	if sel.AskEnabled {
+		t.Error("AskEnabled = true, want false")
+	}
+	if sel.Impl != nil {
+		t.Error("Impl != nil, want nil")
+	}
+}
+
+func TestSelectFromEnv_UnknownProvider(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+
+	_, err := SelectFromEnv()
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+	if !strings.Contains(err.Error(), "anthropic") {
+		t.Errorf("error %q should mention provider name", err.Error())
+	}
+	if !strings.Contains(err.Error(), "none, gemini") {
+		t.Errorf("error %q should list supported providers", err.Error())
+	}
+}
+
+func TestSelectFromEnv_WaylogModelOverridesGeminiModel(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+	t.Setenv("GEMINI_API_KEY", "test-key")
+	t.Setenv("WAYLOG_LLM_MODEL", "foo")
+	t.Setenv("GEMINI_MODEL", "bar")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Model != "foo" {
+		t.Errorf("Model = %q, want %q", sel.Model, "foo")
+	}
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 6412a70..21d7680 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -78,6 +78,10 @@ type Metrics struct {
 	IncidentTickLatency     prometheus.Histogram
 	IncidentActive          prometheus.Gauge
 	IncidentClassifications *prometheus.CounterVec
+	IncidentRebuildDuration prometheus.Histogram
+	IncidentRebuildRows     prometheus.Counter
+	IncidentRebuildFailures prometheus.Counter
+	IncidentRebuildReplayed prometheus.Counter
 
 	CausalRunsTotal   prometheus.Counter
 	CausalRunDuration prometheus.Histogram
@@ -377,11 +381,28 @@ func New(reg *prometheus.Registry) *Metrics {
 		Name: "waylog_incident_classifications_total",
 		Help: "Incident classifications by cause and confidence.",
 	}, []string{"cause", "confidence"})
-	for _, cause := range []string{"deploy", "app", "dependency", "unknown"} {
+	for _, cause := range []string{"deploy", "app", "dependency", "runtime", "unknown"} {
 		for _, confidence := range []string{"high", "medium", "low"} {
 			m.IncidentClassifications.WithLabelValues(cause, confidence).Add(0)
 		}
 	}
+	m.IncidentRebuildDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Name:    "waylog_incident_rebuild_duration_seconds",
+		Help:    "Startup hot-window incident rebuild duration.",
+		Buckets: defaultBuckets,
+	})
+	m.IncidentRebuildRows = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incident_rebuild_rows_replaced",
+		Help: "Incident rows replaced by startup hot-window rebuild.",
+	})
+	m.IncidentRebuildFailures = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incident_rebuild_failures_total",
+		Help: "Failed startup hot-window incident rebuild attempts.",
+	})
+	m.IncidentRebuildReplayed = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incident_rebuild_replayed_events_total",
+		Help: "Schema-2.0 events replayed for startup hot-window incident rebuild.",
+	})
 
 	m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_causal_runs_total",
@@ -462,6 +483,7 @@ func New(reg *prometheus.Registry) *Metrics {
 		m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
 		m.IncidentOpened, m.IncidentUpdated, m.IncidentRecovered, m.IncidentResolved,
 		m.IncidentTickLatency, m.IncidentActive, m.IncidentClassifications,
+		m.IncidentRebuildDuration, m.IncidentRebuildRows, m.IncidentRebuildFailures, m.IncidentRebuildReplayed,
 		m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
 		m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
 		m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,

From 23f92691c3cc51a3465664fc4263091c79c5fb28 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Fri, 8 May 2026 03:24:07 -0400
Subject: [PATCH 07/14] feat: added triage plan template and multi-provider ask

Shipped the M1.5 triage plan shorthand and M2.5 provider layer.

M1.5:
- add `template` + `params` request support to `/v1/plans/execute`
- add built-in `template: "triage"` expansion to a single `triage_incident` step
- preserve existing plan validation, idempotency, `X-Plan-ID`, and SSE progress
- reject unknown templates, missing `params.incident_id`, and mixed `steps`/`template` bodies
- document the triage template response shape at `steps[0].result`

M2.5:
- add Anthropic provider via the Messages API
- add OpenAI provider via the Responses API
- extend `WAYLOG_LLM_PROVIDER` to `none`, `gemini`, `anthropic`, and `openai`
- add Anthropic/OpenAI env selection, model overrides, API base overrides, and missing-key behavior
- preserve OpenAI `call_id` and raw response output items through tool-call follow-up requests
- update OpenAI default model to `gpt-5.4-mini`
- prove triage report hashes do not depend on selected LLM provider

Docs:
- update README with triage plan shorthand and provider list
- update env docs for Anthropic/OpenAI keys and model variables
- update OpenAPI for `/v1/plans/execute`, `PlanResult`, and provider enum

Validation:
- go test ./internal/llm/... ./internal/ingest/... ./internal/triage/...
- go test ./...
- make ci
- make demo + make demo-acceptance
---
 README.md                           |   6 +-
 docs/env.md                         |  12 +-
 docs/openapi.yaml                   | 111 ++++++++++++-
 internal/ingest/handler.go          |  20 ++-
 internal/ingest/handler_test.go     | 118 ++++++++++++++
 internal/ingest/plan.go             |  51 ++++++
 internal/ingest/plan_test.go        |  59 +++++++
 internal/llm/anthropic.go           | 198 +++++++++++++++++++++++
 internal/llm/anthropic_test.go      | 117 ++++++++++++++
 internal/llm/openai.go              | 239 ++++++++++++++++++++++++++++
 internal/llm/openai_test.go         | 203 +++++++++++++++++++++++
 internal/llm/provider.go            |  83 ++++++++--
 internal/llm/provider_test.go       | 138 +++++++++++++++-
 internal/llm/types.go               |   7 +-
 internal/triage/idempotency_test.go |  34 ++++
 15 files changed, 1368 insertions(+), 28 deletions(-)
 create mode 100644 internal/llm/anthropic.go
 create mode 100644 internal/llm/anthropic_test.go
 create mode 100644 internal/llm/openai.go
 create mode 100644 internal/llm/openai_test.go

diff --git a/README.md b/README.md
index 66893ff..6871fc1 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,8 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
 
 All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
 
+Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
+
 | Tool               | Answers                                                       |
 | ------------------ | ------------------------------------------------------------- |
 | `graph_stats`      | Overall shape of the graph right now                          |
@@ -298,12 +300,12 @@ Public alpha. APIs may break before 1.0.
 - schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
 - SQLite cold store (events, deployments, signals, incidents, causal claims)
 - signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
-- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER`; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
 - 11 deterministic analysis tools, rollup-correct root-cause attribution
 - agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
 - `/v1/traces/story` and indented failure-path rendering in the dashboard
 - dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
+- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `triage`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
 - live TUI (`waylog-live --dev` streams via SSE), MCP stdio
 - scoped auth (write/read/agent) with startup validation
 
diff --git a/docs/env.md b/docs/env.md
index 58953f6..65fa480 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -6,7 +6,7 @@ Reference for configuring the Waylog ingest server and SDK. All variables are re
 
 | Variable | Purpose |
 |---|---|
-| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. For the current Gemini provider, set `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. Set the matching key for the selected provider: `GEMINI_API_KEY` or `GOOGLE_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` |
 
 ## LLM provider
 
@@ -14,11 +14,17 @@ Deterministic tools, plans, triage, MCP, and read APIs do not require an LLM pro
 
 | Variable | Default | Purpose |
 |---|---|---|
-| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values in M2: `none`, `gemini` |
-| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. For Gemini, this takes precedence over `GEMINI_MODEL` |
+| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values: `none`, `gemini`, `anthropic`, `openai` |
+| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. Takes precedence over provider-specific model variables |
 | `GEMINI_MODEL` | `gemini-2.5-flash` | Gemini-specific model override when `WAYLOG_LLM_MODEL` is unset |
 | `GEMINI_API_BASE` | Gemini API default | Gemini-specific API base URL override |
 | `GEMINI_TOOL_MODE` | `text` | Gemini-specific tool-calling mode |
+| `ANTHROPIC_API_KEY` | — | Anthropic API key for `WAYLOG_LLM_PROVIDER=anthropic` |
+| `ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `ANTHROPIC_API_BASE` | Anthropic API default | Anthropic-specific API base URL override |
+| `OPENAI_API_KEY` | — | OpenAI API key for `WAYLOG_LLM_PROVIDER=openai` |
+| `OPENAI_MODEL` | `gpt-5.4-mini` | OpenAI-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `OPENAI_API_BASE` | OpenAI API default | OpenAI-specific API base URL override |
 
 ## Auth
 
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 23f1e8c..7162edb 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -741,6 +741,69 @@ paths:
         '503':
           description: LLM provider unavailable
 
+  /v1/plans/execute:
+    post:
+      tags: [Operational]
+      operationId: executePlan
+      summary: Execute a deterministic tool plan
+      description: |
+        Executes explicit plan steps or a built-in template shorthand. The
+        `triage` template expands to one `triage_incident` tool step; the
+        resulting TriageReport is returned at `steps[0].result`.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              oneOf:
+                - type: object
+                  required: [steps]
+                  properties:
+                    steps:
+                      type: array
+                      minItems: 1
+                      maxItems: 10
+                      items:
+                        type: object
+                        required: [id, tool]
+                        properties:
+                          id: {type: string}
+                          tool: {type: string}
+                          params:
+                            type: object
+                            additionalProperties: true
+                - type: object
+                  required: [template, params]
+                  properties:
+                    template:
+                      type: string
+                      enum: [triage]
+                    params:
+                      type: object
+                      required: [incident_id]
+                      properties:
+                        incident_id: {type: string}
+                        window: {type: string, default: 15m}
+                        snapshot: {type: boolean, default: false}
+      responses:
+        '200':
+          description: Plan execution result
+          headers:
+            X-Plan-ID:
+              schema: {type: string}
+              description: Plan ID for `/v1/stream/plans/{id}`.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/PlanResult'
+        '400':
+          description: Invalid plan or template parameters
+        '503':
+          description: Tool registry unavailable
+
   /v1/tools/{name}:
     post:
       tags: [Operational]
@@ -1832,11 +1895,55 @@ components:
         generated_at: {type: string}
         plan_run_id:
           type: string
-          description: Set only when produced via /v1/plans/execute.
+          description: Reserved for future plan-produced reports; omitted by the M1.5 triage template.
         report_hash:
           type: string
           description: "sha256:<hex>"
 
+    PlanResult:
+      type: object
+      required: [plan_id, steps, completed, total, status]
+      properties:
+        plan_id: {type: string}
+        steps:
+          type: array
+          items:
+            type: object
+            required: [id, index, tool, duration_ms]
+            properties:
+              id: {type: string}
+              index: {type: integer}
+              tool: {type: string}
+              result:
+                description: Tool output. For the triage template this is a TriageReport.
+                oneOf:
+                  - $ref: '#/components/schemas/TriageReport'
+                  - type: object
+                    additionalProperties: true
+              error:
+                type: object
+                nullable: true
+                properties:
+                  code: {type: string}
+                  message: {type: string}
+                  retryable: {type: boolean}
+              duration_ms: {type: integer}
+        completed: {type: integer}
+        total: {type: integer}
+        status:
+          type: string
+          enum: [complete, partial, failed]
+        halted_at:
+          type: integer
+          nullable: true
+        error:
+          type: object
+          nullable: true
+          properties:
+            code: {type: string}
+            message: {type: string}
+            retryable: {type: boolean}
+
     CapabilitiesResponse:
       type: object
       example:
@@ -1856,7 +1963,7 @@ components:
             provider:
               type: string
               description: Resolved Ask provider. `custom` is used for injected providers.
-              enum: [none, gemini, custom]
+              enum: [none, gemini, anthropic, openai, custom]
             model:
               type: string
               description: Resolved model for the selected provider, or empty when Ask is disabled.
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index e6150a1..6bf181d 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -2285,9 +2285,7 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
 		}()
 	}
 
-	var req struct {
-		Steps []PlanStep `json:"steps"`
-	}
+	var req PlanExecuteRequest
 	if err := json.Unmarshal(body, &req); err != nil {
 		if dedupIsExecutor {
 			dedupCompleted = true
@@ -2298,6 +2296,8 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
+	steps, expandErr := ExpandPlanRequest(req)
+
 	registry := s.askRegistry
 	if registry == nil {
 		if dedupIsExecutor {
@@ -2309,7 +2309,17 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	if errs := ValidatePlan(req.Steps, registry); len(errs) > 0 {
+	if expandErr != nil {
+		if dedupIsExecutor {
+			dedupCompleted = true
+			s.dedupCache.Complete(r.Method, r.URL.Path, s.dedupPrincipal(r), idempKey, body,
+				http.StatusBadRequest, nil, &APIError{Code: "INVALID_PLAN", Message: expandErr.Error()}, time.Since(start).Milliseconds())
+		}
+		respondError(w, r, http.StatusBadRequest, "INVALID_PLAN", expandErr.Error(), false, APIMeta{RequestID: reqID})
+		return
+	}
+
+	if errs := ValidatePlan(steps, registry); len(errs) > 0 {
 		msg := strings.Join(errs, "; ")
 		if dedupIsExecutor {
 			dedupCompleted = true
@@ -2325,7 +2335,7 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
 		planID = s.planStore.Create()
 	}
 
-	result := s.executePlanWithProgress(r.Context(), req.Steps, registry, planID)
+	result := s.executePlanWithProgress(r.Context(), steps, registry, planID)
 
 	if result.PlanID != "" {
 		w.Header().Set("X-Plan-ID", result.PlanID)
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 0ed3768..28099a7 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -1714,6 +1714,124 @@ func TestToolCall_InvalidJSON_EnvelopeError(t *testing.T) {
 	}
 }
 
+func TestPlanExecute_TriageTemplateExecutesAsPlan(t *testing.T) {
+	reg := tools.NewRegistry()
+	if err := reg.Register(tools.Tool{
+		Name:        "triage_incident",
+		Description: "test triage",
+		InputSchema: json.RawMessage(`{
+			"type":"object",
+			"required":["incident_id"],
+			"properties":{
+				"incident_id":{"type":"string"},
+				"window":{"type":"string"},
+				"snapshot":{"type":"boolean"}
+			}
+		}`),
+		Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+			var got struct {
+				IncidentID string `json:"incident_id"`
+				Window     string `json:"window"`
+				Snapshot   bool   `json:"snapshot"`
+			}
+			if err := json.Unmarshal(params, &got); err != nil {
+				return nil, err
+			}
+			return map[string]any{
+				"schema_version": "triage.v1",
+				"incident_ref":   map[string]string{"id": got.IncidentID, "window": got.Window},
+				"report_hash":    "sha256:test",
+				"snapshot":       got.Snapshot,
+			}, nil
+		},
+	}); err != nil {
+		t.Fatalf("register: %v", err)
+	}
+	ps := NewPlanStore()
+	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg, planStore: ps}
+	body := `{"template":"triage","params":{"incident_id":"inc_abc","window":"15m","snapshot":true}}`
+	r := httptest.NewRequest(http.MethodPost, "/v1/plans/execute", strings.NewReader(body))
+	r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
+	w := httptest.NewRecorder()
+	srv.PlanExecute(w, r)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200; body=%s", w.Code, w.Body.String())
+	}
+	if w.Header().Get("X-Plan-ID") == "" {
+		t.Fatalf("missing X-Plan-ID")
+	}
+	var result PlanResult
+	if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if result.Status != "complete" || result.Completed != 1 || result.Total != 1 {
+		t.Fatalf("result status = %+v", result)
+	}
+	if result.Steps[0].ID != "triage" || result.Steps[0].Tool != "triage_incident" {
+		t.Fatalf("step = %+v", result.Steps[0])
+	}
+	raw, err := json.Marshal(result.Steps[0].Result)
+	if err != nil {
+		t.Fatalf("marshal result: %v", err)
+	}
+	var rep struct {
+		ReportHash  string `json:"report_hash"`
+		IncidentRef struct {
+			ID string `json:"id"`
+		} `json:"incident_ref"`
+		Snapshot bool `json:"snapshot"`
+	}
+	if err := json.Unmarshal(raw, &rep); err != nil {
+		t.Fatalf("decode report: %v", err)
+	}
+	if rep.ReportHash != "sha256:test" || rep.IncidentRef.ID != "inc_abc" || !rep.Snapshot {
+		t.Fatalf("report = %+v", rep)
+	}
+	entry, ok := ps.Get(result.PlanID)
+	if !ok || len(entry.Events) < 3 {
+		t.Fatalf("expected SSE event log with start/complete/done, got ok=%v entry=%+v", ok, entry)
+	}
+}
+
+func TestPlanExecute_TemplateValidationErrors(t *testing.T) {
+	reg := tools.NewRegistry()
+	if err := reg.Register(tools.Tool{
+		Name:        "triage_incident",
+		Description: "test triage",
+		InputSchema: json.RawMessage(`{"type":"object","required":["incident_id"],"properties":{"incident_id":{"type":"string"}}}`),
+		Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+			return map[string]string{"ok": "true"}, nil
+		},
+	}); err != nil {
+		t.Fatalf("register: %v", err)
+	}
+	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg}
+	cases := map[string]string{
+		"unknown template":        `{"template":"bogus","params":{"incident_id":"inc_abc"}}`,
+		"missing incident id":     `{"template":"triage","params":{"snapshot":true}}`,
+		"steps and template both": `{"steps":[{"id":"x","tool":"triage_incident","params":{"incident_id":"inc_abc"}}],"template":"triage","params":{"incident_id":"inc_abc"}}`,
+	}
+	for name, body := range cases {
+		t.Run(name, func(t *testing.T) {
+			r := httptest.NewRequest(http.MethodPost, "/v1/plans/execute?envelope=v2", strings.NewReader(body))
+			r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
+			w := httptest.NewRecorder()
+			srv.PlanExecute(w, r)
+			if w.Code != http.StatusBadRequest {
+				t.Fatalf("status = %d, want 400; body=%s", w.Code, w.Body.String())
+			}
+			var resp APIResponse
+			if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+				t.Fatalf("decode: %v", err)
+			}
+			if resp.Error == nil || resp.Error.Code != "INVALID_PLAN" {
+				t.Fatalf("error = %+v, want INVALID_PLAN", resp.Error)
+			}
+		})
+	}
+}
+
 func TestAsk_DedupSafetyNet_PreservesActualStatus(t *testing.T) {
 	dc := NewDedupCache()
 	srv := &Server{
diff --git a/internal/ingest/plan.go b/internal/ingest/plan.go
index 278d372..b3d6237 100644
--- a/internal/ingest/plan.go
+++ b/internal/ingest/plan.go
@@ -15,6 +15,14 @@ type PlanStep struct {
 	Params json.RawMessage `json:"params"`
 }
 
+// PlanExecuteRequest is the accepted body for POST /v1/plans/execute.
+// Callers may provide explicit steps or a built-in template shorthand.
+type PlanExecuteRequest struct {
+	Steps    []PlanStep       `json:"steps"`
+	Template string           `json:"template"`
+	Params   *json.RawMessage `json:"params"`
+}
+
 // PlanStepError is a structured error for a plan step.
 type PlanStepError struct {
 	Code      string `json:"code"`
@@ -43,6 +51,49 @@ type PlanResult struct {
 	Error     *PlanStepError   `json:"error,omitempty"`
 }
 
+// ExpandPlanRequest turns a supported template shorthand into ordinary plan steps.
+func ExpandPlanRequest(req PlanExecuteRequest) ([]PlanStep, error) {
+	if len(req.Steps) > 0 && strings.TrimSpace(req.Template) != "" {
+		return nil, fmt.Errorf("provide either steps or template, not both")
+	}
+	if strings.TrimSpace(req.Template) == "" {
+		return req.Steps, nil
+	}
+
+	switch strings.TrimSpace(req.Template) {
+	case "triage":
+		return expandTriagePlan(req.Params)
+	default:
+		return nil, fmt.Errorf("unknown plan template %q", req.Template)
+	}
+}
+
+func expandTriagePlan(raw *json.RawMessage) ([]PlanStep, error) {
+	if raw == nil || len(*raw) == 0 {
+		return nil, fmt.Errorf("triage template requires params.incident_id")
+	}
+	var params struct {
+		IncidentID string `json:"incident_id"`
+		Window     string `json:"window,omitempty"`
+		Snapshot   bool   `json:"snapshot"`
+	}
+	if err := json.Unmarshal(*raw, &params); err != nil {
+		return nil, fmt.Errorf("triage template params: %w", err)
+	}
+	if strings.TrimSpace(params.IncidentID) == "" {
+		return nil, fmt.Errorf("triage template requires params.incident_id")
+	}
+	stepParams, err := json.Marshal(params)
+	if err != nil {
+		return nil, fmt.Errorf("triage template params: %w", err)
+	}
+	return []PlanStep{{
+		ID:     "triage",
+		Tool:   "triage_incident",
+		Params: stepParams,
+	}}, nil
+}
+
 // statusForHalt returns the appropriate status string for a halted plan.
 func statusForHalt(haltedIdx int) string {
 	if haltedIdx == 0 {
diff --git a/internal/ingest/plan_test.go b/internal/ingest/plan_test.go
index 893d046..b1effe9 100644
--- a/internal/ingest/plan_test.go
+++ b/internal/ingest/plan_test.go
@@ -289,3 +289,62 @@ func TestValidatePlan_Valid(t *testing.T) {
 		t.Fatalf("expected valid plan to have no errors, got: %v", errs)
 	}
 }
+
+func TestExpandPlanRequest_TriageTemplate(t *testing.T) {
+	params := json.RawMessage(`{"incident_id":"inc_abc","window":"30m","snapshot":true}`)
+	steps, err := ExpandPlanRequest(PlanExecuteRequest{
+		Template: "triage",
+		Params:   &params,
+	})
+	if err != nil {
+		t.Fatalf("expand: %v", err)
+	}
+	if len(steps) != 1 {
+		t.Fatalf("len(steps) = %d, want 1", len(steps))
+	}
+	step := steps[0]
+	if step.ID != "triage" || step.Tool != "triage_incident" {
+		t.Fatalf("step = %+v, want triage/triage_incident", step)
+	}
+	var got struct {
+		IncidentID string `json:"incident_id"`
+		Window     string `json:"window"`
+		Snapshot   bool   `json:"snapshot"`
+	}
+	if err := json.Unmarshal(step.Params, &got); err != nil {
+		t.Fatalf("decode params: %v", err)
+	}
+	if got.IncidentID != "inc_abc" || got.Window != "30m" || !got.Snapshot {
+		t.Fatalf("params = %+v", got)
+	}
+}
+
+func TestExpandPlanRequest_RejectsInvalidTemplateInput(t *testing.T) {
+	params := json.RawMessage(`{"incident_id":"inc_abc"}`)
+	cases := map[string]PlanExecuteRequest{
+		"both steps and template": {
+			Steps:    []PlanStep{{ID: "x", Tool: "graph_stats"}},
+			Template: "triage",
+			Params:   &params,
+		},
+		"unknown template": {
+			Template: "unknown",
+			Params:   &params,
+		},
+		"missing incident id": {
+			Template: "triage",
+			Params:   ptrRawMessage(json.RawMessage(`{"snapshot":true}`)),
+		},
+	}
+	for name, req := range cases {
+		t.Run(name, func(t *testing.T) {
+			if _, err := ExpandPlanRequest(req); err == nil {
+				t.Fatalf("expected error")
+			}
+		})
+	}
+}
+
+func ptrRawMessage(raw json.RawMessage) *json.RawMessage {
+	return &raw
+}
diff --git a/internal/llm/anthropic.go b/internal/llm/anthropic.go
new file mode 100644
index 0000000..94cd300
--- /dev/null
+++ b/internal/llm/anthropic.go
@@ -0,0 +1,198 @@
+package llm
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+const defaultAnthropicModel = "claude-sonnet-4-6"
+const defaultAnthropicBaseURL = "https://api.anthropic.com/v1"
+const anthropicVersion = "2023-06-01"
+
+type AnthropicClient struct {
+	APIKey     string
+	Model      string
+	BaseURL    string
+	HTTPClient *http.Client
+}
+
+func NewAnthropicClient(apiKey string) *AnthropicClient {
+	return &AnthropicClient{
+		APIKey:  apiKey,
+		Model:   defaultAnthropicModel,
+		BaseURL: defaultAnthropicBaseURL,
+	}
+}
+
+func (c *AnthropicClient) Generate(ctx context.Context, prompt string, tools []ToolDefinition, history []Turn) (Result, error) {
+	if c.APIKey == "" {
+		return Result{}, fmt.Errorf("anthropic api key required")
+	}
+	reqBody, err := c.buildRequest(prompt, tools, history)
+	if err != nil {
+		return Result{}, err
+	}
+
+	baseURL := c.BaseURL
+	if baseURL == "" {
+		baseURL = defaultAnthropicBaseURL
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/messages", bytes.NewReader(reqBody))
+	if err != nil {
+		return Result{}, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("x-api-key", c.APIKey)
+	req.Header.Set("anthropic-version", anthropicVersion)
+
+	client := c.HTTPClient
+	if client == nil {
+		client = &http.Client{Timeout: 30 * time.Second}
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		return Result{}, &ProviderError{Provider: "anthropic", Retryable: true, Message: err.Error(), Cause: err}
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return Result{}, &ProviderError{Provider: "anthropic", Retryable: true, Message: err.Error(), Cause: err}
+	}
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return Result{}, &ProviderError{
+			Provider:   "anthropic",
+			StatusCode: resp.StatusCode,
+			Retryable:  resp.StatusCode == 429 || resp.StatusCode >= 500,
+			Message:    string(body),
+		}
+	}
+	return parseAnthropicResponse(body)
+}
+
+type anthropicRequest struct {
+	Model     string             `json:"model"`
+	MaxTokens int                `json:"max_tokens"`
+	Messages  []anthropicMessage `json:"messages"`
+	Tools     []anthropicTool    `json:"tools,omitempty"`
+}
+
+type anthropicMessage struct {
+	Role    string `json:"role"`
+	Content any    `json:"content"`
+}
+
+type anthropicTool struct {
+	Name        string          `json:"name"`
+	Description string          `json:"description,omitempty"`
+	InputSchema json.RawMessage `json:"input_schema"`
+}
+
+type anthropicContentBlock struct {
+	Type      string          `json:"type"`
+	Text      string          `json:"text,omitempty"`
+	ID        string          `json:"id,omitempty"`
+	Name      string          `json:"name,omitempty"`
+	Input     json.RawMessage `json:"input,omitempty"`
+	ToolUseID string          `json:"tool_use_id,omitempty"`
+	Content   string          `json:"content,omitempty"`
+}
+
+func (c *AnthropicClient) buildRequest(prompt string, tools []ToolDefinition, history []Turn) ([]byte, error) {
+	model := c.Model
+	if model == "" {
+		model = defaultAnthropicModel
+	}
+	req := anthropicRequest{
+		Model:     model,
+		MaxTokens: 1024,
+		Messages:  []anthropicMessage{{Role: "user", Content: prompt}},
+	}
+	for _, t := range tools {
+		schema := t.InputSchema
+		if len(schema) == 0 {
+			schema = json.RawMessage(`{"type":"object"}`)
+		}
+		req.Tools = append(req.Tools, anthropicTool{
+			Name:        t.Name,
+			Description: t.Description,
+			InputSchema: schema,
+		})
+	}
+
+	nextToolID := 1
+	lastToolID := ""
+	for _, turn := range history {
+		switch {
+		case turn.ToolCall != nil:
+			toolID := fmt.Sprintf("toolu_waylog_%d", nextToolID)
+			nextToolID++
+			lastToolID = toolID
+			input := turn.ToolCall.Arguments
+			if len(input) == 0 {
+				input = json.RawMessage(`{}`)
+			}
+			req.Messages = append(req.Messages, anthropicMessage{
+				Role: "assistant",
+				Content: []anthropicContentBlock{{
+					Type:  "tool_use",
+					ID:    toolID,
+					Name:  turn.ToolCall.Name,
+					Input: input,
+				}},
+			})
+		case turn.ToolResult != nil:
+			toolID := lastToolID
+			if toolID == "" {
+				toolID = "toolu_waylog_0"
+			}
+			payload, err := json.Marshal(turn.ToolResult.Result)
+			if err != nil {
+				return nil, fmt.Errorf("anthropic: marshal tool result: %w", err)
+			}
+			req.Messages = append(req.Messages, anthropicMessage{
+				Role: "user",
+				Content: []anthropicContentBlock{{
+					Type:      "tool_result",
+					ToolUseID: toolID,
+					Content:   string(payload),
+				}},
+			})
+		case turn.Text != "":
+			req.Messages = append(req.Messages, anthropicMessage{Role: "assistant", Content: turn.Text})
+		}
+	}
+
+	return json.Marshal(req)
+}
+
+type anthropicResponse struct {
+	Content []anthropicContentBlock `json:"content"`
+}
+
+func parseAnthropicResponse(body []byte) (Result, error) {
+	var resp anthropicResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return Result{}, err
+	}
+	var out Result
+	for _, block := range resp.Content {
+		switch block.Type {
+		case "text":
+			out.Text += block.Text
+		case "tool_use":
+			args := block.Input
+			if len(args) == 0 {
+				args = json.RawMessage(`{}`)
+			}
+			out.ToolCalls = append(out.ToolCalls, ToolCall{Name: block.Name, Arguments: args})
+		}
+	}
+	return out, nil
+}
diff --git a/internal/llm/anthropic_test.go b/internal/llm/anthropic_test.go
new file mode 100644
index 0000000..b535744
--- /dev/null
+++ b/internal/llm/anthropic_test.go
@@ -0,0 +1,117 @@
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestAnthropicGenerateSendsMessagesAndTools(t *testing.T) {
+	var captured struct {
+		Model    string `json:"model"`
+		Messages []struct {
+			Role    string          `json:"role"`
+			Content json.RawMessage `json:"content"`
+		} `json:"messages"`
+		Tools []struct {
+			Name        string          `json:"name"`
+			InputSchema json.RawMessage `json:"input_schema"`
+		} `json:"tools"`
+	}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/messages" {
+			t.Fatalf("path = %q, want /messages", r.URL.Path)
+		}
+		if got := r.Header.Get("x-api-key"); got != "test-key" {
+			t.Fatalf("x-api-key = %q", got)
+		}
+		if got := r.Header.Get("anthropic-version"); got != anthropicVersion {
+			t.Fatalf("anthropic-version = %q", got)
+		}
+		if err := json.NewDecoder(r.Body).Decode(&captured); err != nil {
+			t.Fatalf("decode request: %v", err)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{"content":[{"type":"text","text":"done"}]}`))
+	}))
+	defer srv.Close()
+
+	client := NewAnthropicClient("test-key")
+	client.BaseURL = srv.URL
+	client.Model = "claude-test"
+	res, err := client.Generate(context.Background(), "hello", []ToolDefinition{{
+		Name:        "triage_incident",
+		Description: "triage",
+		InputSchema: json.RawMessage(`{"type":"object","properties":{"incident_id":{"type":"string"}}}`),
+	}}, nil)
+	if err != nil {
+		t.Fatalf("generate: %v", err)
+	}
+	if res.Text != "done" {
+		t.Fatalf("Text = %q, want done", res.Text)
+	}
+	if captured.Model != "claude-test" {
+		t.Fatalf("model = %q", captured.Model)
+	}
+	if len(captured.Messages) != 1 || captured.Messages[0].Role != "user" {
+		t.Fatalf("messages = %+v", captured.Messages)
+	}
+	if len(captured.Tools) != 1 || captured.Tools[0].Name != "triage_incident" {
+		t.Fatalf("tools = %+v", captured.Tools)
+	}
+}
+
+func TestParseAnthropicResponseToolUse(t *testing.T) {
+	body := []byte(`{
+		"content": [
+			{"type":"text","text":"checking"},
+			{"type":"tool_use","id":"toolu_1","name":"triage_incident","input":{"incident_id":"inc_abc","snapshot":true}}
+		]
+	}`)
+	res, err := parseAnthropicResponse(body)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if res.Text != "checking" {
+		t.Fatalf("Text = %q", res.Text)
+	}
+	if len(res.ToolCalls) != 1 {
+		t.Fatalf("len(ToolCalls) = %d", len(res.ToolCalls))
+	}
+	if res.ToolCalls[0].Name != "triage_incident" {
+		t.Fatalf("tool name = %q", res.ToolCalls[0].Name)
+	}
+	var args struct {
+		IncidentID string `json:"incident_id"`
+		Snapshot   bool   `json:"snapshot"`
+	}
+	if err := json.Unmarshal(res.ToolCalls[0].Arguments, &args); err != nil {
+		t.Fatalf("args: %v", err)
+	}
+	if args.IncidentID != "inc_abc" || !args.Snapshot {
+		t.Fatalf("args = %+v", args)
+	}
+}
+
+func TestAnthropicGenerateAPIError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, "rate limited", http.StatusTooManyRequests)
+	}))
+	defer srv.Close()
+
+	client := NewAnthropicClient("test-key")
+	client.BaseURL = srv.URL
+	_, err := client.Generate(context.Background(), "hello", nil, nil)
+	if err == nil {
+		t.Fatalf("expected error")
+	}
+	pe, ok := err.(*ProviderError)
+	if !ok {
+		t.Fatalf("err = %T, want *ProviderError", err)
+	}
+	if pe.Provider != "anthropic" || !pe.Retryable || pe.StatusCode != http.StatusTooManyRequests {
+		t.Fatalf("provider error = %+v", pe)
+	}
+}
diff --git a/internal/llm/openai.go b/internal/llm/openai.go
new file mode 100644
index 0000000..3dc9019
--- /dev/null
+++ b/internal/llm/openai.go
@@ -0,0 +1,239 @@
+package llm
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+const defaultOpenAIModel = "gpt-5.4-mini"
+const defaultOpenAIBaseURL = "https://api.openai.com/v1"
+
+type OpenAIClient struct {
+	APIKey     string
+	Model      string
+	BaseURL    string
+	HTTPClient *http.Client
+}
+
+func NewOpenAIClient(apiKey string) *OpenAIClient {
+	return &OpenAIClient{
+		APIKey:  apiKey,
+		Model:   defaultOpenAIModel,
+		BaseURL: defaultOpenAIBaseURL,
+	}
+}
+
+func (c *OpenAIClient) Generate(ctx context.Context, prompt string, tools []ToolDefinition, history []Turn) (Result, error) {
+	if c.APIKey == "" {
+		return Result{}, fmt.Errorf("openai api key required")
+	}
+	reqBody, err := c.buildRequest(prompt, tools, history)
+	if err != nil {
+		return Result{}, err
+	}
+
+	baseURL := c.BaseURL
+	if baseURL == "" {
+		baseURL = defaultOpenAIBaseURL
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/responses", bytes.NewReader(reqBody))
+	if err != nil {
+		return Result{}, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+c.APIKey)
+
+	client := c.HTTPClient
+	if client == nil {
+		client = &http.Client{Timeout: 30 * time.Second}
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		return Result{}, &ProviderError{Provider: "openai", Retryable: true, Message: err.Error(), Cause: err}
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return Result{}, &ProviderError{Provider: "openai", Retryable: true, Message: err.Error(), Cause: err}
+	}
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return Result{}, &ProviderError{
+			Provider:   "openai",
+			StatusCode: resp.StatusCode,
+			Retryable:  resp.StatusCode == 429 || resp.StatusCode >= 500,
+			Message:    string(body),
+		}
+	}
+	return parseOpenAIResponse(body)
+}
+
+type openAIRequest struct {
+	Model string            `json:"model"`
+	Input []json.RawMessage `json:"input"`
+	Tools []openAITool      `json:"tools,omitempty"`
+}
+
+type openAIMessage struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type openAIFunctionCallInput struct {
+	Type      string `json:"type"`
+	CallID    string `json:"call_id"`
+	Name      string `json:"name"`
+	Arguments string `json:"arguments"`
+}
+
+type openAIFunctionOutputInput struct {
+	Type   string `json:"type"`
+	CallID string `json:"call_id"`
+	Output string `json:"output"`
+}
+
+type openAITool struct {
+	Type        string          `json:"type"`
+	Name        string          `json:"name"`
+	Description string          `json:"description,omitempty"`
+	Parameters  json.RawMessage `json:"parameters"`
+}
+
+func (c *OpenAIClient) buildRequest(prompt string, tools []ToolDefinition, history []Turn) ([]byte, error) {
+	model := c.Model
+	if model == "" {
+		model = defaultOpenAIModel
+	}
+	req := openAIRequest{
+		Model: model,
+		Input: []json.RawMessage{mustMarshalOpenAI(openAIMessage{Role: "user", Content: prompt})},
+	}
+	for _, t := range tools {
+		schema := t.InputSchema
+		if len(schema) == 0 {
+			schema = json.RawMessage(`{"type":"object"}`)
+		}
+		req.Tools = append(req.Tools, openAITool{
+			Type:        "function",
+			Name:        t.Name,
+			Description: t.Description,
+			Parameters:  schema,
+		})
+	}
+
+	nextCallID := 1
+	lastCallID := ""
+	for _, turn := range history {
+		switch {
+		case turn.ToolCall != nil:
+			callID := fmt.Sprintf("call_waylog_%d", nextCallID)
+			nextCallID++
+			if turn.ToolCall.ProviderID != "" {
+				callID = turn.ToolCall.ProviderID
+			}
+			lastCallID = callID
+			if len(turn.ToolCall.ProviderRawItems) > 0 {
+				req.Input = append(req.Input, turn.ToolCall.ProviderRawItems...)
+				continue
+			}
+			if !turn.ToolCall.ProviderRawIncluded {
+				args := string(turn.ToolCall.Arguments)
+				if args == "" {
+					args = "{}"
+				}
+				req.Input = append(req.Input, mustMarshalOpenAI(openAIFunctionCallInput{
+					Type:      "function_call",
+					CallID:    callID,
+					Name:      turn.ToolCall.Name,
+					Arguments: args,
+				}))
+			}
+		case turn.ToolResult != nil:
+			callID := lastCallID
+			if callID == "" {
+				callID = "call_waylog_0"
+			}
+			payload, err := json.Marshal(turn.ToolResult.Result)
+			if err != nil {
+				return nil, fmt.Errorf("openai: marshal tool result: %w", err)
+			}
+			req.Input = append(req.Input, mustMarshalOpenAI(openAIFunctionOutputInput{
+				Type:   "function_call_output",
+				CallID: callID,
+				Output: string(payload),
+			}))
+		case turn.Text != "":
+			req.Input = append(req.Input, mustMarshalOpenAI(openAIMessage{Role: "assistant", Content: turn.Text}))
+		}
+	}
+
+	return json.Marshal(req)
+}
+
+func mustMarshalOpenAI(v any) json.RawMessage {
+	raw, _ := json.Marshal(v)
+	return raw
+}
+
+type openAIResponse struct {
+	OutputText string            `json:"output_text"`
+	Output     []json.RawMessage `json:"output"`
+}
+
+type openAIOutputItem struct {
+	Type      string `json:"type"`
+	CallID    string `json:"call_id,omitempty"`
+	Name      string `json:"name,omitempty"`
+	Arguments string `json:"arguments,omitempty"`
+	Content   []struct {
+		Type string `json:"type"`
+		Text string `json:"text"`
+	} `json:"content,omitempty"`
+}
+
+func parseOpenAIResponse(body []byte) (Result, error) {
+	var resp openAIResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return Result{}, err
+	}
+	out := Result{Text: resp.OutputText}
+	rawItems := append([]json.RawMessage(nil), resp.Output...)
+	rawAttached := false
+	for _, raw := range resp.Output {
+		var item openAIOutputItem
+		if err := json.Unmarshal(raw, &item); err != nil {
+			return Result{}, err
+		}
+		switch item.Type {
+		case "function_call":
+			args := json.RawMessage(item.Arguments)
+			if len(args) == 0 {
+				args = json.RawMessage(`{}`)
+			}
+			call := ToolCall{Name: item.Name, Arguments: args, ProviderID: item.CallID}
+			if !rawAttached {
+				call.ProviderRawItems = rawItems
+				rawAttached = true
+			} else {
+				call.ProviderRawIncluded = true
+			}
+			out.ToolCalls = append(out.ToolCalls, call)
+		case "message":
+			if out.Text != "" {
+				continue
+			}
+			for _, part := range item.Content {
+				if part.Type == "output_text" || part.Type == "text" {
+					out.Text += part.Text
+				}
+			}
+		}
+	}
+	return out, nil
+}
diff --git a/internal/llm/openai_test.go b/internal/llm/openai_test.go
new file mode 100644
index 0000000..fa363a0
--- /dev/null
+++ b/internal/llm/openai_test.go
@@ -0,0 +1,203 @@
+package llm
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestOpenAIGenerateSendsResponsesRequest(t *testing.T) {
+	var captured struct {
+		Model string `json:"model"`
+		Input []struct {
+			Role    string `json:"role,omitempty"`
+			Content string `json:"content,omitempty"`
+		} `json:"input"`
+		Tools []struct {
+			Type       string          `json:"type"`
+			Name       string          `json:"name"`
+			Parameters json.RawMessage `json:"parameters"`
+		} `json:"tools"`
+	}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/responses" {
+			t.Fatalf("path = %q, want /responses", r.URL.Path)
+		}
+		if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
+			t.Fatalf("Authorization = %q", got)
+		}
+		if err := json.NewDecoder(r.Body).Decode(&captured); err != nil {
+			t.Fatalf("decode request: %v", err)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{"output_text":"done","output":[]}`))
+	}))
+	defer srv.Close()
+
+	client := NewOpenAIClient("test-key")
+	client.BaseURL = srv.URL
+	client.Model = "gpt-test"
+	res, err := client.Generate(context.Background(), "hello", []ToolDefinition{{
+		Name:        "triage_incident",
+		Description: "triage",
+		InputSchema: json.RawMessage(`{"type":"object","properties":{"incident_id":{"type":"string"}}}`),
+	}}, nil)
+	if err != nil {
+		t.Fatalf("generate: %v", err)
+	}
+	if res.Text != "done" {
+		t.Fatalf("Text = %q, want done", res.Text)
+	}
+	if captured.Model != "gpt-test" {
+		t.Fatalf("model = %q", captured.Model)
+	}
+	if len(captured.Input) != 1 || captured.Input[0].Role != "user" || captured.Input[0].Content != "hello" {
+		t.Fatalf("input = %+v", captured.Input)
+	}
+	if len(captured.Tools) != 1 || captured.Tools[0].Type != "function" || captured.Tools[0].Name != "triage_incident" {
+		t.Fatalf("tools = %+v", captured.Tools)
+	}
+}
+
+func TestParseOpenAIResponseFunctionCall(t *testing.T) {
+	body := []byte(`{
+		"output": [
+			{"type":"function_call","call_id":"call_1","name":"triage_incident","arguments":"{\"incident_id\":\"inc_abc\",\"snapshot\":true}"}
+		]
+	}`)
+	res, err := parseOpenAIResponse(body)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(res.ToolCalls) != 1 {
+		t.Fatalf("len(ToolCalls) = %d", len(res.ToolCalls))
+	}
+	if res.ToolCalls[0].Name != "triage_incident" {
+		t.Fatalf("tool name = %q", res.ToolCalls[0].Name)
+	}
+	if res.ToolCalls[0].ProviderID != "call_1" {
+		t.Fatalf("ProviderID = %q, want call_1", res.ToolCalls[0].ProviderID)
+	}
+	if len(res.ToolCalls[0].ProviderRawItems) != 1 {
+		t.Fatalf("ProviderRawItems len = %d, want 1", len(res.ToolCalls[0].ProviderRawItems))
+	}
+	var args struct {
+		IncidentID string `json:"incident_id"`
+		Snapshot   bool   `json:"snapshot"`
+	}
+	if err := json.Unmarshal(res.ToolCalls[0].Arguments, &args); err != nil {
+		t.Fatalf("args: %v", err)
+	}
+	if args.IncidentID != "inc_abc" || !args.Snapshot {
+		t.Fatalf("args = %+v", args)
+	}
+}
+
+func TestOpenAIRequestPreservesResponseOutputAndCallIDs(t *testing.T) {
+	body := []byte(`{
+		"output": [
+			{"type":"reasoning","id":"rs_1","summary":[]},
+			{"type":"function_call","call_id":"call_1","name":"triage_incident","arguments":"{\"incident_id\":\"inc_abc\"}"},
+			{"type":"function_call","call_id":"call_2","name":"blast_radius","arguments":"{\"code\":\"PMT_502\"}"}
+		]
+	}`)
+	res, err := parseOpenAIResponse(body)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if len(res.ToolCalls) != 2 {
+		t.Fatalf("len(ToolCalls) = %d, want 2", len(res.ToolCalls))
+	}
+
+	client := NewOpenAIClient("test-key")
+	raw, err := client.buildRequest("triage", nil, []Turn{
+		{ToolCall: &res.ToolCalls[0]},
+		{ToolResult: &ToolResult{Name: "triage_incident", Result: map[string]string{"report_hash": "sha256:x"}}},
+		{ToolCall: &res.ToolCalls[1]},
+		{ToolResult: &ToolResult{Name: "blast_radius", Result: map[string]int{"requests": 12}}},
+	})
+	if err != nil {
+		t.Fatalf("buildRequest: %v", err)
+	}
+
+	var req struct {
+		Input []json.RawMessage `json:"input"`
+	}
+	if err := json.Unmarshal(raw, &req); err != nil {
+		t.Fatalf("decode request: %v", err)
+	}
+	var types []string
+	var callIDs []string
+	for _, item := range req.Input {
+		var got struct {
+			Type   string `json:"type"`
+			CallID string `json:"call_id"`
+		}
+		if err := json.Unmarshal(item, &got); err != nil {
+			t.Fatalf("decode input item: %v", err)
+		}
+		if got.Type == "" {
+			continue
+		}
+		types = append(types, got.Type)
+		if got.CallID != "" {
+			callIDs = append(callIDs, got.CallID)
+		}
+	}
+	wantTypes := []string{"reasoning", "function_call", "function_call", "function_call_output", "function_call_output"}
+	if len(types) != len(wantTypes) {
+		t.Fatalf("types = %v, want %v", types, wantTypes)
+	}
+	for i := range wantTypes {
+		if types[i] != wantTypes[i] {
+			t.Fatalf("types = %v, want %v", types, wantTypes)
+		}
+	}
+	wantCallIDs := []string{"call_1", "call_2", "call_1", "call_2"}
+	if len(callIDs) != len(wantCallIDs) {
+		t.Fatalf("callIDs = %v, want %v", callIDs, wantCallIDs)
+	}
+	for i := range wantCallIDs {
+		if callIDs[i] != wantCallIDs[i] {
+			t.Fatalf("callIDs = %v, want %v", callIDs, wantCallIDs)
+		}
+	}
+}
+
+func TestParseOpenAIResponseMessageText(t *testing.T) {
+	body := []byte(`{
+		"output": [
+			{"type":"message","content":[{"type":"output_text","text":"hello"}]}
+		]
+	}`)
+	res, err := parseOpenAIResponse(body)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if res.Text != "hello" {
+		t.Fatalf("Text = %q, want hello", res.Text)
+	}
+}
+
+func TestOpenAIGenerateAPIError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, "unavailable", http.StatusServiceUnavailable)
+	}))
+	defer srv.Close()
+
+	client := NewOpenAIClient("test-key")
+	client.BaseURL = srv.URL
+	_, err := client.Generate(context.Background(), "hello", nil, nil)
+	if err == nil {
+		t.Fatalf("expected error")
+	}
+	pe, ok := err.(*ProviderError)
+	if !ok {
+		t.Fatalf("err = %T, want *ProviderError", err)
+	}
+	if pe.Provider != "openai" || !pe.Retryable || pe.StatusCode != http.StatusServiceUnavailable {
+		t.Fatalf("provider error = %+v", pe)
+	}
+}
diff --git a/internal/llm/provider.go b/internal/llm/provider.go
index c75ea4e..777b178 100644
--- a/internal/llm/provider.go
+++ b/internal/llm/provider.go
@@ -23,34 +23,57 @@ type Selection struct {
 
 // SelectFromEnv resolves the LLM provider from environment variables.
 //
-// WAYLOG_LLM_PROVIDER may be "none" or "gemini". When unset, a Gemini key
-// (GEMINI_API_KEY or GOOGLE_API_KEY) infers gemini; otherwise none.
-// Model precedence: WAYLOG_LLM_MODEL > GEMINI_MODEL > built-in default.
+// WAYLOG_LLM_PROVIDER may be "none", "gemini", "anthropic", or "openai".
+// When unset, a supported provider key infers the provider; otherwise none.
+// Model precedence: WAYLOG_LLM_MODEL > provider-specific model env > built-in default.
 func SelectFromEnv() (Selection, error) {
 	raw := strings.ToLower(strings.TrimSpace(os.Getenv("WAYLOG_LLM_PROVIDER")))
-	key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
-	if key == "" {
-		key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
-	}
 
 	switch raw {
 	case "":
-		if key == "" {
-			return Selection{Provider: "none"}, nil
+		if key := geminiKeyFromEnv(); key != "" {
+			return buildGemini(key, true), nil
 		}
-		return buildGemini(key, true), nil
+		if key := strings.TrimSpace(os.Getenv("ANTHROPIC_API_KEY")); key != "" {
+			return buildAnthropic(key, true), nil
+		}
+		if key := strings.TrimSpace(os.Getenv("OPENAI_API_KEY")); key != "" {
+			return buildOpenAI(key, true), nil
+		}
+		return Selection{Provider: "none"}, nil
 	case "none":
 		return Selection{Provider: "none", Configured: true}, nil
 	case "gemini":
+		key := geminiKeyFromEnv()
 		if key == "" {
 			return Selection{Provider: "gemini", Configured: false}, nil
 		}
 		return buildGemini(key, true), nil
+	case "anthropic":
+		key := strings.TrimSpace(os.Getenv("ANTHROPIC_API_KEY"))
+		if key == "" {
+			return Selection{Provider: "anthropic", Configured: false}, nil
+		}
+		return buildAnthropic(key, true), nil
+	case "openai":
+		key := strings.TrimSpace(os.Getenv("OPENAI_API_KEY"))
+		if key == "" {
+			return Selection{Provider: "openai", Configured: false}, nil
+		}
+		return buildOpenAI(key, true), nil
 	default:
-		return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini", raw)
+		return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini, anthropic, openai", raw)
 	}
 }
 
+func geminiKeyFromEnv() string {
+	key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
+	if key == "" {
+		key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+	}
+	return key
+}
+
 func buildGemini(key string, configured bool) Selection {
 	client := NewGeminiClient(key)
 	if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
@@ -73,3 +96,41 @@ func buildGemini(key string, configured bool) Selection {
 		Impl:       client,
 	}
 }
+
+func buildAnthropic(key string, configured bool) Selection {
+	client := NewAnthropicClient(key)
+	if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+		client.Model = model
+	} else if model := strings.TrimSpace(os.Getenv("ANTHROPIC_MODEL")); model != "" {
+		client.Model = model
+	}
+	if base := strings.TrimSpace(os.Getenv("ANTHROPIC_API_BASE")); base != "" {
+		client.BaseURL = base
+	}
+	return Selection{
+		Provider:   "anthropic",
+		Model:      client.Model,
+		Configured: configured,
+		AskEnabled: true,
+		Impl:       client,
+	}
+}
+
+func buildOpenAI(key string, configured bool) Selection {
+	client := NewOpenAIClient(key)
+	if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+		client.Model = model
+	} else if model := strings.TrimSpace(os.Getenv("OPENAI_MODEL")); model != "" {
+		client.Model = model
+	}
+	if base := strings.TrimSpace(os.Getenv("OPENAI_API_BASE")); base != "" {
+		client.BaseURL = base
+	}
+	return Selection{
+		Provider:   "openai",
+		Model:      client.Model,
+		Configured: configured,
+		AskEnabled: true,
+		Impl:       client,
+	}
+}
diff --git a/internal/llm/provider_test.go b/internal/llm/provider_test.go
index b73e466..a19b248 100644
--- a/internal/llm/provider_test.go
+++ b/internal/llm/provider_test.go
@@ -14,6 +14,12 @@ func clearProviderEnv(t *testing.T) {
 	t.Setenv("GEMINI_MODEL", "")
 	t.Setenv("GEMINI_API_BASE", "")
 	t.Setenv("GEMINI_TOOL_MODE", "")
+	t.Setenv("ANTHROPIC_API_KEY", "")
+	t.Setenv("ANTHROPIC_MODEL", "")
+	t.Setenv("ANTHROPIC_API_BASE", "")
+	t.Setenv("OPENAI_API_KEY", "")
+	t.Setenv("OPENAI_MODEL", "")
+	t.Setenv("OPENAI_API_BASE", "")
 }
 
 func TestSelectFromEnv_NoEnv(t *testing.T) {
@@ -129,16 +135,16 @@ func TestSelectFromEnv_GeminiMissingKey(t *testing.T) {
 
 func TestSelectFromEnv_UnknownProvider(t *testing.T) {
 	clearProviderEnv(t)
-	t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+	t.Setenv("WAYLOG_LLM_PROVIDER", "bogus")
 
 	_, err := SelectFromEnv()
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
-	if !strings.Contains(err.Error(), "anthropic") {
+	if !strings.Contains(err.Error(), "bogus") {
 		t.Errorf("error %q should mention provider name", err.Error())
 	}
-	if !strings.Contains(err.Error(), "none, gemini") {
+	if !strings.Contains(err.Error(), "none, gemini, anthropic, openai") {
 		t.Errorf("error %q should list supported providers", err.Error())
 	}
 }
@@ -158,3 +164,129 @@ func TestSelectFromEnv_WaylogModelOverridesGeminiModel(t *testing.T) {
 		t.Errorf("Model = %q, want %q", sel.Model, "foo")
 	}
 }
+
+func TestSelectFromEnv_AnthropicWithKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+	t.Setenv("ANTHROPIC_API_KEY", "test-key")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "anthropic" {
+		t.Errorf("Provider = %q, want anthropic", sel.Provider)
+	}
+	if !sel.Configured || !sel.AskEnabled || sel.Impl == nil {
+		t.Fatalf("selection = %+v, want configured enabled provider", sel)
+	}
+	if sel.Model == "" {
+		t.Error("Model is empty, want default")
+	}
+}
+
+func TestSelectFromEnv_AnthropicMissingKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "anthropic" {
+		t.Errorf("Provider = %q, want anthropic", sel.Provider)
+	}
+	if sel.Configured || sel.AskEnabled || sel.Impl != nil {
+		t.Fatalf("selection = %+v, want unavailable provider without startup error", sel)
+	}
+}
+
+func TestSelectFromEnv_OpenAIWithKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "openai")
+	t.Setenv("OPENAI_API_KEY", "test-key")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "openai" {
+		t.Errorf("Provider = %q, want openai", sel.Provider)
+	}
+	if !sel.Configured || !sel.AskEnabled || sel.Impl == nil {
+		t.Fatalf("selection = %+v, want configured enabled provider", sel)
+	}
+	if sel.Model == "" {
+		t.Error("Model is empty, want default")
+	}
+}
+
+func TestSelectFromEnv_OpenAIMissingKey(t *testing.T) {
+	clearProviderEnv(t)
+	t.Setenv("WAYLOG_LLM_PROVIDER", "openai")
+
+	sel, err := SelectFromEnv()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if sel.Provider != "openai" {
+		t.Errorf("Provider = %q, want openai", sel.Provider)
+	}
+	if sel.Configured || sel.AskEnabled || sel.Impl != nil {
+		t.Fatalf("selection = %+v, want unavailable provider without startup error", sel)
+	}
+}
+
+func TestSelectFromEnv_InferredProviderKeys(t *testing.T) {
+	tests := []struct {
+		name     string
+		keyEnv   string
+		wantName string
+	}{
+		{name: "anthropic", keyEnv: "ANTHROPIC_API_KEY", wantName: "anthropic"},
+		{name: "openai", keyEnv: "OPENAI_API_KEY", wantName: "openai"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			clearProviderEnv(t)
+			t.Setenv(tc.keyEnv, "test-key")
+
+			sel, err := SelectFromEnv()
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if sel.Provider != tc.wantName || !sel.AskEnabled {
+				t.Fatalf("selection = %+v, want %s enabled", sel, tc.wantName)
+			}
+		})
+	}
+}
+
+func TestSelectFromEnv_WaylogModelOverridesProviderModel(t *testing.T) {
+	tests := []struct {
+		name     string
+		provider string
+		keyEnv   string
+		modelEnv string
+	}{
+		{name: "anthropic", provider: "anthropic", keyEnv: "ANTHROPIC_API_KEY", modelEnv: "ANTHROPIC_MODEL"},
+		{name: "openai", provider: "openai", keyEnv: "OPENAI_API_KEY", modelEnv: "OPENAI_MODEL"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			clearProviderEnv(t)
+			t.Setenv("WAYLOG_LLM_PROVIDER", tc.provider)
+			t.Setenv(tc.keyEnv, "test-key")
+			t.Setenv("WAYLOG_LLM_MODEL", "waylog-model")
+			t.Setenv(tc.modelEnv, "provider-model")
+
+			sel, err := SelectFromEnv()
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if sel.Model != "waylog-model" {
+				t.Fatalf("Model = %q, want waylog-model", sel.Model)
+			}
+		})
+	}
+}
diff --git a/internal/llm/types.go b/internal/llm/types.go
index 0c14aee..eaed6d9 100644
--- a/internal/llm/types.go
+++ b/internal/llm/types.go
@@ -12,8 +12,11 @@ type ToolDefinition struct {
 }
 
 type ToolCall struct {
-	Name      string
-	Arguments json.RawMessage
+	Name                string
+	Arguments           json.RawMessage
+	ProviderID          string            `json:"-"`
+	ProviderRawItems    []json.RawMessage `json:"-"`
+	ProviderRawIncluded bool              `json:"-"`
 }
 
 type ToolResult struct {
diff --git a/internal/triage/idempotency_test.go b/internal/triage/idempotency_test.go
index 9327295..51dd6cd 100644
--- a/internal/triage/idempotency_test.go
+++ b/internal/triage/idempotency_test.go
@@ -60,3 +60,37 @@ func TestSnapshotModeUsesIncidentUpdatedAt(t *testing.T) {
 		t.Fatalf("snap report missing incident ref")
 	}
 }
+
+func TestBuildHashDoesNotDependOnLLMProviderEnv(t *testing.T) {
+	providers := []string{"none", "gemini", "anthropic", "openai"}
+	var first string
+	for _, provider := range providers {
+		t.Run(provider, func(t *testing.T) {
+			t.Setenv("WAYLOG_LLM_PROVIDER", provider)
+			t.Setenv("GEMINI_API_KEY", "test-gemini-key")
+			t.Setenv("ANTHROPIC_API_KEY", "test-anthropic-key")
+			t.Setenv("OPENAI_API_KEY", "test-openai-key")
+
+			deps := Deps{
+				Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+				Signals: richSignals{}, NextChecks: richNextChecks{},
+				Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+			}
+			eng, err := NewEngine(deps)
+			if err != nil {
+				t.Fatalf("new engine: %v", err)
+			}
+			opts, _ := ParseBuildOptions("15m", true, deps.Now())
+			r, err := eng.Build(context.Background(), "inc_abc", opts)
+			if err != nil {
+				t.Fatalf("build: %v", err)
+			}
+			if first == "" {
+				first = r.ReportHash
+			}
+			if r.ReportHash != first {
+				t.Fatalf("provider %s hash = %q, want %q", provider, r.ReportHash, first)
+			}
+		})
+	}
+}

From cd615df7e63fd27c71120e2ec44f2ce8c2502908 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Fri, 8 May 2026 11:57:27 -0400
Subject: [PATCH 08/14] test: added rollup-correct attribution proof

- Add a runnable rollup-comparison demo proof that contrasts Waylog's
root-cause-counted PMT_502 rollup with a naive propagated service-hop count.

- Adds a graph invariant test to keep the root-cause rollup behavior pinned,
extends the demo JSON helper with small extractors used by the proof script,
and documents the new make rollup-comparison target in README.
---
 Makefile                                      |  6 +-
 README.md                                     |  3 +
 .../graph/analysis/rollup_invariant_test.go   | 33 +++++++
 scripts/demo-acceptance-json/main.go          | 47 +++++++++-
 scripts/rollup-comparison.sh                  | 86 +++++++++++++++++++
 5 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 internal/graph/analysis/rollup_invariant_test.go
 create mode 100755 scripts/rollup-comparison.sh

diff --git a/Makefile b/Makefile
index e47aeb7..cb1937a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/sh
 
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
 
 help:
 	@echo "Targets:"
@@ -19,6 +19,7 @@ help:
 	@echo "  demo     - start dashboard demo locally (detached, no Docker)"
 	@echo "  demo-stop - stop demo processes"
 	@echo "  demo-acceptance - verify a running local demo end-to-end"
+	@echo "  rollup-comparison - run demo proof for root-cause vs naive rollup counts"
 	@echo "  demo-up  - start v2 demo stack in Docker (detached)"
 	@echo "  demo-down - stop Docker demo stack"
 	@echo "  micro-demo - start 4-service micro-demo in foreground for debugging"
@@ -123,6 +124,9 @@ demo-stop:
 demo-acceptance:
 	./scripts/demo-acceptance.sh
 
+rollup-comparison:
+	./scripts/rollup-comparison.sh
+
 demo-up: docker-dev
 
 demo-down: docker-down
diff --git a/README.md b/README.md
index 6871fc1..22f3471 100644
--- a/README.md
+++ b/README.md
@@ -273,8 +273,11 @@ make test-race      # race detector
 make ts-test        # TypeScript SDK vitest suite
 make ci             # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
 make demo-acceptance # with make demo running, verify demo + CLI triage loop
+make rollup-comparison # demo proof: root-cause counts vs naive propagated counts
 ```
 
+`make rollup-comparison` runs the checkout demo burst and prints the PMT_502 root-cause count next to a naive propagated count across touched services. It is the quickest local proof that Waylog's default rollups count the originating failure once per failed request instead of inflating it by every downstream hop.
+
 ## Auth
 
 Waylog uses three scoped keys. They are independent — the dashboard never holds the agent key.
diff --git a/internal/graph/analysis/rollup_invariant_test.go b/internal/graph/analysis/rollup_invariant_test.go
new file mode 100644
index 0000000..0607852
--- /dev/null
+++ b/internal/graph/analysis/rollup_invariant_test.go
@@ -0,0 +1,33 @@
+package analysis
+
+import (
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/graph/build"
+	"github.com/sssmaran/WaylogCLI/internal/graph/store"
+	"github.com/sssmaran/WaylogCLI/internal/tracestore"
+)
+
+func TestRollupInvariantRootCauseStaysBelowNaivePropagation(t *testing.T) {
+	s := store.NewStore()
+	ts := tracestore.NewStore()
+	b := build.NewBuilder()
+	now := time.Now().UTC()
+
+	const failedRequests = 3
+	for i := range failedRequests {
+		ingestCascade(t, s, ts, b, i, now.Add(-20*time.Second))
+	}
+
+	summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now.Add(time.Minute))
+	rootCauseCount := summary.PrimaryErrorCount["PMT_502"]
+	naivePropagatedCount := failedRequests * 3
+
+	if rootCauseCount != failedRequests {
+		t.Fatalf("PMT_502 root-cause count = %d, want %d", rootCauseCount, failedRequests)
+	}
+	if rootCauseCount >= naivePropagatedCount {
+		t.Fatalf("root-cause count should stay below naive propagated count: root=%d naive=%d", rootCauseCount, naivePropagatedCount)
+	}
+}
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index 1629b72..e4d5b67 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -13,6 +13,7 @@ type errorsResponse struct {
 
 type errorRow struct {
 	ErrorFamily    errorFamily `json:"error_family"`
+	Count          int         `json:"count"`
 	AffectedTraces int         `json:"affected_traces"`
 	SampleTraces   []string    `json:"sample_traces"`
 }
@@ -58,9 +59,13 @@ type triageReport struct {
 	ReportHash string `json:"report_hash"`
 }
 
+type blastResponse struct {
+	AffectedServices int `json:"affected_services"`
+}
+
 func main() {
 	if len(os.Args) != 2 {
-		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id|triage-report-hash>")
+		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|payment-error-count|payment-affected-traces|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id|triage-report-hash|blast-affected-services>")
 		os.Exit(2)
 	}
 
@@ -75,6 +80,10 @@ func main() {
 		if !hasPaymentError(body) {
 			os.Exit(1)
 		}
+	case "payment-error-count":
+		fmt.Println(paymentErrorCount(body))
+	case "payment-affected-traces":
+		fmt.Println(paymentAffectedTraces(body))
 	case "first-payment-trace":
 		fmt.Println(firstPaymentTrace(body))
 	case "first-event-id":
@@ -91,6 +100,8 @@ func main() {
 		fmt.Println(firstIncidentID(body))
 	case "triage-report-hash":
 		fmt.Println(triageReportHash(body))
+	case "blast-affected-services":
+		fmt.Println(blastAffectedServices(body))
 	default:
 		fmt.Fprintf(os.Stderr, "unknown command: %s\n", os.Args[1])
 		os.Exit(2)
@@ -110,6 +121,32 @@ func hasPaymentError(body []byte) bool {
 	return false
 }
 
+func paymentErrorCount(body []byte) int {
+	var resp errorsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return 0
+	}
+	for _, row := range resp.Rows {
+		if isPayment502(row) {
+			return row.Count
+		}
+	}
+	return 0
+}
+
+func paymentAffectedTraces(body []byte) int {
+	var resp errorsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return 0
+	}
+	for _, row := range resp.Rows {
+		if isPayment502(row) {
+			return row.AffectedTraces
+		}
+	}
+	return 0
+}
+
 func firstPaymentTrace(body []byte) string {
 	var resp errorsResponse
 	if err := json.Unmarshal(body, &resp); err != nil {
@@ -197,3 +234,11 @@ func triageReportHash(body []byte) string {
 	}
 	return rep.ReportHash
 }
+
+func blastAffectedServices(body []byte) int {
+	var resp blastResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return 0
+	}
+	return resp.AffectedServices
+}
diff --git a/scripts/rollup-comparison.sh b/scripts/rollup-comparison.sh
new file mode 100755
index 0000000..252e7c1
--- /dev/null
+++ b/scripts/rollup-comparison.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
+INGEST_URL="${INGEST_URL:-http://localhost:8080}"
+WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+REQUESTS="${REQUESTS:-20}"
+CONCURRENCY="${CONCURRENCY:-5}"
+TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
+USE_RUNNING="${WAYLOG_ROLLUP_USE_RUNNING_DEMO:-0}"
+
+CLI_BIN="${WAYLOG_CLI_BIN:-./data/demo-state/bin/waylog}"
+JSON_BIN="${WAYLOG_JSON_HELPER_BIN:-./data/demo-state/bin/demo-acceptance-json}"
+
+fail() {
+  echo "FAIL: $*" >&2
+  exit 1
+}
+
+http_code() {
+  curl -s -o /dev/null -w "%{http_code}" "$1" || echo "000"
+}
+
+cleanup() {
+  if [[ "$USE_RUNNING" != "1" ]]; then
+    make demo-stop >/dev/null 2>&1 || true
+  fi
+}
+trap cleanup EXIT
+
+if [[ "$USE_RUNNING" != "1" ]]; then
+  make demo
+elif [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
+  fail "running demo is not reachable. Start it with make demo or unset WAYLOG_ROLLUP_USE_RUNNING_DEMO"
+fi
+
+mkdir -p ./data/demo-state/bin
+go build -o "$CLI_BIN" ./cmd/waylog
+go build -o "$JSON_BIN" ./scripts/demo-acceptance-json
+
+CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TIMEOUT")
+
+burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
+burst_status="$(curl -s -o /tmp/waylog-rollup-burst.json -w "%{http_code}" \
+  -X POST "${GATEWAY_URL}/demo/burst" \
+  -H 'Content-Type: application/json' \
+  --data "$burst_body" || echo "000")"
+[[ "$burst_status" == "200" ]] || fail "traffic burst failed: HTTP $burst_status"
+
+errors_json=""
+for _ in $(seq 1 15); do
+  errors_json="$("${CLI[@]}" --json errors --window 15m --limit 10)" || fail "waylog errors failed"
+  if "$JSON_BIN" has-payment-error <<<"$errors_json"; then
+    break
+  fi
+  sleep 1
+done
+"$JSON_BIN" has-payment-error <<<"$errors_json" || fail "payment_502 error family did not appear in /v1/errors"
+
+blast_json="$("${CLI[@]}" --json blast checkout:payment.charge:PMT_502 --window 15m)" || fail "waylog blast failed"
+
+root_count="$("$JSON_BIN" payment-error-count <<<"$errors_json")"
+affected_traces="$("$JSON_BIN" payment-affected-traces <<<"$errors_json")"
+affected_services="$("$JSON_BIN" blast-affected-services <<<"$blast_json")"
+
+[[ "$root_count" =~ ^[0-9]+$ ]] || fail "root-cause count is not numeric: $root_count"
+[[ "$affected_services" =~ ^[0-9]+$ ]] || fail "affected services is not numeric: $affected_services"
+(( root_count > 0 )) || fail "root-cause count is empty"
+(( affected_services > 1 )) || fail "blast radius did not show cross-service spread"
+
+naive_count=$((root_count * affected_services))
+(( naive_count > root_count )) || fail "naive propagated count did not exceed root-cause count"
+
+cat <<EOF
+Rollup comparison
+  workload: ${REQUESTS} demo requests, concurrency ${CONCURRENCY}
+  root-cause counted PMT_502: ${root_count}
+  affected traces: ${affected_traces}
+  affected services: ${affected_services}
+  naive propagated count: ${root_count} * ${affected_services} = ${naive_count}
+
+PASS: Waylog counts the root cause once per failed request instead of once per propagated service hop.
+EOF

From a5f1f9364789e2163eceb75d39a705f4648c26b1 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Fri, 8 May 2026 16:01:50 -0400
Subject: [PATCH 09/14] fix: preserve capabilities fields in CLI output

- Keep provider and incident metadata when decoding capabilities responses in the CLI.

- Render provider, Ask, incident persistence, and rebuild state in human-readable output, and add regression coverage for preserving those fields in JSON output.
---
 internal/cli/v2/render.go      | 16 ++++++++++++++
 internal/cli/v2/render_test.go | 39 +++++++++++++++++++++++++++++++++-
 internal/cli/v2/types.go       | 15 +++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index e0bed00..3092b0a 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -286,6 +286,22 @@ func RenderSearch(w io.Writer, resp EventSearchResponse) {
 func RenderCapabilities(w io.Writer, resp CapabilitiesResponse) {
 	fmt.Fprintf(w, "v2_reads: %s\n", enabledLabel(resp.V2Reads.Enabled))
 	fmt.Fprintf(w, "otlp_http_traces: %s\n", enabledLabel(resp.OTLP.HTTPTraces))
+	if resp.LLM.Provider != "" {
+		fmt.Fprintf(w, "llm: provider=%s configured=%t ask_enabled=%t", resp.LLM.Provider, resp.LLM.Configured, resp.LLM.AskEnabled)
+		if resp.LLM.Model != "" {
+			fmt.Fprintf(w, " model=%s", resp.LLM.Model)
+		}
+		if resp.LLM.ToolMode != "" {
+			fmt.Fprintf(w, " tool_mode=%s", resp.LLM.ToolMode)
+		}
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintf(w, "incidents: enabled=%t persistent=%t rebuild_supported=%t",
+		resp.Incidents.Enabled, resp.Incidents.Persistent, resp.Incidents.Rebuild.Supported)
+	if resp.Incidents.Rebuild.Scope != "" {
+		fmt.Fprintf(w, " rebuild_scope=%s", resp.Incidents.Rebuild.Scope)
+	}
+	fmt.Fprintln(w)
 }
 
 func eventRoute(ev *Event) string {
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index a60e4cc..5748f05 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -137,8 +137,45 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
 	var out bytes.Buffer
 	resp := CapabilitiesResponse{}
 	resp.OTLP.HTTPTraces = true
+	resp.LLM.Provider = "none"
+	resp.Incidents.Enabled = true
+	resp.Incidents.Persistent = true
+	resp.Incidents.Rebuild.Supported = true
+	resp.Incidents.Rebuild.Scope = "hot-window"
 	RenderCapabilities(&out, resp)
-	if !strings.Contains(out.String(), "v2_reads: disabled") || !strings.Contains(out.String(), "otlp_http_traces: enabled") {
+	for _, want := range []string{
+		"v2_reads: disabled",
+		"otlp_http_traces: enabled",
+		"llm: provider=none configured=false ask_enabled=false",
+		"incidents: enabled=true persistent=true rebuild_supported=true rebuild_scope=hot-window",
+	} {
+		if !strings.Contains(out.String(), want) {
+			t.Fatalf("output missing %q:\n%s", want, out.String())
+		}
+	}
+}
+
+func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
+	raw := []byte(`{
+		"v2_reads":{"enabled":true},
+		"otlp":{"http_traces":true},
+		"llm":{"provider":"none","model":"","tool_mode":"","configured":false,"ask_enabled":false},
+		"incidents":{"enabled":true,"persistent":true,"rebuild":{"supported":true,"scope":"hot-window"}}
+	}`)
+	var resp CapabilitiesResponse
+	if err := json.Unmarshal(raw, &resp); err != nil {
+		t.Fatal(err)
+	}
+	var out bytes.Buffer
+	if err := renderJSON(&out, resp); err != nil {
+		t.Fatal(err)
+	}
+	for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`} {
+		if !strings.Contains(out.String(), want) {
+			t.Fatalf("json missing %q:\n%s", want, out.String())
+		}
+	}
+	if !resp.Incidents.Rebuild.Supported {
 		t.Fatalf("output=%s", out.String())
 	}
 }
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 58fcde9..dc707b5 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -15,6 +15,21 @@ type CapabilitiesResponse struct {
 	OTLP struct {
 		HTTPTraces bool `json:"http_traces"`
 	} `json:"otlp"`
+	LLM struct {
+		Provider   string `json:"provider"`
+		Model      string `json:"model"`
+		ToolMode   string `json:"tool_mode"`
+		Configured bool   `json:"configured"`
+		AskEnabled bool   `json:"ask_enabled"`
+	} `json:"llm"`
+	Incidents struct {
+		Enabled    bool `json:"enabled"`
+		Persistent bool `json:"persistent"`
+		Rebuild    struct {
+			Supported bool   `json:"supported"`
+			Scope     string `json:"scope"`
+		} `json:"rebuild"`
+	} `json:"incidents"`
 }
 
 type EventSearchResponse = apiv2.EventSearchResponse

From 03c7d899ebe024f54d103eb23afcc23b5e1c1c7c Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Sun, 10 May 2026 15:42:02 -0400
Subject: [PATCH 10/14] feat: added OTLP gRPC trace ingest

- add OTLP TraceService gRPC receiver using the existing trace conversion path
- require write-scope bearer auth on gRPC metadata
- share OTLP export logic between HTTP and gRPC transports
- report OTLP gRPC state in capabilities and CLI output
- add deterministic OTLP conformance target
- add OpenTelemetry Collector example
- document OTLP gRPC env, capabilities, and status
---
 Makefile                            |   8 +-
 README.md                           |  16 ++--
 cmd/ingest/main.go                  |  34 ++++++++
 docs/env.md                         |   4 +-
 docs/openapi.yaml                   |  12 ++-
 examples/otel-collector/config.yaml |  21 +++++
 go.mod                              |   3 +
 go.sum                              |   4 +
 go.work.sum                         |   6 +-
 internal/cli/v2/render.go           |   5 ++
 internal/cli/v2/render_test.go      |   7 +-
 internal/cli/v2/types.go            |   4 +-
 internal/ingest/handler.go          |  14 ++++
 internal/ingest/handler_test.go     |  32 ++++++++
 internal/otel/grpc.go               | 118 ++++++++++++++++++++++++++++
 internal/otel/grpc_test.go          |  95 ++++++++++++++++++++++
 internal/otel/handler.go            |  74 ++++++++++++-----
 scripts/otlp-conformance.sh         |  10 +++
 18 files changed, 428 insertions(+), 39 deletions(-)
 create mode 100644 examples/otel-collector/config.yaml
 create mode 100644 internal/otel/grpc.go
 create mode 100644 internal/otel/grpc_test.go
 create mode 100755 scripts/otlp-conformance.sh

diff --git a/Makefile b/Makefile
index cb1937a..e9f21ee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/sh
 
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
 
 help:
 	@echo "Targets:"
@@ -20,6 +20,7 @@ help:
 	@echo "  demo-stop - stop demo processes"
 	@echo "  demo-acceptance - verify a running local demo end-to-end"
 	@echo "  rollup-comparison - run demo proof for root-cause vs naive rollup counts"
+	@echo "  otlp-conformance - run deterministic OTLP HTTP/gRPC fixture checks"
 	@echo "  demo-up  - start v2 demo stack in Docker (detached)"
 	@echo "  demo-down - stop Docker demo stack"
 	@echo "  micro-demo - start 4-service micro-demo in foreground for debugging"
@@ -83,7 +84,7 @@ vet-sdk: ## Vet SDK modules
 	cd pkg && go vet ./...
 	cd pkg/transport/kafka && go vet ./...
 
-ci: fmt vet vet-sdk test-race test-sdk ts-test check-doc-links check-rollup-contract
+ci: fmt vet vet-sdk test-race test-sdk ts-test check-doc-links check-rollup-contract otlp-conformance
 	@echo "CI checks passed"
 
 ts-install: ## Install TS SDK deps (skipped if node_modules is already present)
@@ -127,6 +128,9 @@ demo-acceptance:
 rollup-comparison:
 	./scripts/rollup-comparison.sh
 
+otlp-conformance:
+	./scripts/otlp-conformance.sh
+
 demo-up: docker-dev
 
 demo-down: docker-down
diff --git a/README.md b/README.md
index 22f3471..a0246aa 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 <p align="center">
   <strong>Structured logging that explains failed requests and active incidents.</strong><br>
-  Drop-in SDKs (Go, TypeScript) or OTLP/HTTP. Agent-native by design.
+  Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC traces. Agent-native by design.
 </p>
 
 <p align="center">
@@ -133,9 +133,9 @@ func main() {
 
 The recommended SDK path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers. Low-level request APIs such as `Begin`, `Finalize`, and `setField` are for adapter authors, tests, and unusual custom integrations. Full copy-paste examples for `net/http`, chi, gin, echo, standalone TypeScript, Express, Hono, Next.js, and NestJS are in [`docs/sdk-examples.md`](docs/sdk-examples.md).
 
-### OTLP/HTTP traces
+### OTLP traces
 
-Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces`. Protobuf bodies are accepted (gzip optional) and HTTP spans convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. **Phase A covers traces over HTTP.** gRPC, logs, and metrics are not yet shipping.
+Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces` for OTLP/HTTP or `localhost:4317` for OTLP/gRPC. Protobuf trace exports convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. A collector config lives in [`examples/otel-collector/`](examples/otel-collector/). **Only traces are supported.** OTLP logs and metrics are not shipping yet.
 
 ### Alternative: local ingest server (no Docker)
 
@@ -244,8 +244,8 @@ The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs a
 ## Architecture
 
 ```text
-Go / TS services (SDK) · OTLP/HTTP collectors
-        │  schema-2.0 WideEvents · OTLP/HTTP traces
+Go / TS services (SDK) · OTLP collectors
+        │  schema-2.0 WideEvents · OTLP HTTP/gRPC traces
         ▼
   ingest server
     ├─ event log (append-only WAL, source of truth)
@@ -297,7 +297,7 @@ Public alpha. APIs may break before 1.0.
 **Shipped:**
 
 - Go SDK v2 (`net/http`, chi, gin, echo) and TypeScript SDK v2 (`@waylog/sdk`, ESM, Node 18+, standalone core, Express, Hono, Next.js, NestJS)
-- OTLP/HTTP traces at `/v1/otlp/v1/traces` (Phase A — traces only)
+- OTLP traces over HTTP at `/v1/otlp/v1/traces` and gRPC at `:4317` (traces only)
 - durable ingest with WAL + replay
 - hot graph with flattened 3-node model + dedicated trace store
 - schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
@@ -314,7 +314,7 @@ Public alpha. APIs may break before 1.0.
 
 **Planned:**
 
-- OTLP gRPC, logs, and metrics (Phase B)
+- OTLP logs and metrics
 - Python SDK
 - Mintlify docs site
 
@@ -322,7 +322,7 @@ Public alpha. APIs may break before 1.0.
 
 - Single-node only. No HA, no clustering.
 - Alpha quality. APIs may break before 1.0.
-- OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
+- OTLP supports traces only. Logs and metrics are not shipping yet.
 - Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
 - SQLite cold store fits demos and small deployments; not sized for production-scale retention.
 - Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 579e3a6..b71153f 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -6,6 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"net"
 	"net/http"
 	"os"
 	"os/signal"
@@ -41,6 +42,8 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/triagehttp"
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+	coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+	"google.golang.org/grpc"
 )
 
 var graphStore *graphstore.Store
@@ -130,6 +133,7 @@ func main() {
 	grafanaURL := config.Getenv("GRAFANA_URL", "")
 	graphUI := config.GetenvBool("GRAPH_UI", false)
 	otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
+	otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
 	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
 	signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
 	incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
@@ -262,6 +266,8 @@ func main() {
 		PlanStore:                planStore,
 		GraphHotWindow:           graphHotWindow,
 		OTLPEnabled:              otlpEnabled,
+		OTLPGRPCEnabled:          otlpEnabled && otlpGRPCAddr != "",
+		OTLPGRPCAddr:             otlpGRPCAddr,
 		V2ReadsEnabled:           v2ReadsEnabled,
 		IncidentsEnabled:         v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
 		IncidentsPersistent:      v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
@@ -390,10 +396,19 @@ func main() {
 	mux.Handle("/v1/signals", writeAuth(http.HandlerFunc(signalHandler.Signals)))
 
 	// OTLP/HTTP traces reuse the same schema-2.0 WAL and projector as the SDK path.
+	var otlpGRPCServer *grpc.Server
 	if otlpEnabled {
 		otlpHandler := otelhttp.NewHandler(eventsV2, m, maxBody)
 		mux.Handle("/v1/otlp/v1/traces", writeAuth(http.HandlerFunc(otlpHandler.ServeHTTP)))
 		slog.Info("otlp enabled", "endpoint", "/v1/otlp/v1/traces")
+		if otlpGRPCAddr != "" {
+			otlpGRPCServer = grpc.NewServer(
+				grpc.UnaryInterceptor(otelhttp.AuthUnaryInterceptor(authCfg.WriteKeys)),
+				grpc.MaxRecvMsgSize(int(maxBody)),
+			)
+			coltracepb.RegisterTraceServiceServer(otlpGRPCServer, otelhttp.NewTraceServiceServer(eventsV2, m, maxBody))
+			ingestServer.SetOTLPGRPC(true, otlpGRPCAddr)
+		}
 	}
 
 	// Read endpoints — CORS outermost so OPTIONS preflight passes without auth.
@@ -619,6 +634,21 @@ func main() {
 		}
 	}()
 
+	if otlpGRPCServer != nil {
+		lis, err := net.Listen("tcp", otlpGRPCAddr)
+		if err != nil {
+			slog.Error("otlp grpc listen failed", "addr", otlpGRPCAddr, "err", err)
+			os.Exit(1)
+		}
+		go func() {
+			slog.Info("otlp grpc enabled", "addr", otlpGRPCAddr)
+			if err := otlpGRPCServer.Serve(lis); err != nil && !errors.Is(err, grpc.ErrServerStopped) {
+				slog.Error("otlp grpc server error", "err", err)
+				os.Exit(1)
+			}
+		}()
+	}
+
 	// ---------------- Embedded CLI ----------------
 
 	if mcpStdio {
@@ -867,6 +897,10 @@ func main() {
 	} else {
 		slog.Info("ingest shutdown complete")
 	}
+	if otlpGRPCServer != nil {
+		otlpGRPCServer.GracefulStop()
+		slog.Info("otlp grpc shutdown complete")
+	}
 
 	planStore.Close()
 
diff --git a/docs/env.md b/docs/env.md
index 65fa480..de42e43 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -46,7 +46,9 @@ Scoped keys. See the Auth section of the [README](../README.md).
 | Variable | Default | Purpose |
 |---|---|---|
 | `INGEST_ADDR` | `:8080` | Listen address |
-| `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events` |
+| `OTLP_ENABLED` | `true` | Enable OTLP trace ingest over HTTP and gRPC |
+| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver |
+| `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events`, `/v1/otlp/v1/traces`, and OTLP/gRPC receive messages |
 | `READ_HEADER_TIMEOUT` | `5s` | HTTP read header timeout |
 | `READ_TIMEOUT` | `10s` | HTTP read timeout |
 | `WRITE_TIMEOUT` | `10s` | HTTP write timeout |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 7162edb..4d480f1 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -20,7 +20,7 @@ tags:
   - name: Ingest
     description: Schema-2.0 event ingest and validation.
   - name: OTLP
-    description: OTLP/HTTP trace ingest converted into schema-2.0 events.
+    description: OTLP trace ingest converted into schema-2.0 events. HTTP is documented here; gRPC uses OTLP TraceService on the configured OTLP_GRPC_ADDR.
   - name: Signals
     description: Production-context facts used by incident triage.
   - name: Events
@@ -207,7 +207,9 @@ paths:
         Accepts OTLP ExportTraceServiceRequest protobuf bodies. HTTP spans are
         converted into schema-2.0 WideEvents and ingested through the same WAL
         and projector as SDK events. Unsupported spans and validation rejects
-        are reported via OTLP partial_success.
+        are reported via OTLP partial_success. The same trace conversion path
+        is available over OTLP/gRPC TraceService when the gRPC receiver is
+        enabled.
       security:
         - ApiKeyHeader: []
         - BearerAuth: []
@@ -1949,6 +1951,8 @@ components:
       example:
         otlp:
           http_traces: true
+          grpc_traces: true
+          grpc_addr: ":4317"
         v2_reads:
           enabled: true
         graph: false
@@ -1989,6 +1993,10 @@ components:
           properties:
             http_traces:
               type: boolean
+            grpc_traces:
+              type: boolean
+            grpc_addr:
+              type: string
         v2_reads:
           type: object
           properties:
diff --git a/examples/otel-collector/config.yaml b/examples/otel-collector/config.yaml
new file mode 100644
index 0000000..d499b4f
--- /dev/null
+++ b/examples/otel-collector/config.yaml
@@ -0,0 +1,21 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+exporters:
+  otlp/waylog:
+    endpoint: ${env:WAYLOG_OTLP_GRPC_ENDPOINT}
+    tls:
+      insecure: true
+    headers:
+      authorization: Bearer ${env:WAYLOG_WRITE_KEY}
+
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      exporters: [otlp/waylog]
diff --git a/go.mod b/go.mod
index 0f46631..ad8dfec 100644
--- a/go.mod
+++ b/go.mod
@@ -54,8 +54,11 @@ require (
 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
 	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
+	golang.org/x/net v0.50.0 // indirect
 	golang.org/x/sys v0.41.0 // indirect
 	golang.org/x/text v0.34.0 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect
+	google.golang.org/grpc v1.79.2 // indirect
 	google.golang.org/protobuf v1.36.11 // indirect
 	modernc.org/libc v1.67.6 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
diff --git a/go.sum b/go.sum
index 9147d0e..89a1ad1 100644
--- a/go.sum
+++ b/go.sum
@@ -117,6 +117,10 @@ golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
 golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
 golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
 golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
+google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
+google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
 google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
 google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/go.work.sum b/go.work.sum
index b5458b3..29d3784 100644
--- a/go.work.sum
+++ b/go.work.sum
@@ -57,6 +57,8 @@ github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtX
 golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
 golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8=
 golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw=
+golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
+golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
 golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
 golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
 golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY=
@@ -64,7 +66,3 @@ golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGN
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
 google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
-google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
-google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index 3092b0a..2e61981 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -286,6 +286,11 @@ func RenderSearch(w io.Writer, resp EventSearchResponse) {
 func RenderCapabilities(w io.Writer, resp CapabilitiesResponse) {
 	fmt.Fprintf(w, "v2_reads: %s\n", enabledLabel(resp.V2Reads.Enabled))
 	fmt.Fprintf(w, "otlp_http_traces: %s\n", enabledLabel(resp.OTLP.HTTPTraces))
+	fmt.Fprintf(w, "otlp_grpc_traces: %s", enabledLabel(resp.OTLP.GRPCTraces))
+	if resp.OTLP.GRPCAddr != "" {
+		fmt.Fprintf(w, " addr=%s", resp.OTLP.GRPCAddr)
+	}
+	fmt.Fprintln(w)
 	if resp.LLM.Provider != "" {
 		fmt.Fprintf(w, "llm: provider=%s configured=%t ask_enabled=%t", resp.LLM.Provider, resp.LLM.Configured, resp.LLM.AskEnabled)
 		if resp.LLM.Model != "" {
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 5748f05..4fa4f34 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -137,6 +137,8 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
 	var out bytes.Buffer
 	resp := CapabilitiesResponse{}
 	resp.OTLP.HTTPTraces = true
+	resp.OTLP.GRPCTraces = true
+	resp.OTLP.GRPCAddr = ":4317"
 	resp.LLM.Provider = "none"
 	resp.Incidents.Enabled = true
 	resp.Incidents.Persistent = true
@@ -146,6 +148,7 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
 	for _, want := range []string{
 		"v2_reads: disabled",
 		"otlp_http_traces: enabled",
+		"otlp_grpc_traces: enabled addr=:4317",
 		"llm: provider=none configured=false ask_enabled=false",
 		"incidents: enabled=true persistent=true rebuild_supported=true rebuild_scope=hot-window",
 	} {
@@ -158,7 +161,7 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
 func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
 	raw := []byte(`{
 		"v2_reads":{"enabled":true},
-		"otlp":{"http_traces":true},
+		"otlp":{"http_traces":true,"grpc_traces":true,"grpc_addr":":4317"},
 		"llm":{"provider":"none","model":"","tool_mode":"","configured":false,"ask_enabled":false},
 		"incidents":{"enabled":true,"persistent":true,"rebuild":{"supported":true,"scope":"hot-window"}}
 	}`)
@@ -170,7 +173,7 @@ func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
 	if err := renderJSON(&out, resp); err != nil {
 		t.Fatal(err)
 	}
-	for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`} {
+	for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`, `"grpc_traces": true`} {
 		if !strings.Contains(out.String(), want) {
 			t.Fatalf("json missing %q:\n%s", want, out.String())
 		}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index dc707b5..c281fef 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -13,7 +13,9 @@ type CapabilitiesResponse struct {
 		Enabled bool `json:"enabled"`
 	} `json:"v2_reads"`
 	OTLP struct {
-		HTTPTraces bool `json:"http_traces"`
+		HTTPTraces bool   `json:"http_traces"`
+		GRPCTraces bool   `json:"grpc_traces"`
+		GRPCAddr   string `json:"grpc_addr"`
 	} `json:"otlp"`
 	LLM struct {
 		Provider   string `json:"provider"`
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index 6bf181d..452f86f 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -136,6 +136,8 @@ type Server struct {
 	// OTLP capability flag — reported by /v1/capabilities. Set via
 	// ServerConfig when the OTLP handler is mounted in main.go.
 	otlpEnabled               bool
+	otlpGRPCEnabled           bool
+	otlpGRPCAddr              string
 	v2ReadsEnabled            bool
 	incidentsEnabled          bool
 	incidentsPersistent       bool
@@ -204,6 +206,8 @@ type ServerConfig struct {
 	PlanStore                *PlanStore
 	GraphHotWindow           time.Duration
 	OTLPEnabled              bool
+	OTLPGRPCEnabled          bool
+	OTLPGRPCAddr             string
 	V2ReadsEnabled           bool
 	IncidentsEnabled         bool
 	IncidentsPersistent      bool
@@ -246,6 +250,8 @@ func NewServer(cfg ServerConfig) *Server {
 		planStore:                 cfg.PlanStore,
 		graphHotWindow:            cfg.GraphHotWindow,
 		otlpEnabled:               cfg.OTLPEnabled,
+		otlpGRPCEnabled:           cfg.OTLPGRPCEnabled,
+		otlpGRPCAddr:              cfg.OTLPGRPCAddr,
 		v2ReadsEnabled:            cfg.V2ReadsEnabled,
 		incidentsEnabled:          cfg.IncidentsEnabled,
 		incidentsPersistent:       cfg.IncidentsPersistent,
@@ -405,6 +411,12 @@ func (s *Server) AcceptedPtr() *atomic.Uint64 { return &s.accepted }
 // Called once at startup after the OTLP route has been registered.
 func (s *Server) SetOTLPEnabled(enabled bool) { s.otlpEnabled = enabled }
 
+// SetOTLPGRPC marks the OTLP/gRPC trace receiver as mounted.
+func (s *Server) SetOTLPGRPC(enabled bool, addr string) {
+	s.otlpGRPCEnabled = enabled
+	s.otlpGRPCAddr = addr
+}
+
 // EventSearch handles GET /v1/events/search.
 // Both cold-store and JSONL paths return the same []coldstore.SearchResult shape.
 func (s *Server) EventSearch(w http.ResponseWriter, r *http.Request) {
@@ -597,6 +609,8 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 		"graph": s.graphUI,
 		"otlp": map[string]any{
 			"http_traces": s.otlpEnabled,
+			"grpc_traces": s.otlpGRPCEnabled,
+			"grpc_addr":   s.otlpGRPCAddr,
 		},
 		"v2_reads": map[string]any{
 			"enabled": s.v2ReadsEnabled,
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 28099a7..c8775b4 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -349,6 +349,38 @@ func TestCapabilities_V2ReadsEnabled(t *testing.T) {
 	}
 }
 
+func TestCapabilities_OTLPGRPCBlock(t *testing.T) {
+	srv := NewServer(ServerConfig{
+		OTLPEnabled:     true,
+		OTLPGRPCEnabled: true,
+		OTLPGRPCAddr:    ":4317",
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+	w := httptest.NewRecorder()
+	srv.Capabilities(w, req)
+
+	var resp struct {
+		OTLP struct {
+			HTTPTraces bool   `json:"http_traces"`
+			GRPCTraces bool   `json:"grpc_traces"`
+			GRPCAddr   string `json:"grpc_addr"`
+		} `json:"otlp"`
+	}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("invalid json: %v", err)
+	}
+	if !resp.OTLP.HTTPTraces {
+		t.Fatal("otlp.http_traces = false, want true")
+	}
+	if !resp.OTLP.GRPCTraces {
+		t.Fatal("otlp.grpc_traces = false, want true")
+	}
+	if resp.OTLP.GRPCAddr != ":4317" {
+		t.Fatalf("otlp.grpc_addr = %q, want :4317", resp.OTLP.GRPCAddr)
+	}
+}
+
 func TestCapabilities_IncidentsBlock(t *testing.T) {
 	tests := []struct {
 		name             string
diff --git a/internal/otel/grpc.go b/internal/otel/grpc.go
new file mode 100644
index 0000000..886be3b
--- /dev/null
+++ b/internal/otel/grpc.go
@@ -0,0 +1,118 @@
+package otel
+
+import (
+	"context"
+	"crypto/subtle"
+	"net/http"
+	"strings"
+	"time"
+
+	ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+	coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/metadata"
+	"google.golang.org/grpc/status"
+	"google.golang.org/protobuf/proto"
+)
+
+// TraceServiceServer implements OTLP/gRPC TraceService over the same export
+// processor used by the OTLP/HTTP endpoint.
+type TraceServiceServer struct {
+	coltracepb.UnimplementedTraceServiceServer
+	handler *Handler
+	metrics *metrics.Metrics
+}
+
+func NewTraceServiceServer(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *TraceServiceServer {
+	return &TraceServiceServer{
+		handler: NewHandler(v2Ingest, m, maxBodyBytes),
+		metrics: m,
+	}
+}
+
+func (s *TraceServiceServer) Export(ctx context.Context, req *coltracepb.ExportTraceServiceRequest) (*coltracepb.ExportTraceServiceResponse, error) {
+	start := time.Now()
+	if s.metrics != nil {
+		defer func() {
+			s.metrics.OTLPRequestDuration.Observe(time.Since(start).Seconds())
+		}()
+		if req != nil {
+			s.metrics.OTLPRequestSizeBytes.Observe(float64(proto.Size(req)))
+		}
+	}
+
+	resp, err := s.handler.Export(ctx, req)
+	if err != nil {
+		code := codes.Internal
+		msg := "infrastructure error"
+		if exportErr, ok := err.(*ExportError); ok {
+			if s.metrics != nil {
+				s.metrics.OTLPRequestsTotal.WithLabelValues(exportErr.Bucket).Inc()
+			}
+			code = grpcCode(exportErr.StatusCode)
+			msg = exportErr.Message
+		}
+		return nil, status.Error(code, msg)
+	}
+	return resp, nil
+}
+
+func AuthUnaryInterceptor(writeKeys []string) grpc.UnaryServerInterceptor {
+	keyBytes := make([][]byte, 0, len(writeKeys))
+	for _, key := range writeKeys {
+		key = strings.TrimSpace(key)
+		if key != "" {
+			keyBytes = append(keyBytes, []byte(key))
+		}
+	}
+	return func(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) {
+		if len(keyBytes) == 0 {
+			return handler(ctx, req)
+		}
+		if token := bearerToken(ctx); token != "" && matchesAnyToken([]byte(token), keyBytes) {
+			return handler(ctx, req)
+		}
+		return nil, status.Error(codes.Unauthenticated, "unauthorized")
+	}
+}
+
+func bearerToken(ctx context.Context) string {
+	md, ok := metadata.FromIncomingContext(ctx)
+	if !ok {
+		return ""
+	}
+	for _, auth := range md.Get("authorization") {
+		if idx := strings.IndexByte(auth, ' '); idx > 0 && strings.EqualFold(auth[:idx], "bearer") {
+			return strings.TrimSpace(auth[idx+1:])
+		}
+	}
+	return ""
+}
+
+func matchesAnyToken(token []byte, keys [][]byte) bool {
+	match := 0
+	for _, key := range keys {
+		match |= subtle.ConstantTimeCompare(token, key)
+	}
+	return match == 1
+}
+
+func grpcCode(statusCode int) codes.Code {
+	switch statusCode {
+	case http.StatusBadRequest, http.StatusRequestEntityTooLarge, http.StatusUnsupportedMediaType:
+		return codes.InvalidArgument
+	case http.StatusUnauthorized:
+		return codes.Unauthenticated
+	case http.StatusForbidden:
+		return codes.PermissionDenied
+	case http.StatusServiceUnavailable:
+		return codes.Unavailable
+	default:
+		if statusCode >= 500 {
+			return codes.Internal
+		}
+		return codes.Unknown
+	}
+}
diff --git a/internal/otel/grpc_test.go b/internal/otel/grpc_test.go
new file mode 100644
index 0000000..7e9beb1
--- /dev/null
+++ b/internal/otel/grpc_test.go
@@ -0,0 +1,95 @@
+package otel
+
+import (
+	"context"
+	"net"
+	"testing"
+
+	coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/credentials/insecure"
+	"google.golang.org/grpc/metadata"
+	"google.golang.org/grpc/status"
+	"google.golang.org/grpc/test/bufconn"
+)
+
+const bufSize = 1024 * 1024
+
+func newBufconnClient(t *testing.T, keys []string) (coltracepb.TraceServiceClient, func()) {
+	t.Helper()
+	lis := bufconn.Listen(bufSize)
+	srv := grpc.NewServer(grpc.UnaryInterceptor(AuthUnaryInterceptor(keys)))
+	coltracepb.RegisterTraceServiceServer(srv, NewTraceServiceServer(testV2Ingest(t), nil, 1<<20))
+	go func() {
+		_ = srv.Serve(lis)
+	}()
+
+	ctx := context.Background()
+	conn, err := grpc.DialContext(ctx, "bufnet",
+		grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
+			return lis.Dial()
+		}),
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+	)
+	if err != nil {
+		t.Fatalf("dial bufconn: %v", err)
+	}
+	cleanup := func() {
+		_ = conn.Close()
+		srv.Stop()
+		_ = lis.Close()
+	}
+	return coltracepb.NewTraceServiceClient(conn), cleanup
+}
+
+func TestGRPCExportHappyPath(t *testing.T) {
+	client, cleanup := newBufconnClient(t, []string{"write-key"})
+	defer cleanup()
+
+	ctx := metadata.AppendToOutgoingContext(context.Background(), "authorization", "Bearer write-key")
+	resp, err := client.Export(ctx, validOTLPRequest())
+	if err != nil {
+		t.Fatalf("export: %v", err)
+	}
+	if resp.PartialSuccess != nil && resp.PartialSuccess.RejectedSpans != 0 {
+		t.Fatalf("partial_success = %+v, want none", resp.PartialSuccess)
+	}
+}
+
+func TestGRPCExportAuth(t *testing.T) {
+	tests := []struct {
+		name string
+		auth string
+		code codes.Code
+	}{
+		{name: "missing", code: codes.Unauthenticated},
+		{name: "bad key", auth: "Bearer wrong", code: codes.Unauthenticated},
+		{name: "read key", auth: "Bearer read-key", code: codes.Unauthenticated},
+		{name: "agent key", auth: "Bearer agent-key", code: codes.Unauthenticated},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			client, cleanup := newBufconnClient(t, []string{"write-key"})
+			defer cleanup()
+
+			ctx := context.Background()
+			if tc.auth != "" {
+				ctx = metadata.AppendToOutgoingContext(ctx, "authorization", tc.auth)
+			}
+			_, err := client.Export(ctx, validOTLPRequest())
+			if status.Code(err) != tc.code {
+				t.Fatalf("code = %v, want %v (err=%v)", status.Code(err), tc.code, err)
+			}
+		})
+	}
+}
+
+func TestGRPCExportDevModeNoKeys(t *testing.T) {
+	client, cleanup := newBufconnClient(t, nil)
+	defer cleanup()
+
+	if _, err := client.Export(context.Background(), validOTLPRequest()); err != nil {
+		t.Fatalf("export without auth in dev mode: %v", err)
+	}
+}
diff --git a/internal/otel/handler.go b/internal/otel/handler.go
index 321c344..1bcac11 100644
--- a/internal/otel/handler.go
+++ b/internal/otel/handler.go
@@ -8,6 +8,7 @@ import (
 	"compress/gzip"
 	"context"
 	"encoding/json"
+	"errors"
 	"io"
 	"log/slog"
 	"mime"
@@ -34,6 +35,29 @@ type Handler struct {
 	maxBodyBytes int64
 }
 
+// ExportError is returned when decoded OTLP spans cannot be processed after
+// transport-level parsing has succeeded.
+type ExportError struct {
+	StatusCode int
+	Bucket     string
+	Message    string
+	Cause      error
+}
+
+func (e *ExportError) Error() string {
+	if e == nil {
+		return ""
+	}
+	return e.Message
+}
+
+func (e *ExportError) Unwrap() error {
+	if e == nil {
+		return nil
+	}
+	return e.Cause
+}
+
 // NewHandler constructs an OTLP traces handler.
 func NewHandler(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *Handler {
 	return &Handler{
@@ -104,7 +128,31 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	convResult := convert.SpansToEvents(&req)
+	resp, err := h.Export(r.Context(), &req)
+	if err != nil {
+		var exportErr *ExportError
+		if errors.As(err, &exportErr) {
+			h.respondStatus(w, exportErr.Bucket, exportErr.StatusCode, exportErr.Message)
+			return
+		}
+		h.respondStatus(w, "5xx", http.StatusInternalServerError, "infrastructure error")
+		return
+	}
+	respBytes, err := proto.Marshal(resp)
+	if err != nil {
+		h.respondStatus(w, "5xx", http.StatusInternalServerError, "failed to encode response")
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/x-protobuf")
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write(respBytes)
+}
+
+// Export converts an OTLP trace export request into schema-2.0 events and
+// writes them through the same ingest path used by SDK events.
+func (h *Handler) Export(ctx context.Context, req *coltracepb.ExportTraceServiceRequest) (*coltracepb.ExportTraceServiceResponse, error) {
+	convResult := convert.SpansToEvents(req)
 
 	// Mirror the future-timestamp guard from Server.Events: drop any span
 	// dated more than 5 minutes ahead of wall-clock so a skewed collector
@@ -141,8 +189,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 			if h.metrics != nil {
 				h.metrics.OTLPInfraFailures.Inc()
 			}
-			h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
-			return
+			return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error"}
 		}
 		bodies, err := marshalEvents(convResult.Events)
 		if err != nil {
@@ -150,21 +197,18 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 			if h.metrics != nil {
 				h.metrics.OTLPInfraFailures.Inc()
 			}
-			h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
-			return
+			return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error", Cause: err}
 		}
-		env, err = h.v2Ingest.IngestRaw(r.Context(), bodies, true)
+		env, err = h.v2Ingest.IngestRaw(ctx, bodies, true)
 		if err != nil {
 			if err == context.Canceled || err == context.DeadlineExceeded {
-				h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "request canceled")
-				return
+				return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "request canceled", Cause: err}
 			}
 			slog.Error("otlp: v2 ingest infrastructure failure", "err", err)
 			if h.metrics != nil {
 				h.metrics.OTLPInfraFailures.Inc()
 			}
-			h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
-			return
+			return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error", Cause: err}
 		}
 		if h.metrics != nil && len(env.Rejected) > 0 {
 			h.metrics.OTLPValidationRejects.Add(float64(len(env.Rejected)))
@@ -180,19 +224,11 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		}
 	}
 
-	respBytes, err := proto.Marshal(resp)
-	if err != nil {
-		h.respondStatus(w, "5xx", http.StatusInternalServerError, "failed to encode response")
-		return
-	}
-
 	if h.metrics != nil {
 		h.metrics.OTLPRequestsTotal.WithLabelValues("2xx").Inc()
 	}
 
-	w.Header().Set("Content-Type", "application/x-protobuf")
-	w.WriteHeader(http.StatusOK)
-	_, _ = w.Write(respBytes)
+	return resp, nil
 }
 
 func (h *Handler) respondStatus(w http.ResponseWriter, bucket string, code int, msg string) {
diff --git a/scripts/otlp-conformance.sh b/scripts/otlp-conformance.sh
new file mode 100755
index 0000000..2078f59
--- /dev/null
+++ b/scripts/otlp-conformance.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# Deterministic OTLP fixture checks for Waylog's HTTP and gRPC trace paths.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+echo "[otlp-conformance] running OTLP conversion and receiver tests"
+go test ./internal/otel/...
+echo "OK: OTLP HTTP/gRPC fixture checks passed"

From 5106be19f205a4800b9bc5beda82070c8d1b74b7 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Sun, 10 May 2026 18:26:50 -0400
Subject: [PATCH 11/14] feat: add alert-linked operator reports

- add /v1/alerts intake for Waylog, Alertmanager, Grafana, and PagerDuty payloads
- store accepted alerts as alert signals and match them to incidents when possible
- include alert evidence in incident classification without changing cause priority
- add alert references to TriageReport and canonical report hashing
- add Markdown, Slack Block Kit, and PagerDuty note renderers
- expose /v1/triage/{incident_id}/report and render_triage_report
- extend demo acceptance to prove alert intake, stable triage hash, and cited reports
- document alert intake, report rendering, and ALERT_MATCH_WINDOW
---
 README.md                             |  10 +-
 cmd/ingest/main.go                    |  15 +
 docs/env.md                           |   1 +
 docs/openapi.yaml                     | 121 +++++++-
 internal/alerts/alerts.go             | 414 ++++++++++++++++++++++++++
 internal/alerts/alerts_test.go        | 234 +++++++++++++++
 internal/incidents/classifier.go      |  44 ++-
 internal/incidents/classifier_test.go |  48 +++
 internal/reports/reports.go           | 152 ++++++++++
 internal/reports/reports_test.go      |  68 +++++
 internal/tools/report.go              |  69 +++++
 internal/tools/report_test.go         |  29 ++
 internal/triage/adapter.go            |  70 ++++-
 internal/triage/adapter_test.go       |  37 +++
 internal/triage/engine.go             |  13 +
 internal/triagehttp/handler.go        |  57 +++-
 internal/triagehttp/handler_test.go   |  17 ++
 pkg/triage/report.go                  |  11 +
 pkg/triage/report_test.go             |  22 ++
 scripts/demo-acceptance.sh            |  22 ++
 20 files changed, 1442 insertions(+), 12 deletions(-)
 create mode 100644 internal/alerts/alerts.go
 create mode 100644 internal/alerts/alerts_test.go
 create mode 100644 internal/reports/reports.go
 create mode 100644 internal/reports/reports_test.go
 create mode 100644 internal/tools/report.go
 create mode 100644 internal/tools/report_test.go

diff --git a/README.md b/README.md
index a0246aa..943ae6b 100644
--- a/README.md
+++ b/README.md
@@ -208,10 +208,12 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
 
 ### Analysis tools
 
-All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+All twelve tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
 
 Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
 
+External alerts can be posted to `POST /v1/alerts` as Waylog-normalized JSON or Alertmanager, Grafana, or PagerDuty webhooks. Waylog stores them as alert signals, links them to active incidents when possible, and can render cited Markdown, Slack Block Kit, or PagerDuty-note reports from the same deterministic triage artifact.
+
 | Tool               | Answers                                                       |
 | ------------------ | ------------------------------------------------------------- |
 | `graph_stats`      | Overall shape of the graph right now                          |
@@ -225,6 +227,7 @@ Agents can call the built-in triage plan template with `POST /v1/plans/execute`
 | `compare_windows`  | Diff error rates between two windows                          |
 | `graph_insights`   | Windowed rollup of top errors and patterns                    |
 | `triage_incident`  | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
+| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note text rendered from a TriageReport |
 
 Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
 
@@ -303,8 +306,9 @@ Public alpha. APIs may break before 1.0.
 - schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
 - SQLite cold store (events, deployments, signals, incidents, causal claims)
 - signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
+- alert intake for Waylog, Alertmanager, Grafana, and PagerDuty webhooks, stored as signals and linked to incidents when possible
 - provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
-- 11 deterministic analysis tools, rollup-correct root-cause attribution
+- 12 deterministic analysis tools, rollup-correct root-cause attribution
 - agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
 - `/v1/traces/story` and indented failure-path rendering in the dashboard
 - dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
@@ -327,7 +331,7 @@ Public alpha. APIs may break before 1.0.
 - SQLite cold store fits demos and small deployments; not sized for production-scale retention.
 - Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
 - Incident cause classification is deterministic and heuristic.
-- No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
+- No outbound alerting or paging delivery. Waylog accepts external alerts and renders operator reports, but it doesn't wake you up.
 - No multi-tenancy. One instance = one trust boundary.
 - No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
 
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index b71153f..db2f2c7 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -16,6 +16,7 @@ import (
 	"time"
 
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/sssmaran/WaylogCLI/internal/alerts"
 	"github.com/sssmaran/WaylogCLI/internal/auth"
 	"github.com/sssmaran/WaylogCLI/internal/cli"
 	"github.com/sssmaran/WaylogCLI/internal/coldstore"
@@ -136,6 +137,13 @@ func main() {
 	otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
 	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
 	signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+	alertMatchWindow := config.GetenvDuration("ALERT_MATCH_WINDOW", 15*time.Minute)
+	if alertMatchWindow <= 0 {
+		alertMatchWindow = 15 * time.Minute
+	}
+	if alertMatchWindow > 24*time.Hour {
+		alertMatchWindow = 24 * time.Hour
+	}
 	incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
 	incidentCfg := incidents.Config{
 		TickInterval:            config.GetenvDuration("WAYLOG_INCIDENT_TICK_INTERVAL", 30*time.Second),
@@ -530,6 +538,7 @@ func main() {
 						},
 					),
 					Signals:    triage.NewSignalQueryAdapter(signalStore),
+					Alerts:     triage.NewAlertQueryAdapter(signalStore, alertMatchWindow),
 					NextChecks: triage.NewNextChecksAdapter(),
 				})
 				if err != nil {
@@ -540,6 +549,10 @@ func main() {
 					slog.Error("triage tool register failed", "err", err)
 					os.Exit(1)
 				}
+				if err := tools.RegisterTriageReportTool(reg, triageEng); err != nil {
+					slog.Error("triage report tool register failed", "err", err)
+					os.Exit(1)
+				}
 				triageHandler := triagehttp.NewHandler(triageEng)
 				mux.Handle("/v1/triage/", readCORS(triageHandler.Triage))
 
@@ -561,6 +574,8 @@ func main() {
 	mux.Handle("/v1/topology", readCORS(ingestServer.Topology))
 	mux.Handle("/v1/stream/dashboard", readCORS(ingestServer.SSEStream))
 	mux.Handle("/v1/insight", readCORS(ingestServer.Insight))
+	alertHandler := alerts.NewHandler(signalStore, incidentEngine, v2Reader, alertMatchWindow)
+	mux.Handle("/v1/alerts", writeAuth(http.HandlerFunc(alertHandler.Alerts)))
 
 	// Deployments — dual method: GET=read, POST=write.
 	mux.Handle("/v1/deployments", http.HandlerFunc(
diff --git a/docs/env.md b/docs/env.md
index de42e43..3de9e62 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -54,6 +54,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
 | `WRITE_TIMEOUT` | `10s` | HTTP write timeout |
 | `IDLE_TIMEOUT` | `120s` | HTTP idle timeout |
 | `CORS_ORIGIN` | `*` | Allowed CORS origin for read APIs |
+| `ALERT_MATCH_WINDOW` | `15m` | Window for matching `/v1/alerts` to active incidents by `env + service + error_code`; capped at `24h` |
 
 ## CLI
 
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 4d480f1..4c31f77 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -198,6 +198,54 @@ paths:
               schema:
                 $ref: '#/components/schemas/ReadError'
 
+  /v1/alerts:
+    post:
+      tags: [Signals]
+      operationId: ingestAlert
+      summary: Ingest an external alert and match it to an active incident
+      description: |
+        Accepts Waylog-normalized alerts plus Alertmanager, Grafana, and
+        PagerDuty webhook payloads. Accepted alerts are stored as `type=alert`
+        signals. Matching is best-effort and does not create incidents directly.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              additionalProperties: true
+      responses:
+        '201':
+          description: Alert accepted and stored as a signal
+          content:
+            application/json:
+              schema:
+                type: object
+                required: [signal, match]
+                properties:
+                  signal:
+                    $ref: '#/components/schemas/Signal'
+                  match:
+                    type: object
+                    required: [matched, strategy]
+                    properties:
+                      matched: {type: boolean}
+                      incident_id: {type: string}
+                      strategy:
+                        type: string
+                        enum: [incident_id, trace_id, family, none]
+        '400':
+          description: Invalid JSON, unsupported alert shape, or missing required fields
+        '401':
+          description: Unauthorized
+        '405':
+          description: Method Not Allowed
+        '503':
+          description: Signal store unavailable
+
   /v1/otlp/v1/traces:
     post:
       tags: [OTLP]
@@ -559,6 +607,59 @@ paths:
         '500':
           description: Triage build failed
 
+  /v1/triage/{incident_id}/report:
+    get:
+      tags: [Triage]
+      operationId: renderTriageReport
+      summary: Render a cited operator report from a TriageReport
+      description: |
+        Builds the same deterministic TriageReport as `/v1/triage/{incident_id}`
+        and renders it as Markdown, Slack Block Kit JSON, or PagerDuty note text.
+        This endpoint does not deliver messages to Slack or PagerDuty.
+      security:
+        - ApiKeyHeader: []
+        - BearerAuth: []
+      parameters:
+        - name: incident_id
+          in: path
+          required: true
+          schema: {type: string}
+        - name: format
+          in: query
+          required: false
+          schema:
+            type: string
+            enum: [markdown, slack, pagerduty]
+            default: markdown
+        - name: window
+          in: query
+          required: false
+          schema: {type: string, default: "15m"}
+        - name: snapshot
+          in: query
+          required: false
+          schema: {type: boolean, default: false}
+      responses:
+        '200':
+          description: Rendered operator report
+          content:
+            text/markdown:
+              schema: {type: string}
+            text/plain:
+              schema: {type: string}
+            application/json:
+              schema:
+                type: object
+                additionalProperties: true
+        '400':
+          description: Missing or invalid parameters
+        '401':
+          description: Unauthorized
+        '404':
+          $ref: '#/components/responses/ReadNotFound'
+        '500':
+          description: Triage build or render failed
+
   /v1/insight:
     get:
       tags: [Operational]
@@ -811,7 +912,10 @@ paths:
       tags: [Operational]
       operationId: executeTool
       summary: Direct tool call
-      description: Executes a registered structured tool by name.
+      description: |
+        Executes a registered structured tool by name. `triage_incident`
+        returns a TriageReport; `render_triage_report` renders that report as
+        Markdown, Slack Block Kit JSON, or PagerDuty note text without delivery.
       security:
         - ApiKeyHeader: []
         - BearerAuth: []
@@ -1884,6 +1988,21 @@ components:
               id: {type: string}
               type: {type: string}
               evidence_ids: {type: array, items: {type: string}}
+        alerts:
+          type: array
+          description: Alert signals linked to this incident and cited by operator reports.
+          items:
+            type: object
+            properties:
+              signal_id: {type: string}
+              alert_id: {type: string}
+              source: {type: string, enum: [waylog, alertmanager, grafana, pagerduty]}
+              severity: {type: string}
+              reason: {type: string}
+              provider_url: {type: string}
+              evidence_ids:
+                type: array
+                items: {type: string}
         next_checks:
           type: array
           items:
diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go
new file mode 100644
index 0000000..edde578
--- /dev/null
+++ b/internal/alerts/alerts.go
@@ -0,0 +1,414 @@
+package alerts
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+const (
+	CodeInvalidJSON       = "INVALID_JSON"
+	CodeMissingFields     = "MISSING_FIELDS"
+	CodeUnsupportedAlert  = "UNSUPPORTED_ALERT"
+	CodeSignalUnavailable = "SIGNALS_UNAVAILABLE"
+	CodeInternal          = "INTERNAL"
+)
+
+type IncidentSource interface {
+	Active(ctx context.Context) ([]incidents.Incident, error)
+	Get(ctx context.Context, id string) (incidents.Incident, error)
+}
+
+type TraceResolver interface {
+	TraceStoryByTraceID(traceID string) (apiv2.StoryResponse, bool)
+}
+
+type Handler struct {
+	store       signals.Store
+	incidents   IncidentSource
+	traces      TraceResolver
+	now         func() time.Time
+	matchWindow time.Duration
+	maxBody     int64
+}
+
+type MatchResult struct {
+	Matched    bool   `json:"matched"`
+	IncidentID string `json:"incident_id,omitempty"`
+	Strategy   string `json:"strategy"`
+}
+
+func NewHandler(store signals.Store, incidents IncidentSource, traces TraceResolver, matchWindow time.Duration) *Handler {
+	if store == nil {
+		store = signals.UnavailableStore{}
+	}
+	if matchWindow <= 0 {
+		matchWindow = 15 * time.Minute
+	}
+	if matchWindow > 24*time.Hour {
+		matchWindow = 24 * time.Hour
+	}
+	return &Handler{
+		store:       store,
+		incidents:   incidents,
+		traces:      traces,
+		now:         time.Now,
+		matchWindow: matchWindow,
+		maxBody:     1 << 20,
+	}
+}
+
+func (h *Handler) Alerts(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodPost {
+		writeError(w, http.StatusMethodNotAllowed, "METHOD_NOT_ALLOWED", "method not allowed", "")
+		return
+	}
+	r.Body = http.MaxBytesReader(w, r.Body, h.maxBody)
+	body, err := io.ReadAll(r.Body)
+	if err != nil {
+		writeError(w, http.StatusBadRequest, CodeInvalidJSON, "invalid body", err.Error())
+		return
+	}
+	sig, err := Normalize(body, h.now().UTC())
+	if err != nil {
+		var normErr *NormalizeError
+		if errors.As(err, &normErr) {
+			writeError(w, normErr.Status, normErr.Code, normErr.Message, normErr.Detail)
+			return
+		}
+		writeError(w, http.StatusBadRequest, CodeUnsupportedAlert, "unsupported alert", err.Error())
+		return
+	}
+	match := h.Match(r.Context(), sig)
+	sig.SignalID = signals.NewSignalID()
+	sig.ReceivedAt = h.now().UTC()
+	if err := signals.Validate(sig, h.now().UTC(), 5*time.Minute); err != nil {
+		writeError(w, http.StatusBadRequest, CodeMissingFields, "invalid alert", err.Error())
+		return
+	}
+	if err := h.store.Insert(r.Context(), sig); err != nil {
+		if errors.Is(err, signals.ErrUnavailable) {
+			writeError(w, http.StatusServiceUnavailable, CodeSignalUnavailable, "signals unavailable", "set SQLITE_PATH to enable alerts")
+			return
+		}
+		writeError(w, http.StatusInternalServerError, CodeInternal, "internal error", "")
+		return
+	}
+	writeJSON(w, http.StatusCreated, map[string]any{"signal": sig, "match": match})
+}
+
+func (h *Handler) Match(ctx context.Context, sig *signals.Signal) MatchResult {
+	if h.incidents == nil || sig == nil {
+		return MatchResult{Strategy: "none"}
+	}
+	if id := metaString(sig.Metadata, "incident_id"); id != "" {
+		if inc, err := h.incidents.Get(ctx, id); err == nil {
+			return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "incident_id"}
+		}
+	}
+	if traceID := metaString(sig.Metadata, "trace_id"); traceID != "" && h.traces != nil {
+		if story, ok := h.traces.TraceStoryByTraceID(traceID); ok && story.Anchor != nil {
+			if inc, ok := h.findActive(ctx, sig.Env, story.Service, story.Anchor.ErrorCode, sig.Timestamp); ok {
+				return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "trace_id"}
+			}
+		}
+	}
+	if code := metaString(sig.Metadata, "error_code"); code != "" {
+		if inc, ok := h.findActive(ctx, sig.Env, sig.Service, code, sig.Timestamp); ok {
+			return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "family"}
+		}
+	}
+	return MatchResult{Strategy: "none"}
+}
+
+func (h *Handler) findActive(ctx context.Context, env, service, errorCode string, ts time.Time) (incidents.Incident, bool) {
+	rows, err := h.incidents.Active(ctx)
+	if err != nil {
+		return incidents.Incident{}, false
+	}
+	for _, inc := range rows {
+		if inc.Status != incidents.StatusActive {
+			continue
+		}
+		if env != "" && inc.Env != "" && inc.Env != env {
+			continue
+		}
+		if inc.ErrorFamily.Service != service || inc.ErrorFamily.ErrorCode != errorCode {
+			continue
+		}
+		if !ts.IsZero() {
+			lo := inc.StartedAt.Add(-h.matchWindow)
+			hi := inc.UpdatedAt.Add(h.matchWindow)
+			if ts.Before(lo) || ts.After(hi) {
+				continue
+			}
+		}
+		return inc, true
+	}
+	return incidents.Incident{}, false
+}
+
+type NormalizeError struct {
+	Status  int
+	Code    string
+	Message string
+	Detail  string
+}
+
+func (e *NormalizeError) Error() string {
+	return e.Message
+}
+
+func Normalize(body []byte, now time.Time) (*signals.Signal, error) {
+	var root map[string]json.RawMessage
+	if err := json.Unmarshal(body, &root); err != nil {
+		return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeInvalidJSON, Message: "invalid json", Detail: err.Error()}
+	}
+	if source := getString(root, "source"); source != "" && source != "waylog" {
+		return normalizeWaylog(root, now)
+	}
+	switch {
+	case has(root, "ruleId") || has(root, "ruleUID") || has(root, "evalMatches") || isGrafanaAlertmanagerPayload(root):
+		return normalizeGrafana(root, now)
+	case has(root, "source") || has(root, "alert_id"):
+		return normalizeWaylog(root, now)
+	case has(root, "alerts") && (has(root, "receiver") || has(root, "commonLabels")):
+		return normalizeAlertmanager(root, now)
+	case has(root, "messages") || has(root, "event"):
+		return normalizePagerDuty(root, now)
+	default:
+		return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeUnsupportedAlert, Message: "unsupported alert", Detail: "expected waylog, alertmanager, grafana, or pagerduty payload"}
+	}
+}
+
+func isGrafanaAlertmanagerPayload(root map[string]json.RawMessage) bool {
+	if has(root, "orgId") || has(root, "orgID") {
+		return true
+	}
+	alert := firstObject(root, "alerts")
+	if has(alert, "dashboardURL") || has(alert, "panelURL") || has(alert, "ruleURL") {
+		return true
+	}
+	labels := objectField(alert, "labels")
+	annotations := objectField(alert, "annotations")
+	return stringMapField(labels, "grafana_folder") != "" ||
+		stringMapField(annotations, "__dashboardUid__") != "" ||
+		stringMapField(annotations, "__panelId__") != ""
+}
+
+func normalizeWaylog(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+	src := getString(root, "source")
+	if src == "" {
+		src = "waylog"
+	}
+	ts := getTime(root, "timestamp", now)
+	meta := baseMeta(root, src)
+	return finalize(src, getString(root, "service"), getString(root, "env"), getSeverity(root, "severity"), getString(root, "reason"), getString(root, "message"), ts, meta)
+}
+
+func normalizeAlertmanager(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+	alert := firstObject(root, "alerts")
+	labels := objectField(alert, "labels")
+	annotations := objectField(alert, "annotations")
+	meta := map[string]any{"raw_source": "alertmanager"}
+	put(meta, "alert_id", firstString(alert, labels, "fingerprint", "alertname"))
+	put(meta, "fingerprint", firstString(alert, labels, "fingerprint"))
+	put(meta, "provider_url", firstString(alert, annotations, "generatorURL", "runbook_url"))
+	put(meta, "error_code", stringMapField(labels, "error_code"))
+	ts := timeField(alert, "startsAt", now)
+	reason := firstNonEmpty(stringMapField(annotations, "summary"), stringMapField(annotations, "description"), stringMapField(labels, "alertname"))
+	return finalize("alertmanager", stringMapField(labels, "service"), stringMapField(labels, "env"), severityFromString(stringMapField(labels, "severity")), reason, stringMapField(annotations, "description"), ts, meta)
+}
+
+func normalizeGrafana(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+	alert := firstObject(root, "alerts")
+	labels := objectField(alert, "labels")
+	annotations := objectField(alert, "annotations")
+	if len(alert) == 0 {
+		alert = root
+	}
+	meta := map[string]any{"raw_source": "grafana"}
+	put(meta, "alert_id", firstNonEmpty(getString(root, "ruleUID"), getString(root, "ruleId"), stringMapField(labels, "alertname")))
+	put(meta, "fingerprint", stringMapField(alert, "fingerprint"))
+	put(meta, "provider_url", firstString(alert, root, "dashboardURL", "panelURL", "generatorURL", "ruleUrl"))
+	put(meta, "error_code", firstNonEmpty(stringMapField(labels, "error_code"), getString(root, "error_code")))
+	ts := timeField(alert, "startsAt", now)
+	reason := firstNonEmpty(stringMapField(annotations, "summary"), getString(root, "title"), getString(root, "ruleName"), stringMapField(labels, "alertname"))
+	return finalize("grafana", firstNonEmpty(stringMapField(labels, "service"), getString(root, "service")), firstNonEmpty(stringMapField(labels, "env"), getString(root, "env")), severityFromString(firstNonEmpty(stringMapField(labels, "severity"), getString(root, "state"))), reason, stringMapField(annotations, "description"), ts, meta)
+}
+
+func normalizePagerDuty(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+	msg := firstObject(root, "messages")
+	event := objectField(msg, "event")
+	data := objectField(event, "data")
+	if len(data) == 0 {
+		data = objectField(root, "incident")
+	}
+	serviceObj := objectField(data, "service")
+	meta := map[string]any{"raw_source": "pagerduty"}
+	put(meta, "alert_id", firstNonEmpty(stringMapField(data, "id"), stringMapField(event, "id")))
+	put(meta, "provider_url", stringMapField(data, "html_url"))
+	put(meta, "error_code", stringMapField(data, "error_code"))
+	put(meta, "incident_id", stringMapField(data, "incident_id"))
+	ts := timeField(data, "created_at", now)
+	reason := firstNonEmpty(stringMapField(data, "title"), stringMapField(data, "summary"), stringMapField(event, "event_type"))
+	return finalize("pagerduty", firstNonEmpty(stringMapField(data, "service"), stringMapField(serviceObj, "summary")), stringMapField(data, "env"), severityFromString(firstNonEmpty(stringMapField(data, "urgency"), stringMapField(data, "severity"))), reason, stringMapField(data, "description"), ts, meta)
+}
+
+func finalize(source, service, env string, severity signals.Severity, reason, message string, ts time.Time, meta map[string]any) (*signals.Signal, error) {
+	if strings.TrimSpace(service) == "" || strings.TrimSpace(env) == "" || strings.TrimSpace(reason) == "" {
+		return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeMissingFields, Message: "missing required fields", Detail: "service, env, and reason are required"}
+	}
+	if severity == "" {
+		severity = signals.SeverityWarning
+	}
+	return &signals.Signal{
+		Type:      signals.TypeAlert,
+		Source:    source,
+		Service:   service,
+		Env:       env,
+		Severity:  severity,
+		Reason:    reason,
+		Message:   message,
+		Metadata:  meta,
+		Timestamp: ts.UTC(),
+	}, nil
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	_ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+	writeJSON(w, status, map[string]any{"error": map[string]string{"code": code, "message": message, "detail": detail}})
+}
+
+func has(root map[string]json.RawMessage, key string) bool {
+	_, ok := root[key]
+	return ok
+}
+
+func baseMeta(root map[string]json.RawMessage, source string) map[string]any {
+	meta := map[string]any{"raw_source": source}
+	for _, key := range []string{"alert_id", "error_code", "trace_id", "incident_id", "provider_url", "fingerprint"} {
+		put(meta, key, getString(root, key))
+	}
+	return meta
+}
+
+func put(m map[string]any, key, value string) {
+	if value != "" {
+		m[key] = value
+	}
+}
+
+func metaString(m map[string]any, key string) string {
+	if s, ok := m[key].(string); ok {
+		return s
+	}
+	return ""
+}
+
+func getString(root map[string]json.RawMessage, key string) string {
+	var s string
+	_ = json.Unmarshal(root[key], &s)
+	return strings.TrimSpace(s)
+}
+
+func getSeverity(root map[string]json.RawMessage, key string) signals.Severity {
+	return severityFromString(getString(root, key))
+}
+
+func severityFromString(s string) signals.Severity {
+	switch strings.ToLower(strings.TrimSpace(s)) {
+	case "critical", "error", "high", "triggered":
+		return signals.SeverityCritical
+	case "info", "resolved", "ok":
+		return signals.SeverityInfo
+	case "warning", "warn", "":
+		return signals.SeverityWarning
+	default:
+		return signals.SeverityWarning
+	}
+}
+
+func getTime(root map[string]json.RawMessage, key string, fallback time.Time) time.Time {
+	if t := timeField(root, key, time.Time{}); !t.IsZero() {
+		return t
+	}
+	return fallback
+}
+
+func timeField(root map[string]json.RawMessage, key string, fallback time.Time) time.Time {
+	raw, ok := root[key]
+	if !ok {
+		return fallback
+	}
+	var s string
+	if err := json.Unmarshal(raw, &s); err != nil || s == "" {
+		return fallback
+	}
+	if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
+		return t
+	}
+	return fallback
+}
+
+func firstObject(root map[string]json.RawMessage, key string) map[string]json.RawMessage {
+	var arr []map[string]json.RawMessage
+	if err := json.Unmarshal(root[key], &arr); err == nil && len(arr) > 0 {
+		return arr[0]
+	}
+	return nil
+}
+
+func objectField(root map[string]json.RawMessage, key string) map[string]json.RawMessage {
+	if root == nil {
+		return nil
+	}
+	var obj map[string]json.RawMessage
+	if err := json.Unmarshal(root[key], &obj); err != nil {
+		return nil
+	}
+	return obj
+}
+
+func stringMapField(root map[string]json.RawMessage, key string) string {
+	if root == nil {
+		return ""
+	}
+	var s string
+	_ = json.Unmarshal(root[key], &s)
+	return strings.TrimSpace(s)
+}
+
+func firstString(primary, secondary map[string]json.RawMessage, keys ...string) string {
+	for _, key := range keys {
+		if s := stringMapField(primary, key); s != "" {
+			return s
+		}
+		if s := stringMapField(secondary, key); s != "" {
+			return s
+		}
+	}
+	return ""
+}
+
+func firstNonEmpty(values ...string) string {
+	for _, v := range values {
+		if strings.TrimSpace(v) != "" {
+			return strings.TrimSpace(v)
+		}
+	}
+	return ""
+}
diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go
new file mode 100644
index 0000000..6b0b7da
--- /dev/null
+++ b/internal/alerts/alerts_test.go
@@ -0,0 +1,234 @@
+package alerts
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestNormalizeWaylogAlert(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	raw := []byte(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","severity":"critical","reason":"PMT_502 spike","error_code":"PMT_502","provider_url":"https://alerts/1","timestamp":"2026-05-10T12:00:00Z"}`)
+	sig, err := Normalize(raw, now)
+	if err != nil {
+		t.Fatalf("normalize: %v", err)
+	}
+	if sig.Type != signals.TypeAlert || sig.Source != "waylog" || sig.Service != "checkout" {
+		t.Fatalf("unexpected signal: %+v", sig)
+	}
+	if sig.Metadata["alert_id"] != "alert_1" || sig.Metadata["error_code"] != "PMT_502" {
+		t.Fatalf("metadata not preserved: %+v", sig.Metadata)
+	}
+}
+
+func TestNormalizeProviderPayloads(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	cases := []struct {
+		name   string
+		raw    string
+		source string
+	}{
+		{
+			name:   "alertmanager",
+			raw:    `{"receiver":"team","alerts":[{"fingerprint":"fp1","startsAt":"2026-05-10T12:00:00Z","generatorURL":"https://am/1","labels":{"alertname":"Payment502","service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike"}}]}`,
+			source: "alertmanager",
+		},
+		{
+			name:   "grafana",
+			raw:    `{"ruleUID":"rule1","title":"Payment failures","state":"alerting","alerts":[{"startsAt":"2026-05-10T12:00:00Z","dashboardURL":"https://grafana/d","labels":{"service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike"}}]}`,
+			source: "grafana",
+		},
+		{
+			name:   "pagerduty",
+			raw:    `{"messages":[{"event":{"event_type":"incident.trigger","data":{"id":"pd1","html_url":"https://pd/1","title":"PMT_502 spike","service":{"summary":"checkout"},"env":"prod","urgency":"high","error_code":"PMT_502"}}}]}`,
+			source: "pagerduty",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			sig, err := Normalize([]byte(tc.raw), now)
+			if err != nil {
+				t.Fatalf("normalize: %v", err)
+			}
+			if sig.Source != tc.source || sig.Type != signals.TypeAlert {
+				t.Fatalf("unexpected signal: %+v", sig)
+			}
+			if sig.Service != "checkout" || sig.Env != "prod" {
+				t.Fatalf("service/env missing: %+v", sig)
+			}
+		})
+	}
+}
+
+func TestMatcherOrderAndUnmatched(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	inc := incidents.Incident{
+		IncidentID: "inc_1",
+		Env:        "prod",
+		Status:     incidents.StatusActive,
+		StartedAt:  now.Add(-time.Minute),
+		UpdatedAt:  now,
+		ErrorFamily: apiv2.ErrorFamily{
+			Service:   "checkout",
+			Step:      "payment.charge",
+			ErrorCode: "PMT_502",
+		},
+	}
+	h := NewHandler(&memSignalStore{}, incidentSource{rows: []incidents.Incident{inc}}, traceResolver{}, 15*time.Minute)
+
+	sig := &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now, Metadata: map[string]any{"incident_id": "inc_1", "error_code": "OTHER"}}
+	got := h.Match(context.Background(), sig)
+	if !got.Matched || got.Strategy != "incident_id" {
+		t.Fatalf("incident id should win, got %+v", got)
+	}
+
+	sig = &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now, Metadata: map[string]any{"error_code": "PMT_502"}}
+	got = h.Match(context.Background(), sig)
+	if !got.Matched || got.Strategy != "family" {
+		t.Fatalf("family match failed: %+v", got)
+	}
+
+	sig = &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now.Add(2 * time.Hour), Metadata: map[string]any{"error_code": "PMT_502"}}
+	got = h.Match(context.Background(), sig)
+	if got.Matched || got.Strategy != "none" {
+		t.Fatalf("outside window should be unmatched: %+v", got)
+	}
+}
+
+func TestMatcherExplicitIncidentIDCanMatchResolvedIncident(t *testing.T) {
+	inc := incidents.Incident{IncidentID: "inc_resolved", Status: incidents.StatusResolved}
+	h := NewHandler(&memSignalStore{}, incidentSource{rows: []incidents.Incident{inc}}, traceResolver{}, 15*time.Minute)
+
+	got := h.Match(context.Background(), &signals.Signal{Metadata: map[string]any{"incident_id": "inc_resolved"}})
+	if !got.Matched || got.Strategy != "incident_id" || got.IncidentID != "inc_resolved" {
+		t.Fatalf("explicit incident_id should be authoritative, got %+v", got)
+	}
+}
+
+func TestNormalizeGrafanaAlertmanagerPayload(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	raw := []byte(`{"receiver":"grafana","status":"firing","externalURL":"https://grafana.example","alerts":[{"startsAt":"2026-05-10T12:00:00Z","dashboardURL":"https://grafana.example/d/abc","labels":{"service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike","__dashboardUid__":"abc"}}]}`)
+	sig, err := Normalize(raw, now)
+	if err != nil {
+		t.Fatalf("normalize: %v", err)
+	}
+	if sig.Source != "grafana" {
+		t.Fatalf("source=%q want grafana", sig.Source)
+	}
+	if sig.Metadata["provider_url"] != "https://grafana.example/d/abc" {
+		t.Fatalf("provider_url not preserved: %+v", sig.Metadata)
+	}
+}
+
+func TestNormalizeRejectsUnsupported(t *testing.T) {
+	_, err := Normalize([]byte(`{"hello":"world"}`), time.Now())
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	var normErr *NormalizeError
+	if !errors.As(err, &normErr) || normErr.Code != CodeUnsupportedAlert {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestHandlerStoresWaylogAlert(t *testing.T) {
+	store := &recordingSignalStore{}
+	h := NewHandler(store, incidentSource{}, traceResolver{}, 15*time.Minute)
+	h.now = func() time.Time { return time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) }
+
+	req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","severity":"critical","reason":"PMT_502 spike"}`))
+	rr := httptest.NewRecorder()
+	h.Alerts(rr, req)
+
+	if rr.Code != http.StatusCreated {
+		t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+	}
+	if store.inserted == nil || store.inserted.Type != signals.TypeAlert {
+		t.Fatalf("alert signal was not inserted: %+v", store.inserted)
+	}
+	var out struct {
+		Match MatchResult `json:"match"`
+	}
+	if err := json.Unmarshal(rr.Body.Bytes(), &out); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if out.Match.Matched || out.Match.Strategy != "none" {
+		t.Fatalf("unexpected match for no active incidents: %+v", out.Match)
+	}
+}
+
+func TestHandlerRejectsInvalidJSON(t *testing.T) {
+	h := NewHandler(&recordingSignalStore{}, incidentSource{}, traceResolver{}, 15*time.Minute)
+	req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{`))
+	rr := httptest.NewRecorder()
+	h.Alerts(rr, req)
+	if rr.Code != http.StatusBadRequest {
+		t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+func TestHandlerUnavailableSignalStore(t *testing.T) {
+	h := NewHandler(signals.UnavailableStore{}, incidentSource{}, traceResolver{}, 15*time.Minute)
+	req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","reason":"PMT_502 spike"}`))
+	rr := httptest.NewRecorder()
+	h.Alerts(rr, req)
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+type incidentSource struct {
+	rows []incidents.Incident
+}
+
+func (s incidentSource) Active(context.Context) ([]incidents.Incident, error) {
+	return s.rows, nil
+}
+
+func (s incidentSource) Get(_ context.Context, id string) (incidents.Incident, error) {
+	for _, inc := range s.rows {
+		if inc.IncidentID == id {
+			return inc, nil
+		}
+	}
+	return incidents.Incident{}, incidents.ErrNotFound
+}
+
+type traceResolver struct{}
+
+func (traceResolver) TraceStoryByTraceID(string) (apiv2.StoryResponse, bool) {
+	return apiv2.StoryResponse{Service: "checkout", Anchor: &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"}}, true
+}
+
+type memSignalStore struct{}
+
+func (*memSignalStore) Insert(context.Context, *signals.Signal) error { return nil }
+func (*memSignalStore) Query(context.Context, signals.Filter) ([]signals.Signal, error) {
+	return nil, nil
+}
+func (*memSignalStore) PruneOlderThan(context.Context, time.Time) (int, error) { return 0, nil }
+
+type recordingSignalStore struct {
+	inserted *signals.Signal
+}
+
+func (s *recordingSignalStore) Insert(_ context.Context, sig *signals.Signal) error {
+	copy := *sig
+	s.inserted = &copy
+	return nil
+}
+func (*recordingSignalStore) Query(context.Context, signals.Filter) ([]signals.Signal, error) {
+	return nil, nil
+}
+func (*recordingSignalStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+	return 0, nil
+}
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
index 5f10c66..6d57e82 100644
--- a/internal/incidents/classifier.go
+++ b/internal/incidents/classifier.go
@@ -28,6 +28,7 @@ type Classification struct {
 
 func Classify(input ClassificationInput) Classification {
 	evidence := collectTraceEvidence(input.Events)
+	evidence = append(evidence, matchingAlertEvidence(input)...)
 	warnings := instrumentationWarnings(input.Events, input.Signals)
 
 	if dep := matchingDependencySignal(input); dep != nil {
@@ -142,6 +143,32 @@ func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal
 	return nil
 }
 
+func matchingAlertEvidence(input ClassificationInput) []Evidence {
+	start := input.Incident.StartedAt
+	lo := start.Add(-15 * time.Minute)
+	hi := input.Now
+	if hi.IsZero() {
+		hi = input.Incident.UpdatedAt
+	}
+	out := []Evidence{}
+	for _, sig := range input.Signals {
+		if sig.Type != signals.TypeAlert {
+			continue
+		}
+		if input.Incident.Env != "" && sig.Env != input.Incident.Env {
+			continue
+		}
+		if sig.Service != input.Incident.Service {
+			continue
+		}
+		if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
+			continue
+		}
+		out = append(out, signalEvidence(sig, "External alert overlaps incident window"))
+	}
+	return out
+}
+
 func collectTraceEvidence(events []*eventv2.Event) []Evidence {
 	out := make([]Evidence, 0, 2)
 	for _, ev := range events {
@@ -173,6 +200,17 @@ func deploymentEvidence(dep Deployment) Evidence {
 }
 
 func signalEvidence(sig signals.Signal, title string) Evidence {
+	fields := map[string]any{
+		"type":     string(sig.Type),
+		"severity": string(sig.Severity),
+		"source":   sig.Source,
+	}
+	if alertID := stringField(sig.Metadata, "alert_id"); alertID != "" {
+		fields["alert_id"] = alertID
+	}
+	if providerURL := stringField(sig.Metadata, "provider_url"); providerURL != "" {
+		fields["provider_url"] = providerURL
+	}
 	return Evidence{
 		Kind:       EvidenceSignal,
 		Title:      title,
@@ -180,11 +218,7 @@ func signalEvidence(sig signals.Signal, title string) Evidence {
 		Service:    sig.Service,
 		SignalID:   sig.SignalID,
 		OccurredAt: sig.Timestamp,
-		Fields: map[string]any{
-			"type":     string(sig.Type),
-			"severity": string(sig.Severity),
-			"source":   sig.Source,
-		},
+		Fields:     fields,
 	}
 }
 
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
index 5c5a16d..10ae22c 100644
--- a/internal/incidents/classifier_test.go
+++ b/internal/incidents/classifier_test.go
@@ -202,3 +202,51 @@ func TestNextChecksRuntime(t *testing.T) {
 		t.Fatalf("expected non-empty next checks for runtime cause")
 	}
 }
+
+func TestClassifyIncludesAlertEvidenceWithoutChangingCause(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+	paymentEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+	got := Classify(ClassificationInput{
+		Incident: base,
+		Events:   []*eventv2.Event{paymentEvent},
+		Signals: []signals.Signal{{
+			SignalID:  "sig_alert",
+			Type:      signals.TypeAlert,
+			Source:    "grafana",
+			Service:   "checkout",
+			Env:       "prod",
+			Severity:  signals.SeverityCritical,
+			Reason:    "PMT_502 spike",
+			Timestamp: now,
+			Metadata:  map[string]any{"alert_id": "alert_1", "provider_url": "https://grafana/alert"},
+		}, {
+			SignalID:  "sig_other_env",
+			Type:      signals.TypeAlert,
+			Source:    "grafana",
+			Service:   "checkout",
+			Env:       "staging",
+			Severity:  signals.SeverityCritical,
+			Reason:    "staging alert",
+			Timestamp: now,
+			Metadata:  map[string]any{"alert_id": "alert_staging"},
+		}},
+		Now: now,
+	})
+	if got.Cause != CauseDependency {
+		t.Fatalf("alert should not override dependency cause: %+v", got)
+	}
+	for _, ev := range got.Evidence {
+		if ev.SignalID == "sig_other_env" {
+			t.Fatalf("alert evidence from another env should not be included: %+v", ev)
+		}
+		if ev.SignalID == "sig_alert" && ev.Title == "External alert overlaps incident window" {
+			if ev.Fields["alert_id"] != "alert_1" {
+				t.Fatalf("alert metadata missing: %+v", ev.Fields)
+			}
+			return
+		}
+	}
+	t.Fatalf("alert evidence missing: %+v", got.Evidence)
+}
diff --git a/internal/reports/reports.go b/internal/reports/reports.go
new file mode 100644
index 0000000..9ad67d7
--- /dev/null
+++ b/internal/reports/reports.go
@@ -0,0 +1,152 @@
+package reports
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+const (
+	FormatMarkdown  = "markdown"
+	FormatSlack     = "slack"
+	FormatPagerDuty = "pagerduty"
+)
+
+type Rendered struct {
+	Format      string `json:"format"`
+	ContentType string `json:"content_type"`
+	Body        any    `json:"body"`
+}
+
+func Render(rep *pkgtriage.Report, format string) (Rendered, error) {
+	if rep == nil {
+		return Rendered{}, fmt.Errorf("report required")
+	}
+	if format == "" {
+		format = FormatMarkdown
+	}
+	switch format {
+	case FormatMarkdown:
+		return Rendered{Format: format, ContentType: "text/markdown", Body: Markdown(rep)}, nil
+	case FormatSlack:
+		return Rendered{Format: format, ContentType: "application/json", Body: Slack(rep)}, nil
+	case FormatPagerDuty:
+		return Rendered{Format: format, ContentType: "text/plain", Body: PagerDuty(rep)}, nil
+	default:
+		return Rendered{}, fmt.Errorf("unsupported report format %q", format)
+	}
+}
+
+func Markdown(rep *pkgtriage.Report) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "# Waylog Triage Report\n\n")
+	fmt.Fprintf(&b, "- Incident: `%s`\n", nz(rep.IncidentRef.ID))
+	fmt.Fprintf(&b, "- Window: `%s`\n", nz(rep.IncidentRef.Window))
+	fmt.Fprintf(&b, "- Confidence: `%s`\n", nz(string(rep.Confidence)))
+	fmt.Fprintf(&b, "- Report hash: `%s`\n\n", nz(rep.ReportHash))
+
+	fmt.Fprintf(&b, "## Blast Snapshot\n\n")
+	fmt.Fprintf(&b, "- Requests: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Requests, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Users: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Users, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Services: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Services, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	for _, f := range rep.BlastSnapshot.TopErrorFamilies {
+		fmt.Fprintf(&b, "- Error family: `%s/%s/%s` count=%d (incident `%s`, report `%s`)\n", nz(f.Service), nz(f.Step), nz(f.ErrorCode), f.Count, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	}
+	if len(rep.BlastSnapshot.TopErrorFamilies) == 0 {
+		fmt.Fprintf(&b, "- Error family: not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
+	}
+
+	fmt.Fprintf(&b, "\n## Alert Evidence\n\n")
+	if len(rep.Alerts) == 0 {
+		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+	} else {
+		for _, a := range rep.Alerts {
+			fmt.Fprintf(&b, "- `%s` from `%s`: %s (signal `%s`, alert `%s`, report `%s`)\n", nz(a.Severity), nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID), nz(rep.ReportHash))
+		}
+	}
+
+	fmt.Fprintf(&b, "\n## Signals\n\n")
+	if len(rep.Signals) == 0 {
+		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+	} else {
+		for _, s := range rep.Signals {
+			fmt.Fprintf(&b, "- `%s` signal `%s` evidence=%s (report `%s`)\n", nz(s.Type), nz(s.ID), strings.Join(s.EvidenceIDs, ","), nz(rep.ReportHash))
+		}
+	}
+
+	fmt.Fprintf(&b, "\n## Sample Traces\n\n")
+	if len(rep.SampleTraces) == 0 {
+		fmt.Fprintf(&b, "- not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
+	} else {
+		for _, t := range rep.SampleTraces {
+			fmt.Fprintf(&b, "- trace `%s`: %s (incident `%s`)\n", nz(t.TraceID), nz(t.Summary), nz(rep.IncidentRef.ID))
+		}
+	}
+
+	fmt.Fprintf(&b, "\n## Next Checks\n\n")
+	if len(rep.NextChecks) == 0 {
+		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+	} else {
+		for _, c := range rep.NextChecks {
+			fmt.Fprintf(&b, "- %s (check `%s`, report `%s`)\n", nz(c.Prompt), nz(c.ID), nz(rep.ReportHash))
+		}
+	}
+	return b.String()
+}
+
+func Slack(rep *pkgtriage.Report) map[string]any {
+	fields := []map[string]string{
+		{"type": "mrkdwn", "text": "*Incident*\n`" + nz(rep.IncidentRef.ID) + "`"},
+		{"type": "mrkdwn", "text": "*Confidence*\n`" + nz(string(rep.Confidence)) + "`"},
+		{"type": "mrkdwn", "text": "*Report hash*\n`" + nz(rep.ReportHash) + "`"},
+	}
+	alertText := "not available"
+	if len(rep.Alerts) > 0 {
+		a := rep.Alerts[0]
+		alertText = fmt.Sprintf("`%s` %s (signal `%s`, alert `%s`)", nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID))
+	}
+	return map[string]any{
+		"blocks": []map[string]any{
+			{"type": "header", "text": map[string]string{"type": "plain_text", "text": "Waylog triage report"}},
+			{"type": "section", "fields": fields},
+			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Alert evidence*\n" + alertText}},
+			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Next check*\n" + firstCheck(rep)}},
+		},
+	}
+}
+
+func PagerDuty(rep *pkgtriage.Report) string {
+	alert := "not available"
+	if len(rep.Alerts) > 0 {
+		a := rep.Alerts[0]
+		alert = fmt.Sprintf("%s alert %s via signal %s", nz(a.Source), nz(a.AlertID), nz(a.SignalID))
+	}
+	return fmt.Sprintf("Waylog triage: incident %s confidence=%s report_hash=%s alert=%s next_check=%s",
+		nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), nz(rep.ReportHash), alert, firstCheck(rep))
+}
+
+func EncodeBody(r Rendered) ([]byte, error) {
+	if r.Format == FormatSlack {
+		return json.MarshalIndent(r.Body, "", "  ")
+	}
+	if s, ok := r.Body.(string); ok {
+		return []byte(s), nil
+	}
+	return json.MarshalIndent(r.Body, "", "  ")
+}
+
+func firstCheck(rep *pkgtriage.Report) string {
+	if len(rep.NextChecks) == 0 {
+		return "not available (report `" + nz(rep.ReportHash) + "`)"
+	}
+	return nz(rep.NextChecks[0].Prompt) + " (check `" + nz(rep.NextChecks[0].ID) + "`, report `" + nz(rep.ReportHash) + "`)"
+}
+
+func nz(s string) string {
+	if strings.TrimSpace(s) == "" {
+		return "not available"
+	}
+	return s
+}
diff --git a/internal/reports/reports_test.go b/internal/reports/reports_test.go
new file mode 100644
index 0000000..00befb8
--- /dev/null
+++ b/internal/reports/reports_test.go
@@ -0,0 +1,68 @@
+package reports
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestMarkdownReportCitesEvidence(t *testing.T) {
+	out := Markdown(testReport())
+	for _, want := range []string{"Requests: 12 (incident `inc_abc`, report `sha256:test`)", "trace_1", "sig_alert", "alert_1", "check_0"} {
+		if !strings.Contains(out, want) {
+			t.Fatalf("markdown missing %q:\n%s", want, out)
+		}
+	}
+}
+
+func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
+	rendered, err := Render(testReport(), FormatSlack)
+	if err != nil {
+		t.Fatal(err)
+	}
+	raw, err := EncodeBody(rendered)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !json.Valid(raw) {
+		t.Fatalf("invalid json: %s", raw)
+	}
+	for _, want := range []string{"sig_alert", "alert_1", "sha256:test"} {
+		if !strings.Contains(string(raw), want) {
+			t.Fatalf("slack payload missing %q:\n%s", want, raw)
+		}
+	}
+}
+
+func TestPagerDutyReportCitesEvidence(t *testing.T) {
+	out := PagerDuty(testReport())
+	for _, want := range []string{"inc_abc", "sig_alert", "alert_1", "sha256:test"} {
+		if !strings.Contains(out, want) {
+			t.Fatalf("pagerduty missing %q:\n%s", want, out)
+		}
+	}
+}
+
+func testReport() *pkgtriage.Report {
+	return &pkgtriage.Report{
+		SchemaVersion: pkgtriage.SchemaVersionV1,
+		IncidentRef:   pkgtriage.IncidentRef{ID: "inc_abc", Window: "15m"},
+		BlastSnapshot: pkgtriage.BlastSnapshot{
+			Requests: 12,
+			Users:    2,
+			Services: 3,
+			TopErrorFamilies: []pkgtriage.ErrorFamily{
+				{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502", Count: 12},
+			},
+		},
+		SampleTraces: []pkgtriage.TraceSample{{TraceID: "trace_1", Summary: "checkout payment failure"}},
+		Signals:      []pkgtriage.SignalRef{{ID: "sig_alert", Type: "alert", EvidenceIDs: []string{"sig_alert"}}},
+		Alerts:       []pkgtriage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_alert"}}},
+		NextChecks:   []pkgtriage.NextCheck{{ID: "check_0", Prompt: "Check payment health"}},
+		Confidence:   pkgtriage.ConfidenceHigh,
+		GeneratedAt:  "2026-05-10T12:00:00Z",
+		ReportHash:   "sha256:test",
+	}
+}
diff --git a/internal/tools/report.go b/internal/tools/report.go
new file mode 100644
index 0000000..57206c7
--- /dev/null
+++ b/internal/tools/report.go
@@ -0,0 +1,69 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/reports"
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+const renderTriageReportInputSchema = `{
+  "type": "object",
+  "required": ["incident_id"],
+  "properties": {
+    "incident_id": {"type": "string"},
+    "format":      {"type": "string", "enum": ["markdown", "slack", "pagerduty"], "default": "markdown"},
+    "window":      {"type": "string", "description": "Go duration string, default 15m"},
+    "snapshot":    {"type": "boolean"}
+  }
+}`
+
+const renderTriageReportOutputSchema = `{
+  "type": "object",
+  "required": ["format", "content_type", "body"],
+  "properties": {
+    "format": {"type": "string"},
+    "content_type": {"type": "string"},
+    "body": {}
+  }
+}`
+
+func RegisterTriageReportTool(reg *Registry, engine *triage.Engine) error {
+	return reg.Register(Tool{
+		Name:         "render_triage_report",
+		Description:  "Render a deterministic operator report from a TriageReport.",
+		Version:      "triage-report.v1",
+		InputSchema:  json.RawMessage(renderTriageReportInputSchema),
+		OutputSchema: json.RawMessage(renderTriageReportOutputSchema),
+		Examples: []string{
+			`{"incident_id":"inc_01HX...","format":"markdown","snapshot":true}`,
+			`{"incident_id":"inc_01HX...","format":"slack"}`,
+		},
+		Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+			var p struct {
+				IncidentID string `json:"incident_id"`
+				Format     string `json:"format"`
+				Window     string `json:"window"`
+				Snapshot   bool   `json:"snapshot"`
+			}
+			if err := json.Unmarshal(params, &p); err != nil {
+				return nil, fmt.Errorf("render_triage_report: bad params: %w", err)
+			}
+			if p.IncidentID == "" {
+				return nil, fmt.Errorf("render_triage_report: incident_id required")
+			}
+			opts, err := triage.ParseBuildOptions(p.Window, p.Snapshot, time.Now())
+			if err != nil {
+				return nil, err
+			}
+			rep, err := engine.Build(ctx, p.IncidentID, opts)
+			if err != nil {
+				return nil, err
+			}
+			return reports.Render(rep, p.Format)
+		},
+	})
+}
diff --git a/internal/tools/report_test.go b/internal/tools/report_test.go
new file mode 100644
index 0000000..29df20f
--- /dev/null
+++ b/internal/tools/report_test.go
@@ -0,0 +1,29 @@
+package tools_test
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+
+	"github.com/sssmaran/WaylogCLI/internal/reports"
+	"github.com/sssmaran/WaylogCLI/internal/tools"
+)
+
+func TestRenderTriageReportToolReturnsRenderedReport(t *testing.T) {
+	reg := tools.NewRegistry()
+	eng := newStubEngine(t)
+	if err := tools.RegisterTriageReportTool(reg, eng); err != nil {
+		t.Fatalf("register: %v", err)
+	}
+	out, err := reg.Call(context.Background(), nil, "render_triage_report", json.RawMessage(`{"incident_id":"inc_abc","format":"markdown"}`))
+	if err != nil {
+		t.Fatalf("call: %v", err)
+	}
+	rendered, ok := out.(reports.Rendered)
+	if !ok {
+		t.Fatalf("got %T, want reports.Rendered", out)
+	}
+	if rendered.Format != reports.FormatMarkdown || rendered.ContentType != "text/markdown" {
+		t.Fatalf("unexpected rendered report: %+v", rendered)
+	}
+}
diff --git a/internal/triage/adapter.go b/internal/triage/adapter.go
index 8188bd9..1723880 100644
--- a/internal/triage/adapter.go
+++ b/internal/triage/adapter.go
@@ -6,6 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"strconv"
+	"time"
 
 	"github.com/sssmaran/WaylogCLI/internal/incidents"
 	"github.com/sssmaran/WaylogCLI/internal/signals"
@@ -208,12 +209,26 @@ func storySummary(s apiv2.StoryResponse, inc IncidentSummary) string {
 	return "first failure"
 }
 
-type signalQueryAdapter struct{ s SignalStore }
+type signalQueryAdapter struct {
+	s                SignalStore
+	alertMatchWindow time.Duration
+}
 
 func NewSignalQueryAdapter(s SignalStore) SignalQuery {
 	return signalQueryAdapter{s: s}
 }
 
+func NewAlertQueryAdapter(s SignalStore, matchWindow ...time.Duration) AlertQuery {
+	window := 15 * time.Minute
+	if len(matchWindow) > 0 && matchWindow[0] > 0 {
+		window = matchWindow[0]
+	}
+	if window > 24*time.Hour {
+		window = 24 * time.Hour
+	}
+	return signalQueryAdapter{s: s, alertMatchWindow: window}
+}
+
 func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
 	end := opts.Now
 	if end.IsZero() {
@@ -249,6 +264,59 @@ func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary,
 	return out, nil
 }
 
+func (a signalQueryAdapter) AlertsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]pkgtriage.AlertRef, error) {
+	end := opts.Now
+	if end.IsZero() {
+		end = inc.UpdatedAt
+	}
+	since := inc.StartedAt.Add(-a.alertMatchWindow)
+	if inc.StartedAt.IsZero() {
+		window := opts.Window
+		if window <= 0 {
+			window = defaultWindow
+		}
+		since = end.Add(-window)
+	}
+	until := end.Add(a.alertMatchWindow)
+	rows, err := a.s.Query(ctx, signals.Filter{
+		Env:     inc.Env,
+		Service: inc.Service,
+		Types:   []signals.Type{signals.TypeAlert},
+		Since:   since,
+		Until:   until,
+		Limit:   200,
+	})
+	if err != nil {
+		if errors.Is(err, signals.ErrUnavailable) {
+			return nil, nil
+		}
+		return nil, err
+	}
+	out := make([]pkgtriage.AlertRef, 0, len(rows))
+	for _, sig := range rows {
+		out = append(out, pkgtriage.AlertRef{
+			SignalID:    sig.SignalID,
+			AlertID:     stringField(sig.Metadata, "alert_id"),
+			Source:      sig.Source,
+			Severity:    string(sig.Severity),
+			Reason:      sig.Reason,
+			ProviderURL: stringField(sig.Metadata, "provider_url"),
+			EvidenceIDs: []string{sig.SignalID},
+		})
+	}
+	return out, nil
+}
+
+func stringField(m map[string]any, key string) string {
+	if len(m) == 0 {
+		return ""
+	}
+	if s, ok := m[key].(string); ok {
+		return s
+	}
+	return ""
+}
+
 type nextChecksAdapter struct{}
 
 // NewNextChecksAdapter returns a passthrough that converts the incident's
diff --git a/internal/triage/adapter_test.go b/internal/triage/adapter_test.go
index 2e678d6..8eeaf9d 100644
--- a/internal/triage/adapter_test.go
+++ b/internal/triage/adapter_test.go
@@ -417,6 +417,43 @@ func TestSignalQueryAdapter_UnavailableReturnsEmpty(t *testing.T) {
 	}
 }
 
+func TestAlertQueryAdapter_UsesIncidentWindowPlusMatchWindow(t *testing.T) {
+	now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+	started := now.Add(-2 * time.Hour)
+	store := &fakeSignalStore{out: []signals.Signal{{
+		SignalID:  "sig_alert",
+		Type:      signals.TypeAlert,
+		Source:    "grafana",
+		Service:   "checkout",
+		Env:       "demo",
+		Severity:  signals.SeverityCritical,
+		Reason:    "PMT_502 spike",
+		Timestamp: started.Add(-20 * time.Minute),
+		Metadata:  map[string]any{"alert_id": "alert_1"},
+	}}}
+	a := triage.NewAlertQueryAdapter(store, 30*time.Minute)
+	got, err := a.AlertsFor(context.Background(), triage.IncidentSummary{
+		Service:   "checkout",
+		Env:       "demo",
+		StartedAt: started,
+		UpdatedAt: now,
+	}, triage.BuildOptions{Window: 15 * time.Minute, Now: now})
+	if err != nil {
+		t.Fatalf("AlertsFor: %v", err)
+	}
+	wantSince := started.Add(-30 * time.Minute)
+	if !store.got.Since.Equal(wantSince) {
+		t.Fatalf("filter.Since = %v, want %v", store.got.Since, wantSince)
+	}
+	wantUntil := now.Add(30 * time.Minute)
+	if !store.got.Until.Equal(wantUntil) {
+		t.Fatalf("filter.Until = %v, want %v", store.got.Until, wantUntil)
+	}
+	if len(got) != 1 || got[0].AlertID != "alert_1" {
+		t.Fatalf("alert refs wrong: %+v", got)
+	}
+}
+
 // ----- NextChecksAdapter -----
 
 func TestNextChecksAdapter_ConsumesIncidentNextChecks(t *testing.T) {
diff --git a/internal/triage/engine.go b/internal/triage/engine.go
index 07f5a04..3df3ad2 100644
--- a/internal/triage/engine.go
+++ b/internal/triage/engine.go
@@ -59,6 +59,10 @@ type SignalQuery interface {
 	SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error)
 }
 
+type AlertQuery interface {
+	AlertsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]pkgtriage.AlertRef, error)
+}
+
 type NextChecksProvider interface {
 	NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error)
 }
@@ -68,6 +72,7 @@ type Deps struct {
 	Blast      BlastQuery
 	Story      StoryBuilder
 	Signals    SignalQuery
+	Alerts     AlertQuery
 	NextChecks NextChecksProvider
 	Now        func() time.Time
 }
@@ -107,6 +112,13 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
 	if err != nil {
 		return nil, fmt.Errorf("triage: signals: %w", err)
 	}
+	var alerts []pkgtriage.AlertRef
+	if e.deps.Alerts != nil {
+		alerts, err = e.deps.Alerts.AlertsFor(ctx, inc, opts)
+		if err != nil {
+			return nil, fmt.Errorf("triage: alerts: %w", err)
+		}
+	}
 	checks, err := e.deps.NextChecks.NextChecks(ctx, inc)
 	if err != nil {
 		return nil, fmt.Errorf("triage: next_checks: %w", err)
@@ -122,6 +134,7 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
 		FirstFailure: story.Payload,
 		SampleTraces: story.SampleTraces,
 		Signals:      sigs,
+		Alerts:       alerts,
 		NextChecks:   checks,
 		Confidence:   inc.Confidence,
 		GeneratedAt:  e.deps.Now().UTC().Format(time.RFC3339Nano),
diff --git a/internal/triagehttp/handler.go b/internal/triagehttp/handler.go
index 0adc3c2..fe1484f 100644
--- a/internal/triagehttp/handler.go
+++ b/internal/triagehttp/handler.go
@@ -7,6 +7,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/sssmaran/WaylogCLI/internal/reports"
 	"github.com/sssmaran/WaylogCLI/internal/triage"
 )
 
@@ -19,12 +20,15 @@ func NewHandler(engine *triage.Engine) *Handler {
 }
 
 func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
+	if strings.HasSuffix(strings.TrimRight(r.URL.Path, "/"), "/report") {
+		h.Report(w, r)
+		return
+	}
 	if r.Method != http.MethodGet {
 		writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
 		return
 	}
-	id := strings.TrimPrefix(r.URL.Path, "/v1/triage/")
-	id = strings.Trim(id, "/")
+	id := incidentIDFromPath(r.URL.Path)
 	if id == "" {
 		writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
 		return
@@ -47,6 +51,55 @@ func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
 	writeJSON(w, http.StatusOK, rep)
 }
 
+func (h *Handler) Report(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+		return
+	}
+	id := incidentIDFromPath(strings.TrimSuffix(strings.TrimRight(r.URL.Path, "/"), "/report"))
+	if id == "" {
+		writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
+		return
+	}
+	q := r.URL.Query()
+	opts, err := triage.ParseBuildOptions(q.Get("window"), q.Get("snapshot") == "true", time.Now())
+	if err != nil {
+		writeError(w, http.StatusBadRequest, "bad_options", err.Error(), "")
+		return
+	}
+	rep, err := h.engine.Build(r.Context(), id, opts)
+	if errors.Is(err, triage.ErrUnknownIncident) {
+		writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+		return
+	}
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "triage_build_failed", err.Error(), "")
+		return
+	}
+	rendered, err := reports.Render(rep, q.Get("format"))
+	if err != nil {
+		writeError(w, http.StatusBadRequest, "bad_format", err.Error(), "")
+		return
+	}
+	body, err := reports.EncodeBody(rendered)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, "render_failed", err.Error(), "")
+		return
+	}
+	w.Header().Set("Content-Type", rendered.ContentType)
+	w.WriteHeader(http.StatusOK)
+	_, _ = w.Write(body)
+}
+
+func incidentIDFromPath(path string) string {
+	id := strings.TrimPrefix(path, "/v1/triage/")
+	id = strings.Trim(id, "/")
+	if strings.Contains(id, "/") {
+		return ""
+	}
+	return id
+}
+
 func writeJSON(w http.ResponseWriter, status int, v any) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(status)
diff --git a/internal/triagehttp/handler_test.go b/internal/triagehttp/handler_test.go
index fba414e..a274813 100644
--- a/internal/triagehttp/handler_test.go
+++ b/internal/triagehttp/handler_test.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"
 
@@ -82,6 +83,22 @@ func TestTriageHandlerUnknownIncidentIsNotFound(t *testing.T) {
 	}
 }
 
+func TestTriageReportHandlerRendersMarkdown(t *testing.T) {
+	eng := newTriageEngineForHandler(t)
+	h := triagehttp.NewHandler(eng)
+
+	req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc/report?format=markdown", nil)
+	rr := httptest.NewRecorder()
+	h.Triage(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d body=%s", rr.Code, rr.Body.String())
+	}
+	if !strings.Contains(rr.Body.String(), "Waylog Triage Report") || !strings.Contains(rr.Body.String(), "inc_abc") {
+		t.Fatalf("unexpected report:\n%s", rr.Body.String())
+	}
+}
+
 // helper: stub engine
 func newTriageEngineForHandler(t *testing.T) *triage.Engine {
 	return newTriageEngineForHandlerWithIncidents(t, handlerStubIncidents{})
diff --git a/pkg/triage/report.go b/pkg/triage/report.go
index 388933d..c1920b6 100644
--- a/pkg/triage/report.go
+++ b/pkg/triage/report.go
@@ -23,6 +23,7 @@ type Report struct {
 	FirstFailure  json.RawMessage `json:"first_failure,omitempty"`
 	SampleTraces  []TraceSample   `json:"sample_traces,omitempty"`
 	Signals       []SignalRef     `json:"signals,omitempty"`
+	Alerts        []AlertRef      `json:"alerts,omitempty"`
 	NextChecks    []NextCheck     `json:"next_checks,omitempty"`
 	Confidence    Confidence      `json:"confidence"`
 	GeneratedAt   string          `json:"generated_at"`
@@ -60,6 +61,16 @@ type SignalRef struct {
 	EvidenceIDs []string `json:"evidence_ids"`
 }
 
+type AlertRef struct {
+	SignalID    string   `json:"signal_id"`
+	AlertID     string   `json:"alert_id,omitempty"`
+	Source      string   `json:"source"`
+	Severity    string   `json:"severity"`
+	Reason      string   `json:"reason"`
+	ProviderURL string   `json:"provider_url,omitempty"`
+	EvidenceIDs []string `json:"evidence_ids"`
+}
+
 type NextCheck struct {
 	ID     string `json:"id"`
 	Prompt string `json:"prompt"`
diff --git a/pkg/triage/report_test.go b/pkg/triage/report_test.go
index f4f575f..db8bd4d 100644
--- a/pkg/triage/report_test.go
+++ b/pkg/triage/report_test.go
@@ -18,6 +18,7 @@ func TestReportJSONRoundTrip(t *testing.T) {
 				{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
 			},
 		},
+		Alerts:      []triage.AlertRef{{SignalID: "sig_2", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_2"}}},
 		Signals:     []triage.SignalRef{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}},
 		NextChecks:  []triage.NextCheck{{ID: "check_1", Prompt: "verify x"}},
 		Confidence:  triage.ConfidenceMedium,
@@ -41,6 +42,9 @@ func TestReportJSONRoundTrip(t *testing.T) {
 	if out.Confidence != triage.ConfidenceMedium {
 		t.Fatalf("confidence mismatch: got %q", out.Confidence)
 	}
+	if len(out.Alerts) != 1 || out.Alerts[0].AlertID != "alert_1" {
+		t.Fatalf("alerts round-trip lost data: %+v", out.Alerts)
+	}
 }
 
 func TestReportValidate(t *testing.T) {
@@ -116,6 +120,24 @@ func TestCanonicalHashChangesWhenContentChanges(t *testing.T) {
 	}
 }
 
+func TestCanonicalHashChangesWhenAlertEvidenceChanges(t *testing.T) {
+	base := triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_1"},
+		Confidence:    triage.ConfidenceMedium,
+		GeneratedAt:   "t",
+		ReportHash:    "h",
+	}
+	h1, _ := base.CanonicalHash()
+
+	withAlert := base
+	withAlert.Alerts = []triage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_alert"}}}
+	h2, _ := withAlert.CanonicalHash()
+	if h1 == h2 {
+		t.Fatalf("hash must change when alert evidence changes")
+	}
+}
+
 func TestCanonicalHashFormat(t *testing.T) {
 	r := triage.Report{
 		SchemaVersion: "triage.v1",
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index b40edb1..9e78dcc 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -4,6 +4,7 @@ set -euo pipefail
 GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
 INGEST_URL="${INGEST_URL:-http://localhost:8080}"
 WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
 REQUESTS="${REQUESTS:-20}"
 CONCURRENCY="${CONCURRENCY:-5}"
 TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
@@ -73,6 +74,19 @@ CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TI
 "${CLI[@]}" --json capabilities >/dev/null || fail "waylog capabilities failed"
 echo "PASS: waylog capabilities"
 
+alert_id="alert_demo_pmt_502"
+alert_timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+alert_body="{\"source\":\"waylog\",\"alert_id\":\"${alert_id}\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"reason\":\"PMT_502 spike\",\"message\":\"demo alert for checkout payment failures\",\"error_code\":\"PMT_502\",\"timestamp\":\"${alert_timestamp}\"}"
+alert_status="$(curl -s -o /tmp/waylog-demo-alert.json -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/alerts" \
+  -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "$alert_body" || echo "000")"
+[[ "$alert_status" == "201" ]] || fail "alert webhook failed: HTTP $alert_status"
+grep -q '"signal_id"' /tmp/waylog-demo-alert.json || fail "alert webhook response did not include a signal"
+grep -q '"matched"' /tmp/waylog-demo-alert.json || fail "alert webhook response did not include match state"
+echo "PASS: alert webhook accepted"
+
 burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
 burst_status="$(curl -s -o /tmp/waylog-demo-burst.json -w "%{http_code}" \
   -X POST "${GATEWAY_URL}/demo/burst" \
@@ -146,4 +160,12 @@ hash_b="$(json_triage_report_hash <<<"$triage_b")"
 [[ "$hash_a" == "$hash_b" ]] || fail "triage report_hash unstable across runs: A=$hash_a B=$hash_b"
 echo "PASS: waylog triage stable report_hash=$hash_a"
 
+report_status="$(curl -s -o /tmp/waylog-demo-triage-report.md -w "%{http_code}" \
+  -H "Authorization: Bearer ${WAYLOG_READ_KEY}" \
+  "${INGEST_URL}/v1/triage/${incident_id}/report?format=markdown&snapshot=true" || echo "000")"
+[[ "$report_status" == "200" ]] || fail "triage markdown report failed: HTTP $report_status"
+grep -q "$hash_a" /tmp/waylog-demo-triage-report.md || fail "triage markdown report did not cite report_hash"
+grep -q "$alert_id" /tmp/waylog-demo-triage-report.md || fail "triage markdown report did not cite alert evidence"
+echo "PASS: triage markdown report cites alert evidence"
+
 echo "Demo acceptance passed."

From ef1c0fb7bb69a3a1b4ccf3262299b4581f5c025e Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Tue, 12 May 2026 19:01:03 -0400
Subject: [PATCH 12/14] feat: added proof-loop and RCA scorecard acceptance
 gates   Reproducible end-to-end harnesses (alert -> burst -> errors ->
 incidents   -> triage) for v2.1 incident triage, plus the demo-acceptance
 JSON   helpers, microdemo burst hooks, incident-store tests, auth config  
 tightening, and README/docs reframing they depend on.

---
 .gitignore                                |   1 +
 Makefile                                  |  10 +-
 README.md                                 | 482 ++++++++++++++--------
 cmd/ingest/main.go                        |  49 ++-
 docs/env.md                               |  13 +-
 docs/openapi.yaml                         |   5 +-
 examples/cmd/api-gateway/main.go          |  10 +-
 examples/microdemo/gateway.go             |  14 +
 examples/microdemo/proof.go               | 431 +++++++++++++++++++
 examples/microdemo/proof_test.go          |  88 ++++
 examples/microdemo/ui.html                | 278 ++++++++++++-
 examples/microdemo/ui_test.go             |  26 ++
 internal/auth/config.go                   |  40 +-
 internal/auth/config_test.go              |  56 +++
 internal/dashboard/static/index.html      |  26 +-
 internal/dashboard/static_test.go         |   6 +
 internal/incidents/store_test.go          |  84 ++++
 internal/ingest/handler.go                |   4 +
 internal/llm/openai.go                    |   2 +-
 internal/llm/openai_test.go               |  10 +
 internal/reports/reports.go               |  69 ++--
 internal/reports/reports_test.go          |  15 +-
 internal/triagehttp/handler_test.go       |   2 +-
 scripts/demo-acceptance-json/main.go      | 185 ++++++++-
 scripts/demo-acceptance-json/main_test.go |  45 ++
 scripts/demo.sh                           |   8 +-
 scripts/proof-loop.sh                     | 162 ++++++++
 scripts/rca-scorecard.sh                  | 173 ++++++++
 28 files changed, 2046 insertions(+), 248 deletions(-)
 create mode 100644 examples/microdemo/proof.go
 create mode 100644 examples/microdemo/proof_test.go
 create mode 100644 internal/incidents/store_test.go
 create mode 100644 scripts/proof-loop.sh
 create mode 100644 scripts/rca-scorecard.sh

diff --git a/.gitignore b/.gitignore
index 0dfc197..a8cb890 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ profile.cov
 
 /*.log
 /data/
+data/demo-state/
 *.md
 
 # Local runtime/build artifacts
diff --git a/Makefile b/Makefile
index e9f21ee..46f0d90 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 SHELL := /bin/sh
 
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance proof-loop rca-scorecard rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
 
 help:
 	@echo "Targets:"
@@ -19,6 +19,8 @@ help:
 	@echo "  demo     - start dashboard demo locally (detached, no Docker)"
 	@echo "  demo-stop - stop demo processes"
 	@echo "  demo-acceptance - verify a running local demo end-to-end"
+	@echo "  proof-loop - run alert -> incident -> triage -> report -> rollup proof"
+	@echo "  rca-scorecard - run deterministic RCA scorecard over the demo scenario"
 	@echo "  rollup-comparison - run demo proof for root-cause vs naive rollup counts"
 	@echo "  otlp-conformance - run deterministic OTLP HTTP/gRPC fixture checks"
 	@echo "  demo-up  - start v2 demo stack in Docker (detached)"
@@ -125,6 +127,12 @@ demo-stop:
 demo-acceptance:
 	./scripts/demo-acceptance.sh
 
+proof-loop:
+	bash ./scripts/proof-loop.sh
+
+rca-scorecard:
+	bash ./scripts/rca-scorecard.sh
+
 rollup-comparison:
 	./scripts/rollup-comparison.sh
 
diff --git a/README.md b/README.md
index 943ae6b..7f6d92f 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,47 @@
-<div align="center"><pre><img width="461" height="129" alt="Screenshot 2026-03-17 at 2 36 01 PM" src="https://github.com/user-attachments/assets/fdec0a1b-055a-47e3-b52c-cf1cf80fe373" />
-</pre></div>
+<div align="center">
+  <img width="461" height="129" alt="Waylog" src="https://github.com/user-attachments/assets/fdec0a1b-055a-47e3-b52c-cf1cf80fe373" />
 
-<p align="center">
-  <strong>Structured logging that explains failed requests and active incidents.</strong><br>
-  Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC traces. Agent-native by design.
-</p>
+  <p>
+    <strong>Production incident triage you can hash, cite, and hand to an agent.</strong><br>
+    Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC. Deterministic. Single Go binary.
+  </p>
 
-<p align="center">
-  <code>polyglot SDKs</code> · <code>agent-native API</code> · <code>failure tree</code> · <code>rollup-correct root cause</code>
-</p>
+  <p>
+    <code>schema 2.0 WideEvents</code> · <code>signal-correlated incidents</code> · <code>cited operator reports</code> · <code>rollup-correct root cause</code> · <code>agent-native</code>
+  </p>
 
-<p align="center">
-  <em>Public alpha — request triage plus signal-driven incident triage for backend systems.</em>
-</p>
+  <p>
+    <code>alpha</code> · <code>go 1.24+</code> · <code>single-node</code> · <code>OTLP-compatible</code> · <code>MCP-native</code> · <code>LLM-optional</code>
+  </p>
+</div>
+
+> **Public alpha for single-node production-style incident triage.** APIs may break before 1.0.
+
+---
+
+## Try it in 60 seconds
+
+```bash
+git clone https://github.com/sssmaran/WaylogCLI
+cd WaylogCLI
+make demo
+```
+
+Then open <http://localhost:9081/demo> and click **Run proof loop**.
+
+You'll see the full v2.1 flow run end-to-end: an external alert is accepted, a payment-failure burst spikes an error family, the spike detector opens an incident, the cause is classified as `dependency`, a triage report is built across four surfaces (CLI, read endpoint, direct tool, plan template) and verified byte-stable, and Markdown / Slack / PagerDuty operator reports are rendered with citations to every alert, signal, and trace.
+
+Stop with `make demo-stop`. No Docker. No Kafka. No bridge process. SQLite + a single Go binary.
 
 ---
 
-## What Waylog does
+## What Waylog is
 
-A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened in the request, then groups repeated failures into an incident with signal-backed cause evidence:
+Waylog turns failed requests and external alerts into **incidents with deterministic, cited triage reports** that humans and agents can both consume. Three things make it different:
+
+- **Deterministic.** Every triage report has a `report_hash` that is byte-stable across CLI, REST read, direct tool call, and plan template — within a single engine tick. No LLM required to get an answer; LLMs are an optional UX layer over the same bytes.
+- **Agent-native.** The same tool registry powers the CLI, MCP stdio, REST `/v1/tools/*`, and `/v1/plans/execute`. Agents read the exact bytes the human read. Built-in triage plan template, idempotency keys, structured envelopes.
+- **Drop-in.** Go SDK (`net/http`, chi, gin, echo), TypeScript SDK (Express, Hono, Next.js, NestJS), or OTLP HTTP / gRPC. Use what you already have. Single binary + embedded SQLite. No Docker required.
 
 ```text
   trace 7f3a2b9c…   flow=purchase   user=standard   region=us-east-1
@@ -28,56 +51,44 @@ A request hits your API gateway, fans out to three services, and one of them fai
           └─ db      200   —                      3 ms
           └─ payment 502   PMT_502                5 ms   ← first failure
 
-  blast radius:  12 requests · 8 users · 4 services
+  blast radius:    12 requests · 8 users · 4 services
+  incident:        inc_a43c189fc63eff31   (cause=dependency · confidence=high)
+  report_hash:     sha256:1ed7c21b…       (stable across cli / read / tool / plan)
 ```
 
-This is not log search, metrics storage, or incident management. Waylog builds request-triage views from WideEvents, accepts production-context signals such as deploys and dependency health, and returns deterministic answers for "why did this trace fail?", "what incident is active?", and "who is affected by `PMT_502`?". Root-cause rollups count the originating failure once, not once per propagated hop.
-
-Run `make demo` and see it yourself.
-
-## Quick start
+Root-cause rollups count the originating failure **once**, not once per propagated hop. The same hash answers "what failed" identically whether the question came from a terminal, a webhook, or an LLM tool call.
 
-```bash
-make demo
-```
-
-This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, stores demo signals/incidents in local SQLite, and does not require Docker, Kafka, or the bridge process.
+---
 
-Once the stack is up:
+## The incident loop
 
-1. Open demo controls at <http://localhost:9081/demo>, or open the dashboard at <http://localhost:8080/ui/>. The local demo disables dashboard login.
-2. Click **Run traffic burst** to post demo deploy/dependency signals and fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
-   ```bash
-   curl -s -X POST http://localhost:9081/purchase \
-     -H 'Content-Type: application/json' \
-     --data '{"sku":"X1","scenario":"payment_502"}'
-   ```
-3. Investigate with the v2 CLI:
-   ```bash
-   ./waylog incidents
-   ./waylog incident <incident_id> --snapshot
-   ./waylog errors --window 15m
-   ./waylog explain <trace_id>
-   ./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
-   ./waylog blast --code PMT_502 --window 15m
-   ./waylog triage <incident_id>
-   ```
+The v2.1 product in one paragraph: services emit WideEvents (or push OTel spans). External systems post production-context signals (deploys, dependency health, runtime events) and alerts (Alertmanager, Grafana, PagerDuty webhooks, or Waylog-native JSON). When an error family spikes, Waylog opens an **incident**, correlates the signals and alerts that overlap its window, classifies the cause deterministically (`deploy | app | dependency | runtime | unknown`), and exposes a cited **TriageReport** to humans (CLI, dashboard) and agents (REST, plan template, MCP) — same bytes, same hash.
 
-The traffic burst posts fresh demo deploy/dependency signals on each run so the incident panel has evidence to attach. The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
+```bash
+# 1. Drive a real failure through the demo stack
+curl -X POST http://localhost:9081/purchase \
+  -H 'Content-Type: application/json' \
+  -d '{"sku":"X1","scenario":"payment_502"}'
 
-Stop with `make demo-stop`.
+# 2. List active incidents
+waylog incidents
 
-Prefer Docker? Use `make docker-dev` / `make docker-down`. Prefer foreground service logs while hacking on Go code? Use `make micro-demo` and stop with `make micro-demo-stop`.
+# 3. Get the cited triage report — and verify hash agreement across surfaces
+waylog triage inc_a43c189fc63eff31 --snapshot
+curl -H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
+  -d '{"incident_id":"inc_a43c189fc63eff31","snapshot":true}' \
+  http://localhost:8080/v1/tools/triage_incident
 
+# 4. Render an operator report (Markdown / Slack Block Kit / PagerDuty note)
+curl -H "Authorization: Bearer $WAYLOG_READ_KEY" \
+  "http://localhost:8080/v1/triage/inc_a43c189fc63eff31/report?format=slack"
+```
 
-## How it works
+> **Alerts correlate; they do not create incidents.** Incidents are opened by the spike detector. Alerts and signals attach as evidence and shape the deterministic cause classification.
 
-1. **Capture** — services emit [WideEvents](docs/waylog-sdk-contract.md) via the Go or TypeScript SDK, or push OpenTelemetry spans to `/v1/otlp/v1/traces`. Every event is durably logged (WAL + fsync) before it enters the derived read models.
-2. **Signal** — deploy systems, dependency monitors, or operators post small production-context facts to `/v1/signals`.
-3. **Triage** — the ingest server projects request views (`recent`, `errors`, `explain`, `blast`) and opens incidents when error families spike against overlapping signals.
-4. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views. Primary incident surfaces are `waylog incidents`, `waylog incident <id>`, `/v1/incidents/*`, and the dashboard incident cards.
+---
 
-## Get traces in
+## Capture: send Waylog your traces
 
 All three paths feed the same schema-2.0 ingest and read APIs. Pick whichever matches your stack.
 
@@ -105,7 +116,7 @@ app.post("/buy", (req, res) => {
 });
 ```
 
-`@waylog/sdk` is ESM-only, Node 18+, and ships standalone core APIs plus Express, Hono, Next.js, and NestJS entrypoints (`@waylog/sdk/express`, `@waylog/sdk/hono`, `@waylog/sdk/next`, `@waylog/sdk/nest`).
+ESM-only, Node 18+. Standalone core plus framework entrypoints: `@waylog/sdk/express`, `@waylog/sdk/hono`, `@waylog/sdk/next`, `@waylog/sdk/nest`. Full examples in [`docs/sdk-examples.md`](docs/sdk-examples.md).
 
 ### Go SDK
 
@@ -131,208 +142,313 @@ func main() {
 }
 ```
 
-The recommended SDK path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers. Low-level request APIs such as `Begin`, `Finalize`, and `setField` are for adapter authors, tests, and unusual custom integrations. Full copy-paste examples for `net/http`, chi, gin, echo, standalone TypeScript, Express, Hono, Next.js, and NestJS are in [`docs/sdk-examples.md`](docs/sdk-examples.md).
+Middleware adapters for `net/http`, chi, gin, and echo are in [`docs/sdk-examples.md`](docs/sdk-examples.md). The recommended path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers — low-level `Begin` / `Finalize` / `setField` APIs are for adapter authors.
+
+### OTLP / OpenTelemetry
+
+Point your existing OTel collector at Waylog. Both protocols, same conversion path, same downstream views.
 
-### OTLP traces
+```yaml
+exporters:
+  otlphttp/waylog:
+    endpoint: http://localhost:8080/v1/otlp/v1/traces
+    headers:
+      authorization: "Bearer ${WAYLOG_WRITE_KEY}"
+  otlp/waylog:
+    endpoint: localhost:4317
+    headers:
+      authorization: "Bearer ${WAYLOG_WRITE_KEY}"
+```
+
+Sample collector config: [`examples/otel-collector/`](examples/otel-collector/). Only traces are accepted; OTLP logs and metrics are not shipping. Bind `OTLP_GRPC_ADDR=127.0.0.1:4317` for single-host installs that don't need cross-host collectors.
 
-Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces` for OTLP/HTTP or `localhost:4317` for OTLP/gRPC. Protobuf trace exports convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. A collector config lives in [`examples/otel-collector/`](examples/otel-collector/). **Only traces are supported.** OTLP logs and metrics are not shipping yet.
+**Auth.** Both endpoints require `WAYLOG_WRITE_KEY` when `WAYLOG_PROFILE=prod`; the server refuses to boot with unauthenticated OTLP in prod. `make demo` runs unauthenticated by design.
 
-### Alternative: local ingest server (no Docker)
+### Local ingest only (no demo services)
 
 ```bash
 make ingest
 ```
 
-Runs only the ingest server. Point your own services at it via an SDK or OTLP. Full env-var reference: [`docs/env.md`](docs/env.md).
+Runs only the ingest server. Point your own services at it via SDK or OTLP. Full env reference: [`docs/env.md`](docs/env.md).
 
-## What you can ask
+---
+
+## Operate: CLI, dashboard, agents
 
 ### CLI
 
 ```bash
-WAYLOG_V2_READS=true ./ingest
-
-waylog capabilities
-waylog recent --limit 5
-waylog errors --window 15m
-waylog blast checkout:payment.charge:PMT_502 --window 15m
-waylog explain trace_01HX...
-waylog trace trace_01HX...
-waylog event event_01HX...
-waylog search PMT_502 --window 1h
+./ingest                                 # v2 reads are on by default
+
+waylog capabilities                      # diagnose server config / profile / feature flags
+waylog incidents                         # active incidents, deterministic order
+waylog incident <incident_id> --snapshot # full detail with frozen sample traces
+waylog triage   <incident_id>            # cited triage report
+waylog errors   --window 15m             # top error families
+waylog explain  <trace_id>               # first observable failing step
+waylog blast    checkout:payment.charge:PMT_502 --window 15m
+waylog recent   --limit 5
+waylog event    <event_id>
+waylog trace    <trace_id>
+waylog search   PMT_502 --window 1h
 ```
 
-The `waylog` binary is now the v2 operator CLI over the running ingest server's read APIs. Most verbs require the server to advertise `v2_reads.enabled=true` from `/v1/capabilities`; `waylog capabilities` is intentionally ungated so it can diagnose server setup. The CLI uses `INGEST_ADDR`, `WAYLOG_READ_KEY`, and `WAYLOG_CLI_TIMEOUT` by default. Add `--json` to any verb for machine-readable output.
+`waylog capabilities` is intentionally ungated so it can diagnose server setup; other verbs require `v2_reads.enabled=true` (the default). Defaults: `INGEST_ADDR`, `WAYLOG_READ_KEY`, `WAYLOG_CLI_TIMEOUT`. Add `--json` to any verb for machine-readable output.
+
+### Dashboard
 
-### REST (direct tool call)
+Embedded Geist UI at <http://localhost:8080/ui/>. Uses the dashboard session cookie for read-scope auth and runs against the default `WAYLOG_V2_READS=true` reader.
+
+- `#/errors` — top error families over `/v1/errors`
+- `#/incident/<id>` — incident evidence and next checks over `/v1/incidents/{id}`
+- `#/explain/<id>` — first observable failing step over `/v1/traces/story`
+- `#/blast/<key>` — impact panel over `/v1/blast_radius`
+- recent-request stream from `/v1/traces/recent`, polled every 5 s
+
+No Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts. Just the triage loop.
+
+### Agent surface
+
+Twelve deterministic tools, exposed identically through CLI, REST `/v1/tools/{name}`, MCP stdio, and plan execution. Same idempotency keys, same structured envelopes, same bytes.
+
+| Tool                   | Answers                                                                                      |
+| ---------------------- | -------------------------------------------------------------------------------------------- |
+| `triage_incident`      | Structured TriageReport for an open incident (blast + first failure + signals + next checks) |
+| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note from a TriageReport                        |
+| `explain_request`      | Why did this specific trace fail?                                                            |
+| `trace_summary`        | Span tree and timing for a trace                                                             |
+| `graph_failures`       | Which requests are currently failing?                                                        |
+| `failure_patterns`     | What error codes dominate this window?                                                       |
+| `blast_radius`         | How many requests, users, and services does this error touch?                                |
+| `failure_chain`        | How did this failure propagate through services?                                             |
+| `graph_query`          | DSL query over the graph (`expr` + `window`)                                                 |
+| `compare_windows`      | Diff error rates between two windows                                                         |
+| `graph_insights`       | Windowed rollup of top errors and patterns                                                   |
+| `graph_stats`          | Overall shape of the graph right now                                                         |
 
 ```bash
+# Direct tool call
 curl -X POST http://localhost:8080/v1/tools/blast_radius \
-  -H 'Content-Type: application/json' \
   -H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
   -d '{"error_code":"PMT_502","window":"10m","include_services":true}'
-```
 
-### REST (multi-step plan)
-
-```bash
+# Built-in triage plan template — same hash as the CLI/read/tool surfaces
 curl -X POST http://localhost:8080/v1/plans/execute \
-  -H 'Content-Type: application/json' \
   -H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
-  -d '{
-    "steps": [
-      {"id":"patterns", "tool":"failure_patterns", "params":{"window":"10m"}},
-      {"id":"blast",    "tool":"blast_radius",     "params":{"error_code":"PMT_502","window":"10m"}}
-    ]
-  }'
+  -d '{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}'
 ```
 
-Plans execute deterministically server-side with SSE progress on `/v1/stream/plans/{id}`.
+Plans execute deterministically server-side with SSE progress on `/v1/stream/plans/{id}`. Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
 
-### Trace story
+### MCP
 
 ```bash
-curl "http://localhost:8080/v1/traces/story?trace_id=$TRACE" \
-  -H "Authorization: Bearer $WAYLOG_READ_KEY"
+make ingest-mcp     # MCP_STDIO=1
 ```
 
-Returns the first failing step, contributing path, logs, downstream calls, and linkage mode used by the dashboard and `waylog explain`.
+Same registry, same idempotency keys. Plugs into Claude, Cursor, and other MCP clients.
 
-### MCP (agent surface)
+### External alerts
 
 ```bash
-make ingest-mcp    # MCP_STDIO=1
+curl -X POST http://localhost:8080/v1/alerts \
+  -H "Authorization: Bearer $WAYLOG_WRITE_KEY" \
+  -d @grafana-webhook.json
 ```
 
-Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP clients. Same semantics as the REST API.
+`POST /v1/alerts` accepts Waylog-normalized JSON plus Alertmanager, Grafana, and PagerDuty webhook payloads. Accepted alerts are stored as `type=alert` signals and **correlated** with active incidents when possible — alerts do not create incidents. The matching alert evidence then appears in cited Markdown / Slack / PagerDuty triage reports.
 
-### Analysis tools
+---
 
-All twelve tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+## Architecture
 
-Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
+```text
+   Go / TS services (SDK)               OTel collectors
+              │                                │
+   schema-2.0 WideEvents                OTLP HTTP / gRPC
+              ╰──────────────┬─────────────────╯
+                             ▼
+                      ingest server
+              ┌──────────────┴──────────────┐
+              │                              │
+   event log (append-only WAL,       SQLite cold store
+       source of truth)              (events · deployments ·
+              │                       signals · incidents ·
+              ▼                       causal claims)
+   derived read models
+   (errors · explain · blast ·
+    recent · incidents · triage)
+              │
+              ├──▶ /ui dashboard           (Geist, no vendored chart/topology)
+              ├──▶ /v1/tools/*             (deterministic agent surface)
+              ├──▶ /v1/plans/execute       (server-side plan execution + SSE)
+              └──▶ waylog CLI · TUI · MCP
+```
 
-External alerts can be posted to `POST /v1/alerts` as Waylog-normalized JSON or Alertmanager, Grafana, or PagerDuty webhooks. Waylog stores them as alert signals, links them to active incidents when possible, and can render cited Markdown, Slack Block Kit, or PagerDuty-note reports from the same deterministic triage artifact.
+- **Single binary** plus embedded SQLite. No Docker, no Kafka, no bridge.
+- **WAL is source of truth.** Crash → replay on next boot rebuilds the derived read models.
+- **Hot graph + dedicated trace store.** Pruned per snapshot tick to bound memory.
+- **`report_hash` excludes `generated_at`, `plan_run_id`, and itself.** Same upstream state → same bytes across every surface.
+- **OTLP path reuses the same WAL and projector** as the SDK path. No separate ingestion plane.
 
-| Tool               | Answers                                                       |
-| ------------------ | ------------------------------------------------------------- |
-| `graph_stats`      | Overall shape of the graph right now                          |
-| `explain_request`  | Why did this specific trace fail?                             |
-| `trace_summary`    | Span tree and timing for a trace                              |
-| `graph_failures`   | Which requests are currently failing?                         |
-| `failure_patterns` | What error codes dominate this window?                        |
-| `blast_radius`     | How many requests, users, and services does this error touch? |
-| `failure_chain`    | How did this failure propagate through services?              |
-| `graph_query`      | DSL query over the graph (`expr` + `window`)                  |
-| `compare_windows`  | Diff error rates between two windows                          |
-| `graph_insights`   | Windowed rollup of top errors and patterns                    |
-| `triage_incident`  | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
-| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note text rendered from a TriageReport |
+Durability model, retention, merge semantics, readiness policy, and counter buffer details: [`docs/internals.md`](docs/internals.md). Full HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml).
 
-Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
+---
 
-## Dashboard
+## Auth & profiles
 
-The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs as the CLI. It requires `WAYLOG_V2_READS=true` and uses the dashboard session cookie for read-scope auth.
+Three independent scoped keys. The dashboard never holds the agent key.
 
-- dark, minimal Geist UI with aligned KPI modules and inline SVG mini-graphs
-- `#/errors` — top error families over `/v1/errors`
-- `#/explain/<id>` — first observable failing step over `/v1/traces/story`
-- `#/blast/<key>` — impact panel over `/v1/blast_radius`
-- `#/incident/<id>` — incident evidence and next checks over `/v1/incidents/{id}`
-- recent-request stream from `/v1/traces/recent`, polled every 5s
-- no Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts
+| Key                | Protects                                                    |
+| ------------------ | ----------------------------------------------------------- |
+| `WAYLOG_WRITE_KEY` | `/v1/events`, OTLP HTTP + gRPC, `/v1/signals`, `/v1/alerts` |
+| `WAYLOG_READ_KEY`  | Read APIs, dashboard session                                |
+| `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*`                     |
+
+`WAYLOG_API_KEY` is a legacy alias for the write scope. `ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
 
+`WAYLOG_PROFILE` controls auth strictness. The current profile is reported on `/v1/capabilities`.
 
-## Architecture
+| Profile | Use case                    | Defaults                                                                                              |
+| ------- | --------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `demo`  | `make demo` showcase        | All endpoints open. **Not safe to expose to a network.**                                              |
+| `dev`   | Local development (default) | Open OTLP, optional read auth                                                                         |
+| `prod`  | Real deployments            | Refuses to boot without all three scoped keys **and** without `WAYLOG_WRITE_KEY` when OTLP is enabled |
 
-```text
-Go / TS services (SDK) · OTLP collectors
-        │  schema-2.0 WideEvents · OTLP HTTP/gRPC traces
-        ▼
-  ingest server
-    ├─ event log (append-only WAL, source of truth)
-    ├─ derived read models (errors · explain · blast · recent traces · incidents)
-    ├─ SQLite cold store (events · deployments · signals · incidents · causal claims)
-    ├─ tool registry · Ask · plan execution
-    └─ v2 dashboard · health · metrics · OpenAPI
-        │
-        ├──▶ /ui dashboard (Geist, no vendored chart/topology libs)
-        ├──▶ /v1/tools/* · /v1/plans/execute (agent-native)
-        └──▶ CLI · TUI · MCP · agents
+Set `WAYLOG_PROFILE=prod` for any deployment that crosses a trust boundary.
+
+---
+
+## Try every claim locally
+
+```bash
+make demo                # one-shot local stack (no Docker)
+make demo-acceptance     # 15-check gate over CLI + browser proof
+make proof-loop          # full alert → incident → triage → cited reports loop
+make rca-scorecard       # cold-start latency + measured report_hash_stable
+make rollup-comparison   # rollup-correct counts vs naive propagated counts
+make demo-stop
 ```
 
-Events are durably logged before projection — if the process crashes, replay rebuilds the read models from the WAL on next boot.
+`make proof-loop` writes shareable artifacts to `./data/demo-state/proof/`:
+
+- `triage.json` — the TriageReport from the read endpoint
+- `report.md`, `slack.json`, `pagerduty.txt` — the same triage rendered for three operator surfaces
+- `rollup-comparison.txt` — root-cause counts next to naive propagated counts
+- `scorecard.json` — measured `report_hash_stable`, `triage_latency_ms`, `scenario` (`cold-demo` for `rca-scorecard`, `warm-demo` chained from `proof-loop`), inflation-avoided count, and the `report_hash` itself
 
-Durability model, retention, merge semantics, readiness policy, and counter buffer: [`docs/internals.md`](docs/internals.md). Full v2 HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml).
+The browser has the same flow at <http://localhost:9081/demo> → **Run proof loop**.
+
+> Artifacts are local-only. Alert payloads include the demo write key (`Bearer demo`). `data/demo-state/` is gitignored.
+
+---
 
 ## Development
 
 ```bash
-make build          # core binaries
-make build-examples # demo services
-make fmt vet test   # checks
-make test-race      # race detector
-make ts-test        # TypeScript SDK vitest suite
-make ci             # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
-make demo-acceptance # with make demo running, verify demo + CLI triage loop
-make rollup-comparison # demo proof: root-cause counts vs naive propagated counts
+make build              # core binaries
+make build-examples     # demo services
+make fmt vet test       # checks
+make test-race          # race detector
+make ts-test            # TypeScript SDK vitest suite
+make ci                 # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
 ```
 
-`make rollup-comparison` runs the checkout demo burst and prints the PMT_502 root-cause count next to a naive propagated count across touched services. It is the quickest local proof that Waylog's default rollups count the originating failure once per failed request instead of inflating it by every downstream hop.
+Full env-var reference: [`docs/env.md`](docs/env.md). Reproducible demo gate: `make demo-acceptance`.
 
-## Auth
+---
 
-Waylog uses three scoped keys. They are independent — the dashboard never holds the agent key.
+## What's new in v2.1
 
-| Key                | Protects                                              |
-| ------------------ | ----------------------------------------------------- |
-| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces`, `/v1/signals` (SDKs, collectors, production signals) |
-| `WAYLOG_READ_KEY`  | Read APIs, dashboard session                          |
-| `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*`               |
+- **Signal-correlated incident engine** at `/v1/incidents/*` with deterministic cause classification (`deploy | app | dependency | runtime | unknown`).
+- **Deterministic TriageReport** (`triage.v1`) with stable per-tick `report_hash` across CLI, read endpoint, direct tool, and plan template.
+- **Alert intake** at `POST /v1/alerts` for Waylog-native, Alertmanager, Grafana, and PagerDuty webhooks. Alerts correlate with active incidents; they do not create incidents.
+- **Cited operator reports** rendered as Markdown, Slack Block Kit, or PagerDuty notes via `GET /v1/triage/{id}/report`.
+- **OTLP/gRPC trace receiver** on `OTLP_GRPC_ADDR` (default `:4317`).
+- **Provider-neutral Ask** configuration: `gemini`, `anthropic`, `openai`, or `none`. All deterministic surfaces work with no LLM configured.
+- **`WAYLOG_PROFILE=demo|dev|prod`** gates auth defaults; `prod` hard-fails on unsafe configs.
+- **`WAYLOG_V2_READS` defaults to `true`.** Set `false` only for legacy v1-only stacks.
+- **`/v1/insight`** retained as a compat shim returning the top active incident. New clients should use `/v1/incidents/*`.
 
-`WAYLOG_API_KEY` is a legacy alias for the write scope. `ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
+---
 
 ## Status
 
-Public alpha. APIs may break before 1.0.
+Public alpha for single-node production-style incident triage. APIs may break before 1.0.
 
-**Shipped:**
+**Shipped**
 
 - Go SDK v2 (`net/http`, chi, gin, echo) and TypeScript SDK v2 (`@waylog/sdk`, ESM, Node 18+, standalone core, Express, Hono, Next.js, NestJS)
-- OTLP traces over HTTP at `/v1/otlp/v1/traces` and gRPC at `:4317` (traces only)
-- durable ingest with WAL + replay
-- hot graph with flattened 3-node model + dedicated trace store
-- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
+- OTLP HTTP at `/v1/otlp/v1/traces` and OTLP/gRPC at `OTLP_GRPC_ADDR` (traces only)
+- Durable ingest with WAL + replay
+- Hot graph with flattened 3-node model + dedicated trace store
+- Schema-2.0 recent-index read APIs (default)
 - SQLite cold store (events, deployments, signals, incidents, causal claims)
-- signal-driven incident engine with `waylog incidents`, `waylog incident <id>`, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
-- alert intake for Waylog, Alertmanager, Grafana, and PagerDuty webhooks, stored as signals and linked to incidents when possible
-- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
-- 12 deterministic analysis tools, rollup-correct root-cause attribution
-- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
-- `/v1/traces/story` and indented failure-path rendering in the dashboard
-- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `triage`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
-- live TUI (`waylog-live --dev` streams via SSE), MCP stdio
-- scoped auth (write/read/agent) with startup validation
-
-**Planned:**
+- Signal-correlated incident engine with stable IDs, deterministic classification, and startup hot-window rebuild from the schema-2.0 WAL
+- Alert intake from four webhook formats, stored as signals and correlated with active incidents
+- Deterministic triage report with stable hash across CLI / read endpoint / direct tool / plan template within a single engine tick
+- Provider-neutral Ask configuration; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- Twelve deterministic analysis tools, rollup-correct root-cause attribution
+- Agent-native REST with idempotency and structured envelopes
+- MCP stdio, live TUI (`waylog-live --dev` streams via SSE), embedded Geist dashboard
+- Scoped auth (write / read / agent) with startup validation and `WAYLOG_PROFILE=prod` hard-fail
+
+**Planned**
 
 - OTLP logs and metrics
 - Python SDK
+- Resolved-incident retention janitor
 - Mintlify docs site
 
+---
+
 ## Known limitations
 
-- Single-node only. No HA, no clustering.
-- Alpha quality. APIs may break before 1.0.
-- OTLP supports traces only. Logs and metrics are not shipping yet.
-- Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
-- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
-- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
-- Incident cause classification is deterministic and heuristic.
-- No outbound alerting or paging delivery. Waylog accepts external alerts and renders operator reports, but it doesn't wake you up.
-- No multi-tenancy. One instance = one trust boundary.
-- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
-
-**Fastest walkthrough:** `make demo`, open <http://localhost:9081/demo>, click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
+- **Public alpha.** APIs may break before 1.0. Not production-ready. Not HA.
+- **Triage report hash is stable per tick, not forever.** Hash changes when the underlying recent-index window changes (≈30 s default). Use as a short-window dedup key, not a long-term incident fingerprint.
+- **Alerts correlate; they do not create incidents.** Incidents are opened by the spike detector. The alert path is for routing context, not paging primitives.
+- **Resolved incidents are not pruned automatically.** Per the v2.1 plan, the retention janitor is deferred. Manual cleanup:
+  ```sql
+  DELETE FROM incidents WHERE status = 'resolved' AND resolved_at < datetime('now', '-7 days');
+  ```
+- **Stale `active` rows after long downtime.** If the WAL has rolled past an incident's `started_at` and `WAYLOG_REBUILD_INCIDENTS_ON_START=true`, the engine transitions only the stale rows to `recovering` on next start; they resolve after `WAYLOG_INCIDENT_RESOLVE_AFTER` without new evidence.
+- **Single-node only.** No HA, no clustering, no multi-tenant.
+- **SQLite cold store** fits demos and small deployments. Postgres is not shipping.
+- **OTLP supports traces only.** Logs and metrics are not shipping yet.
+- **Only Go and TypeScript SDKs today.** Python / Java / Ruby are not available.
+- **No outbound paging.** Waylog accepts external alerts and renders operator reports; it does not page.
+- **No multi-tenancy.** One instance = one trust boundary.
+- **No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.**
+- **Incident cause classification is heuristic and deterministic.** Intentionally explainable, not ML-based.
+
+---
+
+## Documentation
+
+| File                                                         | What's in it                                                                             |
+| ------------------------------------------------------------ | ---------------------------------------------------------------------------------------- |
+| [`docs/env.md`](docs/env.md)                                 | Full env-var reference (auth, profiles, retention, OTLP, incident engine, LLM providers) |
+| [`docs/sdk-examples.md`](docs/sdk-examples.md)               | Copy-paste SDK examples for every supported framework                                    |
+| [`docs/waylog-sdk-contract.md`](docs/waylog-sdk-contract.md) | WideEvent schema and validation rules                                                    |
+| [`docs/openapi.yaml`](docs/openapi.yaml)                     | Full HTTP contract                                                                       |
+| [`docs/internals.md`](docs/internals.md)                     | Durability model, retention, merge semantics, readiness policy, counter buffer           |
+
+---
+
+## Project layout
+
+```
+cmd/         executable binaries (ingest, waylog, waylog-live, ...)
+pkg/         public SDK importable by external services
+internal/    private implementation (auth, incidents, triage, ingest, ...)
+examples/    demo services + collector config + microdemo
+scripts/     demo + proof + ci helpers
+docs/        reference and contracts
+```
+
+---
+
+## License
+
+Not yet declared. The project is in public alpha; a license will be added before tagging 1.0. Until then, contact the maintainers if you'd like to use Waylog in a context where licensing matters to you.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index db2f2c7..d2d3252 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -115,7 +115,7 @@ func main() {
 	var sm *auth.SessionManager
 	if authCfg.DashboardMode != "off" {
 		sm = auth.NewSessionManager(authCfg.SessionSecret, auth.DefaultSessionMaxAge)
-		sm.Secure = os.Getenv("WAYLOG_PROFILE") == "prod"
+		sm.Secure = authCfg.Profile == auth.ProfileProd
 	}
 	sessionCheck := auth.SessionCheckFunc(sm)
 
@@ -135,7 +135,11 @@ func main() {
 	graphUI := config.GetenvBool("GRAPH_UI", false)
 	otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
 	otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
-	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
+	if authCfg.Profile == auth.ProfileProd && otlpEnabled && len(authCfg.WriteKeys) == 0 {
+		slog.Error("WAYLOG_PROFILE=prod with OTLP enabled requires WAYLOG_WRITE_KEY — refusing to boot with unauthenticated OTLP")
+		os.Exit(1)
+	}
+	v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", true)
 	signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
 	alertMatchWindow := config.GetenvDuration("ALERT_MATCH_WINDOW", 15*time.Minute)
 	if alertMatchWindow <= 0 {
@@ -280,6 +284,7 @@ func main() {
 		IncidentsEnabled:         v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
 		IncidentsPersistent:      v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
 		IncidentRebuildSupported: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+		Profile:                  authCfg.Profile,
 	})
 
 	// SSE hub for real-time dashboard updates
@@ -496,8 +501,46 @@ func main() {
 						os.Exit(1)
 					}
 					if replay.Projected == 0 {
+						// Empty WAL replay while rebuild was explicitly requested.
+						// Transition only the seed rows whose StartedAt precedes
+						// replaySince — those are stale beyond the hot window and
+						// their continuing "active" status is no longer evidence-
+						// backed. Non-stale active rows in the same seed are left
+						// untouched and will be re-evaluated by the next live tick.
+						staleTransitioned := 0
 						if len(seed) > 0 {
-							slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+							incidentStoreRef := incidentStore
+							now := time.Now().UTC()
+							for _, inc := range seed {
+								if inc.Status != incidents.StatusActive {
+									continue
+								}
+								if !inc.StartedAt.Before(replaySince) {
+									continue
+								}
+								row := inc
+								row.Status = incidents.StatusRecovering
+								t := now
+								row.RecoveringAt = &t
+								row.UpdatedAt = now
+								if err := incidentStoreRef.Upsert(context.Background(), row); err != nil {
+									slog.Warn("stale-active rebuild transition failed",
+										"incident_id", row.IncidentID, "err", err)
+									continue
+								}
+								staleTransitioned++
+							}
+							if staleTransitioned > 0 {
+								if err := incidentEngine.Bootstrap(context.Background()); err != nil {
+									slog.Error("incident engine re-bootstrap after stale transition failed", "err", err)
+									os.Exit(1)
+								}
+								slog.Info("incidents rebuild: stale active rows transitioned to recovering",
+									"transitioned", staleTransitioned,
+									"replay_since", replaySince)
+							} else {
+								slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+							}
 						}
 					} else {
 						result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{
diff --git a/docs/env.md b/docs/env.md
index 3de9e62..0ef8836 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -32,14 +32,15 @@ Scoped keys. See the Auth section of the [README](../README.md).
 
 | Variable | Scope |
 |---|---|
-| `WAYLOG_WRITE_KEY` | Write auth for `/v1/events` (preferred) |
+| `WAYLOG_PROFILE`   | Auth strictness profile: `demo` (open), `dev` (default — open OTLP, optional auth), `prod` (refuses to boot without write/read/agent keys, and without write auth on OTLP HTTP+gRPC) |
+| `WAYLOG_WRITE_KEY` | Write auth for `/v1/events`, `/v1/otlp/v1/traces`, OTLP/gRPC, `/v1/signals`, `/v1/alerts` (preferred) |
 | `WAYLOG_API_KEY`   | Legacy alias for write scope. Supports `Authorization: Bearer` and `X-API-Key` headers |
 | `WAYLOG_READ_KEY`  | Read auth for read endpoints + dashboard session validation |
 | `WAYLOG_AGENT_KEY` | Agent auth for `/v1/tools/*`, `/v1/ask`, `/v1/plans/*`. No session fallback |
 | `DASHBOARD_AUTH`   | Dashboard auth mode: `off` \| `basic:<user>:<pass>` \| `key:<secret>` |
-| `DASHBOARD_SESSION_SECRET` | Session signing key (derived from `DASHBOARD_AUTH` if unset) |
+| `DASHBOARD_SESSION_SECRET` | Session signing key (derived from `DASHBOARD_AUTH` if unset; required when `WAYLOG_PROFILE=prod`) |
 
-`ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
+`ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination. When `WAYLOG_PROFILE=prod`, all three scoped keys (`WAYLOG_WRITE_KEY`, `WAYLOG_READ_KEY`, `WAYLOG_AGENT_KEY`) are required, and OTLP cannot run unauthenticated.
 
 ## Ingest server
 
@@ -47,7 +48,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
 |---|---|---|
 | `INGEST_ADDR` | `:8080` | Listen address |
 | `OTLP_ENABLED` | `true` | Enable OTLP trace ingest over HTTP and gRPC |
-| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver |
+| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver. For single-host installs, bind `127.0.0.1:4317`. When `WAYLOG_PROFILE=prod`, the server refuses to boot if OTLP is enabled without `WAYLOG_WRITE_KEY` |
 | `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events`, `/v1/otlp/v1/traces`, and OTLP/gRPC receive messages |
 | `READ_HEADER_TIMEOUT` | `5s` | HTTP read header timeout |
 | `READ_TIMEOUT` | `10s` | HTTP read timeout |
@@ -58,7 +59,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
 
 ## CLI
 
-The `waylog` CLI calls the running ingest server's v2 read APIs. The server must run with `WAYLOG_V2_READS=true`.
+The `waylog` CLI calls the running ingest server's v2 read APIs. The server runs with `WAYLOG_V2_READS=true` by default; only set it to `false` for legacy v1-only stacks.
 
 | Variable | Default | Purpose |
 |---|---|---|
@@ -106,7 +107,7 @@ See [Internals](internals.md) for the full durability model.
 | Variable | Default | Purpose |
 |---|---|---|
 | `GRAPH_UI` | `false` | Enable optional graph topology endpoint `/v1/graph/topology` |
-| `WAYLOG_V2_READS` | `false` | Route v2 read endpoints to the schema-2.0 recent index |
+| `WAYLOG_V2_READS` | `true` | Route v2 read endpoints to the schema-2.0 recent index. Set `false` only for legacy v1-only stacks |
 | `CAUSAL_ENABLED` | `false` | Enable shadow-mode causal inference |
 | `CAUSAL_INTERVAL` | `30s` | Causal inference ticker interval |
 | `HAPPY_SAMPLE_RATE_PCT` | `2` | Success-event sampling rate. Set `100` in dev profiles |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 4c31f77..6cc08dd 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -202,11 +202,12 @@ paths:
     post:
       tags: [Signals]
       operationId: ingestAlert
-      summary: Ingest an external alert and match it to an active incident
+      summary: Ingest an external alert and correlate it with an active incident
       description: |
         Accepts Waylog-normalized alerts plus Alertmanager, Grafana, and
         PagerDuty webhook payloads. Accepted alerts are stored as `type=alert`
-        signals. Matching is best-effort and does not create incidents directly.
+        signals and correlated with active incidents when possible. Alerts do
+        not create incidents — the spike detector owns incident lifecycle.
       security:
         - ApiKeyHeader: []
         - BearerAuth: []
diff --git a/examples/cmd/api-gateway/main.go b/examples/cmd/api-gateway/main.go
index 24aa63d..9662c36 100644
--- a/examples/cmd/api-gateway/main.go
+++ b/examples/cmd/api-gateway/main.go
@@ -16,16 +16,22 @@ func main() {
 	}
 
 	checkoutURL := config.Getenv("CHECKOUT_URL", "http://localhost:9082")
+	ingestURL := config.Getenv("INGEST_URL", "http://localhost:8080")
+	writeKey := config.Getenv("WAYLOG_WRITE_KEY", "")
+	readKey := config.Getenv("WAYLOG_READ_KEY", writeKey)
+	agentKey := config.Getenv("WAYLOG_AGENT_KEY", readKey)
 	gateway := microdemo.NewGatewayHandler(checkoutURL)
 	gateway.SetSignalPoster(microdemo.NewDemoSignalPoster(
-		config.Getenv("INGEST_URL", "http://localhost:8080"),
-		config.Getenv("WAYLOG_WRITE_KEY", ""),
+		ingestURL,
+		writeKey,
 	))
+	gateway.SetWaylogAPI(ingestURL, readKey, writeKey, agentKey)
 
 	mux := http.NewServeMux()
 	mux.Handle("/purchase", gateway.PurchaseHandler())
 	mux.HandleFunc("/demo", gateway.ServeDemo)
 	mux.HandleFunc("/demo/burst", gateway.ServeBurst)
+	mux.HandleFunc("/demo/proof", gateway.ServeProof)
 
 	microdemo.RunService("api-gateway", ":9081", mux)
 }
diff --git a/examples/microdemo/gateway.go b/examples/microdemo/gateway.go
index f065440..e4a0a52 100644
--- a/examples/microdemo/gateway.go
+++ b/examples/microdemo/gateway.go
@@ -28,7 +28,12 @@ var uiHTML []byte
 
 type GatewayHandler struct {
 	checkoutURL string
+	ingestURL   string
+	readKey     string
+	writeKey    string
+	agentKey    string
 	client      *http.Client
+	proofClient *http.Client
 	purchase    http.Handler
 	signals     SignalPoster
 }
@@ -44,6 +49,7 @@ func NewGatewayHandler(checkoutURL string) *GatewayHandler {
 		client: &http.Client{
 			Transport: wayloghttp.NewTransport(demoHTTPTransport(), "checkout"),
 		},
+		proofClient: &http.Client{Timeout: 10 * demoSignalTimeout},
 	}
 	// Pre-wrap so the live /purchase route and /demo/burst dispatch share a
 	// single instance — and so callers can't forget to wire it up.
@@ -67,6 +73,14 @@ func (h *GatewayHandler) SetSignalPoster(poster SignalPoster) {
 	h.signals = poster
 }
 
+// SetWaylogAPI configures the demo-only proof loop proxy.
+func (h *GatewayHandler) SetWaylogAPI(ingestURL, readKey, writeKey, agentKey string) {
+	h.ingestURL = strings.TrimRight(strings.TrimSpace(ingestURL), "/")
+	h.readKey = strings.TrimSpace(readKey)
+	h.writeKey = strings.TrimSpace(writeKey)
+	h.agentKey = strings.TrimSpace(agentKey)
+}
+
 func (h *GatewayHandler) ServeDemo(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "text/html")
 	_, _ = w.Write(uiHTML)
diff --git a/examples/microdemo/proof.go b/examples/microdemo/proof.go
new file mode 100644
index 0000000..41236ac
--- /dev/null
+++ b/examples/microdemo/proof.go
@@ -0,0 +1,431 @@
+package microdemo
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+const (
+	proofWindow       = "15m"
+	proofPollDelay    = 750 * time.Millisecond
+	proofPollAttempts = 24
+)
+
+type ProofSummary struct {
+	AlertID    string            `json:"alert_id"`
+	IncidentID string            `json:"incident_id"`
+	ReportHash string            `json:"report_hash"`
+	Hashes     map[string]string `json:"hashes"`
+	Burst      BurstSummary      `json:"burst"`
+	Evidence   ProofEvidence     `json:"evidence"`
+	Reports    ProofReports      `json:"reports"`
+	Scorecard  ProofScorecard    `json:"scorecard"`
+}
+
+type ProofEvidence struct {
+	TraceID          string `json:"trace_id"`
+	AlertLinked      bool   `json:"alert_linked"`
+	DependencySignal bool   `json:"dependency_signal"`
+	NextChecks       bool   `json:"next_checks"`
+}
+
+type ProofReports struct {
+	Markdown  string          `json:"markdown"`
+	Slack     json.RawMessage `json:"slack"`
+	PagerDuty string          `json:"pagerduty"`
+}
+
+type ProofScorecard struct {
+	RootCauseAccuracy               bool   `json:"root_cause_accuracy"`
+	CauseClassificationDependency   bool   `json:"cause_classification_dependency"`
+	ReportHashStable                bool   `json:"report_hash_stable"`
+	PropagatedErrorInflationAvoided int    `json:"propagated_error_inflation_avoided"`
+	TriageLatencyMS                 int64  `json:"triage_latency_ms"`
+	Scenario                        string `json:"scenario"`
+	RootCauseCount                  int    `json:"root_cause_count"`
+	NaivePropagatedCount            int    `json:"naive_propagated_count"`
+}
+
+type planResult struct {
+	Steps []struct {
+		Result json.RawMessage `json:"result"`
+	} `json:"steps"`
+}
+
+func (h *GatewayHandler) ServeProof(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodPost {
+		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+	if h.ingestURL == "" {
+		http.Error(w, "INGEST_URL is not configured for the demo proof", http.StatusServiceUnavailable)
+		return
+	}
+
+	var req BurstRequest
+	if r.Body != nil {
+		defer r.Body.Close()
+		dec := json.NewDecoder(r.Body)
+		dec.DisallowUnknownFields()
+		if err := dec.Decode(&req); err != nil && err != io.EOF {
+			http.Error(w, "invalid json: "+err.Error(), http.StatusBadRequest)
+			return
+		}
+	}
+
+	result, err := h.runProof(r.Context(), req)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusBadGateway)
+		return
+	}
+	w.Header().Set("Content-Type", "application/json")
+	_ = json.NewEncoder(w).Encode(result)
+}
+
+func (h *GatewayHandler) runProof(ctx context.Context, req BurstRequest) (ProofSummary, error) {
+	alertID := fmt.Sprintf("alert_demo_proof_pmt_502_%d", time.Now().Unix())
+	if err := h.postProofAlert(ctx, alertID); err != nil {
+		return ProofSummary{}, err
+	}
+
+	signals := []SignalResult(nil)
+	if h.signals != nil {
+		signals = h.signals.PostDemoSignals(ctx)
+	}
+	burst := runBurst(ctx, h.purchase, req)
+	burst.Signals = signals
+	answerStart := time.Now()
+
+	errorsResp, err := h.pollErrors(ctx)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	inc, incidentsResp, err := h.pollIncident(ctx)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+
+	triageA, err := h.getTriage(ctx, inc.IncidentID)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	triageB, err := h.getTriage(ctx, inc.IncidentID)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	answerEnd := time.Now()
+
+	toolReport, err := h.postToolTriage(ctx, inc.IncidentID)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	planReport, err := h.postPlanTriage(ctx, inc.IncidentID)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	hashes := map[string]string{
+		"read":   triageA.ReportHash,
+		"repeat": triageB.ReportHash,
+		"tool":   toolReport.ReportHash,
+		"plan":   planReport.ReportHash,
+	}
+	hashStable := triageA.ReportHash != "" &&
+		triageA.ReportHash == triageB.ReportHash &&
+		triageA.ReportHash == toolReport.ReportHash &&
+		triageA.ReportHash == planReport.ReportHash
+
+	blast, err := h.getBlast(ctx)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+	reports, err := h.getReports(ctx, inc.IncidentID)
+	if err != nil {
+		return ProofSummary{}, err
+	}
+
+	rootCount := paymentErrorCount(errorsResp)
+	naive := rootCount * blast.AffectedServices
+	return ProofSummary{
+		AlertID:    alertID,
+		IncidentID: inc.IncidentID,
+		ReportHash: triageA.ReportHash,
+		Hashes:     hashes,
+		Burst:      burst,
+		Evidence: ProofEvidence{
+			TraceID:          firstTraceID(triageA),
+			AlertLinked:      hasAlertID(triageA, alertID),
+			DependencySignal: hasSignalType(triageA, "dependency"),
+			NextChecks:       len(triageA.NextChecks) > 0,
+		},
+		Reports: reports,
+		Scorecard: ProofScorecard{
+			RootCauseAccuracy:               triageRootCauseAccurate(triageA),
+			CauseClassificationDependency:   incidentCauseIsDependency(incidentsResp, inc.IncidentID),
+			ReportHashStable:                hashStable,
+			PropagatedErrorInflationAvoided: naive - rootCount,
+			TriageLatencyMS:                 answerEnd.Sub(answerStart).Milliseconds(),
+			Scenario:                        "warm-demo",
+			RootCauseCount:                  rootCount,
+			NaivePropagatedCount:            naive,
+		},
+	}, nil
+}
+
+func (h *GatewayHandler) postProofAlert(ctx context.Context, alertID string) error {
+	body := map[string]any{
+		"source":     "waylog",
+		"alert_id":   alertID,
+		"service":    "checkout",
+		"env":        "demo",
+		"severity":   "critical",
+		"reason":     "PMT_502 spike",
+		"message":    "browser demo alert for checkout payment failures",
+		"error_code": "PMT_502",
+		"timestamp":  time.Now().UTC().Format(time.RFC3339),
+	}
+	status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/alerts", h.writeKey, body, nil)
+	if err != nil {
+		return err
+	}
+	if status != http.StatusCreated {
+		return fmt.Errorf("alert webhook failed: HTTP %d", status)
+	}
+	return nil
+}
+
+func (h *GatewayHandler) pollErrors(ctx context.Context) (apiv2.ErrorsResponse, error) {
+	var last apiv2.ErrorsResponse
+	for i := 0; i < proofPollAttempts; i++ {
+		q := url.Values{"window": {proofWindow}, "limit": {"10"}}
+		status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/errors?"+q.Encode(), h.readKey, nil, &last)
+		if err == nil && status == http.StatusOK && paymentErrorCount(last) > 0 {
+			return last, nil
+		}
+		if err != nil {
+			return apiv2.ErrorsResponse{}, err
+		}
+		sleepOrDone(ctx, proofPollDelay)
+	}
+	return apiv2.ErrorsResponse{}, fmt.Errorf("payment_502 error family did not appear")
+}
+
+func (h *GatewayHandler) pollIncident(ctx context.Context) (apiv2.Incident, apiv2.IncidentListResponse, error) {
+	var last apiv2.IncidentListResponse
+	for i := 0; i < proofPollAttempts; i++ {
+		status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/incidents/active", h.readKey, nil, &last)
+		if err != nil {
+			return apiv2.Incident{}, apiv2.IncidentListResponse{}, err
+		}
+		if status == http.StatusOK {
+			for _, inc := range last.Incidents {
+				if isPaymentFamily(inc.ErrorFamily) && inc.Cause == "dependency" && inc.Status == "active" {
+					return inc, last, nil
+				}
+			}
+		}
+		sleepOrDone(ctx, proofPollDelay)
+	}
+	return apiv2.Incident{}, apiv2.IncidentListResponse{}, fmt.Errorf("dependency incident did not appear")
+}
+
+func (h *GatewayHandler) getTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+	var rep pkgtriage.Report
+	status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/triage/"+url.PathEscape(incidentID)+"?snapshot=true", h.readKey, nil, &rep)
+	if err != nil {
+		return nil, err
+	}
+	if status != http.StatusOK {
+		return nil, fmt.Errorf("triage read failed: HTTP %d", status)
+	}
+	return &rep, nil
+}
+
+func (h *GatewayHandler) postToolTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+	var rep pkgtriage.Report
+	status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/tools/triage_incident", h.agentKey, map[string]any{"incident_id": incidentID, "snapshot": true}, &rep)
+	if err != nil {
+		return nil, err
+	}
+	if status != http.StatusOK {
+		return nil, fmt.Errorf("triage tool failed: HTTP %d", status)
+	}
+	return &rep, nil
+}
+
+func (h *GatewayHandler) postPlanTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+	var plan planResult
+	status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/plans/execute", h.agentKey, map[string]any{
+		"template": "triage",
+		"params":   map[string]any{"incident_id": incidentID, "snapshot": true},
+	}, &plan)
+	if err != nil {
+		return nil, err
+	}
+	if status != http.StatusOK || len(plan.Steps) == 0 {
+		return nil, fmt.Errorf("triage plan failed: HTTP %d", status)
+	}
+	var rep pkgtriage.Report
+	if err := json.Unmarshal(plan.Steps[0].Result, &rep); err != nil {
+		return nil, fmt.Errorf("triage plan result decode: %w", err)
+	}
+	return &rep, nil
+}
+
+func (h *GatewayHandler) getBlast(ctx context.Context) (apiv2.BlastRadiusResponse, error) {
+	q := url.Values{"window": {proofWindow}, "error_family": {"checkout:payment.charge:PMT_502"}}
+	var blast apiv2.BlastRadiusResponse
+	status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/blast_radius?"+q.Encode(), h.readKey, nil, &blast)
+	if err != nil {
+		return apiv2.BlastRadiusResponse{}, err
+	}
+	if status != http.StatusOK {
+		return apiv2.BlastRadiusResponse{}, fmt.Errorf("blast failed: HTTP %d", status)
+	}
+	return blast, nil
+}
+
+func (h *GatewayHandler) getReports(ctx context.Context, incidentID string) (ProofReports, error) {
+	var out ProofReports
+	for _, format := range []string{"markdown", "slack", "pagerduty"} {
+		path := "/v1/triage/" + url.PathEscape(incidentID) + "/report?format=" + format + "&snapshot=true"
+		status, raw, err := h.doJSON(ctx, http.MethodGet, path, h.readKey, nil, nil)
+		if err != nil {
+			return ProofReports{}, err
+		}
+		if status != http.StatusOK {
+			return ProofReports{}, fmt.Errorf("%s report failed: HTTP %d", format, status)
+		}
+		switch format {
+		case "markdown":
+			out.Markdown = string(raw)
+		case "slack":
+			out.Slack = append(json.RawMessage(nil), raw...)
+		case "pagerduty":
+			out.PagerDuty = string(raw)
+		}
+	}
+	return out, nil
+}
+
+func (h *GatewayHandler) doJSON(ctx context.Context, method, path, key string, body any, out any) (int, []byte, error) {
+	var reader io.Reader
+	if body != nil {
+		raw, err := json.Marshal(body)
+		if err != nil {
+			return 0, nil, err
+		}
+		reader = bytes.NewReader(raw)
+	}
+	req, err := http.NewRequestWithContext(ctx, method, h.ingestURL+path, reader)
+	if err != nil {
+		return 0, nil, err
+	}
+	if body != nil {
+		req.Header.Set("Content-Type", "application/json")
+	}
+	if key != "" {
+		req.Header.Set("Authorization", "Bearer "+key)
+	}
+	client := h.proofClient
+	if client == nil {
+		client = http.DefaultClient
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer resp.Body.Close()
+	raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
+	if err != nil {
+		return resp.StatusCode, nil, err
+	}
+	if out != nil && resp.StatusCode >= 200 && resp.StatusCode < 300 {
+		if err := json.Unmarshal(raw, out); err != nil {
+			return resp.StatusCode, raw, err
+		}
+	}
+	return resp.StatusCode, raw, nil
+}
+
+func sleepOrDone(ctx context.Context, d time.Duration) {
+	timer := time.NewTimer(d)
+	defer timer.Stop()
+	select {
+	case <-ctx.Done():
+	case <-timer.C:
+	}
+}
+
+func paymentErrorCount(resp apiv2.ErrorsResponse) int {
+	for _, row := range resp.Rows {
+		if isPaymentFamily(row.ErrorFamily) {
+			return row.Count
+		}
+	}
+	return 0
+}
+
+func isPaymentFamily(f apiv2.ErrorFamily) bool {
+	return f.Service == "checkout" && f.Step == "payment.charge" && f.ErrorCode == "PMT_502"
+}
+
+func firstTraceID(rep *pkgtriage.Report) string {
+	if rep == nil || len(rep.SampleTraces) == 0 {
+		return ""
+	}
+	return rep.SampleTraces[0].TraceID
+}
+
+func hasAlertID(rep *pkgtriage.Report, alertID string) bool {
+	if rep == nil {
+		return false
+	}
+	for _, alert := range rep.Alerts {
+		if alert.AlertID == alertID && alert.SignalID != "" {
+			return true
+		}
+	}
+	return false
+}
+
+func hasSignalType(rep *pkgtriage.Report, typ string) bool {
+	if rep == nil {
+		return false
+	}
+	for _, sig := range rep.Signals {
+		if sig.ID != "" && sig.Type == typ {
+			return true
+		}
+	}
+	return false
+}
+
+func triageRootCauseAccurate(rep *pkgtriage.Report) bool {
+	if rep == nil {
+		return false
+	}
+	for _, family := range rep.BlastSnapshot.TopErrorFamilies {
+		if family.Service == "checkout" && family.Step == "payment.charge" && family.ErrorCode == "PMT_502" {
+			return true
+		}
+	}
+	return false
+}
+
+func incidentCauseIsDependency(resp apiv2.IncidentListResponse, incidentID string) bool {
+	for _, inc := range resp.Incidents {
+		if inc.IncidentID == incidentID {
+			return inc.Cause == "dependency"
+		}
+	}
+	return false
+}
diff --git a/examples/microdemo/proof_test.go b/examples/microdemo/proof_test.go
new file mode 100644
index 0000000..12904ab
--- /dev/null
+++ b/examples/microdemo/proof_test.go
@@ -0,0 +1,88 @@
+package microdemo
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+func TestServeProofRejectsNonPOST(t *testing.T) {
+	gateway := NewGatewayHandler("http://checkout.example")
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/demo/proof", nil)
+	gateway.ServeProof(rec, req)
+	if rec.Code != http.StatusMethodNotAllowed {
+		t.Fatalf("status = %d, want 405", rec.Code)
+	}
+}
+
+func TestServeProofRequiresIngestURL(t *testing.T) {
+	gateway := NewGatewayHandler("http://checkout.example")
+	gateway.SetPurchaseHandler(okBurstDispatch())
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/demo/proof", strings.NewReader(`{}`))
+	gateway.ServeProof(rec, req)
+	if rec.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want 503", rec.Code)
+	}
+}
+
+func TestServeProofRejectsUnknownFields(t *testing.T) {
+	gateway := NewGatewayHandler("http://checkout.example")
+	gateway.SetWaylogAPI("http://ingest.example", "read", "write", "agent")
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/demo/proof", strings.NewReader(`{"foo":1}`))
+	gateway.ServeProof(rec, req)
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", rec.Code)
+	}
+}
+
+func TestProofSummaryJSONShape(t *testing.T) {
+	out := ProofSummary{
+		AlertID:    "alert_1",
+		IncidentID: "inc_1",
+		ReportHash: "sha256:x",
+		Hashes:     map[string]string{"read": "sha256:x"},
+		Evidence:   ProofEvidence{TraceID: "trace_1", AlertLinked: true, DependencySignal: true, NextChecks: true},
+		Scorecard: ProofScorecard{
+			RootCauseAccuracy:             true,
+			CauseClassificationDependency: true,
+			ReportHashStable:              true,
+			TriageLatencyMS:               42,
+			Scenario:                      "warm-demo",
+		},
+	}
+	raw, err := json.Marshal(out)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	for _, want := range []string{"alert_1", "inc_1", "trace_1", "cause_classification_dependency", `"triage_latency_ms":42`, `"scenario":"warm-demo"`} {
+		if !strings.Contains(string(raw), want) {
+			t.Fatalf("json missing %q: %s", want, raw)
+		}
+	}
+}
+
+func TestProofSummaryReportHashStableFalse(t *testing.T) {
+	out := ProofSummary{
+		AlertID:    "alert_1",
+		IncidentID: "inc_1",
+		ReportHash: "sha256:x",
+		Hashes:     map[string]string{"read": "sha256:x", "repeat": "sha256:y"},
+		Scorecard: ProofScorecard{
+			ReportHashStable: false,
+			TriageLatencyMS:  7,
+			Scenario:         "warm-demo",
+		},
+	}
+	raw, err := json.Marshal(out)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	if !strings.Contains(string(raw), `"report_hash_stable":false`) {
+		t.Fatalf("json missing report_hash_stable:false: %s", raw)
+	}
+}
diff --git a/examples/microdemo/ui.html b/examples/microdemo/ui.html
index b5ac343..b0d3cf0 100644
--- a/examples/microdemo/ui.html
+++ b/examples/microdemo/ui.html
@@ -346,6 +346,150 @@
       display: grid;
       gap: 16px;
     }
+    .proof-grid {
+      display: grid;
+      grid-template-columns: minmax(0, 1fr);
+      gap: 20px;
+      align-items: start;
+    }
+    .proof-hero {
+      display: grid;
+      gap: 16px;
+      padding-bottom: 4px;
+    }
+    .proof-headline {
+      margin: 8px 0 0;
+      font-size: clamp(1.4rem, 3vw, 2rem);
+      line-height: 1.1;
+      letter-spacing: -0.035em;
+      max-width: 22ch;
+    }
+    .proof-summary {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 10px;
+    }
+    .proof-panel {
+      display: grid;
+      gap: 12px;
+    }
+    .proof-columns {
+      display: grid;
+      grid-template-columns: minmax(0, 0.95fr) minmax(360px, 1.05fr);
+      gap: 18px;
+      align-items: start;
+    }
+    .proof-checklist {
+      display: grid;
+      gap: 8px;
+      margin: 0;
+      padding: 0;
+      list-style: none;
+    }
+    .proof-checklist li {
+      display: grid;
+      grid-template-columns: 18px minmax(0, 1fr);
+      gap: 9px;
+      align-items: start;
+      padding: 9px 10px;
+      background: var(--surface-2);
+      border: 1px solid var(--line);
+      border-radius: var(--radius);
+      color: var(--muted);
+      font-size: 0.86rem;
+    }
+    .proof-checklist .ok { color: var(--accent); font-family: var(--font-mono); font-weight: 600; }
+    .proof-checklist .bad { color: var(--danger); font-family: var(--font-mono); font-weight: 600; }
+    .proof-table {
+      width: 100%;
+      border-collapse: collapse;
+      background: var(--surface-2);
+      border: 1px solid var(--line);
+      border-radius: var(--radius);
+      overflow: hidden;
+      font-family: var(--font-mono);
+      font-size: 0.74rem;
+    }
+    .proof-table th,
+    .proof-table td {
+      padding: 8px 9px;
+      border-bottom: 1px solid var(--line);
+      text-align: left;
+      vertical-align: top;
+    }
+    .proof-table tr:last-child td { border-bottom: 0; }
+    .proof-table td { overflow-wrap: anywhere; }
+    .proof-table .hash-short { color: var(--ink); font-weight: 600; }
+    .proof-table .hash-full { color: var(--faint); display: block; margin-top: 3px; }
+    .proof-status {
+      color: var(--accent);
+      font-weight: 600;
+      white-space: nowrap;
+    }
+    .proof-status.fail { color: var(--danger); }
+    .proof-metric {
+      display: grid;
+      gap: 4px;
+      padding: 11px 12px;
+      background: var(--surface-2);
+      border: 1px solid var(--line);
+      border-radius: var(--radius);
+      min-width: 0;
+    }
+    .proof-metric span {
+      color: var(--muted);
+      font-family: var(--font-mono);
+      font-size: 0.68rem;
+      letter-spacing: 0.06em;
+      text-transform: uppercase;
+    }
+    .proof-metric strong {
+      font-family: var(--font-mono);
+      font-size: 0.86rem;
+      font-weight: 500;
+      overflow-wrap: anywhere;
+    }
+    .proof-report {
+      max-height: 360px;
+      overflow: auto;
+      white-space: pre-wrap;
+      overflow-wrap: anywhere;
+      background: var(--surface-2);
+      border: 1px solid var(--line);
+      border-radius: var(--radius);
+      padding: 14px;
+      margin: 0;
+      font-family: var(--font-mono);
+      font-size: 0.76rem;
+      line-height: 1.5;
+    }
+    .proof-report-head {
+      display: flex;
+      flex-wrap: wrap;
+      justify-content: space-between;
+      gap: 10px;
+      align-items: end;
+    }
+    .proof-tabs {
+      display: inline-flex;
+      flex-wrap: wrap;
+      gap: 6px;
+    }
+    .proof-tab {
+      border: 1px solid var(--line);
+      background: var(--surface-2);
+      border-radius: var(--radius-sm);
+      color: var(--muted);
+      font-family: var(--font-mono);
+      font-size: 0.68rem;
+      padding: 5px 7px;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .proof-tab.active {
+      color: var(--ink);
+      border-color: var(--line-strong);
+    }
     .burst-counts {
       display: flex;
       flex-wrap: wrap;
@@ -479,6 +623,7 @@
     @media (max-width: 760px) {
       .expectations { grid-template-columns: 1fr; }
       .result-grid { grid-template-columns: 1fr; }
+      .proof-summary, .proof-columns { grid-template-columns: 1fr; }
       .burst-row { grid-template-columns: 1fr; }
       .burst-form { justify-content: flex-start; }
       main { padding: 36px 0 64px; }
@@ -551,6 +696,14 @@ <h2 id="burst-title">Production-like traffic mix</h2>
       </form>
     </section>
 
+    <section class="section burst-row" aria-labelledby="proof-title">
+      <div class="burst-meta">
+        <h2 id="proof-title">Alert-to-report proof</h2>
+        <p class="muted small">Runs the full product loop in the browser: alert intake, incident match, deterministic triage, tool/plan hash agreement, cited reports, and RCA scorecard.</p>
+      </div>
+      <button type="button" class="primary" id="proof-run">Run proof loop</button>
+    </section>
+
     <section class="section" aria-labelledby="result-title">
       <h2 id="result-title">Result</h2>
       <div id="result" class="result-empty" aria-live="polite">Choose a scenario to send a real request through the demo services.</div>
@@ -613,12 +766,15 @@ <h2 id="result-title">Result</h2>
     const scenarioButtons = Array.from(document.querySelectorAll("[data-scenario]"));
     const burstForm = document.getElementById("burst-form");
     const burstButton = document.getElementById("burst-run");
+    const proofButton = document.getElementById("proof-run");
     const burstControls = Array.from(burstForm.elements);
     let burstInFlight = false;
+    let proofInFlight = false;
     scenarioButtons.forEach(button => {
       button.addEventListener("click", () => purchase(button.dataset.scenario, button));
     });
     burstForm.addEventListener("submit", runBurst);
+    proofButton.addEventListener("click", runProofLoop);
 
     function esc(value) {
       return String(value ?? "").replace(/[&<>"']/g, ch => ({
@@ -627,7 +783,7 @@ <h2 id="result-title">Result</h2>
     }
 
     function setLoading(activeButton, loading) {
-      [...scenarioButtons, ...burstControls].forEach(control => {
+      [...scenarioButtons, ...burstControls, proofButton].forEach(control => {
         control.disabled = loading;
         control.setAttribute("aria-busy", loading ? "true" : "false");
         control.querySelector?.(".spinner")?.remove();
@@ -637,6 +793,14 @@ <h2 id="result-title">Result</h2>
       }
     }
 
+    function burstPayloadFromForm() {
+      const formData = new FormData(burstForm);
+      return {
+        requests: Number(formData.get("requests")),
+        concurrency: Number(formData.get("concurrency"))
+      };
+    }
+
     async function purchase(scenario, activeButton) {
       const result = document.getElementById("result");
       setLoading(activeButton, true);
@@ -664,11 +828,7 @@ <h2 id="result-title">Result</h2>
       if (burstInFlight) return;
       burstInFlight = true;
       setLoading(burstButton, true);
-      const formData = new FormData(burstForm);
-      const payload = {
-        requests: Number(formData.get("requests")),
-        concurrency: Number(formData.get("concurrency"))
-      };
+      const payload = burstPayloadFromForm();
       const result = document.getElementById("result");
       result.className = "result-empty";
       result.textContent = "Posting demo signals and running production-like traffic through the checkout chain…";
@@ -689,6 +849,30 @@ <h2 id="result-title">Result</h2>
       }
     }
 
+    async function runProofLoop() {
+      if (proofInFlight) return;
+      proofInFlight = true;
+      setLoading(proofButton, true);
+      const result = document.getElementById("result");
+      result.className = "result-empty";
+      result.textContent = "Running alert → incident → triage → reports → scorecard. This takes a few seconds…";
+      try {
+        const resp = await fetch("/demo/proof", {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify(burstPayloadFromForm())
+        });
+        if (!resp.ok) throw new Error(await resp.text() || "proof loop failed");
+        renderProofLoop(await resp.json());
+      } catch (err) {
+        result.className = "result-empty";
+        result.textContent = "Error: " + err.message;
+      } finally {
+        proofInFlight = false;
+        setLoading(proofButton, false);
+      }
+    }
+
     async function pollStory(scenario, data) {
       if (!data.trace_id) return;
       const attempts = 16;
@@ -793,6 +977,88 @@ <h2 id="result-title">Result</h2>
         </div>
       </div>`;
     }
+
+    function renderProofLoop(proof) {
+      const score = proof.scorecard || {};
+      const evidence = proof.evidence || {};
+      const markdown = proof.reports?.markdown || "Report not available";
+      const hashes = proof.hashes || {};
+      const hashOK = Object.values(hashes).length > 0 && Object.values(hashes).every(value => value === proof.report_hash);
+      const rootCause = "checkout:payment.charge:PMT_502";
+      const fullHash = proof.report_hash || "";
+      const shortHash = shortHashValue(fullHash);
+      const checklist = [
+        ["Alert accepted", Boolean(proof.alert_id), proof.alert_id || "not available"],
+        ["Incident opened", Boolean(proof.incident_id), proof.incident_id || "not available"],
+        ["Triage built", Boolean(proof.report_hash), shortHash || "not available"],
+        ["Read/tool/plan hashes agree", hashOK, "direct read, direct tool, plan template, repeat snapshot"],
+        ["Reports rendered", Boolean(markdown && proof.reports?.slack && proof.reports?.pagerduty), "Markdown, Slack Block Kit, PagerDuty note"],
+        ["Scorecard passed", Boolean(score.root_cause_accuracy && score.cause_classification_dependency && score.report_hash_stable), "root cause, dependency classification, hash stability"]
+      ];
+      const hashRows = [
+        ["read endpoint", hashes.read],
+        ["repeat snapshot", hashes.repeat],
+        ["direct tool", hashes.tool],
+        ["plan template", hashes.plan]
+      ].map(([label, value]) => `<tr><td>${esc(label)}</td><td><span class="hash-short">${esc(shortHashValue(value) || "not available")}</span><span class="hash-full">${esc(value || "")}</span></td><td><span class="proof-status ${value === proof.report_hash ? "" : "fail"}">${value === proof.report_hash ? "pass" : "fail"}</span></td></tr>`).join("");
+      const result = document.getElementById("result");
+      result.className = "bracketed";
+      result.innerHTML = `<div class="proof-grid">
+        <div class="proof-hero">
+          <div>
+            <div class="eyebrow">Proof loop complete</div>
+            <h3 class="proof-headline">Alert correlated. Root cause identified. Report verified.</h3>
+            <p class="muted">Waylog correlated an external alert with an active incident, found <span class="mono">${esc(rootCause)}</span>, and produced a cited operator report whose hash was stable across CLI, read, direct tool, and plan-template surfaces in this run.</p>
+          </div>
+          <div class="proof-summary">
+            <div class="proof-metric"><span>Root cause</span><strong>${esc(rootCause)}</strong></div>
+            <div class="proof-metric"><span>Stable report hash</span><strong title="${esc(fullHash)}">${esc(shortHash || "not available")}</strong></div>
+            <div class="proof-metric"><span>Inflation avoided</span><strong>${esc(score.propagated_error_inflation_avoided ?? "not available")} propagated errors</strong></div>
+          </div>
+        </div>
+        <div class="proof-columns">
+          <div class="proof-panel">
+            <ul class="proof-checklist" aria-label="Proof checklist">
+              ${checklist.map(([label, ok, detail]) => `<li><span class="${ok ? "ok" : "bad"}">${ok ? "✓" : "!"}</span><span><strong>${esc(label)}</strong><br>${esc(detail)}</span></li>`).join("")}
+            </ul>
+            <div class="proof-metric"><span>Evidence IDs</span><strong>alert=${esc(proof.alert_id || "not available")} · trace=${esc(evidence.trace_id || "not available")} · incident=${esc(proof.incident_id || "not available")}</strong></div>
+            <div class="proof-metric"><span>Evidence completeness</span><strong>alert linked · dependency signal present · next checks ready</strong></div>
+            <table class="proof-table" aria-label="Hash agreement across triage surfaces">
+              <thead><tr><th>Surface</th><th>Report hash</th><th>Status</th></tr></thead>
+              <tbody>${hashRows}</tbody>
+            </table>
+            <div class="proof-metric"><span>RCA scorecard</span><strong>${humanBool(score.root_cause_accuracy, "Root cause identified")} · ${humanBool(score.cause_classification_dependency, "Dependency cause confirmed")} · ${humanBool(score.report_hash_stable, "Hash stable")} · time to answer ${esc(score.time_to_answer_ms || 0)}ms</strong></div>
+            <div class="links">
+              <a class="button primary" href="${dashboardURL}">Open dashboard</a>
+              ${proof.incident_id ? `<a class="button" href="${dashboardURL}#/incident/${encodeURIComponent(proof.incident_id)}">Inspect incident</a>` : ""}
+              ${evidence.trace_id ? `<a class="button" href="${dashboardURL}#/explain/${encodeURIComponent(evidence.trace_id)}">Explain trace</a>` : ""}
+            </div>
+          </div>
+          <div class="proof-panel">
+            <div class="proof-report-head">
+              <div class="proof-metric"><span>Operator report</span><strong>Cited, deterministic, LLM-free</strong></div>
+              <div class="proof-tabs" aria-label="Rendered report formats">
+                <span class="proof-tab active">Markdown</span>
+                <span class="proof-tab">Slack JSON</span>
+                <span class="proof-tab">PagerDuty note</span>
+              </div>
+            </div>
+            <pre class="proof-report">${esc(markdown)}</pre>
+          </div>
+        </div>
+      </div>`;
+    }
+
+    function shortHashValue(value) {
+      value = String(value || "");
+      if (value.length <= 22) return value;
+      if (value.startsWith("sha256:")) return "sha256:" + value.slice(7, 15) + "…" + value.slice(-6);
+      return value.slice(0, 12) + "…" + value.slice(-6);
+    }
+
+    function humanBool(ok, label) {
+      return ok ? label : "Missing " + label.toLowerCase();
+    }
   </script>
 </body>
 </html>
diff --git a/examples/microdemo/ui_test.go b/examples/microdemo/ui_test.go
index 0b541d8..aae8b1a 100644
--- a/examples/microdemo/ui_test.go
+++ b/examples/microdemo/ui_test.go
@@ -26,10 +26,36 @@ func TestDemoUIProductShowcaseCopy(t *testing.T) {
 		"Run cart not found",
 		"Run checkout 500",
 		"Run traffic burst",
+		"Run proof loop",
+		"Alert-to-report proof",
+		"alert → incident → triage → reports → scorecard",
+		"Alert correlated. Root cause identified. Report verified.",
+		"checkout:payment.charge:PMT_502",
+		"Proof checklist",
+		"Alert accepted",
+		"Incident opened",
+		"Triage built",
+		"Read/tool/plan hashes agree",
+		"Hash agreement across triage surfaces",
+		"read endpoint",
+		"direct tool",
+		"plan template",
+		"repeat snapshot",
+		"Evidence IDs",
+		"Evidence completeness",
+		"Operator report",
+		"Stable report hash",
+		"Inflation avoided",
+		"Slack JSON",
+		"PagerDuty note",
+		"shortHashValue",
+		"humanBool",
+		"#/incident/",
 		"Production-like traffic mix",
 		"posts demo deploy/dependency signals",
 		"active incident",
 		"Burst captured",
+		"Proof loop complete",
 		"Open dashboard",
 		"Explain this trace",
 		"View impact",
diff --git a/internal/auth/config.go b/internal/auth/config.go
index c0b8c77..6b8da56 100644
--- a/internal/auth/config.go
+++ b/internal/auth/config.go
@@ -2,15 +2,23 @@ package auth
 
 import (
 	"fmt"
-	"log/slog"
 	"strings"
 )
 
+// Profile values control auth defaults and validation strictness.
+const (
+	ProfileDemo = "demo"
+	ProfileDev  = "dev"
+	ProfileProd = "prod"
+)
+
 type AuthConfig struct {
 	WriteKeys []string
 	ReadKeys  []string
 	AgentKeys []string
 
+	Profile string // "demo", "dev", or "prod". Defaults to "dev" when unset.
+
 	DashboardMode string // "off", "basic", "key"
 	DashboardUser string // for basic mode
 	DashboardPass string // for basic mode
@@ -21,6 +29,16 @@ type AuthConfig struct {
 func ParseConfig(env map[string]string) (AuthConfig, error) {
 	var cfg AuthConfig
 
+	profile := strings.ToLower(strings.TrimSpace(env["WAYLOG_PROFILE"]))
+	switch profile {
+	case "":
+		cfg.Profile = ProfileDev
+	case ProfileDemo, ProfileDev, ProfileProd:
+		cfg.Profile = profile
+	default:
+		return cfg, fmt.Errorf("WAYLOG_PROFILE: must be one of demo, dev, prod; got %q", profile)
+	}
+
 	legacyKey := strings.TrimSpace(env["WAYLOG_API_KEY"])
 	writeKey := strings.TrimSpace(env["WAYLOG_WRITE_KEY"])
 
@@ -61,8 +79,7 @@ func ParseConfig(env map[string]string) (AuthConfig, error) {
 
 	sessionSecret := strings.TrimSpace(env["DASHBOARD_SESSION_SECRET"])
 	if cfg.DashboardMode != "off" {
-		profile := strings.TrimSpace(env["WAYLOG_PROFILE"])
-		if sessionSecret == "" && profile == "prod" {
+		if sessionSecret == "" && cfg.Profile == ProfileProd {
 			return cfg, fmt.Errorf("DASHBOARD_SESSION_SECRET is required when DASHBOARD_AUTH is enabled in prod profile")
 		}
 		if sessionSecret != "" {
@@ -76,9 +93,20 @@ func ParseConfig(env map[string]string) (AuthConfig, error) {
 		return cfg, fmt.Errorf("WAYLOG_READ_KEY is set but DASHBOARD_AUTH is off; the dashboard cannot authenticate against read APIs without a session")
 	}
 
-	profile := strings.TrimSpace(env["WAYLOG_PROFILE"])
-	if profile == "prod" && len(cfg.ReadKeys) == 0 {
-		slog.Warn("WAYLOG_READ_KEY is unset in prod profile; read APIs are open to all")
+	if cfg.Profile == ProfileProd {
+		var missing []string
+		if len(cfg.WriteKeys) == 0 {
+			missing = append(missing, "WAYLOG_WRITE_KEY")
+		}
+		if len(cfg.ReadKeys) == 0 {
+			missing = append(missing, "WAYLOG_READ_KEY")
+		}
+		if len(cfg.AgentKeys) == 0 {
+			missing = append(missing, "WAYLOG_AGENT_KEY")
+		}
+		if len(missing) > 0 {
+			return cfg, fmt.Errorf("WAYLOG_PROFILE=prod requires non-empty %s — refusing to boot with an open auth surface", strings.Join(missing, ", "))
+		}
 	}
 
 	return cfg, nil
diff --git a/internal/auth/config_test.go b/internal/auth/config_test.go
index e8c8070..3b49011 100644
--- a/internal/auth/config_test.go
+++ b/internal/auth/config_test.go
@@ -115,3 +115,59 @@ func TestParseConfig_DashboardAuthWithoutSessionSecret_Dev(t *testing.T) {
 		t.Fatal("expected derived session secret in dev mode")
 	}
 }
+
+func TestParseConfig_ProfileDefaultsToDev(t *testing.T) {
+	cfg, err := ParseConfig(map[string]string{})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if cfg.Profile != ProfileDev {
+		t.Fatalf("profile = %q, want %q", cfg.Profile, ProfileDev)
+	}
+}
+
+func TestParseConfig_ProfileRejectsUnknown(t *testing.T) {
+	_, err := ParseConfig(map[string]string{"WAYLOG_PROFILE": "staging"})
+	if err == nil || !strings.Contains(err.Error(), "WAYLOG_PROFILE") {
+		t.Fatalf("expected WAYLOG_PROFILE validation error, got %v", err)
+	}
+}
+
+func TestParseConfig_ProfileProdRequiresAllKeys(t *testing.T) {
+	_, err := ParseConfig(map[string]string{"WAYLOG_PROFILE": "prod"})
+	if err == nil {
+		t.Fatal("expected error for prod profile with no keys")
+	}
+	for _, want := range []string{"WAYLOG_WRITE_KEY", "WAYLOG_READ_KEY", "WAYLOG_AGENT_KEY", "refusing"} {
+		if !strings.Contains(err.Error(), want) {
+			t.Fatalf("error %q missing %q", err.Error(), want)
+		}
+	}
+}
+
+func TestParseConfig_ProfileProdBootsWithAllKeys(t *testing.T) {
+	cfg, err := ParseConfig(map[string]string{
+		"WAYLOG_PROFILE":           "prod",
+		"WAYLOG_WRITE_KEY":         "w",
+		"WAYLOG_READ_KEY":          "r",
+		"WAYLOG_AGENT_KEY":         "a",
+		"DASHBOARD_AUTH":           "key:dash",
+		"DASHBOARD_SESSION_SECRET": "secret",
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if cfg.Profile != ProfileProd {
+		t.Fatalf("profile = %q, want %q", cfg.Profile, ProfileProd)
+	}
+}
+
+func TestParseConfig_ProfileDemoAllowsOpen(t *testing.T) {
+	cfg, err := ParseConfig(map[string]string{"WAYLOG_PROFILE": "demo"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if cfg.Profile != ProfileDemo {
+		t.Fatalf("profile = %q, want %q", cfg.Profile, ProfileDemo)
+	}
+}
diff --git a/internal/dashboard/static/index.html b/internal/dashboard/static/index.html
index db935b6..aed839e 100644
--- a/internal/dashboard/static/index.html
+++ b/internal/dashboard/static/index.html
@@ -587,12 +587,13 @@
       }
       .incident-grid {
         display: grid;
-        grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
+        grid-template-columns: repeat(auto-fit, minmax(min(100%, 280px), 1fr));
         gap: 8px;
       }
       .incident-card {
         display: grid;
         gap: 8px;
+        min-width: 0;
         min-height: 44px;
         border: 1px solid var(--line);
         border-radius: var(--radius-md);
@@ -601,7 +602,16 @@
         transition: border-color 0.15s ease, background 0.15s ease;
       }
       .incident-card:hover { border-color: var(--line-strong); background: var(--row-hover); }
-      .incident-title { display: flex; justify-content: space-between; gap: 8px; align-items: center; }
+      .incident-title { display: flex; justify-content: space-between; gap: 8px; align-items: start; min-width: 0; }
+      .incident-title strong { min-width: 0; overflow-wrap: anywhere; line-height: 1.25; }
+      .incident-title .status { flex-shrink: 0; }
+      .incident-meta {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 6px 10px;
+        min-width: 0;
+      }
+      .incident-meta span { min-width: 0; overflow-wrap: anywhere; }
       .incident-detail { display: grid; gap: 12px; }
       .evidence-list, .check-list { display: grid; gap: 8px; margin: 0; padding: 0; list-style: none; }
       .evidence-list li, .check-list li {
@@ -870,8 +880,16 @@
                 <strong>${esc(formatFamily(incident.error_family))}</strong>
                 <span class="status ${statusClass(incident.status)}">${esc(incident.status)}</span>
               </div>
-              <div class="item-meta">${esc(incident.cause || "unknown")} · ${esc(incident.confidence || "low")} confidence · severity ${nf.format(incident.severity || 0)}</div>
-              <div class="item-meta">${nf.format(incident.affected_requests || 0)} requests · ${nf.format(incident.affected_services || 0)} services · started ${esc(ago(incident.started_at))}</div>
+              <div class="item-meta incident-meta">
+                <span>${esc(incident.cause || "unknown")}</span>
+                <span>${esc(incident.confidence || "low")} confidence</span>
+                <span>severity ${nf.format(incident.severity || 0)}</span>
+              </div>
+              <div class="item-meta incident-meta">
+                <span>${nf.format(incident.affected_requests || 0)} requests</span>
+                <span>${nf.format(incident.affected_services || 0)} services</span>
+                <span>started ${esc(ago(incident.started_at))}</span>
+              </div>
             </a>`).join("")}</div>
         </section>`;
       }
diff --git a/internal/dashboard/static_test.go b/internal/dashboard/static_test.go
index ce2587f..5410c28 100644
--- a/internal/dashboard/static_test.go
+++ b/internal/dashboard/static_test.go
@@ -47,6 +47,12 @@ func TestStaticDashboardHTML(t *testing.T) {
 		"/v1/incidents/active",
 		"Active incidents",
 		"No active incidents.",
+		"repeat(auto-fit, minmax(min(100%, 280px), 1fr))",
+		".incident-card",
+		"min-width: 0",
+		"overflow-wrap: anywhere",
+		"flex-shrink: 0",
+		"incident-meta",
 		"Next checks",
 		"Instrumentation warnings",
 		"sample_traces",
diff --git a/internal/incidents/store_test.go b/internal/incidents/store_test.go
new file mode 100644
index 0000000..1438d0e
--- /dev/null
+++ b/internal/incidents/store_test.go
@@ -0,0 +1,84 @@
+package incidents
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+// TestStaleActiveTransitionTransitionsOnlyStaleRows mirrors the rebuild-time
+// policy implemented in cmd/ingest/main.go: when the WAL replay returns zero
+// events but the seed has active rows older than the replay-since cutoff,
+// only those stale rows transition to recovering. Non-stale active rows in
+// the same seed are left untouched.
+func TestStaleActiveTransitionTransitionsOnlyStaleRows(t *testing.T) {
+	now := time.Date(2026, 5, 12, 12, 0, 0, 0, time.UTC)
+	replaySince := now.Add(-1 * time.Hour)
+	store := NewMemoryStore()
+
+	stale := testIncident(now.Add(-2 * time.Hour)) // older than replaySince
+	stale.IncidentID = "inc_stale"
+	fresh := testIncident(now.Add(-15 * time.Minute)) // within replaySince
+	fresh.IncidentID = "inc_fresh"
+
+	if err := store.Upsert(context.Background(), stale); err != nil {
+		t.Fatal(err)
+	}
+	if err := store.Upsert(context.Background(), fresh); err != nil {
+		t.Fatal(err)
+	}
+
+	// Policy: only mutate rows whose StartedAt precedes replaySince and whose
+	// status is active. Replicate the same predicate the rebuild path uses.
+	active, _ := store.ListActive(context.Background())
+	transitioned := 0
+	for _, inc := range active {
+		if inc.Status != StatusActive {
+			continue
+		}
+		if !inc.StartedAt.Before(replaySince) {
+			continue
+		}
+		inc.Status = StatusRecovering
+		recoveryTS := now
+		inc.RecoveringAt = &recoveryTS
+		inc.UpdatedAt = now
+		if err := store.Upsert(context.Background(), inc); err != nil {
+			t.Fatalf("upsert stale row: %v", err)
+		}
+		transitioned++
+	}
+	if transitioned != 1 {
+		t.Fatalf("transitioned = %d, want 1 (only stale row)", transitioned)
+	}
+
+	gotStale, _ := store.Get(context.Background(), "inc_stale")
+	if gotStale.Status != StatusRecovering {
+		t.Fatalf("stale row status = %q, want recovering", gotStale.Status)
+	}
+	if gotStale.RecoveringAt == nil {
+		t.Fatal("recovering_at not set on stale row")
+	}
+	gotFresh, _ := store.Get(context.Background(), "inc_fresh")
+	if gotFresh.Status != StatusActive {
+		t.Fatalf("fresh row status = %q, want active (must not be mutated)", gotFresh.Status)
+	}
+	if gotFresh.RecoveringAt != nil {
+		t.Fatal("fresh row recovering_at must be unset")
+	}
+}
+
+// TestStaleActiveTransitionNoActiveRowsIsNoOp covers the "normal empty-WAL
+// startup with no active incidents" path. Nothing should change in the store.
+func TestStaleActiveTransitionNoActiveRowsIsNoOp(t *testing.T) {
+	store := NewMemoryStore()
+	active, _ := store.ListActive(context.Background())
+	if len(active) != 0 {
+		t.Fatalf("expected empty store, got %d rows", len(active))
+	}
+	// No rows = no transitions. Listing again must still be empty.
+	active, _ = store.ListActive(context.Background())
+	if len(active) != 0 {
+		t.Fatalf("unexpected mutation: %d rows", len(active))
+	}
+}
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index 452f86f..1c08487 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -142,6 +142,7 @@ type Server struct {
 	incidentsEnabled          bool
 	incidentsPersistent       bool
 	incidentsRebuildSupported bool
+	profile                   string
 
 	// SSE
 	sseHub               *SSEHub
@@ -212,6 +213,7 @@ type ServerConfig struct {
 	IncidentsEnabled         bool
 	IncidentsPersistent      bool
 	IncidentRebuildSupported bool
+	Profile                  string
 }
 
 // NewServer creates a new ingest server with the given configuration.
@@ -256,6 +258,7 @@ func NewServer(cfg ServerConfig) *Server {
 		incidentsEnabled:          cfg.IncidentsEnabled,
 		incidentsPersistent:       cfg.IncidentsPersistent,
 		incidentsRebuildSupported: cfg.IncidentRebuildSupported,
+		profile:                   cfg.Profile,
 		replayStatus:              "none",
 	}
 	if s.sampler == nil {
@@ -615,6 +618,7 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 		"v2_reads": map[string]any{
 			"enabled": s.v2ReadsEnabled,
 		},
+		"profile": s.profile,
 		"incidents": map[string]any{
 			"enabled":    s.incidentsEnabled,
 			"persistent": s.incidentsPersistent,
diff --git a/internal/llm/openai.go b/internal/llm/openai.go
index 3dc9019..6dff12f 100644
--- a/internal/llm/openai.go
+++ b/internal/llm/openai.go
@@ -11,7 +11,7 @@ import (
 	"time"
 )
 
-const defaultOpenAIModel = "gpt-5.4-mini"
+const defaultOpenAIModel = "gpt-4o-mini"
 const defaultOpenAIBaseURL = "https://api.openai.com/v1"
 
 type OpenAIClient struct {
diff --git a/internal/llm/openai_test.go b/internal/llm/openai_test.go
index fa363a0..a099b80 100644
--- a/internal/llm/openai_test.go
+++ b/internal/llm/openai_test.go
@@ -181,6 +181,16 @@ func TestParseOpenAIResponseMessageText(t *testing.T) {
 	}
 }
 
+func TestOpenAIDefaultModel(t *testing.T) {
+	client := NewOpenAIClient("test-key")
+	if client.Model != "gpt-4o-mini" {
+		t.Fatalf("default model = %q, want gpt-4o-mini", client.Model)
+	}
+	if defaultOpenAIModel != "gpt-4o-mini" {
+		t.Fatalf("defaultOpenAIModel = %q, want gpt-4o-mini", defaultOpenAIModel)
+	}
+}
+
 func TestOpenAIGenerateAPIError(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "unavailable", http.StatusServiceUnavailable)
diff --git a/internal/reports/reports.go b/internal/reports/reports.go
index 9ad67d7..243ee11 100644
--- a/internal/reports/reports.go
+++ b/internal/reports/reports.go
@@ -41,13 +41,15 @@ func Render(rep *pkgtriage.Report, format string) (Rendered, error) {
 
 func Markdown(rep *pkgtriage.Report) string {
 	var b strings.Builder
-	fmt.Fprintf(&b, "# Waylog Triage Report\n\n")
-	fmt.Fprintf(&b, "- Incident: `%s`\n", nz(rep.IncidentRef.ID))
-	fmt.Fprintf(&b, "- Window: `%s`\n", nz(rep.IncidentRef.Window))
-	fmt.Fprintf(&b, "- Confidence: `%s`\n", nz(string(rep.Confidence)))
-	fmt.Fprintf(&b, "- Report hash: `%s`\n\n", nz(rep.ReportHash))
-
-	fmt.Fprintf(&b, "## Blast Snapshot\n\n")
+	fmt.Fprintln(&b, "# Waylog Operator Report")
+	fmt.Fprintln(&b)
+	fmt.Fprintf(&b, "## Summary\n\n")
+	fmt.Fprintf(&b, "- Incident: `%s` (report `%s`)\n", nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Confidence: `%s` (incident `%s`, report `%s`)\n", nz(string(rep.Confidence)), nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Evidence status: alert=%s trace=%s signal=%s (report `%s`)\n", availability(len(rep.Alerts) > 0), availability(len(rep.SampleTraces) > 0), availability(len(rep.Signals) > 0), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Window: `%s` (incident `%s`)\n\n", nz(rep.IncidentRef.Window), nz(rep.IncidentRef.ID))
+
+	fmt.Fprintf(&b, "## Impact\n\n")
 	fmt.Fprintf(&b, "- Requests: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Requests, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
 	fmt.Fprintf(&b, "- Users: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Users, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
 	fmt.Fprintf(&b, "- Services: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Services, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
@@ -58,12 +60,21 @@ func Markdown(rep *pkgtriage.Report) string {
 		fmt.Fprintf(&b, "- Error family: not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
 	}
 
+	fmt.Fprintf(&b, "\n## First Failure And Traces\n\n")
+	if len(rep.SampleTraces) == 0 {
+		fmt.Fprintf(&b, "- not available (incident `%s`, report `%s`)\n", nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	} else {
+		for _, t := range rep.SampleTraces {
+			fmt.Fprintf(&b, "- trace `%s`: %s (incident `%s`, report `%s`)\n", nz(t.TraceID), nz(t.Summary), nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+		}
+	}
+
 	fmt.Fprintf(&b, "\n## Alert Evidence\n\n")
 	if len(rep.Alerts) == 0 {
 		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
 	} else {
 		for _, a := range rep.Alerts {
-			fmt.Fprintf(&b, "- `%s` from `%s`: %s (signal `%s`, alert `%s`, report `%s`)\n", nz(a.Severity), nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID), nz(rep.ReportHash))
+			fmt.Fprintf(&b, "- `%s` from `%s`: %s; provider=%s (signal `%s`, alert `%s`, report `%s`)\n", nz(a.Severity), nz(a.Source), nz(a.Reason), nz(a.ProviderURL), nz(a.SignalID), nz(a.AlertID), nz(rep.ReportHash))
 		}
 	}
 
@@ -76,15 +87,6 @@ func Markdown(rep *pkgtriage.Report) string {
 		}
 	}
 
-	fmt.Fprintf(&b, "\n## Sample Traces\n\n")
-	if len(rep.SampleTraces) == 0 {
-		fmt.Fprintf(&b, "- not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
-	} else {
-		for _, t := range rep.SampleTraces {
-			fmt.Fprintf(&b, "- trace `%s`: %s (incident `%s`)\n", nz(t.TraceID), nz(t.Summary), nz(rep.IncidentRef.ID))
-		}
-	}
-
 	fmt.Fprintf(&b, "\n## Next Checks\n\n")
 	if len(rep.NextChecks) == 0 {
 		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
@@ -99,17 +101,19 @@ func Markdown(rep *pkgtriage.Report) string {
 func Slack(rep *pkgtriage.Report) map[string]any {
 	fields := []map[string]string{
 		{"type": "mrkdwn", "text": "*Incident*\n`" + nz(rep.IncidentRef.ID) + "`"},
-		{"type": "mrkdwn", "text": "*Confidence*\n`" + nz(string(rep.Confidence)) + "`"},
+		{"type": "mrkdwn", "text": "*Confidence*\n`" + nz(string(rep.Confidence)) + "` (incident `" + nz(rep.IncidentRef.ID) + "`, report `" + nz(rep.ReportHash) + "`)"},
+		{"type": "mrkdwn", "text": "*Impact*\n" + impactSummary(rep)},
+		{"type": "mrkdwn", "text": "*Trace*\n" + firstTrace(rep)},
 		{"type": "mrkdwn", "text": "*Report hash*\n`" + nz(rep.ReportHash) + "`"},
 	}
 	alertText := "not available"
 	if len(rep.Alerts) > 0 {
 		a := rep.Alerts[0]
-		alertText = fmt.Sprintf("`%s` %s (signal `%s`, alert `%s`)", nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID))
+		alertText = fmt.Sprintf("`%s` %s provider=%s (signal `%s`, alert `%s`, report `%s`)", nz(a.Source), nz(a.Reason), nz(a.ProviderURL), nz(a.SignalID), nz(a.AlertID), nz(rep.ReportHash))
 	}
 	return map[string]any{
 		"blocks": []map[string]any{
-			{"type": "header", "text": map[string]string{"type": "plain_text", "text": "Waylog triage report"}},
+			{"type": "header", "text": map[string]string{"type": "plain_text", "text": "Waylog operator report"}},
 			{"type": "section", "fields": fields},
 			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Alert evidence*\n" + alertText}},
 			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Next check*\n" + firstCheck(rep)}},
@@ -121,10 +125,10 @@ func PagerDuty(rep *pkgtriage.Report) string {
 	alert := "not available"
 	if len(rep.Alerts) > 0 {
 		a := rep.Alerts[0]
-		alert = fmt.Sprintf("%s alert %s via signal %s", nz(a.Source), nz(a.AlertID), nz(a.SignalID))
+		alert = fmt.Sprintf("%s alert %s via signal %s provider=%s", nz(a.Source), nz(a.AlertID), nz(a.SignalID), nz(a.ProviderURL))
 	}
-	return fmt.Sprintf("Waylog triage: incident %s confidence=%s report_hash=%s alert=%s next_check=%s",
-		nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), nz(rep.ReportHash), alert, firstCheck(rep))
+	return fmt.Sprintf("Waylog operator report: incident=%s confidence=%s impact=%s trace=%s report_hash=%s alert=%s next_check=%s",
+		nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), impactSummary(rep), firstTrace(rep), nz(rep.ReportHash), alert, firstCheck(rep))
 }
 
 func EncodeBody(r Rendered) ([]byte, error) {
@@ -144,6 +148,25 @@ func firstCheck(rep *pkgtriage.Report) string {
 	return nz(rep.NextChecks[0].Prompt) + " (check `" + nz(rep.NextChecks[0].ID) + "`, report `" + nz(rep.ReportHash) + "`)"
 }
 
+func firstTrace(rep *pkgtriage.Report) string {
+	if len(rep.SampleTraces) == 0 {
+		return "not available (report `" + nz(rep.ReportHash) + "`)"
+	}
+	return "`" + nz(rep.SampleTraces[0].TraceID) + "` (incident `" + nz(rep.IncidentRef.ID) + "`, report `" + nz(rep.ReportHash) + "`)"
+}
+
+func impactSummary(rep *pkgtriage.Report) string {
+	return fmt.Sprintf("%d requests, %d users, %d services (incident `%s`, report `%s`)",
+		rep.BlastSnapshot.Requests, rep.BlastSnapshot.Users, rep.BlastSnapshot.Services, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+}
+
+func availability(ok bool) string {
+	if ok {
+		return "present"
+	}
+	return "not available"
+}
+
 func nz(s string) string {
 	if strings.TrimSpace(s) == "" {
 		return "not available"
diff --git a/internal/reports/reports_test.go b/internal/reports/reports_test.go
index 00befb8..40fe239 100644
--- a/internal/reports/reports_test.go
+++ b/internal/reports/reports_test.go
@@ -10,7 +10,14 @@ import (
 
 func TestMarkdownReportCitesEvidence(t *testing.T) {
 	out := Markdown(testReport())
-	for _, want := range []string{"Requests: 12 (incident `inc_abc`, report `sha256:test`)", "trace_1", "sig_alert", "alert_1", "check_0"} {
+	for _, want := range []string{
+		"# Waylog Operator Report",
+		"Evidence status: alert=present trace=present signal=present (report `sha256:test`)",
+		"Requests: 12 (incident `inc_abc`, report `sha256:test`)",
+		"trace `trace_1`: checkout payment failure (incident `inc_abc`, report `sha256:test`)",
+		"`critical` from `grafana`: PMT_502 spike; provider=https://grafana/alert/1 (signal `sig_alert`, alert `alert_1`, report `sha256:test`)",
+		"Check payment health (check `check_0`, report `sha256:test`)",
+	} {
 		if !strings.Contains(out, want) {
 			t.Fatalf("markdown missing %q:\n%s", want, out)
 		}
@@ -29,7 +36,7 @@ func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
 	if !json.Valid(raw) {
 		t.Fatalf("invalid json: %s", raw)
 	}
-	for _, want := range []string{"sig_alert", "alert_1", "sha256:test"} {
+	for _, want := range []string{"Waylog operator report", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test"} {
 		if !strings.Contains(string(raw), want) {
 			t.Fatalf("slack payload missing %q:\n%s", want, raw)
 		}
@@ -38,7 +45,7 @@ func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
 
 func TestPagerDutyReportCitesEvidence(t *testing.T) {
 	out := PagerDuty(testReport())
-	for _, want := range []string{"inc_abc", "sig_alert", "alert_1", "sha256:test"} {
+	for _, want := range []string{"Waylog operator report", "inc_abc", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test"} {
 		if !strings.Contains(out, want) {
 			t.Fatalf("pagerduty missing %q:\n%s", want, out)
 		}
@@ -59,7 +66,7 @@ func testReport() *pkgtriage.Report {
 		},
 		SampleTraces: []pkgtriage.TraceSample{{TraceID: "trace_1", Summary: "checkout payment failure"}},
 		Signals:      []pkgtriage.SignalRef{{ID: "sig_alert", Type: "alert", EvidenceIDs: []string{"sig_alert"}}},
-		Alerts:       []pkgtriage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_alert"}}},
+		Alerts:       []pkgtriage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", ProviderURL: "https://grafana/alert/1", EvidenceIDs: []string{"sig_alert"}}},
 		NextChecks:   []pkgtriage.NextCheck{{ID: "check_0", Prompt: "Check payment health"}},
 		Confidence:   pkgtriage.ConfidenceHigh,
 		GeneratedAt:  "2026-05-10T12:00:00Z",
diff --git a/internal/triagehttp/handler_test.go b/internal/triagehttp/handler_test.go
index a274813..0bf7043 100644
--- a/internal/triagehttp/handler_test.go
+++ b/internal/triagehttp/handler_test.go
@@ -94,7 +94,7 @@ func TestTriageReportHandlerRendersMarkdown(t *testing.T) {
 	if rr.Code != http.StatusOK {
 		t.Fatalf("status = %d body=%s", rr.Code, rr.Body.String())
 	}
-	if !strings.Contains(rr.Body.String(), "Waylog Triage Report") || !strings.Contains(rr.Body.String(), "inc_abc") {
+	if !strings.Contains(rr.Body.String(), "Waylog Operator Report") || !strings.Contains(rr.Body.String(), "inc_abc") {
 		t.Fatalf("unexpected report:\n%s", rr.Body.String())
 	}
 }
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index e4d5b67..558eee8 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -56,7 +56,43 @@ type incident struct {
 }
 
 type triageReport struct {
-	ReportHash string `json:"report_hash"`
+	ReportHash    string         `json:"report_hash"`
+	BlastSnapshot blastSnapshot  `json:"blast_snapshot"`
+	SampleTraces  []traceSample  `json:"sample_traces"`
+	Signals       []triageSignal `json:"signals"`
+	Alerts        []triageAlert  `json:"alerts"`
+	NextChecks    []nextCheck    `json:"next_checks"`
+}
+
+type blastSnapshot struct {
+	TopErrorFamilies []errorFamily `json:"top_error_families"`
+}
+
+type traceSample struct {
+	TraceID string `json:"trace_id"`
+}
+
+type triageSignal struct {
+	ID   string `json:"id"`
+	Type string `json:"type"`
+}
+
+type triageAlert struct {
+	SignalID string `json:"signal_id"`
+	AlertID  string `json:"alert_id"`
+}
+
+type nextCheck struct {
+	ID     string `json:"id"`
+	Prompt string `json:"prompt"`
+}
+
+type planResult struct {
+	Steps []planStep `json:"steps"`
+}
+
+type planStep struct {
+	Result json.RawMessage `json:"result"`
 }
 
 type blastResponse struct {
@@ -64,8 +100,8 @@ type blastResponse struct {
 }
 
 func main() {
-	if len(os.Args) != 2 {
-		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|payment-error-count|payment-affected-traces|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|first-incident-id|triage-report-hash|blast-affected-services>")
+	if len(os.Args) < 2 {
+		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json <has-payment-error|payment-error-count|payment-affected-traces|first-payment-trace|first-event-id|burst-signals-accepted|has-dependency-incident|incident-cause-is-dependency|first-incident-id|triage-report-hash|plan-triage-report-hash|triage-first-trace|triage-has-alert|triage-has-alert-id|triage-has-trace|triage-has-dependency-signal|triage-has-next-check|triage-root-cause-accurate|blast-affected-services> [arg]")
 		os.Exit(2)
 	}
 
@@ -96,10 +132,50 @@ func main() {
 		if !hasDependencyIncident(body) {
 			os.Exit(1)
 		}
+	case "incident-cause-is-dependency":
+		if len(os.Args) != 3 {
+			fmt.Fprintln(os.Stderr, "incident-cause-is-dependency requires an incident_id")
+			os.Exit(2)
+		}
+		if !incidentCauseIsDependency(body, os.Args[2]) {
+			os.Exit(1)
+		}
 	case "first-incident-id":
 		fmt.Println(firstIncidentID(body))
 	case "triage-report-hash":
 		fmt.Println(triageReportHash(body))
+	case "plan-triage-report-hash":
+		fmt.Println(planTriageReportHash(body))
+	case "triage-first-trace":
+		fmt.Println(triageFirstTrace(body))
+	case "triage-has-alert":
+		if !triageHasAlert(body) {
+			os.Exit(1)
+		}
+	case "triage-has-alert-id":
+		if len(os.Args) != 3 {
+			fmt.Fprintln(os.Stderr, "triage-has-alert-id requires an alert_id")
+			os.Exit(2)
+		}
+		if !triageHasAlertID(body, os.Args[2]) {
+			os.Exit(1)
+		}
+	case "triage-has-trace":
+		if !triageHasTrace(body) {
+			os.Exit(1)
+		}
+	case "triage-has-dependency-signal":
+		if !triageHasDependencySignal(body) {
+			os.Exit(1)
+		}
+	case "triage-has-next-check":
+		if !triageHasNextCheck(body) {
+			os.Exit(1)
+		}
+	case "triage-root-cause-accurate":
+		if !triageRootCauseAccurate(body) {
+			os.Exit(1)
+		}
 	case "blast-affected-services":
 		fmt.Println(blastAffectedServices(body))
 	default:
@@ -221,6 +297,19 @@ func firstIncidentID(body []byte) string {
 	return ""
 }
 
+func incidentCauseIsDependency(body []byte, incidentID string) bool {
+	var resp incidentsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return false
+	}
+	for _, inc := range resp.Incidents {
+		if inc.IncidentID == incidentID {
+			return inc.Cause == "dependency"
+		}
+	}
+	return false
+}
+
 func isPaymentFamily(f errorFamily) bool {
 	return f.Service == "checkout" &&
 		f.Step == "payment.charge" &&
@@ -235,6 +324,96 @@ func triageReportHash(body []byte) string {
 	return rep.ReportHash
 }
 
+func planTriageReportHash(body []byte) string {
+	var plan planResult
+	if err := json.Unmarshal(body, &plan); err != nil || len(plan.Steps) == 0 {
+		return ""
+	}
+	return triageReportHash(plan.Steps[0].Result)
+}
+
+func triageHasAlert(body []byte) bool {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return false
+	}
+	for _, alert := range rep.Alerts {
+		if alert.SignalID != "" && alert.AlertID != "" {
+			return true
+		}
+	}
+	return false
+}
+
+func triageHasAlertID(body []byte, alertID string) bool {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return false
+	}
+	for _, alert := range rep.Alerts {
+		if alert.AlertID == alertID && alert.SignalID != "" {
+			return true
+		}
+	}
+	return false
+}
+
+func triageHasTrace(body []byte) bool {
+	return triageFirstTrace(body) != ""
+}
+
+func triageFirstTrace(body []byte) string {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return ""
+	}
+	for _, sample := range rep.SampleTraces {
+		if sample.TraceID != "" {
+			return sample.TraceID
+		}
+	}
+	return ""
+}
+
+func triageHasDependencySignal(body []byte) bool {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return false
+	}
+	for _, sig := range rep.Signals {
+		if sig.ID != "" && sig.Type == "dependency" {
+			return true
+		}
+	}
+	return false
+}
+
+func triageHasNextCheck(body []byte) bool {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return false
+	}
+	for _, check := range rep.NextChecks {
+		if check.ID != "" && check.Prompt != "" {
+			return true
+		}
+	}
+	return false
+}
+
+func triageRootCauseAccurate(body []byte) bool {
+	var rep triageReport
+	if err := json.Unmarshal(body, &rep); err != nil {
+		return false
+	}
+	for _, family := range rep.BlastSnapshot.TopErrorFamilies {
+		if isPaymentFamily(family) {
+			return true
+		}
+	}
+	return false
+}
+
 func blastAffectedServices(body []byte) int {
 	var resp blastResponse
 	if err := json.Unmarshal(body, &resp); err != nil {
diff --git a/scripts/demo-acceptance-json/main_test.go b/scripts/demo-acceptance-json/main_test.go
index 8619a77..6bad1d8 100644
--- a/scripts/demo-acceptance-json/main_test.go
+++ b/scripts/demo-acceptance-json/main_test.go
@@ -31,3 +31,48 @@ func TestTriageReportHash(t *testing.T) {
 		t.Fatalf("malformed input should return empty, got %q", got)
 	}
 }
+
+func TestPlanTriageReportHash(t *testing.T) {
+	body := []byte(`{"steps":[{"result":{"schema_version":"triage.v1","report_hash":"sha256:plan"}}]}`)
+	if got := planTriageReportHash(body); got != "sha256:plan" {
+		t.Fatalf("planTriageReportHash = %q, want sha256:plan", got)
+	}
+	if got := planTriageReportHash([]byte(`{"steps":[]}`)); got != "" {
+		t.Fatalf("missing plan result should return empty, got %q", got)
+	}
+}
+
+func TestTriageEvidenceHelpers(t *testing.T) {
+	body := []byte(`{
+		"blast_snapshot":{"top_error_families":[{"service":"checkout","step":"payment.charge","error_code":"PMT_502"}]},
+		"sample_traces":[{"trace_id":"trace_1"}],
+		"signals":[{"id":"sig_dep","type":"dependency"}],
+		"alerts":[{"signal_id":"sig_alert","alert_id":"alert_1"}],
+		"next_checks":[{"id":"check_0","prompt":"Check payment health"}]
+	}`)
+	if !triageRootCauseAccurate(body) || !triageHasTrace(body) || !triageHasDependencySignal(body) || !triageHasAlert(body) || !triageHasNextCheck(body) {
+		t.Fatalf("expected all triage helpers to pass")
+	}
+	if !triageHasAlertID(body, "alert_1") {
+		t.Fatalf("expected alert_1 to be present")
+	}
+	if triageHasAlertID(body, "alert_other") {
+		t.Fatalf("unexpected alert_other match")
+	}
+	if triageHasAlert([]byte(`{"alerts":[]}`)) {
+		t.Fatalf("empty alerts should fail")
+	}
+}
+
+func TestIncidentCauseIsDependency(t *testing.T) {
+	body := []byte(`{"incidents":[
+		{"incident_id":"inc_a","cause":"app"},
+		{"incident_id":"inc_b","cause":"dependency"}
+	]}`)
+	if !incidentCauseIsDependency(body, "inc_b") {
+		t.Fatalf("expected inc_b to be dependency")
+	}
+	if incidentCauseIsDependency(body, "inc_a") {
+		t.Fatalf("inc_a should not be dependency")
+	}
+}
diff --git a/scripts/demo.sh b/scripts/demo.sh
index d738520..6009381 100755
--- a/scripts/demo.sh
+++ b/scripts/demo.sh
@@ -30,6 +30,7 @@ if [[ "$DASHBOARD_AUTH" == "off" ]]; then
 else
   export WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
 fi
+export WAYLOG_PROFILE="${WAYLOG_PROFILE:-demo}"
 export WAYLOG_V2_READS="${WAYLOG_V2_READS:-true}"
 export WAYLOG_INCIDENT_TICK_INTERVAL="${WAYLOG_INCIDENT_TICK_INTERVAL:-5s}"
 export EVENT_LOG_DIR="${EVENT_LOG_DIR:-${STATE_DIR}/eventlog}"
@@ -142,9 +143,10 @@ Open:
   Dashboard:     http://localhost:8080/ui/
 
 How to demo it:
-  1. Open Demo controls and click "Run traffic burst".
-  2. Open Dashboard and inspect the active incident, errors, impact, and trace explanation.
-  3. Or run: make demo-acceptance
+  1. Open Demo controls and click "Run proof loop".
+  2. Inspect the alert, incident, triage hash, operator report, and scorecard.
+  3. Open Dashboard to inspect the active incident, errors, impact, and trace explanation.
+  4. Or run: make demo-acceptance
 
 Useful CLI checks:
   ./waylog capabilities
diff --git a/scripts/proof-loop.sh b/scripts/proof-loop.sh
new file mode 100644
index 0000000..57e4dbc
--- /dev/null
+++ b/scripts/proof-loop.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
+INGEST_URL="${INGEST_URL:-http://localhost:8080}"
+WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
+WAYLOG_AGENT_KEY="${WAYLOG_AGENT_KEY:-demo}"
+REQUESTS="${REQUESTS:-20}"
+CONCURRENCY="${CONCURRENCY:-5}"
+TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
+PROOF_DIR="${WAYLOG_PROOF_DIR:-./data/demo-state/proof}"
+
+CLI_BIN="${WAYLOG_CLI_BIN:-./data/demo-state/bin/waylog}"
+JSON_BIN="${WAYLOG_JSON_HELPER_BIN:-./data/demo-state/bin/demo-acceptance-json}"
+
+fail() {
+  echo "FAIL: $*" >&2
+  exit 1
+}
+
+cleanup() {
+  make demo-stop >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
+
+echo "[proof-loop] starting local demo"
+make demo
+
+mkdir -p ./data/demo-state/bin "$PROOF_DIR"
+go build -o "$CLI_BIN" ./cmd/waylog
+go build -o "$JSON_BIN" ./scripts/demo-acceptance-json
+
+CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TIMEOUT")
+
+alert_id="alert_proof_pmt_502_$(date +%s)"
+alert_timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+alert_body="{\"source\":\"waylog\",\"alert_id\":\"${alert_id}\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"reason\":\"PMT_502 spike\",\"message\":\"proof-loop alert for checkout payment failures\",\"error_code\":\"PMT_502\",\"timestamp\":\"${alert_timestamp}\"}"
+alert_status="$(curl -s -o "${PROOF_DIR}/alert.json" -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/alerts" \
+  -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "$alert_body" || echo "000")"
+[[ "$alert_status" == "201" ]] || fail "alert webhook failed: HTTP $alert_status"
+echo "[proof-loop] alert accepted"
+
+burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
+burst_status="$(curl -s -o "${PROOF_DIR}/burst.json" -w "%{http_code}" \
+  -X POST "${GATEWAY_URL}/demo/burst" \
+  -H 'Content-Type: application/json' \
+  --data "$burst_body" || echo "000")"
+[[ "$burst_status" == "200" ]] || fail "traffic burst failed: HTTP $burst_status"
+echo "[proof-loop] payment failure burst captured"
+
+errors_json=""
+for _ in $(seq 1 15); do
+  errors_json="$("${CLI[@]}" --json errors --window 15m --limit 10)" || fail "waylog errors failed"
+  if "$JSON_BIN" has-payment-error <<<"$errors_json"; then
+    break
+  fi
+  sleep 1
+done
+"$JSON_BIN" has-payment-error <<<"$errors_json" || fail "payment_502 error family did not appear"
+printf "%s\n" "$errors_json" >"${PROOF_DIR}/errors.json"
+
+incidents_json=""
+for _ in $(seq 1 20); do
+  incidents_json="$("${CLI[@]}" --json incidents)" || fail "waylog incidents failed"
+  if "$JSON_BIN" has-dependency-incident <<<"$incidents_json"; then
+    break
+  fi
+  sleep 1
+done
+"$JSON_BIN" has-dependency-incident <<<"$incidents_json" || fail "dependency incident did not appear"
+printf "%s\n" "$incidents_json" >"${PROOF_DIR}/incidents.json"
+
+incident_id="$("$JSON_BIN" first-incident-id <<<"$incidents_json")"
+[[ -n "$incident_id" ]] || fail "no incident_id found"
+echo "[proof-loop] active incident: ${incident_id}"
+
+triage_cli="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage failed"
+printf "%s\n" "$triage_cli" >"${PROOF_DIR}/triage.json"
+hash_cli="$("$JSON_BIN" triage-report-hash <<<"$triage_cli")"
+[[ -n "$hash_cli" ]] || fail "CLI triage report_hash missing"
+
+read_status="$(curl -s -o "${PROOF_DIR}/triage-read.json" -w "%{http_code}" \
+  -H "Authorization: Bearer ${WAYLOG_READ_KEY}" \
+  "${INGEST_URL}/v1/triage/${incident_id}?snapshot=true" || echo "000")"
+[[ "$read_status" == "200" ]] || fail "read triage endpoint failed: HTTP $read_status"
+hash_read="$("$JSON_BIN" triage-report-hash <"${PROOF_DIR}/triage-read.json")"
+
+tool_body="{\"incident_id\":\"${incident_id}\",\"snapshot\":true}"
+tool_status="$(curl -s -o "${PROOF_DIR}/triage-tool.json" -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/tools/triage_incident" \
+  -H "Authorization: Bearer ${WAYLOG_AGENT_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "$tool_body" || echo "000")"
+[[ "$tool_status" == "200" ]] || fail "triage_incident tool failed: HTTP $tool_status"
+hash_tool="$("$JSON_BIN" triage-report-hash <"${PROOF_DIR}/triage-tool.json")"
+
+plan_body="{\"template\":\"triage\",\"params\":{\"incident_id\":\"${incident_id}\",\"snapshot\":true}}"
+plan_status="$(curl -s -o "${PROOF_DIR}/triage-plan.json" -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/plans/execute" \
+  -H "Authorization: Bearer ${WAYLOG_AGENT_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "$plan_body" || echo "000")"
+[[ "$plan_status" == "200" ]] || fail "triage plan template failed: HTTP $plan_status"
+hash_plan="$("$JSON_BIN" plan-triage-report-hash <"${PROOF_DIR}/triage-plan.json")"
+
+[[ "$hash_cli" == "$hash_read" && "$hash_cli" == "$hash_tool" && "$hash_cli" == "$hash_plan" ]] || {
+  echo "hash_cli=$hash_cli hash_read=$hash_read hash_tool=$hash_tool hash_plan=$hash_plan" >&2
+  fail "triage report_hash mismatch across surfaces"
+}
+echo "[proof-loop] report_hash stable across CLI/read/tool/plan: ${hash_cli}"
+
+"$JSON_BIN" triage-has-alert-id "$alert_id" <"${PROOF_DIR}/triage.json" || fail "triage report missing current alert evidence"
+"$JSON_BIN" triage-has-trace <"${PROOF_DIR}/triage.json" || fail "triage report missing trace evidence"
+"$JSON_BIN" triage-has-dependency-signal <"${PROOF_DIR}/triage.json" || fail "triage report missing dependency signal"
+"$JSON_BIN" triage-has-next-check <"${PROOF_DIR}/triage.json" || fail "triage report missing next checks"
+trace_id="$("$JSON_BIN" triage-first-trace <"${PROOF_DIR}/triage.json")"
+[[ -n "$trace_id" ]] || fail "triage report missing sample trace id"
+
+for format in markdown slack pagerduty; do
+  case "$format" in
+    markdown) out="${PROOF_DIR}/report.md" ;;
+    slack) out="${PROOF_DIR}/slack.json" ;;
+    pagerduty) out="${PROOF_DIR}/pagerduty.txt" ;;
+  esac
+  report_status="$(curl -s -o "$out" -w "%{http_code}" \
+    -H "Authorization: Bearer ${WAYLOG_READ_KEY}" \
+    "${INGEST_URL}/v1/triage/${incident_id}/report?format=${format}&snapshot=true" || echo "000")"
+  [[ "$report_status" == "200" ]] || fail "${format} report endpoint failed: HTTP $report_status"
+  grep -q "$incident_id" "$out" || fail "${format} report missing incident_id citation"
+  grep -q "$hash_cli" "$out" || fail "${format} report missing report_hash citation"
+  grep -q "$alert_id" "$out" || fail "${format} report missing alert_id citation"
+done
+grep -q "$trace_id" "${PROOF_DIR}/report.md" || fail "markdown report missing trace citation"
+grep -q 'signal `sig_' "${PROOF_DIR}/report.md" || fail "markdown report missing signal citation"
+
+render_status="$(curl -s -o "${PROOF_DIR}/render-tool.json" -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/tools/render_triage_report" \
+  -H "Authorization: Bearer ${WAYLOG_AGENT_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "{\"incident_id\":\"${incident_id}\",\"format\":\"markdown\",\"snapshot\":true}" || echo "000")"
+[[ "$render_status" == "200" ]] || fail "render_triage_report tool failed: HTTP $render_status"
+grep -q "$hash_cli" "${PROOF_DIR}/render-tool.json" || fail "render_triage_report tool output missing report_hash"
+echo "[proof-loop] operator reports rendered with citations"
+
+WAYLOG_ROLLUP_USE_RUNNING_DEMO=1 WAYLOG_CLI_BIN="$CLI_BIN" WAYLOG_JSON_HELPER_BIN="$JSON_BIN" \
+  ./scripts/rollup-comparison.sh | tee "${PROOF_DIR}/rollup-comparison.txt"
+
+WAYLOG_SCORECARD_USE_RUNNING_DEMO=1 WAYLOG_SCENARIO=warm-demo WAYLOG_CLI_BIN="$CLI_BIN" WAYLOG_JSON_HELPER_BIN="$JSON_BIN" WAYLOG_PROOF_DIR="$PROOF_DIR" \
+  bash ./scripts/rca-scorecard.sh | tee "${PROOF_DIR}/scorecard.txt"
+
+cat <<EOF
+[proof-loop] complete
+  artifacts: ${PROOF_DIR}
+  report_hash: ${hash_cli}
+EOF
diff --git a/scripts/rca-scorecard.sh b/scripts/rca-scorecard.sh
new file mode 100644
index 0000000..a13db24
--- /dev/null
+++ b/scripts/rca-scorecard.sh
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
+INGEST_URL="${INGEST_URL:-http://localhost:8080}"
+WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
+REQUESTS="${REQUESTS:-20}"
+CONCURRENCY="${CONCURRENCY:-5}"
+TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
+USE_RUNNING="${WAYLOG_SCORECARD_USE_RUNNING_DEMO:-0}"
+PROOF_DIR="${WAYLOG_PROOF_DIR:-./data/demo-state/proof}"
+
+CLI_BIN="${WAYLOG_CLI_BIN:-./data/demo-state/bin/waylog}"
+JSON_BIN="${WAYLOG_JSON_HELPER_BIN:-./data/demo-state/bin/demo-acceptance-json}"
+
+fail() {
+  echo "FAIL: $*" >&2
+  exit 1
+}
+
+http_code() {
+  curl -s -o /dev/null -w "%{http_code}" "$1" || echo "000"
+}
+
+bool_word() {
+  if "$@"; then
+    echo true
+  else
+    echo false
+  fi
+}
+
+now_ms() {
+  perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000'
+}
+
+cleanup() {
+  if [[ "$USE_RUNNING" != "1" ]]; then
+    make demo-stop >/dev/null 2>&1 || true
+  fi
+}
+trap cleanup EXIT
+
+if [[ "$USE_RUNNING" != "1" ]]; then
+  make demo
+elif [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
+  fail "running demo is not reachable. Start it with make demo or unset WAYLOG_SCORECARD_USE_RUNNING_DEMO"
+fi
+
+mkdir -p ./data/demo-state/bin "$PROOF_DIR"
+go build -o "$CLI_BIN" ./cmd/waylog
+go build -o "$JSON_BIN" ./scripts/demo-acceptance-json
+
+CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TIMEOUT")
+
+alert_id="alert_scorecard_pmt_502_$(date +%s)"
+alert_timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+alert_body="{\"source\":\"waylog\",\"alert_id\":\"${alert_id}\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"reason\":\"PMT_502 spike\",\"message\":\"scorecard alert for checkout payment failures\",\"error_code\":\"PMT_502\",\"timestamp\":\"${alert_timestamp}\"}"
+alert_status="$(curl -s -o "${PROOF_DIR}/scorecard-alert.json" -w "%{http_code}" \
+  -X POST "${INGEST_URL}/v1/alerts" \
+  -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+  -H 'Content-Type: application/json' \
+  --data "$alert_body" || echo "000")"
+[[ "$alert_status" == "201" ]] || fail "alert webhook failed: HTTP $alert_status"
+
+burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
+burst_status="$(curl -s -o "${PROOF_DIR}/scorecard-burst.json" -w "%{http_code}" \
+  -X POST "${GATEWAY_URL}/demo/burst" \
+  -H 'Content-Type: application/json' \
+  --data "$burst_body" || echo "000")"
+[[ "$burst_status" == "200" ]] || fail "traffic burst failed: HTTP $burst_status"
+answer_start="$(now_ms)"
+
+errors_json=""
+for _ in $(seq 1 15); do
+  errors_json="$("${CLI[@]}" --json errors --window 15m --limit 10)" || fail "waylog errors failed"
+  if "$JSON_BIN" has-payment-error <<<"$errors_json"; then
+    break
+  fi
+  sleep 1
+done
+"$JSON_BIN" has-payment-error <<<"$errors_json" || fail "payment_502 error family did not appear"
+printf "%s\n" "$errors_json" >"${PROOF_DIR}/scorecard-errors.json"
+
+incidents_json=""
+for _ in $(seq 1 20); do
+  incidents_json="$("${CLI[@]}" --json incidents)" || fail "waylog incidents failed"
+  if "$JSON_BIN" has-dependency-incident <<<"$incidents_json"; then
+    break
+  fi
+  sleep 1
+done
+"$JSON_BIN" has-dependency-incident <<<"$incidents_json" || fail "dependency incident did not appear"
+printf "%s\n" "$incidents_json" >"${PROOF_DIR}/scorecard-incidents.json"
+
+incident_id="$("$JSON_BIN" first-incident-id <<<"$incidents_json")"
+[[ -n "$incident_id" ]] || fail "no incident id found"
+
+triage_a="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage failed"
+triage_b="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "second waylog triage failed"
+answer_end="$(now_ms)"
+printf "%s\n" "$triage_a" >"${PROOF_DIR}/scorecard-triage.json"
+
+hash_a="$("$JSON_BIN" triage-report-hash <<<"$triage_a")"
+hash_b="$("$JSON_BIN" triage-report-hash <<<"$triage_b")"
+hash_stable=true
+if [[ -z "$hash_a" || "$hash_a" != "$hash_b" ]]; then
+  hash_stable=false
+fi
+
+scenario="${WAYLOG_SCENARIO:-cold-demo}"
+
+blast_json="$("${CLI[@]}" --json blast checkout:payment.charge:PMT_502 --window 15m)" || fail "waylog blast failed"
+printf "%s\n" "$blast_json" >"${PROOF_DIR}/scorecard-blast.json"
+
+root_count="$("$JSON_BIN" payment-error-count <<<"$errors_json")"
+affected_services="$("$JSON_BIN" blast-affected-services <<<"$blast_json")"
+[[ "$root_count" =~ ^[0-9]+$ && "$affected_services" =~ ^[0-9]+$ ]] || fail "non-numeric scorecard counts"
+(( root_count > 0 )) || fail "root-cause count is empty"
+(( affected_services > 1 )) || fail "blast radius did not show cross-service spread"
+naive_count=$((root_count * affected_services))
+inflation_avoided=$((naive_count - root_count))
+time_to_answer_ms=$((answer_end - answer_start))
+
+root_cause_accuracy="$(bool_word "$JSON_BIN" triage-root-cause-accurate <<<"$triage_a")"
+cause_classification="$(bool_word "$JSON_BIN" incident-cause-is-dependency "$incident_id" <<<"$incidents_json")"
+trace_present="$(bool_word "$JSON_BIN" triage-has-trace <<<"$triage_a")"
+dependency_signal_present="$(bool_word "$JSON_BIN" triage-has-dependency-signal <<<"$triage_a")"
+alert_present="$(bool_word "$JSON_BIN" triage-has-alert <<<"$triage_a")"
+next_checks_present="$(bool_word "$JSON_BIN" triage-has-next-check <<<"$triage_a")"
+
+cat >"${PROOF_DIR}/scorecard.json" <<EOF
+{
+  "root_cause_accuracy": ${root_cause_accuracy},
+  "cause_classification": ${cause_classification},
+  "evidence_completeness": {
+    "trace_present": ${trace_present},
+    "dependency_signal_present": ${dependency_signal_present},
+    "alert_present": ${alert_present},
+    "next_checks_present": ${next_checks_present}
+  },
+  "scenario": "${scenario}",
+  "triage_latency_ms": ${time_to_answer_ms},
+  "report_hash_stable": ${hash_stable},
+  "propagated_error_inflation_avoided": ${inflation_avoided},
+  "root_cause_count": ${root_count},
+  "naive_propagated_count": ${naive_count},
+  "report_hash": "${hash_a}"
+}
+EOF
+
+cat <<EOF
+RCA scorecard
+  root cause accuracy: ${root_cause_accuracy}
+  cause classification dependency: ${cause_classification}
+  evidence completeness: trace=${trace_present} dependency_signal=${dependency_signal_present} alert=${alert_present} next_checks=${next_checks_present}
+  triage latency: ${time_to_answer_ms}ms
+  report hash stable: ${hash_stable} (${hash_a})
+  propagated-error inflation avoided: ${inflation_avoided} (${naive_count} naive - ${root_count} root-cause)
+  scenario: ${scenario}
+
+Artifacts:
+  ${PROOF_DIR}/scorecard.json
+EOF
+
+if [[ "$hash_stable" != "true" ]]; then
+  echo "FAIL: triage report hash unstable: A=$hash_a B=$hash_b" >&2
+  exit 1
+fi

From 2b635a9e40e93187ecec4c5df042d8095fd6775a Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Mon, 18 May 2026 16:41:21 -0400
Subject: [PATCH 13/14] refactor: remove legacy incident engine dead paths

- delete superseded buildIncident and transitionMissing paths
- remove unused incident engine helper methods
- bound OTLP gRPC graceful shutdown with forceful fallback
- trim stale refactor-history comments
- simplify low-value tests
---
 cmd/ingest/main.go               |  14 ++-
 internal/incidents/engine.go     | 149 ++-----------------------------
 internal/incidents/store_test.go |  14 ---
 internal/llm/openai_test.go      |  10 ---
 internal/triage/options_test.go  |  63 ++++++-------
 5 files changed, 45 insertions(+), 205 deletions(-)

diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index d2d3252..e9c9ea1 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -956,8 +956,18 @@ func main() {
 		slog.Info("ingest shutdown complete")
 	}
 	if otlpGRPCServer != nil {
-		otlpGRPCServer.GracefulStop()
-		slog.Info("otlp grpc shutdown complete")
+		done := make(chan struct{})
+		go func() {
+			otlpGRPCServer.GracefulStop()
+			close(done)
+		}()
+		select {
+		case <-done:
+			slog.Info("otlp grpc shutdown complete")
+		case <-time.After(5 * time.Second):
+			otlpGRPCServer.Stop()
+			slog.Warn("otlp grpc graceful shutdown timed out; forced stop")
+		}
 	}
 
 	planStore.Close()
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
index 4dd9f6c..0680c23 100644
--- a/internal/incidents/engine.go
+++ b/internal/incidents/engine.go
@@ -141,16 +141,13 @@ func (e *Engine) Tick(ctx context.Context) error {
 	return e.ApplyLive(ctx, rows)
 }
 
-// derivedRow carries the derivation output plus whether the row was already
-// in the seed (used by ApplyLive to distinguish Opened vs Updated metrics).
 type derivedRow struct {
 	Incident Incident
 	Existed  bool
 }
 
-// derive computes the full set of incident rows for the cycle from the seed +
-// reader without touching e.active or the store. Used by both live Tick and
-// startup Rebuild.
+// derive computes incident rows for the cycle from the seed + reader without
+// touching e.active or the store. Used by both live Tick and startup Rebuild.
 func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Incident, reader Reader) ([]derivedRow, error) {
 	currentStart := now.Add(-e.cfg.Window)
 	baselineStart := now.Add(-2 * e.cfg.Window)
@@ -186,7 +183,7 @@ func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Inci
 
 // deriveMissing emits transitions for seed rows absent from the current cycle:
 // active → recovering, and recovering → resolved once LastSeenAt is older
-// than ResolveAfter. Mirrors the previous transitionMissing semantics.
+// than ResolveAfter.
 func (e *Engine) deriveMissing(seed map[string]Incident, seen map[string]struct{}, now time.Time) []derivedRow {
 	out := make([]derivedRow, 0)
 	for _, inc := range seed {
@@ -216,8 +213,7 @@ func (e *Engine) deriveMissing(seed map[string]Incident, seen map[string]struct{
 }
 
 // ApplyLive persists derived rows for a live tick: per-row Upsert, in-memory
-// cache update, and per-transition metric increments matching pre-refactor
-// Tick behavior.
+// cache update, and per-transition metric increments.
 func (e *Engine) ApplyLive(ctx context.Context, rows []derivedRow) error {
 	for _, dr := range rows {
 		if err := e.store.Upsert(ctx, dr.Incident); err != nil {
@@ -255,9 +251,7 @@ func (e *Engine) ApplyLive(ctx context.Context, rows []derivedRow) error {
 
 // ApplyRebuild atomically replaces non-resolved store rows with the derived
 // set, then reloads the in-memory cache from the store. ApplyRebuild owns
-// cache reload; do NOT call Bootstrap after it. Per-row Opened/Updated/
-// Recovered/Resolved counters are intentionally not incremented here —
-// rebuild metrics live in main.go.
+// cache reload; do NOT call Bootstrap after it.
 func (e *Engine) ApplyRebuild(ctx context.Context, rows []derivedRow) error {
 	incs := make([]Incident, 0, len(rows))
 	for _, dr := range rows {
@@ -317,74 +311,6 @@ func (e *Engine) TopActive(ctx context.Context) (*Incident, error) {
 	return &rows[0], nil
 }
 
-func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, error) {
-	events := e.sampleEvents(row.ErrorFamily, since, now, 200)
-	startedAt := earliestEventTime(events, now)
-	env := firstEventEnv(events)
-	if existing, ok := e.findByFamily(env, row.ErrorFamily); ok {
-		startedAt = existing.StartedAt
-	}
-	id := StableID(env, row.ErrorFamily, startedAt)
-	existing, hadExisting := e.getCached(id)
-	if !hadExisting {
-		if prior, ok := e.findByFamily(env, row.ErrorFamily); ok {
-			existing = prior
-			id = prior.IncidentID
-			hadExisting = true
-		}
-	}
-	blast := e.reader.BlastRadius(
-		SearchFilter{Since: since, Until: now},
-		apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
-	)
-	sigs, err := e.querySignals(ctx, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
-	if err != nil && !errors.Is(err, signals.ErrUnavailable) {
-		return Incident{}, err
-	}
-	deploys, err := e.queryDeploys(ctx, row.ErrorFamily.Service, now.Add(-e.cfg.DeployCorrelationWindow), now)
-	if err != nil {
-		return Incident{}, err
-	}
-	inc := Incident{
-		IncidentID:       id,
-		Env:              env,
-		Service:          row.ErrorFamily.Service,
-		ErrorFamily:      row.ErrorFamily,
-		Status:           StatusActive,
-		Severity:         severity(row.Count, blast.AffectedServices, lift),
-		StartedAt:        startedAt,
-		UpdatedAt:        now,
-		LastSeenAt:       now,
-		AffectedRequests: blast.AffectedRequests,
-		AffectedUsers:    cloneInt(row.AffectedUsers),
-		AffectedServices: blast.AffectedServices,
-		TopServices:      append([]string(nil), blast.TopServices...),
-		SampleTraces:     stableSamples(existing.SampleTraces, events, e.cfg.SampleLimit),
-		Lift:             lift,
-		BaselineCount:    baselineCount,
-		CurrentCount:     row.Count,
-	}
-	if hadExisting {
-		inc.StartedAt = existing.StartedAt
-		inc.RecoveringAt = nil
-	}
-	class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
-	inc.Cause = class.Cause
-	inc.Confidence = class.Confidence
-	inc.Evidence = class.Evidence
-	inc.NextChecks = class.NextChecks
-	inc.InstrumentationWarnings = class.InstrumentationWarnings
-	e.observeClassification(inc.Cause, inc.Confidence)
-	if e.metrics != nil {
-		if hadExisting {
-			e.metrics.IncidentUpdated.Inc()
-		} else {
-			e.metrics.IncidentOpened.Inc()
-		}
-	}
-	return inc, nil
-}
-
 func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Incident, reader Reader, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, bool, error) {
 	events := sampleEventsFromReader(reader, row.ErrorFamily, since, now, 200)
 	startedAt := earliestEventTime(events, now)
@@ -448,53 +374,6 @@ func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Inci
 	return inc, hadExisting, nil
 }
 
-func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}, now time.Time) error {
-	e.mu.RLock()
-	rows := make([]Incident, 0, len(e.active))
-	for _, inc := range e.active {
-		rows = append(rows, cloneIncident(inc))
-	}
-	e.mu.RUnlock()
-	for _, inc := range rows {
-		if _, ok := seen[inc.IncidentID]; ok {
-			continue
-		}
-		switch inc.Status {
-		case StatusActive:
-			inc.Status = StatusRecovering
-			t := now
-			inc.RecoveringAt = &t
-			inc.UpdatedAt = now
-			if err := e.store.Upsert(ctx, inc); err != nil {
-				return err
-			}
-			e.remember(inc)
-			if e.metrics != nil {
-				e.metrics.IncidentRecovered.Inc()
-			}
-		case StatusRecovering:
-			if now.Sub(inc.LastSeenAt) >= e.cfg.ResolveAfter {
-				inc.Status = StatusResolved
-				t := now
-				inc.ResolvedAt = &t
-				inc.UpdatedAt = now
-				if err := e.store.Upsert(ctx, inc); err != nil {
-					return err
-				}
-				e.forget(inc.IncidentID)
-				if e.metrics != nil {
-					e.metrics.IncidentResolved.Inc()
-				}
-			}
-		}
-	}
-	return nil
-}
-
-func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
-	return sampleEventsFromReader(e.reader, f, since, until, limit)
-}
-
 func sampleEventsFromReader(reader Reader, f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
 	events := reader.SearchEvents(SearchFilter{
 		Service:   f.Service,
@@ -552,24 +431,6 @@ func (e *Engine) forget(id string) {
 	delete(e.active, id)
 }
 
-func (e *Engine) getCached(id string) (Incident, bool) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-	inc, ok := e.active[id]
-	return cloneIncident(inc), ok
-}
-
-func (e *Engine) findByFamily(env string, family apiv2.ErrorFamily) (Incident, bool) {
-	e.mu.RLock()
-	defer e.mu.RUnlock()
-	for _, inc := range e.active {
-		if inc.Env == env && inc.ErrorFamily == family && inc.Status != StatusResolved {
-			return cloneIncident(inc), true
-		}
-	}
-	return Incident{}, false
-}
-
 func (e *Engine) activeCount() int {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
diff --git a/internal/incidents/store_test.go b/internal/incidents/store_test.go
index 1438d0e..b6a6148 100644
--- a/internal/incidents/store_test.go
+++ b/internal/incidents/store_test.go
@@ -68,17 +68,3 @@ func TestStaleActiveTransitionTransitionsOnlyStaleRows(t *testing.T) {
 	}
 }
 
-// TestStaleActiveTransitionNoActiveRowsIsNoOp covers the "normal empty-WAL
-// startup with no active incidents" path. Nothing should change in the store.
-func TestStaleActiveTransitionNoActiveRowsIsNoOp(t *testing.T) {
-	store := NewMemoryStore()
-	active, _ := store.ListActive(context.Background())
-	if len(active) != 0 {
-		t.Fatalf("expected empty store, got %d rows", len(active))
-	}
-	// No rows = no transitions. Listing again must still be empty.
-	active, _ = store.ListActive(context.Background())
-	if len(active) != 0 {
-		t.Fatalf("unexpected mutation: %d rows", len(active))
-	}
-}
diff --git a/internal/llm/openai_test.go b/internal/llm/openai_test.go
index a099b80..fa363a0 100644
--- a/internal/llm/openai_test.go
+++ b/internal/llm/openai_test.go
@@ -181,16 +181,6 @@ func TestParseOpenAIResponseMessageText(t *testing.T) {
 	}
 }
 
-func TestOpenAIDefaultModel(t *testing.T) {
-	client := NewOpenAIClient("test-key")
-	if client.Model != "gpt-4o-mini" {
-		t.Fatalf("default model = %q, want gpt-4o-mini", client.Model)
-	}
-	if defaultOpenAIModel != "gpt-4o-mini" {
-		t.Fatalf("defaultOpenAIModel = %q, want gpt-4o-mini", defaultOpenAIModel)
-	}
-}
-
 func TestOpenAIGenerateAPIError(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "unavailable", http.StatusServiceUnavailable)
diff --git a/internal/triage/options_test.go b/internal/triage/options_test.go
index 179dd49..fc643b5 100644
--- a/internal/triage/options_test.go
+++ b/internal/triage/options_test.go
@@ -5,46 +5,39 @@ import (
 	"time"
 )
 
-func TestBuildOptionsDefaults(t *testing.T) {
+func TestParseBuildOptions(t *testing.T) {
 	now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
-	opts, err := ParseBuildOptions("", false, now)
-	if err != nil {
-		t.Fatalf("parse: %v", err)
-	}
-	if opts.Window != 15*time.Minute {
-		t.Fatalf("default window should be 15m, got %s", opts.Window)
-	}
-	if opts.Snapshot {
-		t.Fatalf("default snapshot should be false")
-	}
-	if !opts.Now.Equal(now) {
-		t.Fatalf("Now should be passed through")
+	cases := []struct {
+		name     string
+		window   string
+		snapshot bool
+		want     time.Duration
+	}{
+		{"default window", "", false, 15 * time.Minute},
+		{"explicit 30m", "30m", false, 30 * time.Minute},
+		{"snapshot flag honored", "15m", true, 15 * time.Minute},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			opts, err := ParseBuildOptions(tc.window, tc.snapshot, now)
+			if err != nil {
+				t.Fatalf("parse: %v", err)
+			}
+			if opts.Window != tc.want {
+				t.Fatalf("window = %s, want %s", opts.Window, tc.want)
+			}
+			if opts.Snapshot != tc.snapshot {
+				t.Fatalf("snapshot = %v, want %v", opts.Snapshot, tc.snapshot)
+			}
+			if !opts.Now.Equal(now) {
+				t.Fatalf("Now should be passed through")
+			}
+		})
 	}
 }
 
-func TestBuildOptionsWindowParse(t *testing.T) {
-	now := time.Now()
-	opts, err := ParseBuildOptions("30m", false, now)
-	if err != nil {
-		t.Fatalf("parse: %v", err)
-	}
-	if opts.Window != 30*time.Minute {
-		t.Fatalf("got %s want 30m", opts.Window)
-	}
-}
-
-func TestBuildOptionsBadWindow(t *testing.T) {
+func TestParseBuildOptionsBadWindow(t *testing.T) {
 	if _, err := ParseBuildOptions("forever", false, time.Now()); err == nil {
 		t.Fatalf("expected error for bad window")
 	}
 }
-
-func TestBuildOptionsSnapshotFlag(t *testing.T) {
-	opts, err := ParseBuildOptions("15m", true, time.Now())
-	if err != nil {
-		t.Fatalf("parse: %v", err)
-	}
-	if !opts.Snapshot {
-		t.Fatalf("snapshot flag not honored")
-	}
-}

From fca3cb22aa717df546cac218f2ab6e770e97edb0 Mon Sep 17 00:00:00 2001
From: skota-hash <santoshsaismaran@gmail.com>
Date: Mon, 18 May 2026 16:58:36 -0400
Subject: [PATCH 14/14] ci fix

---
 internal/incidents/store_test.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/internal/incidents/store_test.go b/internal/incidents/store_test.go
index b6a6148..0f1c568 100644
--- a/internal/incidents/store_test.go
+++ b/internal/incidents/store_test.go
@@ -67,4 +67,3 @@ func TestStaleActiveTransitionTransitionsOnlyStaleRows(t *testing.T) {
 		t.Fatal("fresh row recovering_at must be unset")
 	}
 }
-