diff --git a/.gitignore b/.gitignore index a8cb890..f2d4390 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,14 @@ profile.cov # Editor/IDE # .idea/ .vscode/ +.claude/ + +# Local databases and ad-hoc data dirs +*.db +*.db-shm +*.db-wal +*.sqlite +*.sqlite3 /*.log /data/ @@ -37,6 +45,7 @@ data/demo-state/ /checkout /waylog /bridge +/crux /waylog-live /api-gateway /bin/ @@ -49,3 +58,9 @@ data/demo-state/ !docs/env.md !docs/waylog-sdk-contract.md !docs/sdk-examples.md + +# Governance + community docs must be trackable despite the *.md rule above +!CONTRIBUTING.md +!CODE_OF_CONDUCT.md +!SUPPORT.md +!.github/**/*.md diff --git a/Dockerfile b/Dockerfile index 785bded..8e22061 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,9 +11,6 @@ COPY . . FROM builder AS build-ingest RUN CGO_ENABLED=0 go build -o /bin/ingest ./cmd/ingest -FROM builder AS build-bridge -RUN CGO_ENABLED=0 go build -o /bin/bridge ./cmd/bridge - FROM builder AS build-api-gateway RUN CGO_ENABLED=0 go build -o /bin/api-gateway ./examples/cmd/api-gateway @@ -33,11 +30,6 @@ COPY --from=build-ingest /bin/ingest /bin/ingest EXPOSE 8080 ENTRYPOINT ["/bin/ingest"] -FROM alpine:3.21 AS bridge -RUN apk add --no-cache ca-certificates -COPY --from=build-bridge /bin/bridge /bin/bridge -ENTRYPOINT ["/bin/bridge"] - FROM alpine:3.21 AS api-gateway RUN apk add --no-cache ca-certificates COPY --from=build-api-gateway /bin/api-gateway /bin/api-gateway diff --git a/Makefile b/Makefile index 46f0d90..b356713 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,12 @@ SHELL := /bin/sh -.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance proof-loop rca-scorecard rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate +.PHONY: help build build-crux first-run install-local build-examples ingest ingest-mcp waylog checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance proof-loop rca-scorecard rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate help: @echo "Targets:" @echo " build - build core binaries (SDK tooling)" + @echo " build-crux - build Crux interactive shell" + @echo " install-local - install crux and waylog to GOPATH/bin" @echo " build-examples - build example/demo binaries" @echo " ingest - run ingest server" @echo " ingest-mcp - run ingest server with MCP stdio enabled" @@ -27,7 +29,6 @@ help: @echo " demo-down - stop Docker demo stack" @echo " micro-demo - start 4-service micro-demo in foreground for debugging" @echo " micro-demo-stop - stop micro-demo processes" - @echo " waylog-live - run TUI dashboard (connects to ingest server)" @echo " docker-build - build all Docker images" @echo " docker-up - start full stack via docker compose" @echo " docker-down - stop stack (preserve volumes)" @@ -39,8 +40,21 @@ build: go build ./cmd/ingest go build ./cmd/checkout go build ./cmd/waylog - go build ./cmd/bridge - go build ./cmd/waylog-live + +build-crux: + go build -o crux ./cmd/crux + +first-run: build-crux + go build -o ingest ./cmd/ingest + ./crux first-run + +install-local: build build-crux + @mkdir -p "$$(go env GOPATH)/bin" + cp crux waylog "$$(go env GOPATH)/bin/" + @echo "installed: crux waylog -> $$(go env GOPATH)/bin/" + @echo "" + @echo "Add to PATH if needed:" + @echo " export PATH=\"$$(go env GOPATH)/bin:$$PATH\"" build-examples: go build ./examples/cmd/api-gateway @@ -57,9 +71,6 @@ ingest-mcp: waylog: go run ./cmd/waylog -waylog-live: - go run ./cmd/waylog-live - checkout: go run ./cmd/checkout @@ -86,7 +97,7 @@ vet-sdk: ## Vet SDK modules cd pkg && go vet ./... cd pkg/transport/kafka && go vet ./... -ci: fmt vet vet-sdk test-race test-sdk ts-test check-doc-links check-rollup-contract otlp-conformance +ci: fmt vet vet-sdk test-race test-sdk ts-test build-crux check-doc-links otlp-conformance @echo "CI checks passed" ts-install: ## Install TS SDK deps (skipped if node_modules is already present) @@ -102,15 +113,11 @@ ts-test: ts-install ## Run TS SDK vitest suite check-doc-links: @bash scripts/check-doc-links.sh -.PHONY: check-rollup-contract -check-rollup-contract: - @bash scripts/check-rollup-contract.sh - bench-gate: ## Enforce v2 SDK §4.4.1 perf budgets (optional; not in `ci` yet) @bash scripts/bench-gate.sh clean: - rm -f ingest checkout waylog bridge api-gateway checkout-demo db-demo payment-demo waylog-live + rm -f ingest checkout waylog crux api-gateway checkout-demo db-demo payment-demo kafka-up: docker compose -f docker-compose.kafka.yml up -d diff --git a/README.md b/README.md index 7f6d92f..c408a6e 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,8 @@ func main() { Middleware adapters for `net/http`, chi, gin, and echo are in [`docs/sdk-examples.md`](docs/sdk-examples.md). The recommended path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers — low-level `Begin` / `Finalize` / `setField` APIs are for adapter authors. +The Go and TypeScript SDKs are kept in parity — wire format, config, signals, and public API. The audited matrix (and the documented idiomatic gaps) is in [`docs/sdk-parity.md`](docs/sdk-parity.md). + ### OTLP / OpenTelemetry Point your existing OTel collector at Waylog. Both protocols, same conversion path, same downstream views. @@ -162,6 +164,8 @@ exporters: Sample collector config: [`examples/otel-collector/`](examples/otel-collector/). Only traces are accepted; OTLP logs and metrics are not shipping. Bind `OTLP_GRPC_ADDR=127.0.0.1:4317` for single-host installs that don't need cross-host collectors. +**Deploy correlation works OTel-only.** When spans carry `service.version` and the version changes for a `(service, env)` pair, Waylog auto-registers a deployment — no deploy webhook needed for incidents to classify `cause=deploy`. + **Auth.** Both endpoints require `WAYLOG_WRITE_KEY` when `WAYLOG_PROFILE=prod`; the server refuses to boot with unauthenticated OTLP in prod. `make demo` runs unauthenticated by design. ### Local ingest only (no demo services) @@ -196,9 +200,18 @@ waylog search PMT_502 --window 1h `waylog capabilities` is intentionally ungated so it can diagnose server setup; other verbs require `v2_reads.enabled=true` (the default). Defaults: `INGEST_ADDR`, `WAYLOG_READ_KEY`, `WAYLOG_CLI_TIMEOUT`. Add `--json` to any verb for machine-readable output. +### Interactive shell + +```bash +make build-crux +./crux +``` + +`crux` opens a lightweight incident-triage shell with `help`, `status`, `incidents`, `open `, `triage `, `blast ::`, `explain `, and `exit`. With arguments, it delegates to the same command library as `waylog`, so `crux incidents` and `waylog incidents` share behavior. + ### Dashboard -Embedded Geist UI at . Uses the dashboard session cookie for read-scope auth and runs against the default `WAYLOG_V2_READS=true` reader. +Embedded Geist UI at . Uses the dashboard session cookie for read-scope auth and runs against the v2 reader. - `#/errors` — top error families over `/v1/errors` - `#/incident/` — incident evidence and next checks over `/v1/incidents/{id}` @@ -210,28 +223,20 @@ No Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dash ### Agent surface -Twelve deterministic tools, exposed identically through CLI, REST `/v1/tools/{name}`, MCP stdio, and plan execution. Same idempotency keys, same structured envelopes, same bytes. +Four deterministic tools, exposed identically through CLI, REST `/v1/tools/{name}`, MCP stdio, and plan execution. Same idempotency keys, same structured envelopes, same bytes. | Tool | Answers | | ---------------------- | -------------------------------------------------------------------------------------------- | | `triage_incident` | Structured TriageReport for an open incident (blast + first failure + signals + next checks) | | `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note from a TriageReport | -| `explain_request` | Why did this specific trace fail? | -| `trace_summary` | Span tree and timing for a trace | -| `graph_failures` | Which requests are currently failing? | -| `failure_patterns` | What error codes dominate this window? | -| `blast_radius` | How many requests, users, and services does this error touch? | -| `failure_chain` | How did this failure propagate through services? | -| `graph_query` | DSL query over the graph (`expr` + `window`) | -| `compare_windows` | Diff error rates between two windows | -| `graph_insights` | Windowed rollup of top errors and patterns | -| `graph_stats` | Overall shape of the graph right now | +| `explain_request` | Trace story (per-step path, anchor, downstream) for a given `trace_id` | +| `blast_radius` | How many requests, users, and services does this error family touch in the window? | ```bash # Direct tool call curl -X POST http://localhost:8080/v1/tools/blast_radius \ -H "Authorization: Bearer $WAYLOG_AGENT_KEY" \ - -d '{"error_code":"PMT_502","window":"10m","include_services":true}' + -d '{"service":"payment-service","step":"charge","error_code":"PMT_502","window":"10m"}' # Built-in triage plan template — same hash as the CLI/read/tool surfaces curl -X POST http://localhost:8080/v1/plans/execute \ @@ -274,25 +279,26 @@ curl -X POST http://localhost:8080/v1/alerts \ │ │ event log (append-only WAL, SQLite cold store source of truth) (events · deployments · - │ signals · incidents · - ▼ causal claims) - derived read models - (errors · explain · blast · - recent · incidents · triage) + │ signals · incidents) + ▼ + v2 reader (in-memory hot + index over schema-2.0 WAL) │ - ├──▶ /ui dashboard (Geist, no vendored chart/topology) - ├──▶ /v1/tools/* (deterministic agent surface) - ├──▶ /v1/plans/execute (server-side plan execution + SSE) - └──▶ waylog CLI · TUI · MCP + ├──▶ /v1/errors · /v1/blast_radius · /v1/traces/* · /v1/events/* + ├──▶ incidents engine → /v1/incidents/* · /v1/triage/* + ├──▶ /ui dashboard (Geist, no vendored chart/topology) + ├──▶ /v1/tools/* (four v1.0 agent tools) + ├──▶ /v1/plans/execute (server-side plan execution + SSE) + └──▶ waylog CLI · MCP ``` - **Single binary** plus embedded SQLite. No Docker, no Kafka, no bridge. -- **WAL is source of truth.** Crash → replay on next boot rebuilds the derived read models. -- **Hot graph + dedicated trace store.** Pruned per snapshot tick to bound memory. -- **`report_hash` excludes `generated_at`, `plan_run_id`, and itself.** Same upstream state → same bytes across every surface. +- **WAL is source of truth.** Crash → replay on next boot rebuilds the v2 reader's hot index. +- **v2 reader is the only hot path.** Pruned every tick to enforce `GRAPH_HOT_WINDOW` (default 24h). +- **`report_hash` excludes `generated_at`, `plan_run_id`, and itself.** Same upstream state → same bytes across every surface. **`evidence_fingerprint`** complements it: stable across ticks until the incident's evidence set changes, so operators and agents can cite a triage answer durably. - **OTLP path reuses the same WAL and projector** as the SDK path. No separate ingestion plane. -Durability model, retention, merge semantics, readiness policy, and counter buffer details: [`docs/internals.md`](docs/internals.md). Full HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml). +Durability model, retention, merge semantics, readiness policy, and counter buffer details: [`docs/internals.md`](docs/internals.md). Scale ceiling and how to tune within it: [`docs/scale-and-limits.md`](docs/scale-and-limits.md). Full HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml). --- @@ -368,7 +374,6 @@ Full env-var reference: [`docs/env.md`](docs/env.md). Reproducible demo gate: `m - **OTLP/gRPC trace receiver** on `OTLP_GRPC_ADDR` (default `:4317`). - **Provider-neutral Ask** configuration: `gemini`, `anthropic`, `openai`, or `none`. All deterministic surfaces work with no LLM configured. - **`WAYLOG_PROFILE=demo|dev|prod`** gates auth defaults; `prod` hard-fails on unsafe configs. -- **`WAYLOG_V2_READS` defaults to `true`.** Set `false` only for legacy v1-only stacks. - **`/v1/insight`** retained as a compat shim returning the top active incident. New clients should use `/v1/incidents/*`. --- @@ -382,23 +387,21 @@ Public alpha for single-node production-style incident triage. APIs may break be - Go SDK v2 (`net/http`, chi, gin, echo) and TypeScript SDK v2 (`@waylog/sdk`, ESM, Node 18+, standalone core, Express, Hono, Next.js, NestJS) - OTLP HTTP at `/v1/otlp/v1/traces` and OTLP/gRPC at `OTLP_GRPC_ADDR` (traces only) - Durable ingest with WAL + replay -- Hot graph with flattened 3-node model + dedicated trace store -- Schema-2.0 recent-index read APIs (default) -- SQLite cold store (events, deployments, signals, incidents, causal claims) +- Schema-2.0 v2 reader powering all hot read APIs (`/v1/errors`, `/v1/blast_radius`, `/v1/traces/*`, `/v1/events/*`) +- SQLite cold store (events, deployments, signals, incidents) - Signal-correlated incident engine with stable IDs, deterministic classification, and startup hot-window rebuild from the schema-2.0 WAL - Alert intake from four webhook formats, stored as signals and correlated with active incidents - Deterministic triage report with stable hash across CLI / read endpoint / direct tool / plan template within a single engine tick - Provider-neutral Ask configuration; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured -- Twelve deterministic analysis tools, rollup-correct root-cause attribution +- Four deterministic v1.0 agent tools (`explain_request`, `blast_radius`, `triage_incident`, `render_triage_report`) - Agent-native REST with idempotency and structured envelopes -- MCP stdio, live TUI (`waylog-live --dev` streams via SSE), embedded Geist dashboard +- MCP stdio, embedded Geist dashboard - Scoped auth (write / read / agent) with startup validation and `WAYLOG_PROFILE=prod` hard-fail **Planned** - OTLP logs and metrics - Python SDK -- Resolved-incident retention janitor - Mintlify docs site --- @@ -406,15 +409,11 @@ Public alpha for single-node production-style incident triage. APIs may break be ## Known limitations - **Public alpha.** APIs may break before 1.0. Not production-ready. Not HA. -- **Triage report hash is stable per tick, not forever.** Hash changes when the underlying recent-index window changes (≈30 s default). Use as a short-window dedup key, not a long-term incident fingerprint. +- **Triage report hash is stable per tick, not forever.** `report_hash` changes when the underlying recent-index window changes (≈30 s default) — use it as a short-window dedup key proving all four surfaces returned the same bytes. For citations that survive across ticks, use `evidence_fingerprint`: it hashes only the evidence identity set (incident + signal + alert + runtime + trace IDs) and changes exactly when evidence is attached (see `docs/adr/0002-evidence-fingerprint.md`). - **Alerts correlate; they do not create incidents.** Incidents are opened by the spike detector. The alert path is for routing context, not paging primitives. -- **Resolved incidents are not pruned automatically.** Per the v2.1 plan, the retention janitor is deferred. Manual cleanup: - ```sql - DELETE FROM incidents WHERE status = 'resolved' AND resolved_at < datetime('now', '-7 days'); - ``` - **Stale `active` rows after long downtime.** If the WAL has rolled past an incident's `started_at` and `WAYLOG_REBUILD_INCIDENTS_ON_START=true`, the engine transitions only the stale rows to `recovering` on next start; they resolve after `WAYLOG_INCIDENT_RESOLVE_AFTER` without new evidence. - **Single-node only.** No HA, no clustering, no multi-tenant. -- **SQLite cold store** fits demos and small deployments. Postgres is not shipping. +- **SQLite cold store** fits demos and small deployments — see [Scale & limits](docs/scale-and-limits.md) for the ceiling and how to tune within it. Postgres is not shipping. - **OTLP supports traces only.** Logs and metrics are not shipping yet. - **Only Go and TypeScript SDKs today.** Python / Java / Ruby are not available. - **No outbound paging.** Waylog accepts external alerts and renders operator reports; it does not page. @@ -439,7 +438,7 @@ Public alpha for single-node production-style incident triage. APIs may break be ## Project layout ``` -cmd/ executable binaries (ingest, waylog, waylog-live, ...) +cmd/ executable binaries (ingest, waylog, ...) pkg/ public SDK importable by external services internal/ private implementation (auth, incidents, triage, ingest, ...) examples/ demo services + collector config + microdemo diff --git a/cmd/bridge/main.go b/cmd/bridge/main.go deleted file mode 100644 index 4b986c2..0000000 --- a/cmd/bridge/main.go +++ /dev/null @@ -1,288 +0,0 @@ -package main - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "log/slog" - "net" - "net/http" - "os" - "os/signal" - "strconv" - "strings" - "sync" - "syscall" - "time" - - "github.com/segmentio/kafka-go" - "github.com/sssmaran/WaylogCLI/internal/config" -) - -func main() { - level := parseSlogLevel(config.Getenv("LOG_LEVEL", "info")) - slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: level}))) - - ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) - defer stop() - - brokers := config.SplitEnvList("KAFKA_BROKERS") - if len(brokers) == 0 { - brokers = []string{"localhost:9092"} - } - topic := config.Getenv("KAFKA_TOPIC", "wide_events") - ingestURL := config.Getenv("INGEST_URL", "http://localhost:8080/v1/events") - groupID := config.Getenv("KAFKA_GROUP_ID", "waylog-demo-bridge") - apiKey := config.Getenv("WAYLOG_WRITE_KEY", "") - if apiKey == "" { - apiKey = config.Getenv("WAYLOG_API_KEY", "") - } - dlqDir := config.Getenv("DLQ_DIR", "./data") - - reader := kafka.NewReader(kafka.ReaderConfig{ - Brokers: brokers, - Topic: topic, - GroupID: groupID, - }) - defer reader.Close() - - dlq := newDeadLetterWriter(dlqDir) - defer dlq.Close() - - client := &http.Client{Timeout: 5 * time.Second} - - if err := ensureTopicReady(ctx, brokers, topic); err != nil { - if ctx.Err() != nil { - slog.Info("bridge shutdown during startup") - return - } - slog.Error("failed to ensure topic ready", "err", err) - os.Exit(1) - } - - slog.Info("bridge started", "topic", topic, "brokers", brokers, "ingest_url", ingestURL) - for { - msg, err := reader.FetchMessage(ctx) - if err != nil { - if ctx.Err() != nil { - slog.Info("bridge shutdown signal received") - return - } - slog.Warn("kafka fetch failed, retrying", "err", err) - time.Sleep(2 * time.Second) - continue - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, ingestURL, bytes.NewReader(msg.Value)) - if err != nil { - slog.Error("request build failed", "err", err, "offset", msg.Offset) - continue - } - req.Header.Set("Content-Type", "application/json") - if apiKey != "" { - req.Header.Set("Authorization", "Bearer "+apiKey) - } - - resp, err := client.Do(req) - if err != nil { - if ctx.Err() != nil { - slog.Info("bridge shutdown signal received") - return - } - // Transport error — do not commit, retry on restart - slog.Warn("ingest post failed, will retry on restart", "err", err, "offset", msg.Offset) - continue - } - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - - status := resp.StatusCode - - switch { - // 2xx — success, commit - case status >= 200 && status < 300: - commitWithTimeout(reader, msg) - - // 429 — backpressure, do not commit (retry later) - case status == http.StatusTooManyRequests: - slog.Warn("ingest returned 429, backing off", "offset", msg.Offset) - time.Sleep(2 * time.Second) - - // 5xx — transient server error, do not commit - case status >= 500: - slog.Warn("ingest returned 5xx, skipping commit for retry", "status", status, "offset", msg.Offset) - - // 4xx (except 429) — poison payload, persist to DLQ then commit - default: - slog.Error("ingest rejected message, writing to dead letter", - "status", status, "offset", msg.Offset, "partition", msg.Partition) - if err := dlq.Write(deadLetterEntry{ - Topic: msg.Topic, - Partition: msg.Partition, - Offset: msg.Offset, - Key: string(msg.Key), - StatusCode: status, - ResponseBody: string(respBody), - Payload: string(msg.Value), - Timestamp: time.Now().UTC(), - }); err != nil { - // DLQ write failed — do NOT commit to avoid silent message loss - slog.Error("dead letter write failed, skipping commit", "err", err, "offset", msg.Offset) - continue - } - commitWithTimeout(reader, msg) - } - } -} - -// commitWithTimeout commits the message with a dedicated 3s timeout, -// independent of the request context. -func commitWithTimeout(reader *kafka.Reader, msg kafka.Message) { - commitCtx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - if err := reader.CommitMessages(commitCtx, msg); err != nil { - slog.Warn("kafka commit failed, duplicate delivery possible", "err", err, "offset", msg.Offset) - } -} - -// --- Dead letter writer --- - -type deadLetterEntry struct { - Topic string `json:"topic"` - Partition int `json:"partition"` - Offset int64 `json:"offset"` - Key string `json:"key"` - StatusCode int `json:"status_code"` - ResponseBody string `json:"response_body"` - Payload string `json:"payload"` - Timestamp time.Time `json:"timestamp"` -} - -type deadLetterWriter struct { - mu sync.Mutex - file *os.File -} - -func newDeadLetterWriter(dir string) *deadLetterWriter { - os.MkdirAll(dir, 0o755) - name := fmt.Sprintf("%s/deadletter-%s.jsonl", dir, time.Now().UTC().Format("20060102")) - f, err := os.OpenFile(name, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) - if err != nil { - slog.Error("failed to open dead letter file", "err", err, "path", name) - return &deadLetterWriter{} - } - slog.Info("dead letter file opened", "path", name) - return &deadLetterWriter{file: f} -} - -func (d *deadLetterWriter) Write(entry deadLetterEntry) error { - d.mu.Lock() - defer d.mu.Unlock() - if d.file == nil { - return fmt.Errorf("dead letter file not open") - } - data, err := json.Marshal(entry) - if err != nil { - return err - } - data = append(data, '\n') - _, err = d.file.Write(data) - if err != nil { - return err - } - return d.file.Sync() -} - -func (d *deadLetterWriter) Close() { - d.mu.Lock() - defer d.mu.Unlock() - if d.file != nil { - d.file.Close() - } -} - -// --- Helpers --- - -func parseSlogLevel(s string) slog.Level { - switch strings.ToLower(s) { - case "debug": - return slog.LevelDebug - case "warn", "warning": - return slog.LevelWarn - case "error": - return slog.LevelError - default: - return slog.LevelInfo - } -} - -func ensureTopicReady(ctx context.Context, brokers []string, topic string) error { - if len(brokers) == 0 || topic == "" { - return nil - } - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - controller, err := controllerAddress(brokers[0]) - if err != nil { - slog.Warn("kafka controller not ready, retrying", "err", err) - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(2 * time.Second): - } - continue - } - - conn, err := kafka.Dial("tcp", controller) - if err != nil { - slog.Warn("kafka controller dial failed, retrying", "err", err) - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(2 * time.Second): - } - continue - } - - err = conn.CreateTopics(kafka.TopicConfig{ - Topic: topic, - NumPartitions: 1, - ReplicationFactor: 1, - }) - _ = conn.Close() - - if err != nil && !strings.Contains(err.Error(), "Topic with this name already exists") { - slog.Warn("kafka create topic failed, retrying", "err", err, "topic", topic) - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(2 * time.Second): - } - continue - } - - return nil - } -} - -func controllerAddress(broker string) (string, error) { - conn, err := kafka.Dial("tcp", broker) - if err != nil { - return "", err - } - defer conn.Close() - - controller, err := conn.Controller() - if err != nil { - return "", err - } - return net.JoinHostPort(controller.Host, strconv.Itoa(controller.Port)), nil -} diff --git a/cmd/checkout/main.go b/cmd/checkout/main.go index 38dec46..7a913ab 100644 --- a/cmd/checkout/main.go +++ b/cmd/checkout/main.go @@ -10,10 +10,8 @@ import ( "time" "github.com/sssmaran/WaylogCLI/internal/checkout" - "github.com/sssmaran/WaylogCLI/internal/config" waylog "github.com/sssmaran/WaylogCLI/pkg" wayloghttp "github.com/sssmaran/WaylogCLI/pkg/http" - kafkatransport "github.com/sssmaran/WaylogCLI/pkg/transport/kafka" ) type coded interface { @@ -37,18 +35,6 @@ func main() { }, } - if brokers := config.SplitEnvList("KAFKA_BROKERS"); len(brokers) > 0 { - kt, err := kafkatransport.New(kafkatransport.Config{ - Brokers: brokers, - Topic: config.Getenv("KAFKA_TOPIC", "wide_events"), - }) - if err != nil { - slog.Error("kafka transport init failed", "err", err) - os.Exit(1) - } - cfg.Transport = kt - } - err := waylog.Init(cfg) if err != nil { slog.Error("waylog init failed", "err", err) diff --git a/cmd/crux/dispatch.go b/cmd/crux/dispatch.go new file mode 100644 index 0000000..6b893a4 --- /dev/null +++ b/cmd/crux/dispatch.go @@ -0,0 +1,148 @@ +package main + +import ( + "fmt" + "io" + "net/url" + "strings" + "unicode" + + cliv2 "github.com/sssmaran/WaylogCLI/internal/cli/v2" +) + +type ResultKind int + +const ( + ResultOK ResultKind = iota + ResultNoop + ResultExit + ResultUnknown + ResultUsage + ResultError +) + +type Result struct { + Kind ResultKind + ExitCode int +} + +type runCLIFunc func(args []string, stdin io.Reader, stdout, stderr io.Writer) int + +type Dispatcher struct { + ingestURL string + globalArgs []string + stdin io.Reader + runCLI runCLIFunc + openBrowser func(url string) error +} + +func NewDispatcher(ingestURL string, globalArgs []string, stdin io.Reader) *Dispatcher { + if stdin == nil { + stdin = strings.NewReader("") + } + return &Dispatcher{ + ingestURL: normalizeIngestURL(ingestURL), + globalArgs: append([]string(nil), globalArgs...), + stdin: stdin, + runCLI: cliv2.RunCLI, + openBrowser: openBrowser, + } +} + +func parseLine(line string) []string { + line = strings.TrimSpace(line) + if line == "" { + return nil + } + var out []string + var cur strings.Builder + inQuote := false + for _, r := range line { + switch { + case r == '"': + inQuote = !inQuote + case unicode.IsSpace(r) && !inQuote: + if cur.Len() > 0 { + out = append(out, cur.String()) + cur.Reset() + } + default: + cur.WriteRune(r) + } + } + if cur.Len() > 0 { + out = append(out, cur.String()) + } + return out +} + +func (d *Dispatcher) Dispatch(line string, stdout, stderr io.Writer) Result { + tokens := parseLine(line) + return d.DispatchTokens(tokens, stdout, stderr) +} + +func (d *Dispatcher) DispatchTokens(tokens []string, stdout, stderr io.Writer) Result { + if len(tokens) == 0 { + return Result{Kind: ResultNoop} + } + cmd, rest := tokens[0], tokens[1:] + switch cmd { + case "exit", "quit": + return Result{Kind: ResultExit} + case "help": + printHelp(stdout) + return Result{Kind: ResultOK} + case "open": + return d.openIncident(rest, stdout, stderr) + case "status", "incidents", "incident", "triage", "blast", "explain", "recent", "errors", "event", "trace", "search", "capabilities": + args := d.cliArgs(cmd, rest) + code := d.runCLI(args, d.stdin, stdout, stderr) + if code != 0 { + return Result{Kind: ResultError, ExitCode: code} + } + return Result{Kind: ResultOK} + default: + fmt.Fprintf(stderr, "unknown command: %s (type 'help' for commands)\n", cmd) + return Result{Kind: ResultUnknown} + } +} + +func (d *Dispatcher) openIncident(args []string, stdout, stderr io.Writer) Result { + if len(args) != 1 { + fmt.Fprintln(stderr, "usage: open ") + return Result{Kind: ResultUsage} + } + target := strings.TrimRight(d.ingestURL, "/") + "/ui/#/incident/" + url.PathEscape(args[0]) + if err := d.openBrowser(target); err != nil { + fmt.Fprintf(stderr, "open: %v\n", err) + fmt.Fprintf(stdout, "%s\n", target) + return Result{Kind: ResultError, ExitCode: 1} + } + fmt.Fprintf(stdout, "opened %s\n", target) + return Result{Kind: ResultOK} +} + +func (d *Dispatcher) cliArgs(cmd string, rest []string) []string { + if cmd == "status" { + cmd = "capabilities" + } + args := append([]string(nil), d.globalArgs...) + args = append(args, cmd) + args = append(args, rest...) + return args +} + +func printHelp(w io.Writer) { + fmt.Fprintln(w, "Commands:") + fmt.Fprintln(w, " status show runtime capabilities") + fmt.Fprintln(w, " incidents list active incidents") + fmt.Fprintln(w, " incident show incident detail") + fmt.Fprintln(w, " open open incident in the dashboard") + fmt.Fprintln(w, " triage print deterministic triage report") + fmt.Fprintln(w, " blast :: show blast radius") + fmt.Fprintln(w, " explain explain a trace or event") + fmt.Fprintln(w, " recent [flags] show recent traces") + fmt.Fprintln(w, " errors [flags] show error families") + fmt.Fprintln(w, " help show this help") + fmt.Fprintln(w, " exit | quit leave the shell") +} diff --git a/cmd/crux/dispatch_test.go b/cmd/crux/dispatch_test.go new file mode 100644 index 0000000..45ef7bb --- /dev/null +++ b/cmd/crux/dispatch_test.go @@ -0,0 +1,134 @@ +package main + +import ( + "bytes" + "io" + "reflect" + "strings" + "testing" +) + +func TestParseLine(t *testing.T) { + cases := []struct { + in string + want []string + }{ + {"", nil}, + {" ", nil}, + {"incidents", []string{"incidents"}}, + {"triage inc_abc", []string{"triage", "inc_abc"}}, + {"blast checkout:payment.charge:PMT_502", []string{"blast", "checkout:payment.charge:PMT_502"}}, + {`explain "trace 1"`, []string{"explain", "trace 1"}}, + {" triage inc_abc ", []string{"triage", "inc_abc"}}, + } + for _, c := range cases { + if got := parseLine(c.in); !reflect.DeepEqual(got, c.want) { + t.Fatalf("parseLine(%q) = %#v, want %#v", c.in, got, c.want) + } + } +} + +func TestDispatchBuiltins(t *testing.T) { + var out, errOut bytes.Buffer + d := newTestDispatcher() + if res := d.Dispatch("", &out, &errOut); res.Kind != ResultNoop { + t.Fatalf("empty result = %+v, want noop", res) + } + if res := d.Dispatch("help", &out, &errOut); res.Kind != ResultOK { + t.Fatalf("help result = %+v, want ok", res) + } + if !strings.Contains(out.String(), "Commands:") || !strings.Contains(out.String(), "incidents") { + t.Fatalf("help output = %q", out.String()) + } + if res := d.Dispatch("exit", &out, &errOut); res.Kind != ResultExit { + t.Fatalf("exit result = %+v, want exit", res) + } + if res := d.Dispatch("quit", &out, &errOut); res.Kind != ResultExit { + t.Fatalf("quit result = %+v, want exit", res) + } +} + +func TestDispatchUnknownCommand(t *testing.T) { + var out, errOut bytes.Buffer + d := newTestDispatcher() + res := d.Dispatch("frobnicate", &out, &errOut) + if res.Kind != ResultUnknown { + t.Fatalf("result = %+v, want unknown", res) + } + if !strings.Contains(errOut.String(), "unknown command") { + t.Fatalf("stderr = %q", errOut.String()) + } +} + +func TestDispatchCLIWrappedPassesArgs(t *testing.T) { + var out, errOut bytes.Buffer + var captured []string + d := newTestDispatcher() + d.globalArgs = []string{"--addr", "http://example.test"} + d.runCLI = func(args []string, _ io.Reader, _, _ io.Writer) int { + captured = append([]string(nil), args...) + return 0 + } + res := d.Dispatch("triage inc_abc", &out, &errOut) + if res.Kind != ResultOK { + t.Fatalf("result = %+v, want ok", res) + } + want := []string{"--addr", "http://example.test", "triage", "inc_abc"} + if !reflect.DeepEqual(captured, want) { + t.Fatalf("args = %#v, want %#v", captured, want) + } +} + +func TestDispatchStatusAliasesCapabilities(t *testing.T) { + var out, errOut bytes.Buffer + var captured []string + d := newTestDispatcher() + d.runCLI = func(args []string, _ io.Reader, _, _ io.Writer) int { + captured = append([]string(nil), args...) + return 0 + } + res := d.Dispatch("status", &out, &errOut) + if res.Kind != ResultOK { + t.Fatalf("result = %+v, want ok", res) + } + if len(captured) == 0 || captured[len(captured)-1] != "capabilities" { + t.Fatalf("status args = %#v, want capabilities", captured) + } +} + +func TestDispatchOpen(t *testing.T) { + var out, errOut bytes.Buffer + var opened string + d := newTestDispatcher() + d.openBrowser = func(target string) error { + opened = target + return nil + } + res := d.Dispatch("open inc_abc", &out, &errOut) + if res.Kind != ResultOK { + t.Fatalf("result = %+v, want ok", res) + } + if opened != "http://localhost:8080/ui/#/incident/inc_abc" { + t.Fatalf("opened = %q", opened) + } +} + +func TestDispatchOpenRequiresID(t *testing.T) { + var out, errOut bytes.Buffer + d := newTestDispatcher() + res := d.Dispatch("open", &out, &errOut) + if res.Kind != ResultUsage { + t.Fatalf("result = %+v, want usage", res) + } + if !strings.Contains(errOut.String(), "usage: open ") { + t.Fatalf("stderr = %q", errOut.String()) + } +} + +func newTestDispatcher() *Dispatcher { + return &Dispatcher{ + ingestURL: "http://localhost:8080", + runCLI: func(_ []string, _ io.Reader, _, _ io.Writer) int { return 0 }, + openBrowser: func(_ string) error { return nil }, + } +} diff --git a/cmd/crux/firstrun.go b/cmd/crux/firstrun.go new file mode 100644 index 0000000..e1de7bf --- /dev/null +++ b/cmd/crux/firstrun.go @@ -0,0 +1,48 @@ +package main + +import ( + "fmt" + "os" + "time" + + "github.com/sssmaran/WaylogCLI/internal/firstrun" +) + +// runFirstRun parses `crux first-run [--requests N] [--timeout DUR]` and runs it. +func runFirstRun(args []string) int { + opt := firstrun.Options{Stdout: os.Stdout, Stderr: os.Stderr} + for i := 0; i < len(args); i++ { + switch args[i] { + case "--requests": + if i+1 >= len(args) { + fmt.Fprintln(os.Stderr, "usage: crux first-run [--requests N] [--timeout DUR]") + return 2 + } + if _, err := fmt.Sscanf(args[i+1], "%d", &opt.Requests); err != nil { + fmt.Fprintf(os.Stderr, "invalid --requests: %v\n", err) + return 2 + } + i++ + case "--timeout": + if i+1 >= len(args) { + fmt.Fprintln(os.Stderr, "usage: crux first-run [--requests N] [--timeout DUR]") + return 2 + } + d, err := time.ParseDuration(args[i+1]) + if err != nil { + fmt.Fprintf(os.Stderr, "invalid --timeout: %v\n", err) + return 2 + } + opt.Timeout = d + i++ + default: + fmt.Fprintf(os.Stderr, "unknown flag: %s\n", args[i]) + return 2 + } + } + if err := firstrun.Run(opt); err != nil { + fmt.Fprintf(os.Stderr, "first-run failed: %v\n", err) + return 1 + } + return 0 +} diff --git a/cmd/crux/main.go b/cmd/crux/main.go new file mode 100644 index 0000000..94dfd0d --- /dev/null +++ b/cmd/crux/main.go @@ -0,0 +1,131 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" + + cliv2 "github.com/sssmaran/WaylogCLI/internal/cli/v2" +) + +func main() { + globalArgs, rest, ingestURL := splitCruxArgs(os.Args[1:]) + if len(rest) > 0 { + if rest[0] == "first-run" { + os.Exit(runFirstRun(rest[1:])) + } + if rest[0] == "open" { + disp := NewDispatcher(ingestURL, globalArgs, os.Stdin) + res := disp.DispatchTokens(rest, os.Stdout, os.Stderr) + if res.Kind == ResultError || res.Kind == ResultUsage || res.Kind == ResultUnknown { + if res.ExitCode != 0 { + os.Exit(res.ExitCode) + } + os.Exit(1) + } + return + } + os.Exit(cliv2.RunCLI(os.Args[1:], os.Stdin, os.Stdout, os.Stderr)) + } + os.Exit(runREPL(os.Stdin, os.Stdout, os.Stderr, ingestURL, globalArgs)) +} + +func runREPL(stdin io.Reader, stdout, stderr io.Writer, ingestURL string, globalArgs []string) int { + disp := NewDispatcher(ingestURL, globalArgs, stdin) + reader := bufio.NewReader(stdin) + fmt.Fprintln(stdout, "Crux") + fmt.Fprintf(stdout, "Connected: %s\n", disp.ingestURL) + fmt.Fprintln(stdout, "Type help for commands. Type exit to leave.") + fmt.Fprintln(stdout) + for { + fmt.Fprint(stdout, "crux> ") + line, err := reader.ReadString('\n') + if err != nil { + fmt.Fprintln(stdout) + return 0 + } + res := disp.Dispatch(line, stdout, stderr) + switch res.Kind { + case ResultExit: + fmt.Fprintln(stdout, "bye") + return 0 + case ResultError: + if res.ExitCode != 0 { + fmt.Fprintf(stderr, "(exit %d)\n", res.ExitCode) + } + } + } +} + +func splitCruxArgs(args []string) (globalArgs []string, rest []string, ingestURL string) { + ingestURL = "" + readKey := "" + for i := 0; i < len(args); i++ { + arg := args[i] + switch { + case arg == "--addr" || arg == "--api-key" || arg == "--timeout": + if i+1 >= len(args) { + rest = append(rest, arg) + continue + } + value := args[i+1] + switch arg { + case "--addr": + ingestURL = value + globalArgs = append(globalArgs, "--addr", normalizeIngestURL(value)) + case "--api-key": + readKey = value + globalArgs = append(globalArgs, "--api-key", value) + case "--timeout": + globalArgs = append(globalArgs, "--timeout", value) + } + i++ + case strings.HasPrefix(arg, "--addr="): + value := strings.TrimPrefix(arg, "--addr=") + ingestURL = value + globalArgs = append(globalArgs, "--addr", normalizeIngestURL(value)) + case strings.HasPrefix(arg, "--api-key="): + value := strings.TrimPrefix(arg, "--api-key=") + readKey = value + globalArgs = append(globalArgs, "--api-key", value) + case strings.HasPrefix(arg, "--timeout="): + globalArgs = append(globalArgs, "--timeout", strings.TrimPrefix(arg, "--timeout=")) + default: + rest = append(rest, arg) + } + } + if ingestURL == "" { + ingestURL = resolveIngestURL(nil) + globalArgs = append([]string{"--addr", normalizeIngestURL(ingestURL)}, globalArgs...) + } + if readKey == "" { + if key := os.Getenv("WAYLOG_READ_KEY"); key != "" { + globalArgs = append(globalArgs, "--api-key", key) + } + } + return globalArgs, rest, ingestURL +} + +func resolveIngestURL(args []string) string { + for i := 0; i < len(args); i++ { + switch { + case args[i] == "--addr" && i+1 < len(args): + return args[i+1] + case strings.HasPrefix(args[i], "--addr="): + return strings.TrimPrefix(args[i], "--addr=") + } + } + if v := os.Getenv("INGEST_ADDR"); v != "" { + return v + } + if v := os.Getenv("INGEST_URL"); v != "" { + return v + } + return "http://localhost:8080" +} + +func normalizeIngestURL(raw string) string { + return cliv2.NormalizeBaseURL(raw) +} diff --git a/cmd/crux/main_test.go b/cmd/crux/main_test.go new file mode 100644 index 0000000..3150e33 --- /dev/null +++ b/cmd/crux/main_test.go @@ -0,0 +1,46 @@ +package main + +import ( + "reflect" + "testing" +) + +func TestSplitCruxArgs_REPLWithGlobalFlags(t *testing.T) { + global, rest, ingestURL := splitCruxArgs([]string{"--addr", ":8081", "--api-key", "demo", "--timeout", "30s"}) + if ingestURL != ":8081" { + t.Fatalf("ingestURL = %q, want :8081", ingestURL) + } + if len(rest) != 0 { + t.Fatalf("rest = %#v, want empty", rest) + } + wantGlobal := []string{"--addr", "http://localhost:8081", "--api-key", "demo", "--timeout", "30s"} + if !reflect.DeepEqual(global, wantGlobal) { + t.Fatalf("global = %#v, want %#v", global, wantGlobal) + } +} + +func TestSplitCruxArgs_PreservesTimeoutEqualsForm(t *testing.T) { + global, rest, _ := splitCruxArgs([]string{"--timeout=30s"}) + if len(rest) != 0 { + t.Fatalf("rest = %#v, want empty", rest) + } + found := false + for i := 0; i < len(global)-1; i++ { + if global[i] == "--timeout" && global[i+1] == "30s" { + found = true + } + } + if !found { + t.Fatalf("global = %#v, missing --timeout 30s", global) + } +} + +func TestSplitCruxArgs_CommandModePreservesCommand(t *testing.T) { + global, rest, _ := splitCruxArgs([]string{"--addr=http://localhost:8080", "incidents"}) + if len(global) < 2 || global[0] != "--addr" { + t.Fatalf("global = %#v", global) + } + if !reflect.DeepEqual(rest, []string{"incidents"}) { + t.Fatalf("rest = %#v, want incidents", rest) + } +} diff --git a/cmd/crux/open.go b/cmd/crux/open.go new file mode 100644 index 0000000..ac52151 --- /dev/null +++ b/cmd/crux/open.go @@ -0,0 +1,30 @@ +package main + +import ( + "fmt" + "os/exec" + "runtime" +) + +func browserCommandFor(goos string) (string, []string) { + switch goos { + case "darwin": + return "open", nil + case "linux": + return "xdg-open", nil + case "windows": + return "cmd", []string{"/c", "start"} + default: + return "", nil + } +} + +func openBrowser(target string) error { + cmd, prefix := browserCommandFor(runtime.GOOS) + if cmd == "" { + return fmt.Errorf("auto-open unsupported on %s", runtime.GOOS) + } + args := append([]string(nil), prefix...) + args = append(args, target) + return exec.Command(cmd, args...).Start() +} diff --git a/cmd/crux/open_test.go b/cmd/crux/open_test.go new file mode 100644 index 0000000..e047fcd --- /dev/null +++ b/cmd/crux/open_test.go @@ -0,0 +1,25 @@ +package main + +import ( + "reflect" + "testing" +) + +func TestBrowserCommandFor(t *testing.T) { + cases := []struct { + goos string + wantCmd string + wantArgs []string + }{ + {goos: "darwin", wantCmd: "open"}, + {goos: "linux", wantCmd: "xdg-open"}, + {goos: "windows", wantCmd: "cmd", wantArgs: []string{"/c", "start"}}, + {goos: "plan9"}, + } + for _, c := range cases { + gotCmd, gotArgs := browserCommandFor(c.goos) + if gotCmd != c.wantCmd || !reflect.DeepEqual(gotArgs, c.wantArgs) { + t.Fatalf("browserCommandFor(%q) = (%q, %#v), want (%q, %#v)", c.goos, gotCmd, gotArgs, c.wantCmd, c.wantArgs) + } + } +} diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go index e9c9ea1..f07f6bc 100644 --- a/cmd/ingest/main.go +++ b/cmd/ingest/main.go @@ -4,13 +4,11 @@ import ( "bufio" "context" "errors" - "fmt" "log/slog" "net" "net/http" "os" "os/signal" - "path/filepath" "strings" "syscall" "time" @@ -25,9 +23,6 @@ import ( "github.com/sssmaran/WaylogCLI/internal/detect" "github.com/sssmaran/WaylogCLI/internal/eventlog" eventlogv2 "github.com/sssmaran/WaylogCLI/internal/eventlog/v2" - "github.com/sssmaran/WaylogCLI/internal/graph/causal" - "github.com/sssmaran/WaylogCLI/internal/graph/core" - graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store" "github.com/sssmaran/WaylogCLI/internal/incidents" "github.com/sssmaran/WaylogCLI/internal/ingest" ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2" @@ -35,10 +30,9 @@ import ( "github.com/sssmaran/WaylogCLI/internal/mcp/stdio" "github.com/sssmaran/WaylogCLI/internal/metrics" otelhttp "github.com/sssmaran/WaylogCLI/internal/otel" - "github.com/sssmaran/WaylogCLI/internal/persist" + "github.com/sssmaran/WaylogCLI/internal/ratelimit" "github.com/sssmaran/WaylogCLI/internal/signals" "github.com/sssmaran/WaylogCLI/internal/tools" - "github.com/sssmaran/WaylogCLI/internal/tracestore" "github.com/sssmaran/WaylogCLI/internal/triage" "github.com/sssmaran/WaylogCLI/internal/triagehttp" apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2" @@ -47,57 +41,22 @@ import ( "google.golang.org/grpc" ) -var graphStore *graphstore.Store - func main() { level := parseSlogLevel(config.Getenv("LOG_LEVEL", "info")) slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: level}))) addr := config.Getenv("INGEST_ADDR", ":8080") - // ---------------- Graph persistence config ---------------- + // ---------------- Hot-window config ---------------- - snapshotPath := config.Getenv("SNAPSHOT_PATH", "./data/graph_snapshot.json") - snapshotEvery := config.GetenvInt("SNAPSHOT_EVERY_SEC", 5) - snapshotLogEvery := config.GetenvInt("SNAPSHOT_LOG_EVERY", 1) - graphHotWindow := config.GetenvDuration("GRAPH_HOT_WINDOW", 0) - if graphHotWindow == 0 { - graphHotWindow = config.GetenvDuration("GRAPH_RETENTION", 24*time.Hour) - } + tickEvery := config.GetenvInt("PRUNE_TICK_SEC", 5) + graphHotWindow := config.GetenvDuration("GRAPH_HOT_WINDOW", 24*time.Hour) if graphHotWindow <= 0 { slog.Error("GRAPH_HOT_WINDOW must be positive", "value", graphHotWindow) os.Exit(1) } mcpStdio := config.GetenvBool("MCP_STDIO", false) - graphStore = graphstore.NewStore() - traceStore := tracestore.NewStore() - var snapshotSavedAt time.Time - - // Restore snapshot (non-fatal). On corrupt/missing snapshot the server - // starts with an empty graph and re-establishes persistence once it - // has data. persist.Save backs up the previous file as .bak before - // overwriting, so corrupt snapshots are never lost. - if snap, source, err := persist.LoadWithSource(snapshotPath); err == nil { - graphStore.Restore(snap.Graph) - snapshotSavedAt = snap.SavedAt - slog.Info("snapshot loaded", - "path", snapshotPath, - "nodes", snap.NodeCount, - "edges", snap.EdgeCount, - "saved_at", snap.SavedAt.Format(time.RFC3339), - ) - if source == "backup" { - slog.Info("snapshot loaded from backup", "path", snapshotPath+".bak") - } - } else if errors.Is(err, persist.ErrSnapshotMissing) { - slog.Info("no snapshot found, starting fresh") - } else if errors.Is(err, persist.ErrSnapshotVersionMismatch) { - slog.Warn("snapshot version incompatible, replaying from event log", "path", snapshotPath, "err", err) - } else { - slog.Warn("snapshot load failed, starting with empty graph", "err", err) - } - authCfg, err := auth.ParseConfig(map[string]string{ "WAYLOG_API_KEY": os.Getenv("WAYLOG_API_KEY"), "WAYLOG_WRITE_KEY": os.Getenv("WAYLOG_WRITE_KEY"), @@ -111,6 +70,9 @@ func main() { slog.Error("auth config error", "err", err) os.Exit(1) } + for _, w := range authCfg.WeakKeyWarnings() { + slog.Warn("insecure auth configuration", "detail", w) + } var sm *auth.SessionManager if authCfg.DashboardMode != "off" { @@ -132,14 +94,12 @@ func main() { dashboardRefreshSec := config.GetenvInt("DASHBOARD_REFRESH_SEC", 10) prometheusURL := config.Getenv("PROMETHEUS_URL", "") grafanaURL := config.Getenv("GRAFANA_URL", "") - graphUI := config.GetenvBool("GRAPH_UI", false) otlpEnabled := config.GetenvBool("OTLP_ENABLED", true) otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317") if authCfg.Profile == auth.ProfileProd && otlpEnabled && len(authCfg.WriteKeys) == 0 { slog.Error("WAYLOG_PROFILE=prod with OTLP enabled requires WAYLOG_WRITE_KEY — refusing to boot with unauthenticated OTLP") os.Exit(1) } - v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", true) signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour) alertMatchWindow := config.GetenvDuration("ALERT_MATCH_WINDOW", 15*time.Minute) if alertMatchWindow <= 0 { @@ -154,6 +114,7 @@ func main() { Window: config.GetenvDuration("WAYLOG_INCIDENT_WINDOW", 10*time.Minute), MinCount: config.GetenvInt("WAYLOG_INCIDENT_MIN_COUNT", 5), MinLift: config.GetenvFloat("WAYLOG_INCIDENT_MIN_LIFT", 3.0), + MinRate: config.GetenvFloat("WAYLOG_INCIDENT_MIN_RATE", 0), ResolveAfter: config.GetenvDuration("WAYLOG_INCIDENT_RESOLVE_AFTER", 2*time.Minute), DeployCorrelationWindow: config.GetenvDuration("WAYLOG_DEPLOY_CORRELATION_WINDOW", 15*time.Minute), SampleLimit: config.GetenvInt("WAYLOG_INCIDENT_SAMPLE_LIMIT", 5), @@ -163,9 +124,6 @@ func main() { os.Exit(1) } - causalEnabled := config.GetenvBool("CAUSAL_ENABLED", false) - causalInterval := config.GetenvDuration("CAUSAL_INTERVAL", 30*time.Second) - trustProxy := config.GetenvBool("WAYLOG_TRUST_PROXY", false) if _, err := llm.SelectFromEnv(); err != nil { slog.Error("LLM provider config error", "err", err) @@ -176,10 +134,6 @@ func main() { planStore := ingest.NewPlanStore() reg := tools.NewRegistry() - if err := tools.RegisterGraphTools(reg); err != nil { - slog.Error("mcp tools init failed", "err", err) - os.Exit(1) - } // Prometheus metrics promReg := prometheus.NewRegistry() @@ -192,7 +146,7 @@ func main() { slog.Error("EVENT_LOG_RETENTION must be positive", "value", eventLogRetention) os.Exit(1) } - eventLogV2Dir := config.Getenv("EVENT_LOG_V2_DIR", defaultEventLogV2Dir(eventLogDir)) + eventLogV2Dir := eventlogv2.ResolveDir(os.Getenv("EVENT_LOG_V2_DIR"), eventLogDir) v2Wal, err := eventlogv2.New(eventLogV2Dir, eventlogv2.WithSync(eventLogSync), eventlogv2.WithMaxBytes(eventLogMaxMB*1024*1024), @@ -257,8 +211,6 @@ func main() { // Create ingest server with the store ingestServer := ingest.NewServer(ingest.ServerConfig{ - Store: graphStore, - TraceStore: traceStore, MaxBodyBytes: maxBody, EventLogDir: eventLogDir, Metrics: m, @@ -269,7 +221,6 @@ func main() { DashboardRefreshSec: dashboardRefreshSec, PrometheusURL: prometheusURL, GrafanaURL: grafanaURL, - GraphUI: graphUI, DedupCache: dedupCache, AgentKey: agentKey, TrustProxy: trustProxy, @@ -280,17 +231,12 @@ func main() { OTLPEnabled: otlpEnabled, OTLPGRPCEnabled: otlpEnabled && otlpGRPCAddr != "", OTLPGRPCAddr: otlpGRPCAddr, - V2ReadsEnabled: v2ReadsEnabled, - IncidentsEnabled: v2ReadsEnabled && incidentsEnabled && sqlitePath != "", - IncidentsPersistent: v2ReadsEnabled && incidentsEnabled && sqlitePath != "", - IncidentRebuildSupported: v2ReadsEnabled && incidentsEnabled && sqlitePath != "", + IncidentsEnabled: incidentsEnabled && sqlitePath != "", + IncidentsPersistent: incidentsEnabled && sqlitePath != "", + IncidentRebuildSupported: incidentsEnabled && sqlitePath != "", Profile: authCfg.Profile, }) - // SSE hub for real-time dashboard updates - sseHub := ingest.NewSSEHub(config.GetenvInt("SSE_MAX_CLIENTS", 100)) - ingestServer.SetSSEHub(sseHub) - // Optional append-only v1 event log var el *eventlog.Writer if eventLogDir != "" { @@ -311,63 +257,8 @@ func main() { "max_file_mb", eventLogMaxMB, "retention", eventLogRetention, ) - - // Replay WAL to rebuild derived views. - // - // The graph snapshot covers nodes/edges, so the graph only needs - // entries newer than snapshotSavedAt. The trace store is NOT - // snapshotted, so it must be rebuilt from the full hot window - // to restore drill-down data (trace_summary, story, topology). - m.ReplayInProgress.Set(1) - traceReplayAfter := time.Now().Add(-graphHotWindow) - replayAfter := snapshotSavedAt - if traceReplayAfter.Before(replayAfter) { - replayAfter = traceReplayAfter - } - entries, replayErr := eventlog.ReadDir(eventLogDir, replayAfter) - if replayErr != nil { - slog.Warn("event log replay failed", "err", replayErr) - m.ReplayFailuresTotal.Inc() - } else if len(entries) > 0 { - replayedGraph, replayedTrace := 0, 0 - for i := range entries { - m.ReplayLagSeconds.Set(time.Since(entries[i].LoggedAt).Seconds()) - if !entries[i].SampledInGraph { - continue - } - result := ingestServer.Builder().BuildResult(entries[i].Event) - - // Graph: only merge entries newer than the snapshot. - if entries[i].LoggedAt.After(snapshotSavedAt) { - graphStore.Merge(result.Graph) - replayedGraph++ - } - - // Trace store: merge everything in the hot window. - if result.Span != nil { - traceStore.Upsert(entries[i].Event.Request.TraceID, core.ID("request", entries[i].Event.Request.TraceID), result.Span) - replayedTrace++ - } - } - m.TraceStoreRecords.Set(float64(traceStore.Count())) - m.TraceStoreSpans.Set(float64(traceStore.SpanCount())) - m.TraceStoreCohorts.Set(float64(traceStore.CohortCount())) - slog.Info("event log replay complete", - "total", len(entries), - "graph_replayed", replayedGraph, - "trace_replayed", replayedTrace, - ) - } - m.ReplayLagSeconds.Set(0) - m.ReplayInProgress.Set(0) - ingestServer.SetReplayResult(replayErr) - ingestServer.SetReady() - } else { - ingestServer.SetReady() } - - // Set default store for CLI - cli.SetDefaultStore(graphStore) + ingestServer.SetReady() // ---------------- HTTP server ---------------- @@ -378,9 +269,22 @@ func main() { corsOrigin := config.Getenv("CORS_ORIGIN", "*") - writeAuth := auth.Middleware("write", authCfg.WriteKeys, nil) - readAuth := auth.Middleware("read", authCfg.ReadKeys, sessionCheck) - agentAuth := auth.Middleware("agent", authCfg.AgentKeys, nil) + // Per-key rate limits run outermost (before auth) so floods of invalid + // credentials are throttled too. 0 disables; only prod limits by default. + defWriteRPS, defReadRPS, defAgentRPS := 0, 0, 0 + if authCfg.Profile == auth.ProfileProd { + defWriteRPS, defReadRPS, defAgentRPS = 1000, 200, 50 + } + writeLimit := ratelimit.Middleware(ratelimit.New(config.GetenvInt("WAYLOG_RATE_LIMIT_WRITE_RPS", defWriteRPS)), "write", m) + readLimit := ratelimit.Middleware(ratelimit.New(config.GetenvInt("WAYLOG_RATE_LIMIT_READ_RPS", defReadRPS)), "read", m) + agentLimit := ratelimit.Middleware(ratelimit.New(config.GetenvInt("WAYLOG_RATE_LIMIT_AGENT_RPS", defAgentRPS)), "agent", m) + + writeKeyAuth := auth.Middleware("write", authCfg.WriteKeys, nil) + readKeyAuth := auth.Middleware("read", authCfg.ReadKeys, sessionCheck) + agentKeyAuth := auth.Middleware("agent", authCfg.AgentKeys, nil) + writeAuth := func(h http.Handler) http.Handler { return writeLimit(writeKeyAuth(h)) } + readAuth := func(h http.Handler) http.Handler { return readLimit(readKeyAuth(h)) } + agentAuth := func(h http.Handler) http.Handler { return agentLimit(agentKeyAuth(h)) } dashGate := auth.DashboardGate(authCfg, sm) mux := http.NewServeMux() @@ -411,7 +315,13 @@ func main() { // OTLP/HTTP traces reuse the same schema-2.0 WAL and projector as the SDK path. var otlpGRPCServer *grpc.Server if otlpEnabled { - otlpHandler := otelhttp.NewHandler(eventsV2, m, maxBody) + // A service.version change observed on OTLP spans auto-registers a + // deployment so deploy correlation works without the deploy webhook. + var deployTracker *otelhttp.DeployTracker + if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok { + deployTracker = otelhttp.NewDeployTracker(sqlite) + } + otlpHandler := otelhttp.NewHandler(eventsV2, m, maxBody, deployTracker) mux.Handle("/v1/otlp/v1/traces", writeAuth(http.HandlerFunc(otlpHandler.ServeHTTP))) slog.Info("otlp enabled", "endpoint", "/v1/otlp/v1/traces") if otlpGRPCAddr != "" { @@ -419,7 +329,7 @@ func main() { grpc.UnaryInterceptor(otelhttp.AuthUnaryInterceptor(authCfg.WriteKeys)), grpc.MaxRecvMsgSize(int(maxBody)), ) - coltracepb.RegisterTraceServiceServer(otlpGRPCServer, otelhttp.NewTraceServiceServer(eventsV2, m, maxBody)) + coltracepb.RegisterTraceServiceServer(otlpGRPCServer, otelhttp.NewTraceServiceServer(eventsV2, m, maxBody, deployTracker)) ingestServer.SetOTLPGRPC(true, otlpGRPCAddr) } } @@ -430,192 +340,194 @@ func main() { return http.HandlerFunc(ingest.CORSWrap(corsOrigin, "GET, OPTIONS", func(w http.ResponseWriter, r *http.Request) { inner.ServeHTTP(w, r) })) } - mux.Handle("/v1/overview", readCORS(ingestServer.Overview)) - var v2Reader *ingestv2.Reader var incidentEngine *incidents.Engine incidentRunning := false - if v2ReadsEnabled { - v2Reader = ingestv2.NewReader(v2Index) - v2ReadHandler := ingestv2.NewReadHandler(v2Reader, m, graphHotWindow) - mux.Handle("/v1/events/search", readCORS(v2ReadHandler.EventSearch)) - mux.Handle("/v1/errors", readCORS(v2ReadHandler.Errors)) - mux.Handle("/v1/blast_radius", readCORS(v2ReadHandler.BlastRadius)) - mux.Handle("/v1/traces/story", readCORS(v2ReadHandler.TraceStory)) - mux.Handle("/v1/traces/recent", readCORS(v2ReadHandler.RecentTraces)) - // ServeMux chooses the longest matching pattern, so these prefix handlers - // do not capture the concrete routes above or /v1/events/validate. - mux.Handle("/v1/events/", readCORS(v2ReadHandler.EventByID)) - mux.Handle("/v1/traces/", readCORS(v2ReadHandler.TraceByID)) - slog.Info("v2 read endpoints enabled") - if incidentsEnabled { - if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok { - incidentStore := coldstore.NewIncidentStore(sqlite) - incReader := incidentReaderAdapter{reader: v2Reader} - incidentEngine = incidents.NewEngine( - incReader, - signalStore, - coldDeployAdapter{store: sqlite}, - incidentStore, - incidentCfg, - m, - slog.Default(), - ) - if err := incidentEngine.Bootstrap(context.Background()); err != nil { - slog.Error("incident engine bootstrap failed", "err", err) - os.Exit(1) + v2Reader := ingestv2.NewReader(v2Index) + v2ReadHandler := ingestv2.NewReadHandler(v2Reader, m, graphHotWindow) + mux.Handle("/v1/events/search", readCORS(v2ReadHandler.EventSearch)) + mux.Handle("/v1/errors", readCORS(v2ReadHandler.Errors)) + mux.Handle("/v1/blast_radius", readCORS(v2ReadHandler.BlastRadius)) + mux.Handle("/v1/traces/story", readCORS(v2ReadHandler.TraceStory)) + mux.Handle("/v1/traces/recent", readCORS(v2ReadHandler.RecentTraces)) + // ServeMux chooses the longest matching pattern, so these prefix handlers + // do not capture the concrete routes above or /v1/events/validate. + mux.Handle("/v1/events/", readCORS(v2ReadHandler.EventByID)) + mux.Handle("/v1/traces/", readCORS(v2ReadHandler.TraceByID)) + // Register the reader-backed explain_request + blast_radius. The + // triage_incident + render_triage_report pair registers later once + // triage.Engine exists. + { + v2ToolReader := incidentReaderAdapter{reader: v2Reader} + if err := tools.RegisterExplainRequestTool(reg, v2ToolReader); err != nil { + slog.Error("register explain_request v2", "err", err) + os.Exit(1) + } + if err := tools.RegisterBlastRadiusTool(reg, v2ToolReader); err != nil { + slog.Error("register blast_radius v2", "err", err) + os.Exit(1) + } + } + if incidentsEnabled { + if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok { + incidentStore := coldstore.NewIncidentStore(sqlite) + incReader := incidentReaderAdapter{reader: v2Reader} + incidentEngine = incidents.NewEngine( + incReader, + signalStore, + coldDeployAdapter{store: sqlite}, + incidentStore, + incidentCfg, + m, + slog.Default(), + ) + if err := incidentEngine.Bootstrap(context.Background()); err != nil { + slog.Error("incident engine bootstrap failed", "err", err) + os.Exit(1) + } + if config.GetenvBool("WAYLOG_REBUILD_INCIDENTS_ON_START", false) { + rebuildMaxEvents := config.GetenvInt("WAYLOG_INCIDENT_REBUILD_MAX_EVENTS", 250000) + if rebuildMaxEvents <= 0 { + rebuildMaxEvents = 250000 } - if config.GetenvBool("WAYLOG_REBUILD_INCIDENTS_ON_START", false) { - rebuildMaxEvents := config.GetenvInt("WAYLOG_INCIDENT_REBUILD_MAX_EVENTS", 250000) - if rebuildMaxEvents <= 0 { - rebuildMaxEvents = 250000 - } - replayWindow := graphHotWindow - if minWindow := 2 * incidentCfg.Window; minWindow > replayWindow { - replayWindow = minWindow - } - replaySince := time.Now().UTC().Add(-replayWindow) - seed := incidentEngine.SnapshotActive() - for _, inc := range seed { - if inc.StartedAt.Before(replaySince) { - slog.Info("incident continuity broken: started_at older than WAL retention", - "incident_id", inc.IncidentID, - "started_at", inc.StartedAt, - "replay_since", replaySince, - ) - break - } - } - tempIndex := ingestv2.NewRecentIndex(nil) - tempDedup := ingestv2.NewDedup(dedupCapacity, nil) - tempProjector := ingestv2.NewProjector(tempIndex) - replay, err := ingestv2.ReplayWAL(eventLogV2Dir, tempDedup, tempProjector, replaySince, m) - if err != nil { - m.IncidentRebuildFailures.Inc() - slog.Error("incident rebuild WAL replay failed", "err", err) - os.Exit(1) - } - m.IncidentRebuildReplayed.Add(float64(replay.Projected)) - if replay.Projected > rebuildMaxEvents { - m.IncidentRebuildFailures.Inc() - slog.Error("incident rebuild replay exceeded max events", "projected", replay.Projected, "max_events", rebuildMaxEvents) - os.Exit(1) - } - if replay.Projected == 0 { - // Empty WAL replay while rebuild was explicitly requested. - // Transition only the seed rows whose StartedAt precedes - // replaySince — those are stale beyond the hot window and - // their continuing "active" status is no longer evidence- - // backed. Non-stale active rows in the same seed are left - // untouched and will be re-evaluated by the next live tick. - staleTransitioned := 0 - if len(seed) > 0 { - incidentStoreRef := incidentStore - now := time.Now().UTC() - for _, inc := range seed { - if inc.Status != incidents.StatusActive { - continue - } - if !inc.StartedAt.Before(replaySince) { - continue - } - row := inc - row.Status = incidents.StatusRecovering - t := now - row.RecoveringAt = &t - row.UpdatedAt = now - if err := incidentStoreRef.Upsert(context.Background(), row); err != nil { - slog.Warn("stale-active rebuild transition failed", - "incident_id", row.IncidentID, "err", err) - continue - } - staleTransitioned++ - } - if staleTransitioned > 0 { - if err := incidentEngine.Bootstrap(context.Background()); err != nil { - slog.Error("incident engine re-bootstrap after stale transition failed", "err", err) - os.Exit(1) - } - slog.Info("incidents rebuild: stale active rows transitioned to recovering", - "transitioned", staleTransitioned, - "replay_since", replaySince) - } else { - slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is") - } - } - } else { - result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{ - Engine: incidentEngine, - Reader: incidentReaderAdapter{reader: ingestv2.NewReader(tempIndex)}, - Now: time.Now, - }) - if err != nil { - m.IncidentRebuildFailures.Inc() - slog.Error("incident rebuild failed", "err", err) - os.Exit(1) - } - m.IncidentRebuildDuration.Observe(result.Duration.Seconds()) - m.IncidentRebuildRows.Add(float64(result.RowsReplaced)) - slog.Info("incident rebuild complete", - "replayed_events", replay.Projected, - "rows_replaced", result.RowsReplaced, - "duration", result.Duration, + replayWindow := graphHotWindow + // 4× window: the spike baseline is the median of the 3 prior + // windows, so rebuild needs current + 3 baselines of history. + if minWindow := 4 * incidentCfg.Window; minWindow > replayWindow { + replayWindow = minWindow + } + replaySince := time.Now().UTC().Add(-replayWindow) + seed := incidentEngine.SnapshotActive() + for _, inc := range seed { + if inc.StartedAt.Before(replaySince) { + slog.Info("incident continuity broken: started_at older than WAL retention", + "incident_id", inc.IncidentID, + "started_at", inc.StartedAt, + "replay_since", replaySince, ) + break } } - incidentHandler := incidents.NewHandler(incidentEngine) - mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active)) - mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident)) - ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine}) - - // Triage engine: deterministic TriageReport build for a given - // incident. Reuses the same v2Reader-backed adapter for blast - // queries, the live graph + trace store for first-failure - // stories, and the configured signal store. Read-scope auth. - triageEng, err := triage.NewEngine(triage.Deps{ - Incidents: triage.NewIncidentLookupAdapter(incidentEngine), - Blast: triage.NewBlastQueryAdapter(incReader), - Story: triage.NewStoryBuilderAdapter( - incidentEngine, - func(traceID string) (apiv2.StoryResponse, bool) { - return v2Reader.TraceStoryByTraceID(traceID) - }, - ), - Signals: triage.NewSignalQueryAdapter(signalStore), - Alerts: triage.NewAlertQueryAdapter(signalStore, alertMatchWindow), - NextChecks: triage.NewNextChecksAdapter(), - }) + tempIndex := ingestv2.NewRecentIndex(nil) + tempDedup := ingestv2.NewDedup(dedupCapacity, nil) + tempProjector := ingestv2.NewProjector(tempIndex) + replay, err := ingestv2.ReplayWAL(eventLogV2Dir, tempDedup, tempProjector, replaySince, m) if err != nil { - slog.Error("triage engine init failed", "err", err) + m.IncidentRebuildFailures.Inc() + slog.Error("incident rebuild WAL replay failed", "err", err) os.Exit(1) } - if err := tools.RegisterTriageTool(reg, triageEng); err != nil { - slog.Error("triage tool register failed", "err", err) + m.IncidentRebuildReplayed.Add(float64(replay.Projected)) + if replay.Projected > rebuildMaxEvents { + m.IncidentRebuildFailures.Inc() + slog.Error("incident rebuild replay exceeded max events", "projected", replay.Projected, "max_events", rebuildMaxEvents) os.Exit(1) } - if err := tools.RegisterTriageReportTool(reg, triageEng); err != nil { - slog.Error("triage report tool register failed", "err", err) - os.Exit(1) + if replay.Projected == 0 { + // Empty WAL replay while rebuild was explicitly requested. + // Transition only the seed rows whose StartedAt precedes + // replaySince — those are stale beyond the hot window and + // their continuing "active" status is no longer evidence- + // backed. Non-stale active rows in the same seed are left + // untouched and will be re-evaluated by the next live tick. + staleTransitioned := 0 + if len(seed) > 0 { + incidentStoreRef := incidentStore + now := time.Now().UTC() + for _, inc := range seed { + if inc.Status != incidents.StatusActive { + continue + } + if !inc.StartedAt.Before(replaySince) { + continue + } + row := inc + row.Status = incidents.StatusRecovering + t := now + row.RecoveringAt = &t + row.UpdatedAt = now + if err := incidentStoreRef.Upsert(context.Background(), row); err != nil { + slog.Warn("stale-active rebuild transition failed", + "incident_id", row.IncidentID, "err", err) + continue + } + staleTransitioned++ + } + if staleTransitioned > 0 { + if err := incidentEngine.Bootstrap(context.Background()); err != nil { + slog.Error("incident engine re-bootstrap after stale transition failed", "err", err) + os.Exit(1) + } + slog.Info("incidents rebuild: stale active rows transitioned to recovering", + "transitioned", staleTransitioned, + "replay_since", replaySince) + } else { + slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is") + } + } + } else { + result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{ + Engine: incidentEngine, + Reader: incidentReaderAdapter{reader: ingestv2.NewReader(tempIndex)}, + Now: time.Now, + }) + if err != nil { + m.IncidentRebuildFailures.Inc() + slog.Error("incident rebuild failed", "err", err) + os.Exit(1) + } + m.IncidentRebuildDuration.Observe(result.Duration.Seconds()) + m.IncidentRebuildRows.Add(float64(result.RowsReplaced)) + slog.Info("incident rebuild complete", + "replayed_events", replay.Projected, + "rows_replaced", result.RowsReplaced, + "duration", result.Duration, + ) } - triageHandler := triagehttp.NewHandler(triageEng) - mux.Handle("/v1/triage/", readCORS(triageHandler.Triage)) - - incidentRunning = true - slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window) - } else { - slog.Warn("incidents requested but SQLite not configured; running without incidents") } + incidentHandler := incidents.NewHandler(incidentEngine) + mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active)) + mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident)) + ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine}) + + // Triage engine: deterministic TriageReport build for a given + // incident. Wires the incidents engine for lookups, the v2 + // reader for blast queries and TraceStoryByTraceID, and the + // signal store + alert adapter for context. Read-scope auth. + triageEng, err := triage.NewEngine(triage.Deps{ + Incidents: triage.NewIncidentLookupAdapter(incidentEngine), + Blast: triage.NewBlastQueryAdapter(incReader), + Story: triage.NewStoryBuilderAdapter( + incidentEngine, + func(traceID string) (apiv2.StoryResponse, bool) { + return v2Reader.TraceStoryByTraceID(traceID) + }, + ), + Signals: triage.NewSignalQueryAdapter(signalStore), + Alerts: triage.NewAlertQueryAdapter(signalStore, alertMatchWindow), + NextChecks: triage.NewNextChecksAdapter(), + }) + if err != nil { + slog.Error("triage engine init failed", "err", err) + os.Exit(1) + } + if err := tools.RegisterTriageTool(reg, triageEng); err != nil { + slog.Error("triage tool register failed", "err", err) + os.Exit(1) + } + if err := tools.RegisterTriageReportTool(reg, triageEng); err != nil { + slog.Error("triage report tool register failed", "err", err) + os.Exit(1) + } + triageHandler := triagehttp.NewHandler(triageEng) + mux.Handle("/v1/triage/", readCORS(triageHandler.Triage)) + + incidentRunning = true + slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window) + } else { + slog.Warn("incidents requested but SQLite not configured; running without incidents") } - } else { - mux.Handle("/v1/traces/story", readCORS(ingestServer.TraceStory)) - mux.Handle("/v1/blast_radius", readCORS(ingestServer.BlastRadius)) - mux.Handle("/v1/traces/recent", readCORS(ingestServer.RecentTraces)) - mux.Handle("/v1/events/search", readCORS(ingestServer.EventSearch)) } - mux.Handle("/v1/overview/timeseries", readCORS(ingestServer.OverviewTimeseries)) - mux.Handle("/v1/routes", readCORS(ingestServer.Routes)) mux.Handle("/v1/capabilities", readCORS(ingestServer.Capabilities)) - mux.Handle("/v1/topology", readCORS(ingestServer.Topology)) - mux.Handle("/v1/stream/dashboard", readCORS(ingestServer.SSEStream)) mux.Handle("/v1/insight", readCORS(ingestServer.Insight)) alertHandler := alerts.NewHandler(signalStore, incidentEngine, v2Reader, alertMatchWindow) mux.Handle("/v1/alerts", writeAuth(http.HandlerFunc(alertHandler.Alerts))) @@ -648,11 +560,6 @@ func main() { mux.Handle("/v1/plans/execute", agentCORS("POST, OPTIONS", ingestServer.PlanExecute)) mux.Handle("/v1/stream/plans/", agentCORS("GET, OPTIONS", ingestServer.PlanStream)) - // Graph topology (feature-gated). - if graphUI { - mux.Handle("/v1/graph/topology", readCORS(ingestServer.GraphTopology)) - } - // Dashboard. mux.Handle("/ui/", dashGate(http.StripPrefix("/ui/", dashboard.Handler()))) mux.HandleFunc("/ui", func(w http.ResponseWriter, r *http.Request) { @@ -682,6 +589,10 @@ func main() { } if incidentRunning { go incidentEngine.Run(ctx) + if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok { + incidentRetention := config.GetenvDuration("WAYLOG_INCIDENT_RETENTION", 168*time.Hour) + go incidents.RunRetention(ctx, coldstore.NewIncidentStore(sqlite), incidentRetention, 5*time.Minute, m, slog.Default()) + } } go func() { @@ -712,7 +623,7 @@ func main() { if mcpStdio { go func() { slog.Info("MCP stdio ready", "protocol", "2024-11-05") - if err := stdio.Serve(ctx, os.Stdin, os.Stdout, reg, graphStore, stdio.ServerInfo{ + if err := stdio.Serve(ctx, os.Stdin, os.Stdout, reg, stdio.ServerInfo{ Name: "waylog", Version: "0.1.0", }); err != nil && err != context.Canceled { @@ -723,85 +634,21 @@ func main() { go replLoop() } - // ---------------- Periodic snapshotter ---------------- - - ticker := time.NewTicker(time.Duration(snapshotEvery) * time.Second) - defer ticker.Stop() - snapshotCount := 0 + // ---------------- Periodic v2-index pruning ---------------- + pruneTicker := time.NewTicker(time.Duration(tickEvery) * time.Second) + defer pruneTicker.Stop() go func() { for { select { case <-ctx.Done(): return - case <-ticker.C: - snapshotCount++ - - // Enforce retention: prune nodes older than the retention window. + case <-pruneTicker.C: cutoff := time.Now().Add(-graphHotWindow) - graphStore.PruneOlderThan(cutoff) v2Pruned := v2Index.PruneOlderThan(cutoff) if v2Pruned.Events > 0 { m.V2IndexPruned.Add(float64(v2Pruned.Events)) } - deletedTraces, _ := traceStore.PruneOlderThan(cutoff) - m.GraphPrunedTotal.Inc() - if deletedTraces > 0 { - m.TraceStorePruned.Add(float64(deletedTraces)) - } - - g := graphStore.Snapshot() - - m.GraphNodes.Set(float64(len(g.Nodes))) - m.GraphEdges.Set(float64(len(g.Edges))) - m.TraceStoreRecords.Set(float64(traceStore.Count())) - m.TraceStoreSpans.Set(float64(traceStore.SpanCount())) - m.TraceStoreCohorts.Set(float64(traceStore.CohortCount())) - - if len(g.Nodes) == 0 { - if snapshotLogEvery > 0 && snapshotCount%snapshotLogEvery == 0 { - slog.Debug("snapshot skipped, graph empty") - } - continue - } - - if err := persist.Save(snapshotPath, g); err != nil { - slog.Error("snapshot save failed", "err", err, "path", snapshotPath) - m.SnapshotLastError.Set(float64(time.Now().Unix())) - } else { - m.SnapshotLastSuccess.Set(float64(time.Now().Unix())) - if snapshotLogEvery > 0 && snapshotCount%snapshotLogEvery == 0 { - slog.Info("snapshot saved", - "nodes", len(g.Nodes), - "edges", len(g.Edges), - "path", snapshotPath, - ) - } - } - } - } - }() - - // ---------------- SSE recompute ticker ---------------- - - go func() { - sseTicker := time.NewTicker(1 * time.Second) - defer sseTicker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-sseTicker.C: - dirty := sseHub.DrainDirty() - if len(dirty) == 0 { - continue - } - for _, topic := range dirty { - data := ingestServer.ComputeSSETopic(topic) - if data != nil { - sseHub.Publish(topic, data) - } - } } } }() @@ -840,108 +687,6 @@ func main() { // ---------------- Anomaly detection ticker ---------------- - detectCfg := detect.ParseConfig() - if detectCfg.Enabled && !incidentRunning { - var deploySrc detect.DeploySource - if coldDB != nil { - deploySrc = coldDB - } - detector := detect.NewDetector(detectCfg, graphStore, traceStore, deploySrc) - ingestServer.SetDetector(detector) - go detector.Run(ctx) - } else if incidentRunning { - slog.Info("legacy anomaly detector disabled because v2.1 incident engine is running") - } - - // ---------------- Causal inference ticker ---------------- - - if causalEnabled && coldDB != nil { - ingestServer.SetCausalEnabled() - go func() { - causalTicker := time.NewTicker(causalInterval) - defer causalTicker.Stop() - slog.Info("causal engine started", "interval", causalInterval) - for { - select { - case <-ctx.Done(): - return - case <-causalTicker.C: - func() { - defer func() { - if r := recover(); r != nil { - slog.Error("causal inference panicked", "recover", r) - m.CausalRunFailures.Inc() - ingestServer.SetCausalRunResult(fmt.Errorf("panic: %v", r)) - } - }() - - tickCtx, cancel := context.WithTimeout(ctx, 5*time.Second) - defer cancel() - - now := time.Now().UTC() - m.CausalRunsTotal.Inc() - - // Query deployments first — cheap; skip snapshot if none. - window := 1 * time.Hour - deps, err := coldDB.DeploymentsInWindow(tickCtx, now.Add(-window), now, "") - if err != nil { - slog.Warn("causal: deployment query failed", "err", err) - m.CausalRunFailures.Inc() - ingestServer.SetCausalRunResult(err) - return - } - if len(deps) == 0 { - ingestServer.SetCausalRunResult(nil) - return - } - - snap := graphStore.Snapshot() - if len(snap.Nodes) == 0 { - ingestServer.SetCausalRunResult(nil) - return - } - - // Convert coldstore.Deployment → causal.DeploymentInfo - infos := make([]causal.DeploymentInfo, len(deps)) - for i, d := range deps { - infos[i] = causal.DeploymentInfo{ID: d.ID, Service: d.Service, FirstSeen: d.FirstSeen} - } - - claims := causal.InferIntroducedBy(snap, infos, now.Add(-window), now) - - if len(claims) > 0 { - if err := coldDB.SaveClaims(tickCtx, claims); err != nil { - slog.Warn("causal: save claims failed", "err", err) - m.CausalRunFailures.Inc() - ingestServer.SetCausalRunResult(err) - return - } - for _, c := range claims { - slog.Info("causal claim (shadow)", - "type", c.ClaimType, - "subject", c.Subject, - "target", c.Target, - "service", c.Service, - "confidence", c.Confidence, - "tier", c.Tier, - ) - m.CausalClaimsTotal.With(prometheus.Labels{ - "type": string(c.ClaimType), - "tier": string(c.Tier), - }).Inc() - } - } - - m.CausalRunDuration.Observe(time.Since(now).Seconds()) - ingestServer.SetCausalRunResult(nil) - }() - } - } - }() - } else if causalEnabled && coldDB == nil { - slog.Warn("CAUSAL_ENABLED=true but SQLITE_PATH not set — causal engine disabled") - } - // ---------------- Shutdown ---------------- <-ctx.Done() @@ -976,19 +721,6 @@ func main() { coldWriter.Stop() slog.Info("coldstore writer drained") } - - // Final snapshot on shutdown - g := graphStore.Snapshot() - if len(g.Nodes) == 0 { - slog.Info("final snapshot skipped, graph empty") - } else if err := persist.Save(snapshotPath, g); err != nil { - slog.Error("final snapshot save failed", "err", err) - } else { - slog.Info("final snapshot saved", - "nodes", len(g.Nodes), - "edges", len(g.Edges), - ) - } } func replLoop() { @@ -1023,13 +755,6 @@ func replLoop() { } } -func defaultEventLogV2Dir(eventLogDir string) string { - if eventLogDir != "" { - return filepath.Join(eventLogDir, "v2") - } - return "./data/eventlog-v2" -} - func printHelp() { os.Stdout.WriteString("\033[1m\033[36mcommands:\033[0m\n") os.Stdout.WriteString(" waylog \"\"\n") @@ -1084,6 +809,18 @@ func (a incidentReaderAdapter) SearchEvents(f incidents.SearchFilter, limit int) return res.Events } +func (a incidentReaderAdapter) TraceStoryByTraceID(traceID string) (apiv2.StoryResponse, bool) { + return a.reader.TraceStoryByTraceID(traceID) +} + +func (a incidentReaderAdapter) TraceEvents(traceID string) ([]*eventv2.Event, bool) { + result, ok := a.reader.GetTrace(traceID) + if !ok { + return nil, false + } + return result.Events, true +} + func toV2SearchFilter(f incidents.SearchFilter) ingestv2.SearchFilter { return ingestv2.SearchFilter{ Service: f.Service, diff --git a/cmd/waylog-live/main.go b/cmd/waylog-live/main.go deleted file mode 100644 index cfecea7..0000000 --- a/cmd/waylog-live/main.go +++ /dev/null @@ -1,42 +0,0 @@ -package main - -import ( - "context" - "flag" - "fmt" - "os" - "time" - - tea "github.com/charmbracelet/bubbletea" - "github.com/sssmaran/WaylogCLI/internal/tui" -) - -func main() { - addr := flag.String("addr", "http://localhost:8080", "ingest server URL") - interval := flag.Duration("interval", 2*time.Second, "poll interval") - dev := flag.Bool("dev", false, "use SSE stream for live updates instead of polling") - flag.Parse() - if *interval <= 0 { - *interval = 2 * time.Second - } - - client := tui.NewAPIClient(*addr) - model := tui.NewModel(client, *interval) - - if *dev { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ch, err := client.StartDashboardStream(ctx) - if err != nil { - fmt.Fprintf(os.Stderr, "dev stream unavailable (%v) — falling back to polling\n", err) - } else { - model = model.WithStream(ch) - } - } - - p := tea.NewProgram(model, tea.WithAltScreen()) - if _, err := p.Run(); err != nil { - fmt.Fprintf(os.Stderr, "Error: %v\n", err) - os.Exit(1) - } -} diff --git a/docker-compose.yml b/docker-compose.yml index c332ad2..c35473a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,31 +73,6 @@ services: timeout: 3s retries: 10 - bridge: - profiles: ["legacy-kafka"] - build: - context: . - target: bridge - depends_on: - kafka: - condition: service_healthy - ingest: - condition: service_healthy - environment: - KAFKA_BROKERS: kafka:29092 - KAFKA_TOPIC: wide_events - INGEST_URL: http://ingest:8080/v1/events - WAYLOG_API_KEY: ${WAYLOG_API_KEY:-} - WAYLOG_WRITE_KEY: ${WAYLOG_WRITE_KEY:-} - WAYLOG_READ_KEY: ${WAYLOG_READ_KEY:-} - WAYLOG_AGENT_KEY: ${WAYLOG_AGENT_KEY:-} - DASHBOARD_AUTH: ${DASHBOARD_AUTH:-off} - DASHBOARD_SESSION_SECRET: ${DASHBOARD_SESSION_SECRET:-} - DLQ_DIR: /data/dlq - LOG_LEVEL: info - volumes: - - bridge-data:/data - api-gateway: build: context: . @@ -183,6 +158,5 @@ services: volumes: waylog-data: - bridge-data: prometheus-data: grafana-data: diff --git a/docs/env.md b/docs/env.md index 0ef8836..e029ef0 100644 --- a/docs/env.md +++ b/docs/env.md @@ -56,10 +56,13 @@ Scoped keys. See the Auth section of the [README](../README.md). | `IDLE_TIMEOUT` | `120s` | HTTP idle timeout | | `CORS_ORIGIN` | `*` | Allowed CORS origin for read APIs | | `ALERT_MATCH_WINDOW` | `15m` | Window for matching `/v1/alerts` to active incidents by `env + service + error_code`; capped at `24h` | +| `WAYLOG_RATE_LIMIT_WRITE_RPS` | `0` (`1000` in prod) | Per-key requests/second for write endpoints (`/v1/events`, `/v1/signals`, `/v1/alerts`, OTLP HTTP). `0` disables. Keyed by presented credential, falling back to client IP; throttled requests get `429` + `Retry-After: 1` | +| `WAYLOG_RATE_LIMIT_READ_RPS` | `0` (`200` in prod) | Per-key requests/second for read endpoints | +| `WAYLOG_RATE_LIMIT_AGENT_RPS` | `0` (`50` in prod) | Per-key requests/second for agent endpoints (`/v1/tools/*`, `/v1/ask`, `/v1/plans/*`) | ## CLI -The `waylog` CLI calls the running ingest server's v2 read APIs. The server runs with `WAYLOG_V2_READS=true` by default; only set it to `false` for legacy v1-only stacks. +The `waylog` CLI calls the running ingest server's v2 read APIs. | Variable | Default | Purpose | |---|---|---| @@ -71,7 +74,6 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server runs | Variable | Default | Purpose | |---|---|---| -| `SNAPSHOT_PATH` | `./data/graph_snapshot.json` | Graph snapshot location | | `SQLITE_PATH` | — | SQLite cold store path (optional; disabled if empty) | | `EVENT_LOG_DIR` | — | Append-only event log directory (disabled if empty) | | `EVENT_LOG_V2_DIR` | `${EVENT_LOG_DIR}/v2` or `./data/eventlog-v2` | Raw schema-2.0 WAL directory for `/v1/events` | @@ -79,19 +81,20 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server runs | `EVENT_LOG_MAX_FILE_MB` | `50` | Rotation size. `0` disables rotation | | `EVENT_LOG_RETENTION` | `72h` | Event log retention. Must be positive | | `WAYLOG_SIGNAL_RETENTION` | `72h` | Production-context signal retention. Must be positive. `/v1/signals` requires `SQLITE_PATH` | -| `WAYLOG_INCIDENTS_ENABLED` | `true` | Enable the v2.1 incident engine when `SQLITE_PATH` is set and `WAYLOG_V2_READS=true` | +| `WAYLOG_INCIDENTS_ENABLED` | `true` | Enable the v2.1 incident engine when `SQLITE_PATH` is set | | `WAYLOG_INCIDENT_TICK_INTERVAL` | `30s` | Incident engine evaluation interval | | `WAYLOG_INCIDENT_WINDOW` | `10m` | Current error-family spike window | | `WAYLOG_INCIDENT_MIN_COUNT` | `5` | Minimum current-window failures needed to open an incident | -| `WAYLOG_INCIDENT_MIN_LIFT` | `3.0` | Minimum current-vs-baseline lift when the family already exists in the baseline window | +| `WAYLOG_INCIDENT_MIN_LIFT` | `3.0` | Minimum current-vs-baseline lift when the family already exists in the baseline. Baseline is the per-family median of the 3 prior windows (see `docs/internals.md`) | +| `WAYLOG_INCIDENT_MIN_RATE` | `0` | Low-traffic guard in errors/minute: a family must sustain `rate × window-minutes` failures in the current window to open an incident. `0` disables | | `WAYLOG_INCIDENT_RESOLVE_AFTER` | `2m` | Time without renewed matching failures before a recovering incident resolves | | `WAYLOG_DEPLOY_CORRELATION_WINDOW` | `15m` | Window used to attach deploy signals and deployment records as incident evidence | | `WAYLOG_INCIDENT_SAMPLE_LIMIT` | `5` | Maximum persisted sample traces per incident | +| `WAYLOG_INCIDENT_RETENTION` | `168h` | How long resolved incidents are kept before the retention janitor deletes them (checked every 5m). Active/recovering incidents are never pruned. `0` disables | | `WAYLOG_REBUILD_INCIDENTS_ON_START` | `false` | Rebuild non-resolved incident rows at startup from the schema-2.0 WAL hot window plus signals | | `WAYLOG_INCIDENT_REBUILD_MAX_EVENTS` | `250000` | Safety cap for startup incident rebuild replay | | `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity | -| `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window | -| `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick | +| `GRAPH_HOT_WINDOW` | `24h` | Hot window the v2 reader keeps in memory; also caps the maximum window accepted by read endpoints. Older entries are pruned every tick | See [Internals](internals.md) for the full durability model. @@ -106,10 +109,6 @@ See [Internals](internals.md) for the full durability model. | Variable | Default | Purpose | |---|---|---| -| `GRAPH_UI` | `false` | Enable optional graph topology endpoint `/v1/graph/topology` | -| `WAYLOG_V2_READS` | `true` | Route v2 read endpoints to the schema-2.0 recent index. Set `false` only for legacy v1-only stacks | -| `CAUSAL_ENABLED` | `false` | Enable shadow-mode causal inference | -| `CAUSAL_INTERVAL` | `30s` | Causal inference ticker interval | | `HAPPY_SAMPLE_RATE_PCT` | `2` | Success-event sampling rate. Set `100` in dev profiles | | `MCP_STDIO` | — | Set to `1` to run MCP stdio server instead of REPL | @@ -124,9 +123,8 @@ See [Internals](internals.md) for the full durability model. | `WAYLOG_WRITE_KEY` | `demo` | Write-scope key used by the demo SDK emitters | | `WAYLOG_READ_KEY` | `demo` | Read-scope key used by the printed CLI commands | | `DASHBOARD_AUTH` | `off` in `make demo`; `key:demo` in `make micro-demo` | Dashboard auth mode for the local demo surface | -| `WAYLOG_V2_READS` | `true` in demo scripts | Enables v2 read APIs required by `waylog errors/explain/blast` | -The embedded `/ui` dashboard is a schema-2.0 triage surface and renders a setup message unless `WAYLOG_V2_READS=true`. +The embedded `/ui` dashboard is a schema-2.0 triage surface served from the v2 reader. ## Dashboard links diff --git a/docs/internals.md b/docs/internals.md index b8c318d..eb8862e 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -5,65 +5,70 @@ Mechanics behind the ingest server. If you're adopting Waylog for a service and ## Data flow 1. **SDK / collector** emits a schema-2.0 WideEvent over HTTP, or sends OTLP/HTTP traces that are converted to schema-2.0 events. -2. **Ingest** validates the event, writes it to the schema-2.0 WAL, and — only if the WAL write succeeds — projects it into recent read models. -3. **Derived read models** index events by event, trace, service, error family, and downstream call for `recent`, `errors`, `explain`, and `blast`. -4. **Cold store** (SQLite, optional) persists events, deployments, and causal claims for historical queries. -5. **Snapshot** writes the graph to disk every tick (default 5s) so restarts replay only the tail. -6. **Read path** serves the dashboard, CLI, MCP, and agent APIs from the same derived data. +2. **Ingest** validates the event, writes it to the schema-2.0 WAL, and — only if the WAL write succeeds — projects it into the v2 reader's recent index and forwards it to the cold-store batch writer. +3. **v2 reader** (`internal/ingest/v2`) indexes events by event, trace, service, error family, and downstream call for `errors`, `recent`, `explain`, `blast`, and `events/search` endpoints. +4. **Cold store** (SQLite, optional) persists events, deployments, signals, and incident rows for historical queries and incident-engine rebuild. +5. **Incident engine** (`internal/incidents`) runs every `WAYLOG_INCIDENT_TICK_INTERVAL` against the v2 reader + signal store + deployment store, opens/updates/recovers/resolves incidents, and attaches propagation + blast evidence snapshots. +6. **Read path** serves the dashboard, CLI, MCP, and agent APIs from the v2 reader and the incident store. ## Durability model -The event log is the **source of truth**. The in-memory graph and schema-2.0 read indexes are derived, queryable views that can be rebuilt from the log. +The event log is the **source of truth**. The v2 reader's in-memory index is a derived, queryable view that can be rebuilt from the log. ### Write path -Every event must be durably logged before it enters the graph. If the event log write fails, the handler returns 503 and the event is not merged. The client is expected to retry. +Every event must be durably logged before it enters the recent index. If the event log write fails, the handler returns 503 and the event is not projected. The client is expected to retry. ### Durability modes - **Sync (default, `EVENT_LOG_SYNC=true`)** — each write `fsync`s to disk. Survives process crash, host crash, and power loss. ~200–1000 events/sec depending on the disk. - **Buffered (`EVENT_LOG_SYNC=false`)** — writes go to the OS page cache without per-write fsync. Survives process crash only. Higher throughput, suitable for dev or load testing. -### Snapshot persistence - -The graph is snapshotted to disk every 5s by default. The persist layer uses **atomic write**: write a temp file, `fsync`, move the previous good snapshot to `.bak`, then rename the temp file into place. A crash during save never corrupts the current snapshot; the `.bak` is the previous good state. - ### Startup replay -On boot, the server loads the latest snapshot and then replays event log entries newer than the snapshot to rebuild the graph. If replay fails, the server starts with whatever data the snapshot had and becomes ready in **degraded mode**. New events ingest correctly; historical reads (story, overview, recent) may return partial results until traffic rebuilds the graph. `/healthz` reports `replay.status: "failed"` so operators can see it. +On boot, the server replays event-log entries newer than `time.Now() - GRAPH_HOT_WINDOW` (default 24h) into the v2 reader. If replay fails, the server still becomes ready in **degraded mode**. New events ingest correctly; historical reads may return partial results until traffic rebuilds the index. `/healthz` reports `replay.status: "failed"` so operators can see it. ### Readiness policy `/readyz` gates on ingest availability, not replay completeness. Fail-open: the server becomes ready as soon as it can accept events. Inspect `/healthz` for degraded state. -## Graph retention +## Hot-window retention -The graph is pruned every snapshot tick to enforce `GRAPH_RETENTION` (default 24h). Nodes whose `LastSeen` is older than the retention window are dropped along with their edges. `PruneOlderThan` rebuilds all derived indexes (edge set, trace maps, request facts, counters) and then snapshots the pruned graph. +The v2 reader's in-memory index is pruned every tick to enforce `GRAPH_HOT_WINDOW` (default 24h). Entries older than the window are dropped from the recent index. Cold storage (SQLite) retains the full history bounded by `EVENT_LOG_RETENTION`. -Retention bounds memory growth. Production deployments should tune this to match their incident-response window — you rarely need more than 24 hours of hot graph data. +Retention bounds memory growth. Production deployments should tune this to match their incident-response window — you rarely need more than 24 hours of hot data in memory. -## Graph merge semantics +For the single-node throughput, memory, and storage ceiling as a whole — and how to tune within it or scale past it — see [`scale-and-limits.md`](scale-and-limits.md). -Events arrive out-of-order and across services. The merge rules for a request node: +## Spike detection baseline -- **`success`** is AND-reduced across hops. A request is successful only if every span is. -- **Root span** overwrites the request summary (flow, user, latency) and sets `root_service`. -- **`error_codes`** accumulate and are deduped. -- **`RequestFacts`** (topology view) is always updated on merge; counter recompute is gated by `factsEqual` on `Services`, `Errors`, and `Flags`. +The incident engine opens an incident when an error family's current-window +count clears `WAYLOG_INCIDENT_MIN_COUNT` and its lift over baseline clears +`WAYLOG_INCIDENT_MIN_LIFT`. Two design choices keep the detector deterministic +and explainable (no learned models): -Edge rules: - -- Edges are created only when both endpoints exist. Never create an edge pointing at a non-existent node. -- Error nodes are created only when `ev.Error != nil`. No empty error nodes from successful events. +- **Baseline = per-family median of the 3 prior windows** (`[now-2W, now-W]`, + `[now-3W, now-2W]`, `[now-4W, now-3W]`, where `W` is + `WAYLOG_INCIDENT_WINDOW`). A family absent from a window counts as 0. The + median means one anomalous prior window can neither suppress a real spike + (a prior burst inflating the baseline) nor fabricate lift (a single quiet + window deflating it). A family that is new or mostly-quiet has median 0 and + is treated as a fresh spike (lift = current count). All four windows are + served by the v2 reader and must fit inside `GRAPH_HOT_WINDOW` — the + startup rebuild replay window is sized to `4 × WAYLOG_INCIDENT_WINDOW` for + the same reason. +- **Low-traffic guard** (`WAYLOG_INCIDENT_MIN_RATE`, errors/minute, default + `0` = disabled). On low-traffic services a handful of failures can clear + `MIN_COUNT` while representing trivially small absolute volume; when set, + a family must also sustain `MIN_RATE × window-minutes` failures in the + current window to open an incident. ## Service attribution -Request nodes carry two kinds of service information: - -- **`root_service`** (canonical owner) — set by `mergeRequestAttrs` when the root span merges. Used by `/v1/routes` for ownership metrics. One canonical service per request, no fan-out inflation. -- **`Services []string`** in `RequestFacts` — populated from `handled_by` edges (every service that touched the request). Used by topology analysis tools (`blast_radius`, `failure_patterns`, etc.) where fan-out semantics are correct. +The v2 reader carries per-request service info inferred from span fan-out: -If the root span hasn't arrived yet, `/v1/routes` falls back to deriving service from the `event_name` prefix (`"api-gateway.request"` → `"api-gateway"`). Once the root merges, `root_service` takes precedence. +- **`root_service`** (canonical owner) — the originating service for the trace, used for ownership metrics. One canonical service per request, no fan-out inflation. +- **`services`** (set) — every service that touched the request, used by topology-aware tools (`blast_radius`) where fan-out semantics are correct. ## Sampling @@ -75,7 +80,7 @@ Sampling is hash-based on `trace_id` (FNV), so a given trace is either fully sam ## Counter buffer -A 120-minute ring buffer keeps per-minute counts for fast windowed queries (`graph_insights`, `/v1/overview`). For windows larger than 120 minutes, `Sum()` returns 0 and callers fall back to the hot graph itself. This bounds memory while keeping short-window reads O(1). +A 120-minute ring buffer keeps per-minute counts for fast windowed error-rate queries. For windows larger than 120 minutes, `Sum()` returns 0 and callers fall back to the v2 reader's index. This bounds memory while keeping short-window reads O(1). ## Event log rotation @@ -83,7 +88,7 @@ Size-based rotation on `EVENT_LOG_MAX_FILE_MB`. When two rotations happen in the ## Metrics -Custom `prometheus.Registry` per server — no global. All metric calls are guarded by `if s.metrics != nil` so tests can run without wiring a registry. 22+ collectors under the `waylog_*` prefix. Scraped at `/metrics`. +Custom `prometheus.Registry` per server — no global. All metric calls are guarded by `if s.metrics != nil` so tests can run without wiring a registry. Scraped at `/metrics` under the `waylog_*` prefix. ## SDK contract diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 6cc08dd..e76c5b0 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -710,100 +710,6 @@ paths: '405': description: Method Not Allowed - /v1/overview: - get: - tags: [Operational] - operationId: getOperationalOverview - summary: Operational overview - description: Secondary graph-derived overview endpoint retained for operational tooling. - security: - - ApiKeyHeader: [] - - BearerAuth: [] - parameters: - - $ref: '#/components/parameters/Window' - - $ref: '#/components/parameters/Limit' - responses: - '200': - description: Overview object - content: - application/json: - schema: - type: object - additionalProperties: true - '503': - description: Graph store unavailable - - /v1/overview/timeseries: - get: - tags: [Operational] - operationId: getOperationalTimeseries - summary: Operational overview timeseries - description: Secondary graph-derived timeseries endpoint retained for operational tooling. - security: - - ApiKeyHeader: [] - - BearerAuth: [] - parameters: - - $ref: '#/components/parameters/Window' - - in: query - name: step - schema: - type: string - default: 5m - responses: - '200': - description: Timeseries buckets - content: - application/json: - schema: - type: object - additionalProperties: true - '400': - $ref: '#/components/responses/ReadBadRequest' - - /v1/routes: - get: - tags: [Operational] - operationId: listOperationalRoutes - summary: Operational route statistics - description: Secondary graph-derived route statistics endpoint retained for operational tooling. - security: - - ApiKeyHeader: [] - - BearerAuth: [] - parameters: - - $ref: '#/components/parameters/Window' - - $ref: '#/components/parameters/Limit' - responses: - '200': - description: Route statistics - content: - application/json: - schema: - type: object - additionalProperties: true - - /v1/graph/topology: - get: - tags: [Operational] - operationId: getServiceTopology - summary: Optional service topology graph - description: Returns Cytoscape.js-formatted service topology when GRAPH_UI=1. - security: - - ApiKeyHeader: [] - - BearerAuth: [] - parameters: - - $ref: '#/components/parameters/Window' - responses: - '200': - description: Cytoscape topology graph - content: - application/json: - schema: - type: object - additionalProperties: true - '405': - description: Method Not Allowed - '503': - description: Graph store unavailable or GRAPH_UI disabled /v1/ask: post: diff --git a/examples/microdemo/burst.go b/examples/microdemo/burst.go index cf7ccc4..e24449c 100644 --- a/examples/microdemo/burst.go +++ b/examples/microdemo/burst.go @@ -37,19 +37,31 @@ type BurstSummary struct { SampleTraceIDs []string `json:"sample_trace_ids"` } -var scenarioWeights = []struct { +type scenarioWeight struct { Cutoff float64 Scenario string -}{ - {0.70, ScenarioHappy}, - {0.85, ScenarioPayment502}, - {0.93, ScenarioDBMiss}, - {0.98, ScenarioCheckoutError}, +} + +// scenarioWeights is the base cumulative-cutoff table for NON-seeded burst +// traffic. Each burst jitters a copy of it (see jitteredScenarioWeights) so +// repeated `make demo` runs look different; the deterministic incident seeds in +// pickBurstScenarioForIndex never consult this table. +var scenarioWeights = []scenarioWeight{ + {0.68, ScenarioHappy}, + {0.83, ScenarioPayment502}, + {0.90, ScenarioDBMiss}, + {0.95, ScenarioCheckoutError}, + {0.98, ScenarioInventory503}, + {0.995, ScenarioCheckoutPanic}, {1.00, ScenarioSuppressedPayment502}, } -func pickBurstScenarioFloat(x float64) string { - for _, weight := range scenarioWeights { +// burstWeightJitterPct bounds how far each scenario band's width may drift from +// the base table per burst. Kept small so every scenario stays well-represented. +const burstWeightJitterPct = 0.05 + +func pickBurstScenarioFloatFrom(x float64, weights []scenarioWeight) string { + for _, weight := range weights { if x < weight.Cutoff { return weight.Scenario } @@ -57,8 +69,32 @@ func pickBurstScenarioFloat(x float64) string { return ScenarioSuppressedPayment502 } -func pickBurstScenario() string { - return pickBurstScenarioFloat(rand.Float64()) +func pickBurstScenarioFloat(x float64) string { + return pickBurstScenarioFloatFrom(x, scenarioWeights) +} + +// jitteredScenarioWeights returns a per-burst copy of scenarioWeights with each +// band's width perturbed by up to ±burstWeightJitterPct, then renormalized so +// cutoffs stay strictly increasing and end exactly at 1.0. Scenario order is +// preserved, so every scenario remains reachable — only the proportions of +// non-seeded traffic shift between bursts. +func jitteredScenarioWeights() []scenarioWeight { + out := make([]scenarioWeight, len(scenarioWeights)) + widths := make([]float64, len(scenarioWeights)) + prev, total := 0.0, 0.0 + for i, w := range scenarioWeights { + jitter := 1 + (rand.Float64()*2-1)*burstWeightJitterPct + widths[i] = (w.Cutoff - prev) * jitter + prev = w.Cutoff + total += widths[i] + } + cum := 0.0 + for i := range scenarioWeights { + cum += widths[i] / total + out[i] = scenarioWeight{Cutoff: cum, Scenario: scenarioWeights[i].Scenario} + } + out[len(out)-1].Cutoff = 1.0 // pin the final cutoff against float drift + return out } func normalizeBurstRequest(raw BurstRequest) (requested, accepted BurstRequest) { @@ -89,11 +125,20 @@ func normalizeBurstRequest(raw BurstRequest) (requested, accepted BurstRequest) return requested, accepted } -func pickBurstScenarioForIndex(i, requests int) string { - if i < incidentSeedPaymentCount(requests) { +func pickBurstScenarioForIndex(i, requests int, weights []scenarioWeight) string { + seeds := incidentSeedPaymentCount(requests) + if i < seeds { return ScenarioPayment502 } - return pickBurstScenario() + // Deterministically seed exactly one checkout panic right after the payment + // seeds (within the PMT_502 timing window) so the acceptance gate always has + // app-runtime evidence; a weighted-only panic can be missed at low request + // counts. The seed branches above never consult weights, so per-burst jitter + // can never weaken the deterministic incident the acceptance gate depends on. + if i == seeds && requests > seeds { + return ScenarioCheckoutPanic + } + return pickBurstScenarioFloatFrom(rand.Float64(), weights) } func incidentSeedPaymentCount(requests int) int { @@ -119,6 +164,9 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs var wg sync.WaitGroup var mu sync.Mutex sampledScenarios := map[string]struct{}{} + // One jittered weight table per burst so each run's non-seeded traffic mix + // differs while the deterministic seeds stay fixed. + weights := jitteredScenarioWeights() for i := 0; i < accepted.Requests; i++ { if ctx.Err() != nil { @@ -128,7 +176,7 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs // concurrency instead of stacking up `requests` blocked goroutines. sem <- struct{}{} wg.Add(1) - scenario := pickBurstScenarioForIndex(i, accepted.Requests) + scenario := pickBurstScenarioForIndex(i, accepted.Requests, weights) go func(scenario string) { defer wg.Done() defer func() { <-sem }() diff --git a/examples/microdemo/burst_test.go b/examples/microdemo/burst_test.go index 8c49b87..8f2d951 100644 --- a/examples/microdemo/burst_test.go +++ b/examples/microdemo/burst_test.go @@ -16,14 +16,18 @@ func TestPickBurstScenarioFloatBoundaries(t *testing.T) { want string }{ {0.00, ScenarioHappy}, - {0.699, ScenarioHappy}, - {0.70, ScenarioPayment502}, - {0.849, ScenarioPayment502}, - {0.85, ScenarioDBMiss}, - {0.929, ScenarioDBMiss}, - {0.93, ScenarioCheckoutError}, - {0.979, ScenarioCheckoutError}, - {0.98, ScenarioSuppressedPayment502}, + {0.679, ScenarioHappy}, + {0.68, ScenarioPayment502}, + {0.829, ScenarioPayment502}, + {0.83, ScenarioDBMiss}, + {0.899, ScenarioDBMiss}, + {0.90, ScenarioCheckoutError}, + {0.949, ScenarioCheckoutError}, + {0.95, ScenarioInventory503}, + {0.979, ScenarioInventory503}, + {0.98, ScenarioCheckoutPanic}, + {0.994, ScenarioCheckoutPanic}, + {0.995, ScenarioSuppressedPayment502}, {0.999, ScenarioSuppressedPayment502}, {1.0, ScenarioSuppressedPayment502}, } @@ -36,7 +40,7 @@ func TestPickBurstScenarioFloatBoundaries(t *testing.T) { func TestPickBurstScenarioFloatAllScenariosReachable(t *testing.T) { seen := map[string]bool{} - for _, x := range []float64{0.1, 0.75, 0.88, 0.95, 0.99} { + for _, x := range []float64{0.1, 0.75, 0.86, 0.92, 0.96, 0.985, 0.997} { seen[pickBurstScenarioFloat(x)] = true } for _, scenario := range []string{ @@ -44,6 +48,8 @@ func TestPickBurstScenarioFloatAllScenariosReachable(t *testing.T) { ScenarioPayment502, ScenarioDBMiss, ScenarioCheckoutError, + ScenarioInventory503, + ScenarioCheckoutPanic, ScenarioSuppressedPayment502, } { if !seen[scenario] { @@ -93,9 +99,16 @@ func TestRunBurstDispatchesEveryRequestThroughHandler(t *testing.T) { } func TestPickBurstScenarioForIndexSeedsPaymentFailures(t *testing.T) { - for i := 0; i < incidentSeedPayments; i++ { - if got := pickBurstScenarioForIndex(i, 20); got != ScenarioPayment502 { - t.Fatalf("seed scenario[%d] = %q, want payment_502", i, got) + // Seeds must be deterministic regardless of the (jittered) weight table, so + // the acceptance gate always finds a complete PMT_502 incident + one panic. + for _, weights := range [][]scenarioWeight{scenarioWeights, jitteredScenarioWeights()} { + for i := 0; i < incidentSeedPayments; i++ { + if got := pickBurstScenarioForIndex(i, 20, weights); got != ScenarioPayment502 { + t.Fatalf("seed scenario[%d] = %q, want payment_502", i, got) + } + } + if got := pickBurstScenarioForIndex(incidentSeedPayments, 20, weights); got != ScenarioCheckoutPanic { + t.Fatalf("post-seed scenario = %q, want checkout_panic", got) } } if got := incidentSeedPaymentCount(3); got != 3 { @@ -103,6 +116,28 @@ func TestPickBurstScenarioForIndexSeedsPaymentFailures(t *testing.T) { } } +func TestJitteredScenarioWeightsStaysValid(t *testing.T) { + for run := 0; run < 200; run++ { + w := jitteredScenarioWeights() + if len(w) != len(scenarioWeights) { + t.Fatalf("len = %d, want %d", len(w), len(scenarioWeights)) + } + prev := 0.0 + for i, weight := range w { + if weight.Scenario != scenarioWeights[i].Scenario { + t.Fatalf("scenario[%d] = %q, want %q (order must be preserved)", i, weight.Scenario, scenarioWeights[i].Scenario) + } + if weight.Cutoff <= prev { + t.Fatalf("cutoff[%d] = %v not strictly greater than previous %v", i, weight.Cutoff, prev) + } + prev = weight.Cutoff + } + if last := w[len(w)-1].Cutoff; last != 1.0 { + t.Fatalf("final cutoff = %v, want exactly 1.0", last) + } + } +} + func TestServeBurstRejectsNonPOST(t *testing.T) { gateway := NewGatewayHandler("http://checkout.example") gateway.SetPurchaseHandler(okBurstDispatch()) diff --git a/examples/microdemo/checkout.go b/examples/microdemo/checkout.go index 91d5393..2cd9eca 100644 --- a/examples/microdemo/checkout.go +++ b/examples/microdemo/checkout.go @@ -40,6 +40,13 @@ func (h *CheckoutHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } setDemoFields(ctx, "checkout", reqBody) + if reqBody.Scenario == ScenarioCheckoutPanic { + // Real recoverable panic inside the instrumented request. The Waylog + // HTTP middleware recovers it -> emits a failed checkout WideEvent and, + // with runtime hooks on, posts a go-sdk "runtime" panic signal. + panic("checkout: simulated panic charging payment (demo)") + } + if reqBody.Scenario == ScenarioSuppressedPayment502 { h.serveSuppressedPayment(w, reqBody, ctx) return @@ -124,6 +131,9 @@ func (h *CheckoutHandler) loadCart(ctx context.Context, reqBody PurchaseRequest) func (h *CheckoutHandler) reserveInventory(ctx context.Context, reqBody PurchaseRequest) error { return waylogv2.StepVoid(ctx, "inventory.reserve", func(ctx context.Context) error { + if reqBody.Scenario == ScenarioInventory503 { + return waylogv2.NewError("INV_503", waylogv2.WithReason("inventory service unavailable")) + } waylogv2.From(ctx).Info("inventory reserved", waylogv2.F{ "sku": reqBody.SKU, "reservation_id": "res-" + reqBody.SKU, diff --git a/examples/microdemo/gateway.go b/examples/microdemo/gateway.go index e4a0a52..79b5899 100644 --- a/examples/microdemo/gateway.go +++ b/examples/microdemo/gateway.go @@ -19,6 +19,15 @@ const ( ScenarioSuppressedPayment502 = "suppressed_payment_502" ScenarioDBMiss = "db_miss" ScenarioCheckoutError = "checkout_error" + // ScenarioInventory503 fails the inventory.reserve step with INV_503 so the + // demo's failing dependency rotates beyond payment/db — an alternate + // dependency failure, never a replacement for the seeded PMT_502 path. + ScenarioInventory503 = "inventory_503" + // ScenarioCheckoutPanic triggers a real recoverable panic inside the + // checkout handler. The Waylog HTTP middleware recovers it (emitting a failed + // WideEvent) and, with runtime hooks enabled, posts a go-sdk "runtime" panic + // signal that correlates onto the checkout incident as app-runtime evidence. + ScenarioCheckoutPanic = "checkout_panic" demoUserID = "demo-user" ) diff --git a/examples/microdemo/microdemo_test.go b/examples/microdemo/microdemo_test.go index 71aa33f..183c4ee 100644 --- a/examples/microdemo/microdemo_test.go +++ b/examples/microdemo/microdemo_test.go @@ -114,6 +114,33 @@ func TestCheckoutInternalErrorEmitsCHK500WithoutDownstream(t *testing.T) { requireNoStep(t, ev, "order.commit") } +func TestCheckoutInventory503EmitsINV503AfterDBLoad(t *testing.T) { + out := initSDK(t, "checkout") + db := httptest.NewServer(okJSON()) + defer db.Close() + payment := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Fatalf("payment should not be called for inventory_503 scenario") + })) + defer payment.Close() + + resp := postPurchase(t, wayloghttp.HTTP(microdemo.NewCheckoutHandler(payment.URL, db.URL)), "/checkout", microdemo.ScenarioInventory503) + if resp.Code != http.StatusBadGateway { + t.Fatalf("status = %d, want %d", resp.Code, http.StatusBadGateway) + } + ev := oneEvent(t, out) + if ev.Status != eventv2.StatusError { + t.Fatalf("status = %s, want error", ev.Status) + } + if ev.Anchor == nil || ev.Anchor.Step != "inventory.reserve" || ev.Anchor.ErrorCode != "INV_503" { + t.Fatalf("anchor = %#v, want inventory.reserve/INV_503", ev.Anchor) + } + requireStep(t, ev, "cart.validate", eventv2.StepStatusOK, "") + requireStep(t, ev, "db.load_cart", eventv2.StepStatusOK, "db") + requireStep(t, ev, "inventory.reserve", eventv2.StepStatusError, "") + requireNoStep(t, ev, "payment.charge") + requireNoStep(t, ev, "order.commit") +} + func TestCheckoutHappyEmitsOKWithoutAnchor(t *testing.T) { out := initSDK(t, "checkout") db := httptest.NewServer(okJSON()) diff --git a/examples/microdemo/scenario.go b/examples/microdemo/scenario.go index b83c699..69d7378 100644 --- a/examples/microdemo/scenario.go +++ b/examples/microdemo/scenario.go @@ -21,6 +21,10 @@ func normalizeScenario(s string) string { return ScenarioDBMiss case ScenarioCheckoutError: return ScenarioCheckoutError + case ScenarioInventory503: + return ScenarioInventory503 + case ScenarioCheckoutPanic: + return ScenarioCheckoutPanic default: return "" } diff --git a/examples/microdemo/service.go b/examples/microdemo/service.go index b18efe2..2c15b61 100644 --- a/examples/microdemo/service.go +++ b/examples/microdemo/service.go @@ -15,12 +15,13 @@ import ( func InitService(service string) error { return waylogv2.Init(waylogv2.Config{ - Service: service, - Env: "demo", - Version: "0.1.0", - IngestURL: config.Getenv("INGEST_URL", "http://localhost:8080"), - APIKey: config.Getenv("WAYLOG_WRITE_KEY", ""), - DevMode: config.GetenvBool("WAYLOG_DEV", false), + Service: service, + Env: "demo", + Version: "0.1.0", + IngestURL: config.Getenv("INGEST_URL", "http://localhost:8080"), + APIKey: config.Getenv("WAYLOG_WRITE_KEY", ""), + DevMode: config.GetenvBool("WAYLOG_DEV", false), + EnableRuntimeHooks: config.GetenvBool("WAYLOG_ENABLE_RUNTIME_HOOKS", true), }) } diff --git a/examples/microdemo/signals.go b/examples/microdemo/signals.go index b7b6611..547d180 100644 --- a/examples/microdemo/signals.go +++ b/examples/microdemo/signals.go @@ -63,6 +63,22 @@ func (p *DemoSignalPoster) PostDemoSignals(ctx context.Context) []SignalResult { Resource: map[string]any{"service": "payment", "endpoint": "POST /charge"}, Metadata: map[string]any{"error_code": "PMT_502", "downstream": "payment", "demo": "traffic_burst"}, }, + { + // Infra runtime evidence. Targets checkout — the service the burst + // incident opens on (checkout:payment.charge:PMT_502) — so it + // correlates onto the same incident that already carries the alert, + // dependency, propagation and blast evidence (Critical Design + // Decision 3: runtime signals match by inc.Service). Source k8s-demo + // marks it as infrastructure-runtime in the dashboard. + Type: "runtime", + Service: "checkout", + Severity: "critical", + Reason: "OOMKilled", + Message: "Container checkout killed by OOM (limit: 256Mi, usage: 312Mi).", + Source: "k8s-demo", + Resource: map[string]any{"service": "checkout", "container": "checkout"}, + Metadata: map[string]any{"subtype": "oom_killed", "pod": "checkout-7f8b9c-x2k", "container": "checkout", "demo": "traffic_burst"}, + }, } results := make([]SignalResult, 0, len(specs)) @@ -133,14 +149,19 @@ type demoSignalSpec struct { Severity string Reason string Message string + Source string // signal source; defaults to "waylog-demo" when empty Resource map[string]any Metadata map[string]any } func (s demoSignalSpec) body(ts time.Time) map[string]any { + source := s.Source + if source == "" { + source = "waylog-demo" + } return map[string]any{ "type": s.Type, - "source": "waylog-demo", + "source": source, "service": s.Service, "env": "demo", "severity": s.Severity, diff --git a/examples/microdemo/signals_test.go b/examples/microdemo/signals_test.go index d1d530b..8c09566 100644 --- a/examples/microdemo/signals_test.go +++ b/examples/microdemo/signals_test.go @@ -35,16 +35,16 @@ func TestDemoSignalPosterPostsDeployAndDependencySignals(t *testing.T) { })} poster.now = func() time.Time { return time.Date(2026, 5, 5, 12, 0, 0, 0, time.UTC) } results := poster.PostDemoSignals(t.Context()) - if len(results) != 2 { - t.Fatalf("results len = %d, want 2", len(results)) + if len(results) != 3 { + t.Fatalf("results len = %d, want 3", len(results)) } for _, result := range results { if !result.Accepted || result.SignalID == "" || result.Status != http.StatusCreated { t.Fatalf("result = %+v", result) } } - if len(posted) != 2 { - t.Fatalf("posted len = %d, want 2", len(posted)) + if len(posted) != 3 { + t.Fatalf("posted len = %d, want 3", len(posted)) } if posted[0]["type"] != "deploy" || posted[0]["service"] != "checkout" || posted[0]["env"] != "demo" { t.Fatalf("deploy signal = %+v", posted[0]) @@ -56,6 +56,18 @@ func TestDemoSignalPosterPostsDeployAndDependencySignals(t *testing.T) { if !ok || metadata["error_code"] != "PMT_502" { t.Fatalf("dependency metadata = %+v", posted[1]["metadata"]) } + // Infra runtime (OOMKill) targets checkout with env=demo so it correlates + // onto the same incident as the alert/dependency evidence. + if posted[2]["type"] != "runtime" || posted[2]["service"] != "checkout" || posted[2]["env"] != "demo" { + t.Fatalf("runtime signal = %+v", posted[2]) + } + if posted[2]["source"] != "k8s-demo" || posted[2]["reason"] != "OOMKilled" { + t.Fatalf("runtime signal source/reason = %+v", posted[2]) + } + rtMeta, ok := posted[2]["metadata"].(map[string]any) + if !ok || rtMeta["subtype"] != "oom_killed" { + t.Fatalf("runtime metadata = %+v", posted[2]["metadata"]) + } } func TestDemoSignalPosterReportsNonCreatedResponse(t *testing.T) { @@ -67,8 +79,8 @@ func TestDemoSignalPosterReportsNonCreatedResponse(t *testing.T) { }, nil })} results := poster.PostDemoSignals(t.Context()) - if len(results) != 2 { - t.Fatalf("results len = %d, want 2", len(results)) + if len(results) != 3 { + t.Fatalf("results len = %d, want 3", len(results)) } for _, result := range results { if result.Accepted || result.Status != http.StatusServiceUnavailable || result.Error == "" { diff --git a/examples/microdemo/ui.html b/examples/microdemo/ui.html index b0d3cf0..028a62e 100644 --- a/examples/microdemo/ui.html +++ b/examples/microdemo/ui.html @@ -4,7 +4,7 @@ - Waylog — Live demo + Crux — Live demo @@ -19,12 +19,13 @@ --line-strong: #a8a18d; --surface: #f4efe1; --surface-2: #ede7d6; - --accent: #15803d; - --accent-bright: #22c55e; - --accent-soft: rgba(34, 197, 94, 0.18); + --accent: #FF7300; + --accent-bright: #FF7300; + --accent-soft: rgba(255, 115, 0, 0.18); --danger: #b42318; --danger-soft: rgba(180, 35, 24, 0.16); --warn: #b45309; + --ok: #15803d; --focus: #2563eb; --font-sans: "Geist", ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; --font-mono: "Geist Mono", ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; @@ -43,12 +44,13 @@ --line-strong: #2a2a2a; --surface: #0c0c0c; --surface-2: #121212; - --accent: #5eff8b; - --accent-bright: #5eff8b; - --accent-soft: rgba(94, 255, 139, 0.18); + --accent: #FF7300; + --accent-bright: #FF7300; + --accent-soft: rgba(255, 115, 0, 0.18); --danger: #f87171; --danger-soft: rgba(248, 113, 113, 0.18); --warn: #fbbf24; + --ok: #5eff8b; } } @@ -398,7 +400,7 @@ color: var(--muted); font-size: 0.86rem; } - .proof-checklist .ok { color: var(--accent); font-family: var(--font-mono); font-weight: 600; } + .proof-checklist .ok { color: var(--ok); font-family: var(--font-mono); font-weight: 600; } .proof-checklist .bad { color: var(--danger); font-family: var(--font-mono); font-weight: 600; } .proof-table { width: 100%; @@ -638,7 +640,7 @@
- Waylog + Crux / live demo
@@ -647,12 +649,13 @@
-

Simulate a checkout outage. Watch Waylog explain the cascade.

-

Trigger a real request through api-gatewaycheckoutdb/payment, then jump into Waylog to see the failure path, blast radius, and trace narrative.

+

Simulate a checkout outage. Watch Crux explain the cascade.

+

Trigger a real request through api-gatewaycheckoutdb/payment, then jump into Crux to see the failure cascade, blast radius, alert evidence, and triage report.

+
@@ -661,23 +664,27 @@

Simulate a checkout outage. Watch Waylog explain the cascade.

Scenarios

-
+
Happy checkout Captured as a successful recent request; no error family appears.
-
+
Payment outage Checkout records payment.charge with PMT_502; dashboard shows the failure path and blast radius.
-
+
Cart not found DB returns 404; checkout records db.load_cart with CART_NOT_FOUND — a logical (not infra) failure.
-
+
Checkout 500 Checkout fails before any downstream call: cart.validate raises CHK_500 with no payment or db hop.
-
+
+ Inventory outage + DB load succeeds, then inventory.reserve fails with INV_503 — an alternate dependency failure before any payment hop. +
+
Suppressed known issue Visible in recent/direct trace views, excluded from error rollups and blast radius.
@@ -726,13 +733,15 @@

Result

const burstImpactFamilies = { payment_502: "checkout:payment.charge:PMT_502", db_miss: "checkout:db.load_cart:CART_NOT_FOUND", - checkout_error: "checkout:cart.validate:CHK_500" + checkout_error: "checkout:cart.validate:CHK_500", + inventory_503: "checkout:inventory.reserve:INV_503" }; const scenarioLabels = { happy: "happy", payment_502: "payment_502", db_miss: "db_miss", checkout_error: "checkout_error", + inventory_503: "inventory_503", suppressed_payment_502: "suppressed" }; const scenarioCopy = { @@ -743,19 +752,24 @@

Result

}, payment_502: { title: "Payment outage captured", - detail: "Waylog should show checkout:payment.charge:PMT_502 in Failures and Impact.", + detail: "Crux should show checkout:payment.charge:PMT_502 in Failures and Impact.", impactFamily: "checkout:payment.charge:PMT_502" }, db_miss: { title: "Cart not found captured", - detail: "Waylog should show checkout:db.load_cart:CART_NOT_FOUND in Failures and Impact.", + detail: "Crux should show checkout:db.load_cart:CART_NOT_FOUND in Failures and Impact.", impactFamily: "checkout:db.load_cart:CART_NOT_FOUND" }, checkout_error: { title: "Checkout 500 captured", - detail: "Waylog should show checkout:cart.validate:CHK_500 in Failures and Impact — no payment or db hop should appear.", + detail: "Crux should show checkout:cart.validate:CHK_500 in Failures and Impact — no payment or db hop should appear.", impactFamily: "checkout:cart.validate:CHK_500" }, + inventory_503: { + title: "Inventory outage captured", + detail: "Crux should show checkout:inventory.reserve:INV_503 in Failures and Impact — the db hop succeeds, payment is never reached.", + impactFamily: "checkout:inventory.reserve:INV_503" + }, suppressed_payment_502: { title: "Suppressed issue captured", detail: "This known issue should show in Recent requests while staying out of Failures and Impact.", @@ -904,7 +918,7 @@

Result

data.trace_id ? `Explain this trace` : "", copy.impactFamily ? `View impact` : "" ].filter(Boolean).join(""); - const timeline = story ? renderTimeline(story) : `
${timedOut ? "Trace captured, but the story is still catching up. Use the dashboard link to retry." : "Still propagating through Waylog…"}
`; + const timeline = story ? renderTimeline(story) : `
${timedOut ? "Trace captured, but the story is still catching up. Use the dashboard link to retry." : "Still propagating through Crux…"}
`; const result = document.getElementById("result"); result.className = "bracketed"; result.innerHTML = ` @@ -1008,7 +1022,8 @@

Result

Proof loop complete

Alert correlated. Root cause identified. Report verified.

-

Waylog correlated an external alert with an active incident, found ${esc(rootCause)}, and produced a cited operator report whose hash was stable across CLI, read, direct tool, and plan-template surfaces in this run.

+

Crux correlated an external alert with an active incident, found ${esc(rootCause)}, and produced a cited operator report whose hash was stable across CLI, read, direct tool, and plan-template surfaces in this run.

+ ${isLocalDemo() ? `

In this local demo, provider links open demo controls. In a real install, they point to your alert provider.

` : ""}
Root cause${esc(rootCause)}
diff --git a/examples/microdemo/ui_test.go b/examples/microdemo/ui_test.go index aae8b1a..8e1c54b 100644 --- a/examples/microdemo/ui_test.go +++ b/examples/microdemo/ui_test.go @@ -19,12 +19,22 @@ func TestDemoUIProductShowcaseCopy(t *testing.T) { } html := rec.Body.String() required := []string{ - "Simulate a checkout outage. Watch Waylog explain the cascade.", + "Crux — Live demo", + "Crux", + "#FF7300", + "--ok: #15803d", + ".proof-checklist .ok { color: var(--ok);", + "failure cascade", + "blast radius", + "alert evidence", + "triage report", + "Simulate a checkout outage. Watch Crux explain the cascade.", "Run payment outage", "Run happy checkout", "Run suppressed known issue", "Run cart not found", "Run checkout 500", + "Run inventory outage", "Run traffic burst", "Run proof loop", "Alert-to-report proof", @@ -59,7 +69,8 @@ func TestDemoUIProductShowcaseCopy(t *testing.T) { "Open dashboard", "Explain this trace", "View impact", - "Still propagating through Waylog…", + "Still propagating through Crux…", + "In this local demo, provider links open demo controls.", "Happy checkout captured", "Payment outage captured", "Cart not found captured", @@ -76,6 +87,12 @@ func TestDemoUIProductShowcaseCopy(t *testing.T) { "waylog explain <trace_id>", "waylog errors", "waylog blast", + "Watch Waylog explain", + "Still propagating through Waylog", + "Waylog — Live demo", + "#22c55e", + "rgba(34, 197, 94", + "rgba(94, 255, 139", } for _, needle := range forbidden { if strings.Contains(html, needle) { diff --git a/go.mod b/go.mod index ad8dfec..fd9c2e4 100644 --- a/go.mod +++ b/go.mod @@ -5,61 +5,39 @@ go 1.24.2 toolchain go1.24.12 require ( - github.com/charmbracelet/bubbles v1.0.0 - github.com/charmbracelet/bubbletea v1.3.10 - github.com/charmbracelet/lipgloss v1.1.0 + github.com/google/uuid v1.6.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/segmentio/kafka-go v0.4.50 + github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/sssmaran/WaylogCLI/pkg v0.0.0 - github.com/sssmaran/WaylogCLI/pkg/transport/kafka v0.0.0 go.opentelemetry.io/proto/otlp v1.10.0 + golang.org/x/time v0.12.0 + google.golang.org/grpc v1.79.2 + google.golang.org/protobuf v1.36.11 modernc.org/sqlite v1.46.1 ) replace github.com/sssmaran/WaylogCLI/pkg => ./pkg -replace github.com/sssmaran/WaylogCLI/pkg/transport/kafka => ./pkg/transport/kafka - require ( - github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/charmbracelet/colorprofile v0.4.1 // indirect - github.com/charmbracelet/x/ansi v0.11.6 // indirect - github.com/charmbracelet/x/cellbuf v0.0.15 // indirect - github.com/charmbracelet/x/term v0.2.2 // indirect - github.com/clipperhouse/displaywidth v0.9.0 // indirect - github.com/clipperhouse/stringish v0.1.1 // indirect - github.com/clipperhouse/uax29/v2 v2.5.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect - github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/klauspost/compress v1.18.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/kr/text v0.2.0 // indirect - github.com/lucasb-eyer/go-colorful v1.3.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-localereader v0.0.1 // indirect - github.com/mattn/go-runewidth v0.0.19 // indirect - github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect - github.com/muesli/cancelreader v0.2.2 // indirect - github.com/muesli/termenv v0.16.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect - github.com/pierrec/lz4/v4 v4.1.15 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - github.com/rivo/uniseg v0.4.7 // indirect - github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect golang.org/x/net v0.50.0 // indirect golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect - google.golang.org/grpc v1.79.2 // indirect - google.golang.org/protobuf v1.36.11 // indirect modernc.org/libc v1.67.6 // indirect modernc.org/mathutil v1.7.1 // indirect modernc.org/memory v1.11.0 // indirect diff --git a/go.sum b/go.sum index 89a1ad1..f4df390 100644 --- a/go.sum +++ b/go.sum @@ -1,42 +1,26 @@ -github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= -github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc= -github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E= -github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw= -github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4= -github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk= -github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk= -github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY= -github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30= -github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8= -github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ= -github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI= -github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q= -github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= -github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= -github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA= -github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA= -github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs= -github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA= -github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U= -github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= -github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= -github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -47,26 +31,12 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= -github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= -github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= -github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= -github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= -github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= -github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= -github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= -github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= -github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= -github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= -github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0= -github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= @@ -79,22 +49,24 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= -github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= -github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= -github.com/segmentio/kafka-go v0.4.50 h1:mcyC3tT5WeyWzrFbd6O374t+hmcu1NKt2Pu1L3QaXmc= -github.com/segmentio/kafka-go v0.4.50/go.mod h1:Y1gn60kzLEEaW28YshXyk2+VCUKbJ3Qr6DrnT3i4+9E= +github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= +github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= -github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= -github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= -github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= -github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= -github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= -github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= +go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= +go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= +go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= +go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= +go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= +go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= +go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= +go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= +go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -109,14 +81,19 @@ golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= -golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= +golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0= +google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY= google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac= google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU= diff --git a/go.work b/go.work index fb5f00b..53cee76 100644 --- a/go.work +++ b/go.work @@ -1,4 +1,4 @@ -go 1.24.2 +go 1.25.0 use ( . diff --git a/internal/auth/config.go b/internal/auth/config.go index 6b8da56..66ab621 100644 --- a/internal/auth/config.go +++ b/internal/auth/config.go @@ -17,6 +17,11 @@ type AuthConfig struct { ReadKeys []string AgentKeys []string + // WriteKeyEnv records which env var populated WriteKeys ("WAYLOG_WRITE_KEY" + // or the legacy "WAYLOG_API_KEY") so a weak-key warning names the variable the + // operator actually set. Empty when no write key is configured. + WriteKeyEnv string + Profile string // "demo", "dev", or "prod". Defaults to "dev" when unset. DashboardMode string // "off", "basic", "key" @@ -48,8 +53,10 @@ func ParseConfig(env map[string]string) (AuthConfig, error) { if writeKey != "" { cfg.WriteKeys = splitKeys(writeKey) + cfg.WriteKeyEnv = "WAYLOG_WRITE_KEY" } else if legacyKey != "" { cfg.WriteKeys = []string{legacyKey} + cfg.WriteKeyEnv = "WAYLOG_API_KEY" } cfg.ReadKeys = splitKeys(env["WAYLOG_READ_KEY"]) @@ -112,6 +119,58 @@ func ParseConfig(env map[string]string) (AuthConfig, error) { return cfg, nil } +// weakKeyValues are common placeholder secrets that must never guard a real +// deployment. Matched case-insensitively; the deploy/prod.env "changeme-*" +// presets are caught by the prefix check in isWeakKey. +var weakKeyValues = map[string]bool{ + "demo": true, "changeme": true, "change-me": true, "password": true, + "secret": true, "test": true, "example": true, "key": true, "token": true, +} + +func isWeakKey(k string) bool { + k = strings.ToLower(strings.TrimSpace(k)) + if k == "" { + return false + } + return weakKeyValues[k] || strings.HasPrefix(k, "changeme") || strings.HasPrefix(k, "change-me") +} + +// weakKeySuffix is appended to every weak-key warning. It is non-fatal and fires +// in all profiles — including demo — because a placeholder key is only ever safe +// on a local, unexposed server. The demo deliberately runs with a "demo" key, so +// `make demo` surfaces this warning by design; it is the same nudge an operator +// needs if they promote that config toward a real deployment. +const weakKeySuffix = " is a weak/placeholder value — fine for a local demo, but never expose this server with it" + +// WeakKeyWarnings returns one human-readable warning per auth scope guarded by a +// placeholder/demo secret. Callers should log these at startup so an operator who +// ships with a default key is told before it becomes an incident. +func (c AuthConfig) WeakKeyWarnings() []string { + var warns []string + check := func(envName string, keys []string) { + for _, k := range keys { + if isWeakKey(k) { + warns = append(warns, envName+weakKeySuffix) + return + } + } + } + writeEnv := c.WriteKeyEnv + if writeEnv == "" { + writeEnv = "WAYLOG_WRITE_KEY" + } + check(writeEnv, c.WriteKeys) + check("WAYLOG_READ_KEY", c.ReadKeys) + check("WAYLOG_AGENT_KEY", c.AgentKeys) + if c.DashboardMode == "basic" && isWeakKey(c.DashboardPass) { + warns = append(warns, "DASHBOARD_AUTH basic password"+weakKeySuffix) + } + if c.DashboardMode == "key" && isWeakKey(c.DashboardKey) { + warns = append(warns, "DASHBOARD_AUTH key"+weakKeySuffix) + } + return warns +} + func splitKeys(s string) []string { s = strings.TrimSpace(s) if s == "" { diff --git a/internal/auth/config_test.go b/internal/auth/config_test.go index 3b49011..7c3240a 100644 --- a/internal/auth/config_test.go +++ b/internal/auth/config_test.go @@ -171,3 +171,65 @@ func TestParseConfig_ProfileDemoAllowsOpen(t *testing.T) { t.Fatalf("profile = %q, want %q", cfg.Profile, ProfileDemo) } } + +func TestWeakKeyWarnings(t *testing.T) { + t.Run("flags placeholder keys across scopes in dev", func(t *testing.T) { + cfg := AuthConfig{ + Profile: ProfileDev, + WriteKeys: []string{"changeme-write"}, + ReadKeys: []string{"demo"}, + AgentKeys: []string{"a-real-strong-agent-key"}, + DashboardMode: "basic", + DashboardPass: "changeme", + } + warns := cfg.WeakKeyWarnings() + joined := strings.Join(warns, "\n") + for _, want := range []string{"WAYLOG_WRITE_KEY", "WAYLOG_READ_KEY", "DASHBOARD_AUTH basic"} { + if !strings.Contains(joined, want) { + t.Fatalf("warnings %q missing mention of %q", joined, want) + } + } + if strings.Contains(joined, "WAYLOG_AGENT_KEY") { + t.Fatalf("strong agent key should not be flagged: %q", joined) + } + }) + + t.Run("still warns in demo profile (make demo runs with a demo key)", func(t *testing.T) { + cfg := AuthConfig{Profile: ProfileDemo, WriteKeys: []string{"demo"}, ReadKeys: []string{"demo"}} + warns := cfg.WeakKeyWarnings() + if len(warns) == 0 { + t.Fatal("demo profile with demo keys should warn (plan G5a: start with demo key -> warning log)") + } + for _, w := range warns { + if !strings.Contains(w, "local demo") { + t.Fatalf("demo warning should read as non-fatal/expected for local demo, got %q", w) + } + } + }) + + t.Run("silent when keys are strong", func(t *testing.T) { + cfg := AuthConfig{ + Profile: ProfileProd, + WriteKeys: []string{"K7f2-write-9aZ"}, + ReadKeys: []string{"K7f2-read-9aZ"}, + AgentKeys: []string{"K7f2-agent-9aZ"}, + } + if warns := cfg.WeakKeyWarnings(); len(warns) != 0 { + t.Fatalf("strong keys should not warn, got %v", warns) + } + }) + + t.Run("names WAYLOG_API_KEY when the weak write key came from the legacy var", func(t *testing.T) { + cfg, err := ParseConfig(map[string]string{"WAYLOG_API_KEY": "demo"}) + if err != nil { + t.Fatalf("ParseConfig: %v", err) + } + joined := strings.Join(cfg.WeakKeyWarnings(), "\n") + if !strings.Contains(joined, "WAYLOG_API_KEY") { + t.Fatalf("legacy weak write key should name WAYLOG_API_KEY, got %q", joined) + } + if strings.Contains(joined, "WAYLOG_WRITE_KEY") { + t.Fatalf("must not name WAYLOG_WRITE_KEY when the source was the legacy var, got %q", joined) + } + }) +} diff --git a/internal/cli/root.go b/internal/cli/root.go index 069b1b7..a1203e7 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -1,33 +1,16 @@ package cli import ( - "context" "encoding/json" "fmt" "io" "net/http" "os" "strings" - - "github.com/sssmaran/WaylogCLI/internal/llm" - "github.com/sssmaran/WaylogCLI/internal/tools" ) -// defaultStore is set via SetDefaultStore for backward compatibility. -var defaultStore tools.Store - -// SetDefaultStore sets the default store for CLI commands that don't provide one. -func SetDefaultStore(s tools.Store) { - defaultStore = s -} - -// Run runs the CLI with the default store. +// Run runs the CLI. func Run(args []string) { - RunWithStore(defaultStore, args) -} - -// RunWithStore runs the CLI with the provided store. -func RunWithStore(store tools.Store, args []string) { if len(args) == 0 { usage() return @@ -43,7 +26,7 @@ func RunWithStore(store tools.Store, args []string) { handleTools() return } - handleAsk(store, args[1:]) + handleAsk(args[1:]) default: usage() } @@ -55,11 +38,9 @@ func usage() { fmt.Println(" waylog tools") fmt.Println("") fmt.Println("examples:") - fmt.Println(" waylog \"show top errors\"") - fmt.Println(" waylog \"trace summary for trace \"") fmt.Println(" waylog \"explain request \"") - fmt.Println(" waylog \"graph_query expr='error_code=PMT_502' window='10m'\"") - fmt.Println(" waylog \"compare_windows current='10m' baseline='10m' offset='1h'\"") + fmt.Println(" waylog \"blast radius for payment-service/charge/PMT_502 in 15m\"") + fmt.Println(" waylog \"triage incident \"") } func handleTools() { @@ -96,20 +77,9 @@ func handleTools() { } } - // Fallback to local registry if server is unreachable. if len(entries) == 0 { - reg := tools.NewRegistry() - if regErr := tools.RegisterGraphTools(reg); regErr != nil { - fmt.Println("tool registry error:", regErr) - return - } - for _, t := range reg.List() { - entries = append(entries, toolEntry{ - Name: t.Name, - Description: t.Description, - Examples: t.Examples, - }) - } + fmt.Println("no tools available (server returned empty list or unreachable)") + return } // Print formatted table. @@ -123,184 +93,20 @@ func handleTools() { } } -func handleAsk(store tools.Store, args []string) { - prompt := strings.TrimSpace(strings.Join(args, " ")) - if prompt == "" { +func handleAsk(args []string) { + if len(args) > 0 && strings.TrimSpace(strings.Join(args, " ")) == "" { fmt.Println("usage: waylog \"\"") return } - - sel, err := llm.SelectFromEnv() - if err != nil { - fmt.Println(err) - return - } - if !sel.AskEnabled { - fmt.Println(llm.ErrProviderNotConfigured) - return - } - - reg := tools.NewRegistry() - if err := tools.RegisterGraphTools(reg); err != nil { - fmt.Println("tool registry error:", err) - return - } - - toolDefs := make([]llm.ToolDefinition, 0, len(reg.List())) - for _, t := range reg.List() { - toolDefs = append(toolDefs, llm.ToolDefinition{ - Name: t.Name, - Description: t.Description, - InputSchema: t.InputSchema, - }) - } - - answer, _, err := llm.Ask(context.Background(), sel.Impl, toolDefs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) { - return reg.Call(ctx, store, name, params) - }), prompt, llm.AskOptions{MaxSteps: 5}) - if err != nil { - printAskError(err) - return - } - - fmt.Println(colorizeOutput(answer)) -} - -func printAskError(err error) { - msg := err.Error() - fmt.Printf("%s✗ %s%s\n", ansiRed, msg, ansiReset) - - var tip string - switch { - case strings.Contains(msg, "expr required") || strings.Contains(msg, "window required"): - tip = "graph_query requires both expr and window, for example:\n waylog \"graph_query expr='error_code=PMT_502' window='10m'\"" - case strings.Contains(msg, "query parse error"): - tip = "check your query syntax. Example:\n waylog \"graph_query expr='success=false' window='10m'\"" - case strings.Contains(msg, "request_id or trace_id required"): - tip = "provide a trace ID, for example:\n waylog \"explain request \"" - case strings.Contains(msg, "current, baseline, and offset required"): - tip = "compare_windows needs current, baseline, and offset, for example:\n waylog \"compare_windows current='10m' baseline='10m' offset='1h'\"" - case strings.Contains(msg, "map that to a tool") || strings.Contains(msg, "couldn't"): - tip = "Try: \"show top errors\", \"summarize trace \", or \"explain request \"" - } - if tip != "" { - fmt.Printf("%s💡 %s%s\n", ansiYellow, tip, ansiReset) - } + fmt.Println("local ask is no longer wired; use the ingest server's /v1/ask endpoint") + fmt.Println(" curl -H \"Authorization: Bearer $WAYLOG_AGENT_KEY\" \\") + fmt.Println(" -d '{\"prompt\":\"...\"}' http://localhost:8080/v1/ask") } const ( - ansiReset = "\033[0m" - ansiBold = "\033[1m" - ansiDim = "\033[2m" - ansiRed = "\033[31m" - ansiGreen = "\033[32m" - ansiYellow = "\033[33m" - ansiBlue = "\033[34m" - ansiMagenta = "\033[35m" - ansiCyan = "\033[36m" - ansiWhite = "\033[37m" + ansiReset = "\033[0m" + ansiBold = "\033[1m" + ansiDim = "\033[2m" + ansiYellow = "\033[33m" + ansiCyan = "\033[36m" ) - -func colorizeOutput(s string) string { - lines := strings.Split(s, "\n") - var out strings.Builder - for i, line := range lines { - if i > 0 { - out.WriteByte('\n') - } - out.WriteString(colorizeLine(line)) - } - return out.String() -} - -func colorizeLine(line string) string { - trimmed := strings.TrimSpace(line) - - // Empty lines - if trimmed == "" { - return "" - } - - // Section headers: **Title** or **Title** - if strings.HasPrefix(trimmed, "**") && strings.HasSuffix(trimmed, "**") { - title := strings.Trim(trimmed, "* ") - return fmt.Sprintf("\n%s%s%s%s", ansiBold, ansiCyan, title, ansiReset) - } - - // Bullet lines: - key: value - if strings.HasPrefix(trimmed, "- ") { - return colorizeBullet(trimmed) - } - - // Title line (first non-empty, non-bullet, non-header line) — bold white - if !strings.HasPrefix(trimmed, "-") && !strings.HasPrefix(trimmed, "*") { - return fmt.Sprintf("%s%s%s", ansiWhite, trimmed, ansiReset) - } - - return line -} - -func colorizeBullet(line string) string { - // Split "- key: value" into key and value - content := strings.TrimPrefix(line, "- ") - parts := strings.SplitN(content, ": ", 2) - - if len(parts) != 2 { - return fmt.Sprintf(" %s•%s %s", ansiDim, ansiReset, content) - } - - key := parts[0] - value := parts[1] - - // Color the value based on the key type - coloredValue := colorizeValue(key, value) - return fmt.Sprintf(" %s•%s %s%s:%s %s", ansiDim, ansiReset, ansiDim, key, ansiReset, coloredValue) -} - -func colorizeValue(key, value string) string { - lowerKey := strings.ToLower(key) - lowerVal := strings.ToLower(value) - - // Error codes and error-related values - if strings.Contains(lowerKey, "error") || strings.Contains(lowerKey, "failure") { - return fmt.Sprintf("%s%s%s%s", ansiBold, ansiRed, value, ansiReset) - } - - // Trace/request/span IDs — cyan for easy copy - if strings.Contains(lowerKey, "trace_id") || strings.Contains(lowerKey, "request_id") || - strings.Contains(lowerKey, "span") { - return fmt.Sprintf("%s%s%s", ansiCyan, value, ansiReset) - } - - // Counts and numeric values - if strings.Contains(lowerKey, "count") || strings.Contains(lowerKey, "total") || - strings.Contains(lowerKey, "latency") { - return fmt.Sprintf("%s%s%s%s", ansiBold, ansiYellow, value, ansiReset) - } - - // Service names - if strings.Contains(lowerKey, "service") { - return fmt.Sprintf("%s%s%s", ansiMagenta, value, ansiReset) - } - - // Event names - if strings.Contains(lowerKey, "event") || strings.Contains(lowerKey, "flow") { - return fmt.Sprintf("%s%s%s", ansiBlue, value, ansiReset) - } - - // Service paths (arrows) - if strings.Contains(value, "->") { - return fmt.Sprintf("%s%s%s", ansiMagenta, value, ansiReset) - } - - // Success/failure status values - if lowerVal == "true" || lowerVal == "success" || lowerVal == "ok" { - return fmt.Sprintf("%s%s%s", ansiGreen, value, ansiReset) - } - if lowerVal == "false" || strings.Contains(lowerVal, "fail") || strings.Contains(lowerVal, "error") { - return fmt.Sprintf("%s%s%s", ansiRed, value, ansiReset) - } - - // Default — white - return fmt.Sprintf("%s%s%s", ansiWhite, value, ansiReset) -} diff --git a/internal/cli/v2/cmd.go b/internal/cli/v2/cmd.go index 9c2ba9e..49b6d94 100644 --- a/internal/cli/v2/cmd.go +++ b/internal/cli/v2/cmd.go @@ -67,6 +67,8 @@ func RunCLI(args []string, _ io.Reader, stdout, stderr io.Writer) int { return runSearch(ctx, client, cfg, rest[1:], stdout, stderr) case "triage": return runTriage(ctx, client, cfg, rest[1:], stdout, stderr) + case "doctor": + return runDoctor(cfg, rest[1:], stdout, stderr) default: fmt.Fprintf(stderr, "unknown command: %s\n", rest[0]) printUsage(stderr) @@ -133,19 +135,6 @@ func setGlobalValue(cfg *cliConfig, key, value string) error { return nil } -func requireV2Reads(ctx context.Context, client *Client, stderr io.Writer) int { - caps, err := client.Capabilities(ctx) - if err != nil { - fmt.Fprintf(stderr, "capability check failed: %v\n", err) - return exitCodeForError(err) - } - if !caps.V2Reads.Enabled { - fmt.Fprintln(stderr, "server must run with WAYLOG_V2_READS=true for the v2 CLI") - return 3 - } - return 0 -} - func runCapabilities(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int { if len(args) != 0 { return usage(stderr, "usage: waylog capabilities [--json]") @@ -163,9 +152,6 @@ func runErrors(ctx context.Context, client *Client, cfg cliConfig, args []string if err := fs.Parse(args); err != nil || fs.NArg() != 0 { return usage(stderr, "usage: waylog errors [--window ] [--service ] [--limit ] [--cursor ] [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Errors(ctx, ErrorsParams{Window: *window, Service: *service, Limit: *limit, Cursor: *cursor}) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderErrors) } @@ -181,9 +167,6 @@ func runRecent(ctx context.Context, client *Client, cfg cliConfig, args []string if err := fs.Parse(args); err != nil || fs.NArg() != 0 { return usage(stderr, "usage: waylog recent [--window ] [--service ] [--status ] [--limit ] [--cursor ] [--include-suppressed] [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Recent(ctx, RecentParams{Window: *window, Service: *service, Status: *status, Limit: *limit, Cursor: *cursor, IncludeSuppressed: *includeSuppressed}) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderRecent) } @@ -192,9 +175,6 @@ func runIncidents(ctx context.Context, client *Client, cfg cliConfig, args []str if len(args) != 0 { return usage(stderr, "usage: waylog incidents [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Incidents(ctx) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderIncidents) } @@ -204,9 +184,6 @@ func runIncident(ctx context.Context, client *Client, cfg cliConfig, args []stri if err != nil { return usage(stderr, err.Error()) } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } if snapshot { if cfg.json { resp, err := client.IncidentSnapshotJSON(ctx, incidentID) @@ -251,9 +228,6 @@ func runTriage(ctx context.Context, client *Client, cfg cliConfig, args []string if err != nil { return usage(stderr, err.Error()) } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } rep, err := client.Triage(ctx, id, TriageParams{Window: window, Snapshot: snapshot}) if err != nil { fmt.Fprintln(stderr, err) @@ -301,9 +275,6 @@ func runEvent(ctx context.Context, client *Client, cfg cliConfig, args []string, if len(args) != 1 { return usage(stderr, "usage: waylog event [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Event(ctx, args[0]) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderEvent) } @@ -312,9 +283,6 @@ func runTrace(ctx context.Context, client *Client, cfg cliConfig, args []string, if len(args) != 1 { return usage(stderr, "usage: waylog trace [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Trace(ctx, args[0]) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderTrace) } @@ -323,9 +291,6 @@ func runExplain(ctx context.Context, client *Client, cfg cliConfig, args []strin if len(args) != 1 { return usage(stderr, "usage: waylog explain [--json]") } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } id := args[0] resp, err := client.Story(ctx, StoryQuery{EventID: id}) if isNotFound(err) { @@ -339,9 +304,6 @@ func runBlast(ctx context.Context, client *Client, cfg cliConfig, args []string, if err != nil { return usage(stderr, err.Error()) } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Blast(ctx, p) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderBlast) } @@ -422,9 +384,6 @@ func runSearch(ctx context.Context, client *Client, cfg cliConfig, args []string } p.ErrorCode = query } - if gate := requireV2Reads(ctx, client, stderr); gate != 0 { - return gate - } resp, err := client.Search(ctx, p) return renderOrError(stdout, stderr, cfg.json, resp, err, RenderSearch) } @@ -546,6 +505,7 @@ func printUsage(w io.Writer) { waylog explain [--json] waylog blast (--service --step --code | --code | ) [--window ] [--json] waylog search [--service ] [--status ] [--window ] [--limit ] [--cursor ] [--json] + waylog doctor [--server] [--json] Recommended loop: waylog incidents diff --git a/internal/cli/v2/cmd_test.go b/internal/cli/v2/cmd_test.go index 51c39fb..92bfabf 100644 --- a/internal/cli/v2/cmd_test.go +++ b/internal/cli/v2/cmd_test.go @@ -2,26 +2,12 @@ package cliv2 import ( "bytes" - "encoding/json" "net/http" "net/http/httptest" "strings" "testing" ) -func TestRunCLIRequiresV2Reads(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - _ = json.NewEncoder(w).Encode(CapabilitiesResponse{}) - })) - defer srv.Close() - - var stdout, stderr bytes.Buffer - code := RunCLI([]string{"--addr", srv.URL, "errors"}, nil, &stdout, &stderr) - if code != 3 || !strings.Contains(stderr.String(), "WAYLOG_V2_READS=true") { - t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String()) - } -} - func TestRunCLIErrorsHappyPath(t *testing.T) { var gotPath, gotQuery string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -103,35 +89,6 @@ func TestRunCLIIncidentsListsActive(t *testing.T) { } } -func TestRunCLIIncidentsEmptyAndRequiresV2Reads(t *testing.T) { - calls := 0 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - calls++ - _, _ = w.Write([]byte(`{"v2_reads":{"enabled":false}}`)) - })) - defer srv.Close() - - var stdout, stderr bytes.Buffer - code := RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr) - if code != 3 || calls != 1 || !strings.Contains(stderr.String(), "WAYLOG_V2_READS=true") { - t.Fatalf("code=%d calls=%d stdout=%q stderr=%q", code, calls, stdout.String(), stderr.String()) - } - - srv.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path == "/v1/capabilities" { - _, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`)) - return - } - _, _ = w.Write([]byte(`{"incidents":[]}`)) - }) - stdout.Reset() - stderr.Reset() - code = RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr) - if code != 0 || !strings.Contains(stdout.String(), "No active incidents.") { - t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String()) - } -} - func TestRunCLIIncidentDetailAndSnapshot(t *testing.T) { calls := []string{} accepts := []string{} @@ -178,52 +135,6 @@ func TestRunCLIIncidentDetailAndSnapshot(t *testing.T) { } } -func TestRunCLIEventEscapesIDAndRequiresV2Reads(t *testing.T) { - calls := []string{} - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - calls = append(calls, r.URL.String()) - if r.URL.Path == "/v1/capabilities" { - _, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`)) - return - } - _, _ = w.Write([]byte(`{"event":{"event_id":"event/1","trace_id":"trace","service":"checkout","status":"ok","duration_ms":3}}`)) - })) - defer srv.Close() - - var stdout, stderr bytes.Buffer - code := RunCLI([]string{"--addr", srv.URL, "event", "event/1"}, nil, &stdout, &stderr) - if code != 0 { - t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String()) - } - if len(calls) != 2 || calls[1] != "/v1/events/event%2F1" { - t.Fatalf("calls=%v", calls) - } - if !strings.Contains(stdout.String(), "event_id: event/1") { - t.Fatalf("stdout=%q", stdout.String()) - } -} - -func TestRunCLICapabilitiesDoesNotRequireV2Reads(t *testing.T) { - var gotPath string - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - gotPath = r.URL.Path - _, _ = w.Write([]byte(`{"v2_reads":{"enabled":false},"otlp":{"http_traces":true}}`)) - })) - defer srv.Close() - - var stdout, stderr bytes.Buffer - code := RunCLI([]string{"--addr", srv.URL, "capabilities"}, nil, &stdout, &stderr) - if code != 0 { - t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String()) - } - if gotPath != "/v1/capabilities" { - t.Fatalf("path=%q", gotPath) - } - if !strings.Contains(stdout.String(), "v2_reads: disabled") || !strings.Contains(stdout.String(), "otlp_http_traces: enabled") { - t.Fatalf("stdout=%q", stdout.String()) - } -} - func TestRunCLIExplainFallsBackToTraceID(t *testing.T) { calls := []string{} srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/cli/v2/doctor.go b/internal/cli/v2/doctor.go new file mode 100644 index 0000000..64ac5f4 --- /dev/null +++ b/internal/cli/v2/doctor.go @@ -0,0 +1,37 @@ +package cliv2 + +import ( + "fmt" + "io" + + "github.com/sssmaran/WaylogCLI/internal/doctor" +) + +// runDoctor runs read-only local checks (always) plus server reachability checks +// when --server is passed. Server checks probe cfg.addr (default localhost:8080). +func runDoctor(cfg cliConfig, args []string, stdout, stderr io.Writer) int { + server := false + for _, a := range args { + switch a { + case "--server": + server = true + default: + return usage(stderr, "usage: waylog doctor [--server] [--json]") + } + } + + res := doctor.Run(doctor.Options{Addr: cfg.addr, ServerChecks: server}) + + if cfg.json { + if err := doctor.RenderJSON(stdout, res); err != nil { + fmt.Fprintln(stderr, err) + return 2 + } + } else { + doctor.Render(stdout, res) + } + if res.OK() { + return 0 + } + return 1 +} diff --git a/internal/cli/v2/doctor_test.go b/internal/cli/v2/doctor_test.go new file mode 100644 index 0000000..81ef3f9 --- /dev/null +++ b/internal/cli/v2/doctor_test.go @@ -0,0 +1,57 @@ +package cliv2 + +import ( + "bytes" + "strings" + "testing" + + "github.com/sssmaran/WaylogCLI/internal/doctor" +) + +// cleanDoctorEnv clears every env var doctor reads (doctor.EnvKeys is the single +// source of truth) so the doctor command's local checks pass deterministically +// regardless of the developer's exported environment. +func cleanDoctorEnv(t *testing.T) { + t.Helper() + for _, k := range doctor.EnvKeys { + t.Setenv(k, "") + } + // Point the WAL dir at a writable temp dir so the wal-dir check passes + // hermetically (it never skips) without probing the test's cwd. + t.Setenv("EVENT_LOG_V2_DIR", t.TempDir()) +} + +func TestDoctorCommandRunsLocalChecks(t *testing.T) { + cleanDoctorEnv(t) + var out, errb bytes.Buffer + code := RunCLI([]string{"doctor"}, nil, &out, &errb) + if code != 0 { + t.Fatalf("doctor exit = %d, want 0; stderr=%s stdout=%s", code, errb.String(), out.String()) + } + for _, want := range []string{"auth/config", "triage-hash", "doctor: ok"} { + if !strings.Contains(out.String(), want) { + t.Fatalf("doctor output missing %q:\n%s", want, out.String()) + } + } +} + +func TestDoctorCommandJSON(t *testing.T) { + cleanDoctorEnv(t) + var out, errb bytes.Buffer + code := RunCLI([]string{"--json", "doctor"}, nil, &out, &errb) + if code != 0 { + t.Fatalf("doctor --json exit = %d, want 0; stderr=%s", code, errb.String()) + } + if !strings.Contains(out.String(), `"checks"`) { + t.Fatalf("expected JSON with checks, got:\n%s", out.String()) + } +} + +func TestDoctorRejectsUnknownFlag(t *testing.T) { + cleanDoctorEnv(t) + var out, errb bytes.Buffer + code := RunCLI([]string{"doctor", "--bogus"}, nil, &out, &errb) + if code == 0 { + t.Fatalf("unknown doctor flag should be non-zero exit") + } +} diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go index 2e61981..c781da0 100644 --- a/internal/cli/v2/render.go +++ b/internal/cli/v2/render.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "io" + "slices" "strings" "text/tabwriter" "time" @@ -124,6 +125,9 @@ func renderIncidentBody(w io.Writer, inc Incident) { fmt.Fprintf(w, "baseline_count: %d\n", inc.BaselineCount) fmt.Fprintf(w, "current_count: %d\n", inc.CurrentCount) + renderPropagationBlock(w, inc.Propagation) + renderBlastBlock(w, inc.Blast) + fmt.Fprintln(w, "\nevidence:") if len(inc.Evidence) == 0 { fmt.Fprintln(w, " none") @@ -284,7 +288,6 @@ func RenderSearch(w io.Writer, resp EventSearchResponse) { } func RenderCapabilities(w io.Writer, resp CapabilitiesResponse) { - fmt.Fprintf(w, "v2_reads: %s\n", enabledLabel(resp.V2Reads.Enabled)) fmt.Fprintf(w, "otlp_http_traces: %s\n", enabledLabel(resp.OTLP.HTTPTraces)) fmt.Fprintf(w, "otlp_grpc_traces: %s", enabledLabel(resp.OTLP.GRPCTraces)) if resp.OTLP.GRPCAddr != "" { @@ -379,6 +382,106 @@ func formatTime(t time.Time) string { return t.Format(time.RFC3339) } +func renderPropagationBlock(w io.Writer, p *apiv2.PropagationSnapshot) { + if p == nil || p.Latest == nil { + fmt.Fprintln(w, "\nWhere did it start?") + fmt.Fprintln(w, " Propagation evidence unavailable") + return + } + fmt.Fprintln(w, "\nWhere did it start?") + if p.Latest.CaptureStatus != apiv2.CaptureStatusOK { + fmt.Fprintf(w, " Propagation evidence unavailable (%s) — retrying\n", p.Latest.CaptureStatus) + return + } + fmt.Fprintf(w, " Origin: %s / %s\n", p.Latest.OriginService, p.Latest.OriginStep) + firstFailing, errCode := firstErrorStep(p.Latest) + if firstFailing != "" { + fmt.Fprintf(w, " First failing step: %s %s\n", firstFailing, errCode) + } + if len(p.Latest.Path) > 0 { + names := make([]string, 0, len(p.Latest.Path)) + for _, s := range p.Latest.Path { + names = append(names, s.Step) + } + fmt.Fprintf(w, " %s\n", strings.Join(names, " → ")) + } + fmt.Fprintf(w, " sample trace: %s · captured %s ago\n", + p.Latest.SampleTraceID, time.Since(p.Latest.CapturedAt).Round(time.Second)) +} + +func renderBlastBlock(w io.Writer, b *apiv2.BlastSnapshot) { + if b == nil || b.Latest == nil { + fmt.Fprintln(w, "\nHow bad is it?") + fmt.Fprintln(w, " Blast evidence unavailable") + return + } + fmt.Fprintln(w, "\nHow bad is it?") + if b.Opening != nil && blastDelta(b.Opening, b.Latest) { + fmt.Fprintf(w, " At open: %d req · %d svc · %s users\n", + b.Opening.AffectedRequests, b.Opening.AffectedServices, usersStr(b.Opening.AffectedUsers)) + fmt.Fprintf(w, " Now: %d req · %d svc · %s users\n", + b.Latest.AffectedRequests, b.Latest.AffectedServices, usersStr(b.Latest.AffectedUsers)) + } else { + fmt.Fprintf(w, " Now: %d req · %d svc · %s users\n", + b.Latest.AffectedRequests, b.Latest.AffectedServices, usersStr(b.Latest.AffectedUsers)) + } + if len(b.Latest.TopServices) > 0 { + fmt.Fprintf(w, " Top services: %s\n", strings.Join(b.Latest.TopServices, ", ")) + } + fmt.Fprintf(w, " captured %s ago\n", time.Since(b.Latest.CapturedAt).Round(time.Second)) +} + +func firstErrorStep(p *apiv2.PropagationEvidence) (step, code string) { + if p == nil { + return "", "" + } + for _, s := range p.Path { + if s.Status == "error" { + return s.Step, s.ErrorCode + } + } + return "", "" +} + +// blastDelta returns true if Opening and Latest differ on any user-visible +// impact field. CapturedAt and CaptureStatus are excluded — they change every +// tick and would force a permanent delta. +func blastDelta(o, l *apiv2.BlastEvidence) bool { + if o == nil || l == nil { + return false + } + if o.AffectedRequests != l.AffectedRequests { + return true + } + if o.AffectedServices != l.AffectedServices { + return true + } + if usersInt(o.AffectedUsers) != usersInt(l.AffectedUsers) { + return true + } + if !slices.Equal(o.TopServices, l.TopServices) { + return true + } + if len(o.SampledTraces) != len(l.SampledTraces) { + return true + } + return false +} + +func usersInt(u *int) int { + if u == nil { + return 0 + } + return *u +} + +func usersStr(u *int) string { + if u == nil { + return "?" + } + return fmt.Sprintf("%d", *u) +} + func RenderTriage(w io.Writer, rep *TriageReport) int { fmt.Fprintf(w, "Triage report incident=%s window=%s confidence=%s\n", rep.IncidentRef.ID, rep.IncidentRef.Window, rep.Confidence) diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go index 4fa4f34..6fb1a5a 100644 --- a/internal/cli/v2/render_test.go +++ b/internal/cli/v2/render_test.go @@ -146,7 +146,6 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) { resp.Incidents.Rebuild.Scope = "hot-window" RenderCapabilities(&out, resp) for _, want := range []string{ - "v2_reads: disabled", "otlp_http_traces: enabled", "otlp_grpc_traces: enabled addr=:4317", "llm: provider=none configured=false ask_enabled=false", @@ -160,7 +159,6 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) { func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) { raw := []byte(`{ - "v2_reads":{"enabled":true}, "otlp":{"http_traces":true,"grpc_traces":true,"grpc_addr":":4317"}, "llm":{"provider":"none","model":"","tool_mode":"","configured":false,"ask_enabled":false}, "incidents":{"enabled":true,"persistent":true,"rebuild":{"supported":true,"scope":"hot-window"}} @@ -228,3 +226,66 @@ func TestRenderTriageHeaderAndSections(t *testing.T) { } } } + +func TestRenderIncident_WithPropagationAndBlast(t *testing.T) { + ts := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC) + users := 47 + openUsers := 5 + resp := IncidentDetailResponse{ + Incident: Incident{ + IncidentID: "inc_render", + Service: "payment-service", + Status: "active", + Propagation: &apiv2.PropagationSnapshot{ + Latest: &apiv2.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []apiv2.PropagationStep{ + {Service: "payment-service", Step: "validate", Status: "ok"}, + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "7a3fb2", + CapturedAt: ts, + CaptureStatus: "ok", + }, + }, + Blast: &apiv2.BlastSnapshot{ + Opening: &apiv2.BlastEvidence{AffectedRequests: 3, AffectedServices: 1, AffectedUsers: &openUsers, CapturedAt: ts, CaptureStatus: "ok"}, + Latest: &apiv2.BlastEvidence{AffectedRequests: 184, AffectedServices: 3, AffectedUsers: &users, TopServices: []string{"checkout", "api-gateway"}, CapturedAt: ts, CaptureStatus: "ok"}, + }, + }, + } + var buf bytes.Buffer + RenderIncident(&buf, resp) + out := buf.String() + for _, want := range []string{ + "Where did it start?", + "Origin: payment-service / charge", + "First failing step: charge DB_TIMEOUT", + "validate → charge", + "How bad is it?", + "At open: 3 req", + "Now: 184 req", + "Top services: checkout, api-gateway", + } { + if !strings.Contains(out, want) { + t.Errorf("rendered output missing %q\n\nFull output:\n%s", want, out) + } + } +} + +func TestRenderIncident_PropagationMissing_ShowsRetryLine(t *testing.T) { + resp := IncidentDetailResponse{ + Incident: Incident{ + IncidentID: "inc_missing", + Propagation: &apiv2.PropagationSnapshot{ + Latest: &apiv2.PropagationEvidence{CapturedAt: time.Now(), CaptureStatus: "missing"}, + }, + }, + } + var buf bytes.Buffer + RenderIncident(&buf, resp) + if !strings.Contains(buf.String(), "Propagation evidence unavailable") { + t.Errorf("missing-state render missing the retry line:\n%s", buf.String()) + } +} diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go index c281fef..83a0146 100644 --- a/internal/cli/v2/types.go +++ b/internal/cli/v2/types.go @@ -9,9 +9,6 @@ import ( ) type CapabilitiesResponse struct { - V2Reads struct { - Enabled bool `json:"enabled"` - } `json:"v2_reads"` OTLP struct { HTTPTraces bool `json:"http_traces"` GRPCTraces bool `json:"grpc_traces"` diff --git a/internal/coldstore/causal_store.go b/internal/coldstore/causal_store.go deleted file mode 100644 index 2f7d036..0000000 --- a/internal/coldstore/causal_store.go +++ /dev/null @@ -1,97 +0,0 @@ -package coldstore - -import ( - "context" - "encoding/json" - "fmt" - "time" - - "github.com/sssmaran/WaylogCLI/internal/graph/causal" -) - -func (s *SQLiteStore) SaveClaims(ctx context.Context, claims []causal.Claim) error { - if len(claims) == 0 { - return nil - } - - tx, err := s.writer.BeginTx(ctx, nil) - if err != nil { - return fmt.Errorf("causal: begin tx: %w", err) - } - defer tx.Rollback() - - now := time.Now().UTC().Format(tsFormat) - - for _, c := range claims { - _, err := tx.ExecContext(ctx, - `UPDATE causal_claims SET superseded_at = ? WHERE claim_type = ? AND subject = ? AND service = ? AND superseded_at IS NULL`, - now, string(c.ClaimType), c.Subject, c.Service, - ) - if err != nil { - return fmt.Errorf("causal: supersede: %w", err) - } - - evJSON, err := json.Marshal(c.Evidence) - if err != nil { - return fmt.Errorf("causal: marshal evidence: %w", err) - } - - shadowInt := 0 - if c.ShadowMode { - shadowInt = 1 - } - - _, err = tx.ExecContext(ctx, - `INSERT INTO causal_claims (claim_type, subject, target, service, confidence, tier, evidence, window_start, window_end, shadow_mode) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - string(c.ClaimType), c.Subject, c.Target, c.Service, - c.Confidence, string(c.Tier), string(evJSON), - c.WindowStart.UTC().Format(tsFormat), - c.WindowEnd.UTC().Format(tsFormat), - shadowInt, - ) - if err != nil { - return fmt.Errorf("causal: insert claim: %w", err) - } - } - - return tx.Commit() -} - -func (s *SQLiteStore) ActiveClaims(ctx context.Context, q causal.ClaimQuery) ([]causal.Claim, error) { - rows, err := s.reader.QueryContext(ctx, - `SELECT claim_type, subject, target, service, confidence, tier, evidence, window_start, window_end, shadow_mode - FROM causal_claims - WHERE claim_type = ? AND superseded_at IS NULL - ORDER BY confidence DESC`, - string(q.ClaimType), - ) - if err != nil { - return nil, fmt.Errorf("causal: query active: %w", err) - } - defer rows.Close() - - var out []causal.Claim - for rows.Next() { - var ( - c causal.Claim - ct, tier string - evJSON string - ws, we string - shadowInt int - ) - if err := rows.Scan(&ct, &c.Subject, &c.Target, &c.Service, &c.Confidence, &tier, &evJSON, &ws, &we, &shadowInt); err != nil { - return nil, fmt.Errorf("causal: scan row: %w", err) - } - c.ClaimType = causal.ClaimType(ct) - c.Tier = causal.ConfidenceTier(tier) - c.ShadowMode = shadowInt == 1 - if err := json.Unmarshal([]byte(evJSON), &c.Evidence); err != nil { - return nil, fmt.Errorf("causal: unmarshal evidence: %w", err) - } - c.WindowStart, _ = time.Parse(tsFormat, ws) - c.WindowEnd, _ = time.Parse(tsFormat, we) - out = append(out, c) - } - return out, rows.Err() -} diff --git a/internal/coldstore/causal_store_test.go b/internal/coldstore/causal_store_test.go deleted file mode 100644 index ae0ed0a..0000000 --- a/internal/coldstore/causal_store_test.go +++ /dev/null @@ -1,127 +0,0 @@ -package coldstore - -import ( - "context" - "testing" - "time" - - "github.com/sssmaran/WaylogCLI/internal/graph/causal" -) - -func TestCausalStore_SaveAndQuery(t *testing.T) { - s, err := Open(":memory:") - if err != nil { - t.Fatal(err) - } - defer s.Close() - - ctx := context.Background() - now := time.Now().UTC() - - claims := []causal.Claim{ - { - ClaimType: causal.ClaimIntroducedBy, - Subject: "PMT_502", - Target: "deploy_abc", - Service: "payment-service", - Confidence: 0.92, - Tier: causal.TierSupported, - Evidence: causal.Evidence{BeforeFailures: 2, AfterFailures: 100, Lift: 50, TimeDeltaMin: 5, WindowMinutes: 30}, - WindowStart: now.Add(-30 * time.Minute), - WindowEnd: now, - ShadowMode: true, - }, - } - - if err := s.SaveClaims(ctx, claims); err != nil { - t.Fatal("SaveClaims:", err) - } - - active, err := s.ActiveClaims(ctx, causal.ClaimQuery{ClaimType: causal.ClaimIntroducedBy}) - if err != nil { - t.Fatal("ActiveClaims:", err) - } - if len(active) != 1 { - t.Fatalf("got %d active claims, want 1", len(active)) - } - if active[0].Subject != "PMT_502" { - t.Errorf("subject = %q, want PMT_502", active[0].Subject) - } - if active[0].Confidence != 0.92 { - t.Errorf("confidence = %f, want 0.92", active[0].Confidence) - } -} - -func TestCausalStore_SupersedesOldClaims(t *testing.T) { - s, err := Open(":memory:") - if err != nil { - t.Fatal(err) - } - defer s.Close() - - ctx := context.Background() - now := time.Now().UTC() - - old := []causal.Claim{ - { - ClaimType: causal.ClaimIntroducedBy, - Subject: "PMT_502", - Target: "deploy_old", - Service: "payment-service", - Confidence: 0.80, - Tier: causal.TierProvisional, - Evidence: causal.Evidence{Lift: 5}, - WindowStart: now.Add(-60 * time.Minute), - WindowEnd: now.Add(-30 * time.Minute), - ShadowMode: true, - }, - } - if err := s.SaveClaims(ctx, old); err != nil { - t.Fatal(err) - } - - updated := []causal.Claim{ - { - ClaimType: causal.ClaimIntroducedBy, - Subject: "PMT_502", - Target: "deploy_new", - Service: "payment-service", - Confidence: 0.92, - Tier: causal.TierSupported, - Evidence: causal.Evidence{Lift: 50}, - WindowStart: now.Add(-30 * time.Minute), - WindowEnd: now, - ShadowMode: true, - }, - } - if err := s.SaveClaims(ctx, updated); err != nil { - t.Fatal(err) - } - - active, err := s.ActiveClaims(ctx, causal.ClaimQuery{ClaimType: causal.ClaimIntroducedBy}) - if err != nil { - t.Fatal(err) - } - if len(active) != 1 { - t.Fatalf("got %d active claims, want 1 (old should be superseded)", len(active)) - } - if active[0].Target != "deploy_new" { - t.Errorf("target = %q, want deploy_new", active[0].Target) - } -} - -func TestCausalStore_EmptyResult(t *testing.T) { - s, err := Open(":memory:") - if err != nil { - t.Fatal(err) - } - defer s.Close() - - active, err := s.ActiveClaims(context.Background(), causal.ClaimQuery{ClaimType: causal.ClaimIntroducedBy}) - if err != nil { - t.Fatal(err) - } - if len(active) != 0 { - t.Fatalf("got %d claims, want 0", len(active)) - } -} diff --git a/internal/coldstore/coldstore.go b/internal/coldstore/coldstore.go index d97da0b..0c39dd0 100644 --- a/internal/coldstore/coldstore.go +++ b/internal/coldstore/coldstore.go @@ -4,11 +4,12 @@ import ( "context" "database/sql" "embed" + "errors" "fmt" "log/slog" + "sort" "time" - "github.com/sssmaran/WaylogCLI/internal/graph/causal" "github.com/sssmaran/WaylogCLI/pkg/event" _ "modernc.org/sqlite" @@ -26,7 +27,6 @@ var migrationsFS embed.FS type Store interface { EventSearcher DeploymentStore - CausalStore } // ManagedStore adds lifecycle methods to Store. @@ -54,12 +54,6 @@ type DeploymentStore interface { ServiceErrorRateInWindow(ctx context.Context, svc string, from, to time.Time) (ServiceErrorRate, error) } -// CausalStore persists and queries causal inference claims. -type CausalStore interface { - SaveClaims(ctx context.Context, claims []causal.Claim) error - ActiveClaims(ctx context.Context, q causal.ClaimQuery) ([]causal.Claim, error) -} - // SQLiteStore wraps a SQLite database for cold storage of events and deployments. // It maintains separate writer (single-conn) and reader (multi-conn) handles. type SQLiteStore struct { @@ -115,24 +109,87 @@ func Open(path string) (ManagedStore, error) { return s, nil } -// Migrate runs all embedded SQL migration files idempotently. +// MigrationNames returns the embedded migration file names, sorted ascending. +// It reads the same embedded FS that Migrate applies, so callers (e.g. doctor) +// can compare applied-vs-expected without opening the database. +func MigrationNames() ([]string, error) { + entries, err := migrationsFS.ReadDir("migrations") + if err != nil { + return nil, fmt.Errorf("coldstore: read migrations dir: %w", err) + } + names := make([]string, 0, len(entries)) + for _, e := range entries { + names = append(names, e.Name()) + } + sort.Strings(names) + return names, nil +} + +// Migrate runs all embedded SQL migration files idempotently. Applied +// migrations are tracked in the schema_migrations table; files already +// recorded there are skipped on subsequent calls. This lets non-idempotent +// statements (e.g. ALTER TABLE ADD COLUMN) live in a migration file and +// still satisfy the Migrate-twice contract. func (s *SQLiteStore) Migrate() error { + if _, err := s.writer.Exec(`CREATE TABLE IF NOT EXISTS schema_migrations ( + name TEXT PRIMARY KEY, + applied_at TEXT NOT NULL + )`); err != nil { + return fmt.Errorf("create schema_migrations: %w", err) + } entries, err := migrationsFS.ReadDir("migrations") if err != nil { return fmt.Errorf("read migrations dir: %w", err) } for _, entry := range entries { + var applied string + err := s.writer.QueryRow(`SELECT applied_at FROM schema_migrations WHERE name = ?`, entry.Name()).Scan(&applied) + if err == nil { + continue + } + if !errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("check migration %s: %w", entry.Name(), err) + } data, err := migrationsFS.ReadFile("migrations/" + entry.Name()) if err != nil { return fmt.Errorf("read migration %s: %w", entry.Name(), err) } - if _, err := s.writer.Exec(string(data)); err != nil { - return fmt.Errorf("exec migration %s: %w", entry.Name(), err) + if err := s.applyMigration(entry.Name(), data); err != nil { + return err } } return nil } +// applyMigration runs one migration file and records it in schema_migrations +// inside a single transaction. If either statement fails, the entire change +// is rolled back so a non-idempotent migration (e.g. ALTER TABLE ADD COLUMN) +// is never left half-applied across a crash or partial failure. +func (s *SQLiteStore) applyMigration(name string, data []byte) (err error) { + tx, err := s.writer.Begin() + if err != nil { + return fmt.Errorf("begin migration %s: %w", name, err) + } + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + if _, err = tx.Exec(string(data)); err != nil { + return fmt.Errorf("exec migration %s: %w", name, err) + } + if _, err = tx.Exec( + `INSERT INTO schema_migrations (name, applied_at) VALUES (?, ?)`, + name, time.Now().UTC().Format(tsFormat), + ); err != nil { + return fmt.Errorf("record migration %s: %w", name, err) + } + if err = tx.Commit(); err != nil { + return fmt.Errorf("commit migration %s: %w", name, err) + } + return nil +} + // Close closes both writer and reader database handles. func (s *SQLiteStore) Close() error { var firstErr error @@ -154,5 +211,4 @@ var ( _ EventWriter = (*SQLiteStore)(nil) _ EventSearcher = (*SQLiteStore)(nil) _ DeploymentStore = (*SQLiteStore)(nil) - _ CausalStore = (*SQLiteStore)(nil) ) diff --git a/internal/coldstore/coldstore_test.go b/internal/coldstore/coldstore_test.go index 08f7279..19896bd 100644 --- a/internal/coldstore/coldstore_test.go +++ b/internal/coldstore/coldstore_test.go @@ -62,3 +62,21 @@ func TestDeploymentsTableExists(t *testing.T) { t.Fatal("deployments table not created:", err) } } + +func TestMigrationNamesListsEmbeddedMigrations(t *testing.T) { + names, err := MigrationNames() + if err != nil { + t.Fatalf("MigrationNames: %v", err) + } + if len(names) == 0 { + t.Fatal("expected embedded migrations, got none") + } + if names[0] != "001_initial.sql" { + t.Fatalf("first migration = %q, want 001_initial.sql", names[0]) + } + for i := 1; i < len(names); i++ { + if names[i-1] >= names[i] { + t.Fatalf("names not sorted ascending at %d: %q >= %q", i, names[i-1], names[i]) + } + } +} diff --git a/internal/coldstore/incident_backcompat_test.go b/internal/coldstore/incident_backcompat_test.go new file mode 100644 index 0000000..1370933 --- /dev/null +++ b/internal/coldstore/incident_backcompat_test.go @@ -0,0 +1,83 @@ +package coldstore + +import ( + "context" + "testing" + "time" + + "github.com/sssmaran/WaylogCLI/internal/incidents" + apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2" +) + +// TestIncidentStoreScansLegacyNullEvidenceColumns is a characterization test for +// data-correctness invariant (f): a row whose evidence columns are NULL — the +// shape of an incident migrated in from before evidence-snapshot capture — must +// scan cleanly, with nil snapshots and no panic. +// +// It does NOT rely on today's writer choosing to write nil snapshots as NULL. +// Instead it writes a fully-populated incident, then drives the four evidence +// columns to NULL with raw SQL (simulating the migrated legacy row), and +// verifies the READ path (scanIncident -> parseJSONText) turns those NULL +// columns back into nil pointers. +func TestIncidentStoreScansLegacyNullEvidenceColumns(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + sqlStore := managed.(*SQLiteStore) + store := NewIncidentStore(sqlStore) + + now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) + inc := incidents.Incident{ + IncidentID: "inc_legacy", + Env: "prod", + Service: "checkout", + ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "charge", ErrorCode: "PMT_502"}, + Status: incidents.StatusActive, + Cause: incidents.CauseUnknown, + Confidence: incidents.ConfidenceLow, + StartedAt: now, UpdatedAt: now, LastSeenAt: now, + // Populate every evidence snapshot, so the subsequent NULL-ing is a real + // state change and the read-path NULL handling is what produces nil. + Propagation: &incidents.PropagationSnapshot{Latest: &incidents.PropagationEvidence{ + SampleTraceID: "trace-a", CaptureStatus: incidents.CaptureOK, CapturedAt: now, + }}, + Blast: &incidents.BlastSnapshot{Latest: &incidents.BlastEvidence{ + AffectedRequests: 1, CaptureStatus: incidents.CaptureOK, CapturedAt: now, + }}, + Alerts: &incidents.AlertSnapshot{Latest: &incidents.AlertEvidence{ + CaptureStatus: incidents.CaptureOK, CapturedAt: now, + }}, + Runtime: &incidents.RuntimeSnapshot{Matches: []incidents.RuntimeEvidence{{ + SignalID: "sig_1", Subtype: "oom_killed", Service: "checkout", + Severity: "critical", Reason: "OOMKilled", OccurredAt: now, CapturedAt: now, + }}}, + } + ctx := context.Background() + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + + // Drive the evidence columns to NULL the way a migration would for a row that + // predates evidence capture. + if _, err := sqlStore.writer.ExecContext(ctx, + `UPDATE incidents SET propagation_json = NULL, blast_json = NULL, alert_json = NULL, runtime_json = NULL WHERE incident_id = ?`, + "inc_legacy", + ); err != nil { + t.Fatalf("null-out evidence columns: %v", err) + } + + got, err := store.Get(ctx, "inc_legacy") + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation != nil || got.Blast != nil || got.Alerts != nil || got.Runtime != nil { + t.Fatalf("NULL evidence columns must scan as nil, got prop=%v blast=%v alerts=%v runtime=%v", + got.Propagation, got.Blast, got.Alerts, got.Runtime) + } + // Sanity: the rest of the row is intact (we only nulled evidence). + if got.IncidentID != "inc_legacy" || got.Service != "checkout" { + t.Fatalf("non-evidence fields corrupted: %+v", got) + } +} diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go index 9a05201..83286fd 100644 --- a/internal/coldstore/incident_store.go +++ b/internal/coldstore/incident_store.go @@ -77,14 +77,30 @@ func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.In if err != nil { return fmt.Errorf("coldstore incident warnings: %w", err) } + propagation, err := jsonText(inc.Propagation) + if err != nil { + return fmt.Errorf("coldstore incident propagation: %w", err) + } + blast, err := jsonText(inc.Blast) + if err != nil { + return fmt.Errorf("coldstore incident blast: %w", err) + } + alerts, err := jsonText(inc.Alerts) + if err != nil { + return fmt.Errorf("coldstore incident alerts: %w", err) + } + runtime, err := jsonText(inc.Runtime) + if err != nil { + return fmt.Errorf("coldstore incident runtime: %w", err) + } _, err = execer.ExecContext(ctx, ` INSERT INTO incidents ( incident_id, env, service, error_service, error_step, error_code, status, cause, confidence, severity, started_at, updated_at, last_seen_at, recovering_at, resolved_at, affected_requests, affected_users, affected_services, top_services, sample_traces, evidence, next_checks, instrumentation_warnings, - lift, baseline_count, current_count - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + lift, baseline_count, current_count, propagation_json, blast_json, alert_json, runtime_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(incident_id) DO UPDATE SET status = excluded.status, cause = excluded.cause, @@ -104,13 +120,18 @@ func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.In instrumentation_warnings = excluded.instrumentation_warnings, lift = excluded.lift, baseline_count = excluded.baseline_count, - current_count = excluded.current_count`, + current_count = excluded.current_count, + propagation_json = excluded.propagation_json, + blast_json = excluded.blast_json, + alert_json = excluded.alert_json, + runtime_json = excluded.runtime_json`, inc.IncidentID, inc.Env, inc.Service, inc.ErrorFamily.Service, inc.ErrorFamily.Step, inc.ErrorFamily.ErrorCode, string(inc.Status), string(inc.Cause), string(inc.Confidence), inc.Severity, formatTime(inc.StartedAt), formatTime(inc.UpdatedAt), formatTime(inc.LastSeenAt), nullableTime(inc.RecoveringAt), nullableTime(inc.ResolvedAt), inc.AffectedRequests, nullableInt(inc.AffectedUsers), inc.AffectedServices, topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount, + propagation, blast, alerts, runtime, ) if err != nil { return err @@ -166,7 +187,8 @@ func incidentSelectSQL() string { affected_requests, affected_users, affected_services, COALESCE(top_services, ''), COALESCE(sample_traces, ''), COALESCE(evidence, ''), COALESCE(next_checks, ''), COALESCE(instrumentation_warnings, ''), - lift, baseline_count, current_count + lift, baseline_count, current_count, + COALESCE(propagation_json, ''), COALESCE(blast_json, ''), COALESCE(alert_json, ''), COALESCE(runtime_json, '') FROM incidents` } @@ -176,11 +198,13 @@ func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, var startedAt, updatedAt, lastSeenAt, recoveringAt, resolvedAt string var affectedUsers sql.NullInt64 var topServices, samples, evidence, nextChecks, warnings string + var propagationJSON, blastJSON, alertJSON, runtimeJSON string err := row.Scan( &inc.IncidentID, &inc.Env, &inc.Service, &inc.ErrorFamily.Service, &inc.ErrorFamily.Step, &inc.ErrorFamily.ErrorCode, &status, &cause, &confidence, &inc.Severity, &startedAt, &updatedAt, &lastSeenAt, &recoveringAt, &resolvedAt, &inc.AffectedRequests, &affectedUsers, &inc.AffectedServices, &topServices, &samples, &evidence, &nextChecks, &warnings, &inc.Lift, &inc.BaselineCount, &inc.CurrentCount, + &propagationJSON, &blastJSON, &alertJSON, &runtimeJSON, ) if err != nil { return incidents.Incident{}, err @@ -231,6 +255,18 @@ func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, if err := parseJSONText(warnings, &inc.InstrumentationWarnings); err != nil { return incidents.Incident{}, fmt.Errorf("coldstore incident warnings: %w", err) } + if err := parseJSONText(propagationJSON, &inc.Propagation); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident propagation: %w", err) + } + if err := parseJSONText(blastJSON, &inc.Blast); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident blast: %w", err) + } + if err := parseJSONText(alertJSON, &inc.Alerts); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident alerts: %w", err) + } + if err := parseJSONText(runtimeJSON, &inc.Runtime); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident runtime: %w", err) + } return inc, nil } diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go index 9853f3b..0468415 100644 --- a/internal/coldstore/incident_store_test.go +++ b/internal/coldstore/incident_store_test.go @@ -84,6 +84,44 @@ func TestIncidentStoreRoundtripAndPrune(t *testing.T) { } } +func TestPruneResolvedNeverTouchesActiveOrRecovering(t *testing.T) { + ctx := context.Background() + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + old := time.Date(2026, 5, 1, 12, 0, 0, 0, time.UTC) + + for _, inc := range []incidents.Incident{ + testColdIncident("inc_active", incidents.StatusActive, old), + testColdIncident("inc_recovering", incidents.StatusRecovering, old), + testColdIncident("inc_resolved", incidents.StatusResolved, old), + } { + if err := store.Upsert(ctx, inc); err != nil { + t.Fatal(err) + } + } + + // Cutoff far in the future: everything resolved is eligible. + deleted, err := store.PruneResolvedOlderThan(ctx, old.Add(24*time.Hour)) + if err != nil { + t.Fatal(err) + } + if deleted != 1 { + t.Fatalf("deleted=%d, want only the resolved row", deleted) + } + for _, id := range []string{"inc_active", "inc_recovering"} { + if _, err := store.Get(ctx, id); err != nil { + t.Fatalf("%s must survive pruning: %v", id, err) + } + } + if _, err := store.Get(ctx, "inc_resolved"); !errors.Is(err, incidents.ErrNotFound) { + t.Fatalf("resolved row must be pruned, got %v", err) + } +} + func TestIncidentStoreReplaceNonResolved(t *testing.T) { ctx := context.Background() managed, err := Open(":memory:") @@ -132,6 +170,289 @@ func TestIncidentStoreReplaceNonResolved(t *testing.T) { } } +func TestIncidentStore_PropagationSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_prop", ts) + inc.Propagation = &incidents.PropagationSnapshot{ + Opening: &incidents.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []incidents.PropagationStep{ + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "trace_a", + CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + }, + Latest: &incidents.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []incidents.PropagationStep{ + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "trace_b", + CapturedAt: ts.Add(30 * time.Second), + CaptureStatus: incidents.CaptureOK, + }, + } + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation == nil { + t.Fatal("Propagation lost") + } + if got.Propagation.Opening == nil || got.Propagation.Latest == nil { + t.Fatal("Opening/Latest lost") + } + if got.Propagation.Opening.SampleTraceID != "trace_a" { + t.Errorf("Opening.SampleTraceID = %q", got.Propagation.Opening.SampleTraceID) + } + if got.Propagation.Latest.SampleTraceID != "trace_b" { + t.Errorf("Latest.SampleTraceID = %q", got.Propagation.Latest.SampleTraceID) + } + if got.Propagation.Opening.CaptureStatus != incidents.CaptureOK { + t.Errorf("Opening.CaptureStatus = %q", got.Propagation.Opening.CaptureStatus) + } +} + +func TestIncidentStore_BlastSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + openUsers := 12 + latestUsers := 47 + inc := baseEvidenceIncident("inc_test_blast", ts) + inc.Blast = &incidents.BlastSnapshot{ + Opening: &incidents.BlastEvidence{ + AffectedRequests: 5, + AffectedUsers: &openUsers, + AffectedServices: 1, + TopServices: []string{"checkout"}, + SampledTraces: []string{"trace_a"}, + CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + }, + Latest: &incidents.BlastEvidence{ + AffectedRequests: 184, + AffectedUsers: &latestUsers, + AffectedServices: 3, + TopServices: []string{"checkout", "api-gateway"}, + SampledTraces: []string{"trace_b", "trace_c"}, + CapturedAt: ts.Add(time.Minute), + CaptureStatus: incidents.CaptureOK, + }, + } + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Blast == nil || got.Blast.Opening == nil || got.Blast.Latest == nil { + t.Fatalf("Blast snapshot lost: %+v", got.Blast) + } + if got.Blast.Opening.AffectedRequests != 5 || got.Blast.Latest.AffectedRequests != 184 { + t.Errorf("AffectedRequests round-trip wrong: opening=%d latest=%d", + got.Blast.Opening.AffectedRequests, got.Blast.Latest.AffectedRequests) + } + if got.Blast.Opening.AffectedServices != 1 || got.Blast.Latest.AffectedServices != 3 { + t.Errorf("AffectedServices round-trip wrong") + } + if got.Blast.Opening.AffectedUsers == nil || *got.Blast.Opening.AffectedUsers != openUsers { + t.Errorf("Opening.AffectedUsers lost: %+v", got.Blast.Opening.AffectedUsers) + } + if got.Blast.Latest.AffectedUsers == nil || *got.Blast.Latest.AffectedUsers != latestUsers { + t.Errorf("Latest.AffectedUsers lost: %+v", got.Blast.Latest.AffectedUsers) + } +} + +func TestIncidentStore_RuntimeSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_runtime", ts) + oom := incidents.RuntimeEvidence{ + SignalID: "sig_oom", Subtype: "oom_killed", Service: "checkout", Source: "k8s-demo", + Severity: "critical", Reason: "OOMKilled", OccurredAt: ts.Add(-2 * time.Minute), CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, Metadata: map[string]any{"pod": "checkout-7f8b9c-x2k"}, + } + panicEv := incidents.RuntimeEvidence{ + SignalID: "sig_panic", Subtype: "panic", Service: "checkout", Source: "go-sdk", + Severity: "warning", Reason: "runtime panic", OccurredAt: ts.Add(-time.Minute), CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + } + inc.Runtime = &incidents.RuntimeSnapshot{Matches: []incidents.RuntimeEvidence{oom, panicEv}, Opening: &oom, Latest: &panicEv} + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Runtime == nil || len(got.Runtime.Matches) != 2 { + t.Fatalf("Runtime snapshot lost: %+v", got.Runtime) + } + if got.Runtime.Matches[0].Subtype != "oom_killed" || got.Runtime.Matches[1].Subtype != "panic" { + t.Errorf("runtime subtypes round-trip wrong: %+v", got.Runtime.Matches) + } + if got.Runtime.Opening == nil || got.Runtime.Opening.SignalID != "sig_oom" { + t.Errorf("Opening lost: %+v", got.Runtime.Opening) + } + if got.Runtime.Latest == nil || got.Runtime.Latest.SignalID != "sig_panic" { + t.Errorf("Latest lost: %+v", got.Runtime.Latest) + } + if got.Runtime.Matches[0].Metadata["pod"] != "checkout-7f8b9c-x2k" { + t.Errorf("metadata lost: %+v", got.Runtime.Matches[0].Metadata) + } +} + +func TestIncidentStore_NilSnapshotsRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_nil", ts) + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation != nil { + t.Errorf("Propagation = %+v; want nil", got.Propagation) + } + if got.Blast != nil { + t.Errorf("Blast = %+v; want nil", got.Blast) + } + if got.Alerts != nil { + t.Errorf("Alerts = %+v; want nil", got.Alerts) + } +} + +func TestIncidentStore_AlertSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_alerts", ts) + inc.Alerts = &incidents.AlertSnapshot{ + Opening: &incidents.AlertEvidence{ + Matches: []incidents.MatchedAlert{{ + SignalID: "sig_open", + AlertID: "CheckoutPaymentFailure", + Source: "alertmanager", + Severity: "critical", + Reason: "PMT_502 spike", + ProviderURL: "https://alerts.example/open", + EvidenceIDs: []string{"sig_open"}, + MatchedAt: ts, + Strategy: "family", + }}, + CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + }, + Latest: &incidents.AlertEvidence{ + Matches: []incidents.MatchedAlert{{ + SignalID: "sig_latest", + AlertID: "CheckoutPaymentFailure", + Source: "alertmanager", + Severity: "critical", + Reason: "PMT_502 still firing", + EvidenceIDs: []string{"sig_latest"}, + MatchedAt: ts.Add(time.Minute), + Strategy: "family", + }}, + CapturedAt: ts.Add(time.Minute), + CaptureStatus: incidents.CaptureOK, + }, + } + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Alerts == nil || got.Alerts.Opening == nil || got.Alerts.Latest == nil { + t.Fatalf("Alerts snapshot lost: %+v", got.Alerts) + } + if got.Alerts.Opening.Matches[0].SignalID != "sig_open" { + t.Fatalf("opening match = %+v", got.Alerts.Opening.Matches) + } + if got.Alerts.Latest.Matches[0].SignalID != "sig_latest" { + t.Fatalf("latest match = %+v", got.Alerts.Latest.Matches) + } + if got.Alerts.Latest.CaptureStatus != incidents.CaptureOK { + t.Fatalf("latest status = %s", got.Alerts.Latest.CaptureStatus) + } +} + +func TestIncidentStore_DoesNotMergeOpening(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + base := baseEvidenceIncident("inc_test_dumb", ts) + base.Propagation = &incidents.PropagationSnapshot{ + Opening: &incidents.PropagationEvidence{SampleTraceID: "trace_a", CapturedAt: ts, CaptureStatus: incidents.CaptureOK}, + Latest: &incidents.PropagationEvidence{SampleTraceID: "trace_a", CapturedAt: ts, CaptureStatus: incidents.CaptureOK}, + } + if err := store.Upsert(ctx, base); err != nil { + t.Fatalf("upsert: %v", err) + } + base.Propagation = &incidents.PropagationSnapshot{ + Latest: &incidents.PropagationEvidence{SampleTraceID: "trace_b", CapturedAt: ts.Add(time.Minute), CaptureStatus: incidents.CaptureOK}, + } + if err := store.Upsert(ctx, base); err != nil { + t.Fatalf("upsert 2: %v", err) + } + got, err := store.Get(ctx, base.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation == nil || got.Propagation.Opening != nil { + t.Errorf("Opening should be nil after explicit overwrite; got %+v", got.Propagation) + } + if got.Propagation == nil || got.Propagation.Latest == nil || got.Propagation.Latest.SampleTraceID != "trace_b" { + t.Errorf("Latest should be trace_b; got %+v", got.Propagation) + } +} + func testColdIncident(id string, status incidents.Status, at time.Time) incidents.Incident { resolvedAt := at.Add(time.Minute) inc := incidents.Incident{ @@ -160,3 +481,22 @@ func testColdIncident(id string, status incidents.Status, at time.Time) incident } return inc } + +// baseEvidenceIncident returns a minimal active incident shared across the +// Propagation/Blast snapshot round-trip tests. Tests set the relevant snapshot +// field after construction. +func baseEvidenceIncident(id string, ts time.Time) incidents.Incident { + return incidents.Incident{ + IncidentID: id, + Env: "demo", + Service: "payment-service", + ErrorFamily: apiv2.ErrorFamily{Service: "payment-service", Step: "charge", ErrorCode: "DB_TIMEOUT"}, + Status: incidents.StatusActive, + Cause: incidents.CauseUnknown, + Confidence: incidents.ConfidenceMedium, + Severity: 2, + StartedAt: ts, + UpdatedAt: ts, + LastSeenAt: ts, + } +} diff --git a/internal/coldstore/migrations/001_initial.sql b/internal/coldstore/migrations/001_initial.sql index ece71a4..5ec15ef 100644 --- a/internal/coldstore/migrations/001_initial.sql +++ b/internal/coldstore/migrations/001_initial.sql @@ -1,9 +1,8 @@ -- 001_initial.sql: events + deployments tables for SQLite cold storage. - -PRAGMA journal_mode = WAL; -PRAGMA busy_timeout = 5000; -PRAGMA foreign_keys = ON; -PRAGMA synchronous = NORMAL; +-- Connection-level PRAGMAs (journal_mode, busy_timeout, foreign_keys, +-- synchronous) are configured by Open() via the writer DSN or, for +-- :memory:, via explicit Exec calls — not here, because PRAGMAs that +-- change safety/journal modes cannot run inside a transaction. CREATE TABLE IF NOT EXISTS events ( id INTEGER PRIMARY KEY AUTOINCREMENT, diff --git a/internal/coldstore/migrations/005_incident_evidence.sql b/internal/coldstore/migrations/005_incident_evidence.sql new file mode 100644 index 0000000..da4aead --- /dev/null +++ b/internal/coldstore/migrations/005_incident_evidence.sql @@ -0,0 +1,6 @@ +-- 005_incident_evidence.sql: add v1.0 incident evidence snapshots +-- (PropagationSnapshot + BlastSnapshot). Both columns hold JSON text or NULL. +-- A NULL column round-trips to a nil *PropagationSnapshot / *BlastSnapshot in Go. + +ALTER TABLE incidents ADD COLUMN propagation_json TEXT; +ALTER TABLE incidents ADD COLUMN blast_json TEXT; diff --git a/internal/coldstore/migrations/006_incident_alert_evidence.sql b/internal/coldstore/migrations/006_incident_alert_evidence.sql new file mode 100644 index 0000000..4097143 --- /dev/null +++ b/internal/coldstore/migrations/006_incident_alert_evidence.sql @@ -0,0 +1,3 @@ +-- 006_incident_alert_evidence.sql: persist incident-attached alert evidence. + +ALTER TABLE incidents ADD COLUMN alert_json TEXT; diff --git a/internal/coldstore/migrations/007_incident_runtime_evidence.sql b/internal/coldstore/migrations/007_incident_runtime_evidence.sql new file mode 100644 index 0000000..a2b01b4 --- /dev/null +++ b/internal/coldstore/migrations/007_incident_runtime_evidence.sql @@ -0,0 +1,4 @@ +-- 007_incident_runtime_evidence.sql: persist incident-attached runtime evidence +-- (infra: k8s OOMKill/crashloop; app: panics/unhandled rejections). + +ALTER TABLE incidents ADD COLUMN runtime_json TEXT; diff --git a/internal/dashboard/static/index.html b/internal/dashboard/static/index.html index aed839e..1f533f8 100644 --- a/internal/dashboard/static/index.html +++ b/internal/dashboard/static/index.html @@ -3,21 +3,27 @@ - - Waylog v2 Triage + + Crux Triage @@ -30,7 +36,7 @@
+
-
Loading dashboard…
+
+ + + +
diff --git a/internal/dashboard/static_test.go b/internal/dashboard/static_test.go index 5410c28..22b6b7c 100644 --- a/internal/dashboard/static_test.go +++ b/internal/dashboard/static_test.go @@ -24,41 +24,88 @@ func TestStaticDashboardHTML(t *testing.T) { html := string(body) required := []string{ - "Waylog v2 Triage", + "Crux Triage", "fonts.googleapis.com/css2?family=Geist", "waylog-dashboard-theme", "data-theme", "id=\"theme-toggle\"", "Light theme", "Dark theme", + "#FF7300", + "#231F1C", "Find the failure that started the cascade.", - "First failing step", "failure-path", "No failures in this window.", "http://localhost:9081/demo", "topbar-link", "Demo controls", - "No recent requests yet", - "Run a scenario", + "emptyStateCopy(\"recent requests\")", "#/errors", "#/explain", "#/blast", "#/incident", "/v1/incidents/active", "Active incidents", - "No active incidents.", + "No active incidents right now", + "Crux is connected and watching for incident evidence.", + "In the demo, the auto-fire loop opens one shortly.", + "./scripts/demo-fire-burst.sh", + "Send a wide event and watch the recent requests panel", + "renderTriageKpis", + "Highest cause", + "Affected requests", + "Since last incident", + "All clear — Crux is watching for new error families.", + "sortAlertsBySeverity", + "Demo provider links may point back to demo controls", + "Provider links open the alert source configured for this incident.", + "data-retry-fetch", + "Crux", "repeat(auto-fit, minmax(min(100%, 280px), 1fr))", ".incident-card", + "incident-body-grid", "min-width: 0", "overflow-wrap: anywhere", "flex-shrink: 0", "incident-meta", - "Next checks", + "Suggested checks", "Instrumentation warnings", "sample_traces", - "renderSparkline", - "This dashboard requires WAYLOG_V2_READS=true", "first observable failing step", + "Failure cascade from sampled trace", + "trace-backed cascade", + "renderAlertEvidenceBlock", + "Alertmanager matches", + "data-expanded=\"false\"", + "data-alert-expand", + "Show all", + "Show fewer alerts", + "affected_requests", + "sampled_traces", + "/v1/triage/{id}/report?format=markdown&snapshot=true", + "copyReportMarkdown", + "Report preview", + "id=\"copy-toast\"", + "role=\"status\"", + "Markdown copied", + "Could not copy markdown", + "origin_service", + "origin_step", + "provider_url", + "At open", + "Top services", + "capturedFooter(l, source, \"span\", \"Captured\")", + "renderPropagationBlock", + "renderBlastBlock", + "renderRuntimeBlock", + "Runtime evidence", + "Infrastructure & application failures", + "data-runtime-row", + "runtimeSubtypeLabel", + "OOMKilled", + "CrashLoopBackOff", + "No correlated runtime events (pod restarts, OOMKills, panics)", + "incident.runtime", } for _, needle := range required { if !strings.Contains(html, needle) { @@ -66,6 +113,9 @@ func TestStaticDashboardHTML(t *testing.T) { } } + // Required strings are exact source guards. Forbidden product claims are + // case-insensitive so capitalization changes cannot bypass the copy rules. + lowerHTML := strings.ToLower(html) forbidden := []string{ "/ui/ask", "/ui/explain", @@ -75,17 +125,88 @@ func TestStaticDashboardHTML(t *testing.T) { "/v1/routes", "/v1/topology", "/v1/insight", + "/ui/incidents", + "No active incidents.", + "Markdown artifact", "chart.umd.min.js", "chartjs-plugin-annotation.min.js", "cytoscape.min.js", + "causal service graph", + "complete propagation tree", + "all downstream failures", + "full topology", } for _, needle := range forbidden { - if strings.Contains(html, needle) { + if strings.Contains(lowerHTML, strings.ToLower(needle)) { t.Fatalf("dashboard html still references %q", needle) } } } +// TestDashboardXSSDefenses pins the dashboard's XSS invariants so a future edit +// cannot silently drop escaping on attacker-influenceable event/incident fields. +// All dashboard data originates from ingested WideEvents and signals, which an +// untrusted service can populate with markup. +func TestDashboardXSSDefenses(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/", nil) + rec := httptest.NewRecorder() + Handler().ServeHTTP(rec, req) + body, err := io.ReadAll(rec.Result().Body) + if err != nil { + t.Fatalf("read response: %v", err) + } + html := string(body) + + // The escaping/URL primitives must exist and safeHTTPURL must enforce scheme. + mustHave := []string{ + "function esc(", + "function safeHTTPURL(", + `u.protocol === "http:"`, + `u.protocol === "https:"`, + // Provider links are gated through safeHTTPURL before reaching an href. + "safeHTTPURL(alert.provider_url)", + "safeHTTPURL(a.provider_url)", + // Attacker-influenceable fields are escaped at the HTML sink. + "esc(alert.reason", + "esc(alert.source", + "esc(m.reason", + "esc(m.subtype", + "esc(m.service", + // The triage report preview is injected as text, never HTML. + "body.textContent = report", + `
= d.cfg.MinCount {
-			// New errors have infinite lift; treat as meeting threshold.
-			candidates = append(candidates, candidate{e.ErrorCode, e.After, 0, float64(e.After)})
-		}
-	}
-	for _, e := range diff.Increased {
-		if e.After >= d.cfg.MinCount && e.Before > 0 {
-			l := float64(e.After) / float64(e.Before)
-			if l >= d.cfg.MinLift {
-				candidates = append(candidates, candidate{e.ErrorCode, e.After, e.Before, l})
-			}
-		}
-	}
-
-	if len(candidates) == 0 {
-		return "", 0, 0, 0
-	}
-
-	if len(candidates) == 1 {
-		top := candidates[0]
-		return top.code, top.after, top.before, top.lift
-	}
-
-	// Multiple candidates — use root cause scoring to pick the leaf error.
-	depths := d.errorDepths(candidates)
-
-	sort.Slice(candidates, func(i, j int) bool {
-		if candidates[i].after != candidates[j].after {
-			return candidates[i].after > candidates[j].after
-		}
-		// Tie on count: prefer deeper service (higher depth = leaf = root cause).
-		return depths[candidates[i].code] > depths[candidates[j].code]
-	})
-	top := candidates[0]
-	return top.code, top.after, top.before, top.lift
-}
-
-// errorDepths computes a depth score for each candidate error code.
-// The deepest error (leaf service in a call chain) is most likely the root cause
-// in cascading failures. Uses the trace store for per-span attribution when
-// available, falling back to graph topology.
-func (d *Detector) errorDepths(candidates []candidate) map[string]int {
-	// Primary strategy: use trace store spans for direct error→service→depth mapping.
-	if d.traces != nil {
-		if depths := d.errorDepthsFromTraces(candidates); len(depths) > 0 {
-			return depths
-		}
-	}
-	// Fallback: use graph topology (requires error node "service" attribute).
-	return d.errorDepthsFromGraph(candidates)
-}
-
-// errorDepthsFromTraces samples a trace that contains the candidate error codes
-// and uses its span parent-child chain to determine depth per error code.
-func (d *Detector) errorDepthsFromTraces(candidates []candidate) map[string]int {
-	codes := map[string]bool{}
-	for _, c := range candidates {
-		codes[c.code] = true
-	}
-
-	// Find a request that has at least two candidate error codes, then
-	// extract its trace_id from the graph node attributes (RequestFacts.TraceID
-	// is not populated by the store).
-	now := time.Now().UTC()
-	start := now.Add(-d.cfg.CurrentWindow)
-	var sampleRequestID string
-	d.store.ForEachRequestFact(start, now, func(f store.RequestFacts) {
-		if sampleRequestID != "" {
-			return
-		}
-		matched := 0
-		for _, e := range f.Errors {
-			if codes[e] {
-				matched++
-			}
-		}
-		if matched >= 2 {
-			sampleRequestID = f.RequestID
-		}
-	})
-	if sampleRequestID == "" {
-		return nil
-	}
-
-	// Extract trace_id from the request node's attributes.
-	snap := d.store.Snapshot()
-	reqNode, ok := snap.Nodes[sampleRequestID]
-	if !ok {
-		return nil
-	}
-	traceID, _ := reqNode.Attr["trace_id"].(string)
-	if traceID == "" {
-		return nil
-	}
-
-	rec, ok := d.traces.Get(traceID)
-	if !ok || len(rec.Spans) == 0 {
-		return nil
-	}
-
-	// Compute span depths via parent chain.
-	spans := map[string]tracestore.SpanRecord{}
-	parentOf := map[string]string{}
-	for _, span := range rec.Spans {
-		if span.SpanID == "" {
-			continue
-		}
-		spans[span.SpanID] = span
-		if span.ParentSpanID != "" {
-			parentOf[span.SpanID] = span.ParentSpanID
-		}
-	}
-
-	depthCache := map[string]int{}
-	visiting := map[string]bool{}
-	var depth func(string) int
-	depth = func(id string) int {
-		if d, ok := depthCache[id]; ok {
-			return d
-		}
-		if visiting[id] {
-			return 0
-		}
-		visiting[id] = true
-		pid, hasParent := parentOf[id]
-		if !hasParent || pid == "" {
-			depthCache[id] = 0
-			delete(visiting, id)
-			return 0
-		}
-		if _, ok := spans[pid]; !ok {
-			depthCache[id] = 0
-			delete(visiting, id)
-			return 0
-		}
-		d := depth(pid) + 1
-		depthCache[id] = d
-		delete(visiting, id)
-		return d
-	}
-	for id := range spans {
-		depth(id)
-	}
-
-	// Map error codes to their span depths.
-	depths := map[string]int{}
-	for _, span := range spans {
-		if span.ErrorCode != "" && codes[span.ErrorCode] {
-			if d := depthCache[span.SpanID]; d > depths[span.ErrorCode] {
-				depths[span.ErrorCode] = d
-			}
-		}
-	}
-	return depths
-}
-
-// errorDepthsFromGraph uses error node "service" attributes and the service
-// call topology (EdgeCalls BFS) to compute depth per error code.
-func (d *Detector) errorDepthsFromGraph(candidates []candidate) map[string]int {
-	snap := d.store.Snapshot()
-	depths := map[string]int{}
-
-	codeToSvcID := map[string]string{}
-	for _, c := range candidates {
-		errID := core.ID("error", c.code)
-		node, ok := snap.Nodes[errID]
-		if !ok {
-			continue
-		}
-		svcName, _ := node.Attr["service"].(string)
-		if svcName == "" {
-			continue
-		}
-		env := ""
-		for _, n := range snap.Nodes {
-			if n.Type == core.NodeService {
-				if name, _ := n.Attr["name"].(string); name == svcName {
-					env, _ = n.Attr["env"].(string)
-					break
-				}
-			}
-		}
-		codeToSvcID[c.code] = core.ID("service", svcName, env)
-	}
-
-	hasIncoming := map[string]bool{}
-	callEdges := map[string][]string{}
-	for _, edges := range snap.OutEdges {
-		for _, e := range edges {
-			if e.Type == core.EdgeCalls {
-				callEdges[e.From] = append(callEdges[e.From], e.To)
-				hasIncoming[e.To] = true
-			}
-		}
-	}
-
-	depthMap := map[string]int{}
-	var queue []string
-	for id, node := range snap.Nodes {
-		if node.Type == core.NodeService && !hasIncoming[id] {
-			queue = append(queue, id)
-			depthMap[id] = 0
-		}
-	}
-	for len(queue) > 0 {
-		curr := queue[0]
-		queue = queue[1:]
-		for _, next := range callEdges[curr] {
-			if _, visited := depthMap[next]; !visited {
-				depthMap[next] = depthMap[curr] + 1
-				queue = append(queue, next)
-			}
-		}
-	}
-
-	for code, svcID := range codeToSvcID {
-		depths[code] = depthMap[svcID]
-	}
-	return depths
-}
-
-// computeBlast computes blast radius for an error code within a time window.
-func (d *Detector) computeBlast(errorCode string, start, end time.Time) (requests, users, vipUsers int, services []string, severity float64) {
-	userSet := map[string]struct{}{}
-	vipSet := map[string]struct{}{}
-	svcSet := map[string]struct{}{}
-	reqCount := 0
-
-	d.store.ForEachRequestFact(start, end, func(f store.RequestFacts) {
-		if !f.HasError(errorCode) {
-			return
-		}
-		reqCount++
-		if f.UserID != "" {
-			userSet[f.UserID] = struct{}{}
-		}
-		if f.UserVIP && f.UserID != "" {
-			vipSet[f.UserID] = struct{}{}
-		}
-		for _, svc := range f.Services {
-			svcSet[svc] = struct{}{}
-		}
-	})
-
-	svcList := make([]string, 0, len(svcSet))
-	for svc := range svcSet {
-		svcList = append(svcList, svc)
-	}
-	sort.Strings(svcList)
-
-	sev := float64(reqCount) + float64(len(vipSet))*10 + float64(len(svcSet))*5
-	return reqCount, len(userSet), len(vipSet), svcList, sev
-}
-
-// correlateDeploy checks if a recent deployment correlates with the spike.
-func (d *Detector) correlateDeploy(ctx context.Context, errorCode string, now time.Time) *DeployCorrelation {
-	tickCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
-	defer cancel()
-
-	window := 1 * time.Hour
-	deps, err := d.deploys.DeploymentsInWindow(tickCtx, now.Add(-window), now, "")
-	if err != nil || len(deps) == 0 {
-		return nil
-	}
-
-	snap := d.store.Snapshot()
-	if len(snap.Nodes) == 0 {
-		return nil
-	}
-
-	infos := make([]causal.DeploymentInfo, len(deps))
-	for i, d := range deps {
-		infos[i] = causal.DeploymentInfo{ID: d.ID, Service: d.Service, FirstSeen: d.FirstSeen}
-	}
-
-	claims := causal.InferIntroducedBy(snap, infos, now.Add(-window), now)
-	for _, c := range claims {
-		if c.Subject == errorCode {
-			return &DeployCorrelation{
-				DeploymentID: c.Target,
-				Service:      c.Service,
-				Confidence:   c.Confidence,
-			}
-		}
-	}
-	return nil
-}
diff --git a/internal/detect/detector_test.go b/internal/detect/detector_test.go
deleted file mode 100644
index a8d5421..0000000
--- a/internal/detect/detector_test.go
+++ /dev/null
@@ -1,324 +0,0 @@
-package detect
-
-import (
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-func TestDetector_NoSpike(t *testing.T) {
-	s := store.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// 2 errors in current window — below MinCount of 3
-	for i := range 2 {
-		ingest(t, s, builder, testutil.MakeEvent(
-			testutil.WithTraceID(fmt.Sprintf("aaaa%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("bbbb%012d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "fail"),
-			testutil.WithTimestamp(now.Add(-30*time.Second)),
-		))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, nil, nil)
-
-	d.tick(nil)
-
-	if d.Current() != nil {
-		t.Fatal("expected no insight when count below threshold")
-	}
-}
-
-func TestDetector_SpikeDetected(t *testing.T) {
-	s := store.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// Baseline: 1 error in the baseline window
-	ingest(t, s, builder, testutil.MakeEvent(
-		testutil.WithTraceID("bbbb0000000000000000000000000000"),
-		testutil.WithSpanID("cccc000000000000"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "fail"),
-		testutil.WithUser("user-1", "standard", "us-east-1"),
-		testutil.WithTimestamp(now.Add(-3*time.Minute)),
-	))
-
-	// Current: 5 errors in the current window — 5x lift, count >= 3
-	for i := range 5 {
-		ingest(t, s, builder, testutil.MakeEvent(
-			testutil.WithTraceID(fmt.Sprintf("aaaa%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("dddd%012d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "fail"),
-			testutil.WithUser(fmt.Sprintf("user-%d", i+10), "standard", "us-east-1"),
-			testutil.WithTimestamp(now.Add(-20*time.Second)),
-		))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, nil, nil)
-
-	d.tick(nil)
-
-	insight := d.Current()
-	if insight == nil {
-		t.Fatal("expected insight to be set")
-	}
-	if insight.TopErrorCode != "PMT_502" {
-		t.Fatalf("TopErrorCode = %q, want PMT_502", insight.TopErrorCode)
-	}
-	if insight.CurrentCount != 5 {
-		t.Fatalf("CurrentCount = %d, want 5", insight.CurrentCount)
-	}
-	if insight.Lift < 3.0 {
-		t.Fatalf("Lift = %f, want >= 3.0", insight.Lift)
-	}
-	if insight.AffectedRequests != 5 {
-		t.Fatalf("AffectedRequests = %d, want 5", insight.AffectedRequests)
-	}
-	if insight.AffectedUsers != 5 {
-		t.Fatalf("AffectedUsers = %d, want 5", insight.AffectedUsers)
-	}
-}
-
-func TestDetector_NewErrorCode(t *testing.T) {
-	s := store.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// No baseline errors at all. 4 new errors in current window.
-	for i := range 4 {
-		ingest(t, s, builder, testutil.MakeEvent(
-			testutil.WithTraceID(fmt.Sprintf("aaaa%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("dddd%012d", i)),
-			testutil.WithService("db"),
-			testutil.WithStatusCode(503),
-			testutil.WithError("DB_503", "unavailable"),
-			testutil.WithTimestamp(now.Add(-15*time.Second)),
-		))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, nil, nil)
-
-	d.tick(nil)
-
-	insight := d.Current()
-	if insight == nil {
-		t.Fatal("expected insight for new error code")
-	}
-	if insight.TopErrorCode != "DB_503" {
-		t.Fatalf("TopErrorCode = %q, want DB_503", insight.TopErrorCode)
-	}
-	if insight.BaselineCount != 0 {
-		t.Fatalf("BaselineCount = %d, want 0", insight.BaselineCount)
-	}
-}
-
-func TestDetector_AutoResolve(t *testing.T) {
-	s := store.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// Spike: 5 errors in current window, 0 in baseline
-	for i := range 5 {
-		ingest(t, s, builder, testutil.MakeEvent(
-			testutil.WithTraceID(fmt.Sprintf("aaaa%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("dddd%012d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "fail"),
-			testutil.WithTimestamp(now.Add(-20*time.Second)),
-		))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, nil, nil)
-
-	// First tick detects the spike.
-	d.tick(nil)
-	if d.Current() == nil {
-		t.Fatal("expected insight on first tick")
-	}
-
-	// Simulate time passing: move all errors into the baseline window
-	// by adding them again with old timestamps.
-	// Instead, just add enough baseline errors to drop the lift below threshold.
-	for i := range 5 {
-		ingest(t, s, builder, testutil.MakeEvent(
-			testutil.WithTraceID(fmt.Sprintf("bbbb%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("eeee%012d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "fail"),
-			testutil.WithTimestamp(now.Add(-3*time.Minute)),
-		))
-	}
-
-	// Second tick: current=5, baseline=5, lift=1.0 — below threshold.
-	d.tick(nil)
-	if d.Current() != nil {
-		t.Fatal("expected insight to auto-resolve when lift drops")
-	}
-}
-
-func TestDetector_VIPTracking(t *testing.T) {
-	s := store.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// 3 errors, 1 from a VIP user
-	for i := range 3 {
-		opts := []testutil.EventOption{
-			testutil.WithTraceID(fmt.Sprintf("aaaa%028d", i)),
-			testutil.WithSpanID(fmt.Sprintf("dddd%012d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "fail"),
-			testutil.WithUser(fmt.Sprintf("user-%d", i), "standard", "us-east-1"),
-			testutil.WithTimestamp(now.Add(-20 * time.Second)),
-		}
-		if i == 0 {
-			opts = append(opts, testutil.WithVIP(true))
-		}
-		ingest(t, s, builder, testutil.MakeEvent(opts...))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, nil, nil)
-
-	d.tick(nil)
-
-	insight := d.Current()
-	if insight == nil {
-		t.Fatal("expected insight")
-	}
-	if insight.VIPUsers != 1 {
-		t.Fatalf("VIPUsers = %d, want 1", insight.VIPUsers)
-	}
-}
-
-func TestDetector_CascadingFailure_PicksRootCause(t *testing.T) {
-	s := store.NewStore()
-	ts := tracestore.NewStore()
-	builder := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// Simulate 3 cascading failures: payment → checkout → api-gateway.
-	// Each request generates 3 error codes with the same count.
-	// The detector should pick PMT_502 (leaf service) as root cause.
-	for i := range 3 {
-		traceID := fmt.Sprintf("cccc%028d", i)
-		reqID := core.ID("request", traceID)
-		// Payment span (leaf — no downstream, called by checkout)
-		ingestWithTraces(t, s, ts, builder, traceID, reqID, testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID(fmt.Sprintf("p%015d", i)),
-			testutil.WithParentSpanID(fmt.Sprintf("c%015d", i)),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "payment failed"),
-			testutil.WithCallerService("checkout"),
-			testutil.WithTimestamp(now.Add(-20*time.Second)),
-		))
-		// Checkout span (middle — called by api-gateway, calls payment)
-		ingestWithTraces(t, s, ts, builder, traceID, reqID, testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID(fmt.Sprintf("c%015d", i)),
-			testutil.WithParentSpanID(fmt.Sprintf("a%015d", i)),
-			testutil.WithService("checkout"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("CHK_DOWNSTREAM", "downstream failed"),
-			testutil.WithCallerService("api-gateway"),
-			testutil.WithTimestamp(now.Add(-20*time.Second)),
-		))
-		// Api-gateway span (root — no caller)
-		ingestWithTraces(t, s, ts, builder, traceID, reqID, testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID(fmt.Sprintf("a%015d", i)),
-			testutil.WithParentSpanID(""),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("GW_DOWNSTREAM", "downstream failed"),
-			testutil.WithTimestamp(now.Add(-20*time.Second)),
-		))
-	}
-
-	d := NewDetector(Config{
-		Enabled:        true,
-		Interval:       10 * time.Second,
-		CurrentWindow:  1 * time.Minute,
-		BaselineWindow: 5 * time.Minute,
-		MinLift:        3.0,
-		MinCount:       3,
-	}, s, ts, nil)
-
-	d.tick(nil)
-
-	insight := d.Current()
-	if insight == nil {
-		t.Fatal("expected insight")
-	}
-	if insight.TopErrorCode != "PMT_502" {
-		t.Fatalf("TopErrorCode = %q, want PMT_502 (root cause, not propagation error)", insight.TopErrorCode)
-	}
-}
-
-func ingest(t *testing.T, s *store.Store, builder *build.Builder, ev event.WideEvent) {
-	t.Helper()
-	result := builder.BuildResult(ev)
-	s.Merge(result.Graph)
-}
-
-func ingestWithTraces(t *testing.T, s *store.Store, ts *tracestore.Store, builder *build.Builder, traceID, reqID string, ev event.WideEvent) {
-	t.Helper()
-	result := builder.BuildResult(ev)
-	s.Merge(result.Graph)
-	if ts != nil && result.Span != nil {
-		ts.Upsert(traceID, reqID, result.Span)
-	}
-}
diff --git a/internal/doctor/checks.go b/internal/doctor/checks.go
new file mode 100644
index 0000000..d1891c9
--- /dev/null
+++ b/internal/doctor/checks.go
@@ -0,0 +1,226 @@
+package doctor
+
+import (
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/auth"
+	"github.com/sssmaran/WaylogCLI/internal/coldstore"
+	eventlogv2 "github.com/sssmaran/WaylogCLI/internal/eventlog/v2"
+	_ "modernc.org/sqlite"
+)
+
+// checkAuth runs the auth/config constraint matrix and surfaces weak keys.
+// A constraint error fails the run; weak placeholder keys only warn.
+func checkAuth(env map[string]string) Check {
+	cfg, err := auth.ParseConfig(env)
+	if err != nil {
+		return Check{Name: "auth/config", Status: StatusFail, Detail: err.Error()}
+	}
+	if warns := cfg.WeakKeyWarnings(); len(warns) > 0 {
+		return Check{Name: "auth/config", Status: StatusWarn, Detail: strings.Join(warns, "; ")}
+	}
+	return Check{Name: "auth/config", Status: StatusOK, Detail: "profile=" + cfg.Profile}
+}
+
+// checkWALDir verifies the server can write its WAL. It resolves the dir exactly
+// like the server (eventlogv2.ResolveDir — never "", so the check never skips),
+// then — because the server MkdirAll's the dir on startup — probes writability
+// of the dir, or of the nearest existing ancestor when the dir doesn't exist yet
+// (so a not-yet-created default path passes when it is creatable). The probe is
+// a single temp file, immediately removed — doctor's only write; a probe file
+// that cannot be removed is surfaced as a warning.
+func checkWALDir(env map[string]string) Check {
+	dir := eventlogv2.ResolveDir(env["EVENT_LOG_V2_DIR"], env["EVENT_LOG_DIR"])
+	target, info := nearestExistingDir(dir)
+	if target == "" {
+		return Check{Name: "wal-dir", Status: StatusFail, Detail: dir + ": no existing parent directory"}
+	}
+	if !info.IsDir() {
+		return Check{Name: "wal-dir", Status: StatusFail, Detail: target + ": not a directory"}
+	}
+	writable, leaked, err := probeWritable(target)
+	if !writable {
+		return Check{Name: "wal-dir", Status: StatusFail, Detail: fmt.Sprintf("%s: not writable: %v", target, err)}
+	}
+	if leaked != "" {
+		return Check{Name: "wal-dir", Status: StatusWarn, Detail: fmt.Sprintf("%s writable but probe file %s could not be removed: %v", target, leaked, err)}
+	}
+	if target != dir {
+		return Check{Name: "wal-dir", Status: StatusOK, Detail: fmt.Sprintf("%s absent; will be created (parent %s writable)", dir, target)}
+	}
+	return Check{Name: "wal-dir", Status: StatusOK, Detail: dir + " (writable)"}
+}
+
+// nearestExistingDir walks up from dir to the first path that exists on disk,
+// returning it with its FileInfo, or ("", nil) if none is found. It is a
+// string-only walk (filepath.Dir) and does not resolve symlinks, so it is an
+// approximation of the server's os.MkdirAll — the server's call is the
+// authoritative test.
+func nearestExistingDir(dir string) (string, os.FileInfo) {
+	for dir != "" {
+		if info, err := os.Stat(dir); err == nil {
+			return dir, info
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			return "", nil
+		}
+		dir = parent
+	}
+	return "", nil
+}
+
+// probeWritable proves dir is writable with a transient temp file. writable is
+// true when the file was created; leaked is the temp path if it could not be
+// removed afterward (doctor's only side effect was left behind).
+func probeWritable(dir string) (writable bool, leaked string, err error) {
+	f, err := os.CreateTemp(dir, ".waylog-doctor-*")
+	if err != nil {
+		return false, "", err
+	}
+	name := f.Name()
+	_ = f.Close()
+	if rmErr := os.Remove(name); rmErr != nil {
+		return true, name, rmErr
+	}
+	return true, "", nil
+}
+
+// checkSQLite opens the cold store read-only and reports migration state. It
+// never creates the database and never applies migrations.
+func checkSQLite(env map[string]string) Check {
+	path := strings.TrimSpace(env["SQLITE_PATH"])
+	if path == "" {
+		return Check{Name: "sqlite", Status: StatusSkip, Detail: "SQLITE_PATH unset (cold store disabled)"}
+	}
+	if _, err := os.Stat(path); err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("%s: %v (doctor does not create it)", path, err)}
+	}
+	db, err := sql.Open("sqlite", path+"?mode=ro&_busy_timeout=2000")
+	if err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("open read-only: %v", err)}
+	}
+	defer db.Close()
+	if err := db.Ping(); err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("ping: %v", err)}
+	}
+
+	applied := map[string]bool{}
+	rows, err := db.Query(`SELECT name FROM schema_migrations`)
+	if err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("read schema_migrations: %v", err)}
+	}
+	defer rows.Close()
+	for rows.Next() {
+		var n string
+		if err := rows.Scan(&n); err != nil {
+			return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("scan migration: %v", err)}
+		}
+		applied[n] = true
+	}
+	if err := rows.Err(); err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("iterate migrations: %v", err)}
+	}
+
+	expected, err := coldstore.MigrationNames()
+	if err != nil {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: err.Error()}
+	}
+	expectedSet := make(map[string]bool, len(expected))
+	for _, name := range expected {
+		expectedSet[name] = true
+	}
+	// Behind (DB missing migrations this binary expects) is a hard failure: the
+	// binary's queries reference schema that isn't there.
+	var missing []string
+	for _, name := range expected {
+		if !applied[name] {
+			missing = append(missing, name)
+		}
+	}
+	if len(missing) > 0 {
+		return Check{Name: "sqlite", Status: StatusFail, Detail: fmt.Sprintf("%d migration(s) behind — cold store not ready for this binary: %s", len(missing), strings.Join(missing, ", "))}
+	}
+	// Ahead (DB has migrations this binary doesn't know — a newer binary wrote
+	// it) is a caution, not a hard failure: reads of known schema still work.
+	var unknown []string
+	for name := range applied {
+		if !expectedSet[name] {
+			unknown = append(unknown, name)
+		}
+	}
+	if len(unknown) > 0 {
+		sort.Strings(unknown)
+		return Check{Name: "sqlite", Status: StatusWarn, Detail: fmt.Sprintf("DB has %d migration(s) unknown to this binary (newer binary wrote it): %s", len(unknown), strings.Join(unknown, ", "))}
+	}
+	return Check{Name: "sqlite", Status: StatusOK, Detail: fmt.Sprintf("%s (%d migrations applied)", filepath.Base(path), len(expected))}
+}
+
+// checkServer probes liveness/readiness endpoints. Used only when server checks
+// are requested; a dead address fails (it never silently passes).
+func checkServer(addr string) []Check {
+	base := strings.TrimRight(addr, "/")
+	client := &http.Client{Timeout: 3 * time.Second}
+	probe := func(name, path string) Check {
+		resp, err := client.Get(base + path)
+		if err != nil {
+			return Check{Name: name, Status: StatusFail, Detail: err.Error()}
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode != http.StatusOK {
+			return Check{Name: name, Status: StatusFail, Detail: fmt.Sprintf("%s -> %d", path, resp.StatusCode)}
+		}
+		return Check{Name: name, Status: StatusOK, Detail: base + path}
+	}
+	return []Check{
+		probe("server-livez", "/livez"),
+		probe("server-readyz", "/readyz"),
+		probeHealth(client, base),
+	}
+}
+
+// probeHealth inspects /healthz for a degraded state — e.g. a failed WAL replay
+// that leaves the server "ready" (so /readyz passes) but serving partial reads.
+// /healthz always returns 200 with a JSON status; a non-"ok" status is a warning.
+func probeHealth(client *http.Client, base string) Check {
+	resp, err := client.Get(base + "/healthz")
+	if err != nil {
+		return Check{Name: "server-health", Status: StatusFail, Detail: err.Error()}
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		return Check{Name: "server-health", Status: StatusFail, Detail: fmt.Sprintf("/healthz -> %d", resp.StatusCode)}
+	}
+	var body struct {
+		Status string `json:"status"`
+		Replay struct {
+			Status string `json:"status"`
+			Error  string `json:"error"`
+		} `json:"replay"`
+	}
+	if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
+		return Check{Name: "server-health", Status: StatusWarn, Detail: fmt.Sprintf("/healthz returned 200 but body was undecodable: %v", err)}
+	}
+	if body.Status == "" {
+		return Check{Name: "server-health", Status: StatusWarn, Detail: "/healthz returned 200 with no status field (unexpected response — not a Waylog server?)"}
+	}
+	if body.Status != "ok" {
+		detail := "status=" + body.Status
+		switch {
+		case body.Replay.Status != "" && body.Replay.Error != "":
+			detail = fmt.Sprintf("status=%s (replay=%s: %s)", body.Status, body.Replay.Status, body.Replay.Error)
+		case body.Replay.Status != "":
+			detail = fmt.Sprintf("status=%s (replay=%s)", body.Status, body.Replay.Status)
+		}
+		return Check{Name: "server-health", Status: StatusWarn, Detail: detail}
+	}
+	return Check{Name: "server-health", Status: StatusOK, Detail: base + "/healthz (status=ok)"}
+}
diff --git a/internal/doctor/checks_test.go b/internal/doctor/checks_test.go
new file mode 100644
index 0000000..81741df
--- /dev/null
+++ b/internal/doctor/checks_test.go
@@ -0,0 +1,265 @@
+package doctor
+
+import (
+	"database/sql"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/sssmaran/WaylogCLI/internal/coldstore"
+)
+
+// Task 3: checkAuth tests
+
+func TestCheckAuthFailsOnBadProfile(t *testing.T) {
+	c := checkAuth(map[string]string{"WAYLOG_PROFILE": "banana"})
+	if c.Status != StatusFail {
+		t.Fatalf("bad profile must fail, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckAuthOKOnDefaultEnv(t *testing.T) {
+	c := checkAuth(map[string]string{}) // empty => dev profile, no keys, dashboard off
+	if c.Status != StatusOK {
+		t.Fatalf("default env must be ok, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckAuthWarnsOnWeakKey(t *testing.T) {
+	c := checkAuth(map[string]string{"WAYLOG_WRITE_KEY": "changeme"})
+	if c.Status != StatusWarn {
+		t.Fatalf("weak key must warn, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+// checkWALDir tests
+
+func TestCheckWALDirOKWhenWritable(t *testing.T) {
+	dir := t.TempDir()
+	c := checkWALDir(map[string]string{"EVENT_LOG_V2_DIR": dir})
+	if c.Status != StatusOK {
+		t.Fatalf("writable dir must be ok, got %q (%s)", c.Status, c.Detail)
+	}
+	// The probe must clean up after itself.
+	entries, _ := os.ReadDir(dir)
+	if len(entries) != 0 {
+		t.Fatalf("probe left files behind: %v", entries)
+	}
+}
+
+func TestCheckWALDirOKWhenAbsentButCreatable(t *testing.T) {
+	// Dir doesn't exist yet but the parent is writable: the server MkdirAll's it
+	// on startup, so doctor reports ok ("will be created") rather than failing.
+	absent := filepath.Join(t.TempDir(), "eventlog-v2")
+	c := checkWALDir(map[string]string{"EVENT_LOG_V2_DIR": absent})
+	if c.Status != StatusOK {
+		t.Fatalf("absent-but-creatable dir must be ok, got %q (%s)", c.Status, c.Detail)
+	}
+	if !strings.Contains(c.Detail, "will be created") {
+		t.Fatalf("detail should note pending creation: %q", c.Detail)
+	}
+}
+
+func TestCheckWALDirFailsWhenAncestorNotADir(t *testing.T) {
+	// Nearest existing ancestor is a regular file, so creation is impossible.
+	// Root-independent — it does not rely on an unwritable directory.
+	f := filepath.Join(t.TempDir(), "afile")
+	if err := os.WriteFile(f, []byte("x"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	c := checkWALDir(map[string]string{"EVENT_LOG_V2_DIR": filepath.Join(f, "v2")})
+	if c.Status != StatusFail {
+		t.Fatalf("ancestor-is-a-file must fail, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckWALDirFallsBackToEventLogDirV2(t *testing.T) {
+	// Only EVENT_LOG_DIR set: doctor resolves /v2 like the server,
+	// not the bare dir.
+	base := t.TempDir()
+	if err := os.Mkdir(filepath.Join(base, "v2"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	c := checkWALDir(map[string]string{"EVENT_LOG_DIR": base})
+	if c.Status != StatusOK || !strings.Contains(c.Detail, filepath.Join(base, "v2")) {
+		t.Fatalf("fallback must probe /v2, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+// Task 5: checkSQLite tests
+
+func TestCheckSQLiteSkipsWhenUnset(t *testing.T) {
+	c := checkSQLite(map[string]string{})
+	if c.Status != StatusSkip {
+		t.Fatalf("unset must skip, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckSQLiteFailsWhenMissing(t *testing.T) {
+	c := checkSQLite(map[string]string{"SQLITE_PATH": filepath.Join(t.TempDir(), "absent.db")})
+	if c.Status != StatusFail {
+		t.Fatalf("missing db must fail (must not create), got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckSQLiteOKOnMigratedDB(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "cold.db")
+	// Create + migrate a real DB via coldstore, then close it.
+	managed, err := coldstore.Open(path)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	_ = managed.Close()
+
+	c := checkSQLite(map[string]string{"SQLITE_PATH": path})
+	if c.Status != StatusOK {
+		t.Fatalf("fully-migrated db must be ok, got %q (%s)", c.Status, c.Detail)
+	}
+}
+
+func TestCheckSQLiteFailsWhenBehind(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "cold.db")
+	managed, err := coldstore.Open(path)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	_ = managed.Close()
+	// Test (not doctor) removes one applied migration to simulate "behind".
+	db, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("open rw: %v", err)
+	}
+	names, err := coldstore.MigrationNames()
+	if err != nil {
+		t.Fatalf("migration names: %v", err)
+	}
+	if _, err := db.Exec(`DELETE FROM schema_migrations WHERE name = ?`, names[len(names)-1]); err != nil {
+		t.Fatalf("delete migration: %v", err)
+	}
+	_ = db.Close()
+
+	c := checkSQLite(map[string]string{"SQLITE_PATH": path})
+	if c.Status != StatusFail {
+		t.Fatalf("a DB behind on migrations must fail (not ready for this binary), got %q (%s)", c.Status, c.Detail)
+	}
+	if !strings.Contains(c.Detail, "behind") {
+		t.Fatalf("fail detail should say 'behind': %q", c.Detail)
+	}
+}
+
+func TestCheckSQLiteWarnsWhenAhead(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "cold.db")
+	managed, err := coldstore.Open(path)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	_ = managed.Close()
+	// Test inserts a migration this binary doesn't know, as a newer binary would.
+	db, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("open rw: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO schema_migrations (name, applied_at) VALUES (?, ?)`,
+		"999_from_the_future.sql", "2099-01-01T00:00:00.000000000Z"); err != nil {
+		t.Fatalf("insert migration: %v", err)
+	}
+	_ = db.Close()
+
+	c := checkSQLite(map[string]string{"SQLITE_PATH": path})
+	if c.Status != StatusWarn {
+		t.Fatalf("a DB ahead on migrations must warn, got %q (%s)", c.Status, c.Detail)
+	}
+	if !strings.Contains(c.Detail, "999_from_the_future.sql") {
+		t.Fatalf("warn detail should name the unknown migration: %q", c.Detail)
+	}
+}
+
+// Task 7: checkServer tests
+
+func TestCheckServerOKWhenHealthy(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/healthz" {
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = w.Write([]byte(`{"status":"ok"}`))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	checks := checkServer(srv.URL)
+	for _, c := range checks {
+		if c.Status != StatusOK {
+			t.Fatalf("%s should be ok, got %q (%s)", c.Name, c.Status, c.Detail)
+		}
+	}
+}
+
+func TestCheckServerWarnsWhenDegraded(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/healthz" {
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = w.Write([]byte(`{"status":"degraded","replay":{"status":"failed","error":"wal corrupt"}}`))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	checks := checkServer(srv.URL)
+	var health *Check
+	for i := range checks {
+		if checks[i].Name == "server-health" {
+			health = &checks[i]
+		} else if checks[i].Status != StatusOK {
+			t.Fatalf("%s should still be ok, got %q", checks[i].Name, checks[i].Status)
+		}
+	}
+	if health == nil {
+		t.Fatal("expected a server-health check")
+	}
+	if health.Status != StatusWarn {
+		t.Fatalf("degraded health must warn, got %q (%s)", health.Status, health.Detail)
+	}
+	if !strings.Contains(health.Detail, "degraded") || !strings.Contains(health.Detail, "wal corrupt") {
+		t.Fatalf("degraded detail should include status + replay error: %q", health.Detail)
+	}
+}
+
+func TestCheckServerFailsWhenDown(t *testing.T) {
+	// Start a server, capture its URL, then close it so the address is dead but
+	// still routable — more portable than relying on http://127.0.0.1:0.
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
+	url := srv.URL
+	srv.Close()
+
+	checks := checkServer(url)
+	if len(checks) == 0 {
+		t.Fatal("expected server checks")
+	}
+	for _, c := range checks {
+		if c.Status != StatusFail {
+			t.Fatalf("%s should fail against a dead addr, got %q", c.Name, c.Status)
+		}
+	}
+}
+
+func TestCheckServerWarnsWhenHealthzHasNoStatus(t *testing.T) {
+	// A 200 /healthz with no status field (e.g. a proxy returning {}) must warn,
+	// not silently pass and not crash.
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/healthz" {
+			w.Header().Set("Content-Type", "application/json")
+			_, _ = w.Write([]byte(`{}`))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+	for _, c := range checkServer(srv.URL) {
+		if c.Name == "server-health" && c.Status != StatusWarn {
+			t.Fatalf("missing-status /healthz must warn, got %q (%s)", c.Status, c.Detail)
+		}
+	}
+}
diff --git a/internal/doctor/doctor.go b/internal/doctor/doctor.go
new file mode 100644
index 0000000..37ef15a
--- /dev/null
+++ b/internal/doctor/doctor.go
@@ -0,0 +1,85 @@
+// Package doctor runs read-only health checks for a Waylog/Crux deployment and
+// reports them as a green/red checklist. It is read-only except for one
+// transient temp-file probe in the WAL dir (see checkWALDir).
+package doctor
+
+import (
+	"os"
+	"strings"
+)
+
+type Status string
+
+const (
+	StatusOK   Status = "ok"
+	StatusWarn Status = "warn"
+	StatusFail Status = "fail"
+	StatusSkip Status = "skip"
+)
+
+// Check is a single diagnostic result.
+type Check struct {
+	Name   string `json:"name"`
+	Status Status `json:"status"`
+	Detail string `json:"detail,omitempty"`
+}
+
+// Result is the full set of checks from one doctor run.
+type Result struct {
+	Checks []Check `json:"checks"`
+}
+
+// OK reports whether no check failed. Warn and skip do not fail the run.
+func (r Result) OK() bool {
+	for _, c := range r.Checks {
+		if c.Status == StatusFail {
+			return false
+		}
+	}
+	return true
+}
+
+// Options controls which checks run.
+type Options struct {
+	Addr         string // base URL for server checks (e.g. http://localhost:8080)
+	ServerChecks bool   // run reachability checks against Addr
+}
+
+// EnvKeys are the environment variables doctor reads for its local checks.
+// Exported so tests can clear them all in one place, keeping the test fixtures
+// from drifting out of sync with processEnv.
+var EnvKeys = []string{
+	"WAYLOG_PROFILE", "WAYLOG_API_KEY", "WAYLOG_WRITE_KEY", "WAYLOG_READ_KEY",
+	"WAYLOG_AGENT_KEY", "DASHBOARD_AUTH", "DASHBOARD_SESSION_SECRET",
+	"EVENT_LOG_V2_DIR", "EVENT_LOG_DIR", "SQLITE_PATH",
+}
+
+// processEnv snapshots the environment variables the checks read.
+func processEnv() map[string]string {
+	env := make(map[string]string, len(EnvKeys))
+	for _, k := range EnvKeys {
+		if v, ok := os.LookupEnv(k); ok {
+			env[k] = v
+		}
+	}
+	return env
+}
+
+// Run executes the local checks always and the server checks when requested.
+func Run(opts Options) Result {
+	env := processEnv()
+	checks := []Check{
+		checkAuth(env),
+		checkWALDir(env),
+		checkSQLite(env),
+		checkTriageHash(),
+	}
+	if opts.ServerChecks {
+		addr := strings.TrimSpace(opts.Addr)
+		if addr == "" {
+			addr = "http://localhost:8080"
+		}
+		checks = append(checks, checkServer(addr)...)
+	}
+	return Result{Checks: checks}
+}
diff --git a/internal/doctor/doctor_test.go b/internal/doctor/doctor_test.go
new file mode 100644
index 0000000..b026aaf
--- /dev/null
+++ b/internal/doctor/doctor_test.go
@@ -0,0 +1,45 @@
+package doctor
+
+import "testing"
+
+func TestRunLocalOnlyHasNoServerChecks(t *testing.T) {
+	// Hermetic: clear the env doctor reads and point the WAL dir at a temp dir
+	// so this test neither depends on the developer's env nor probes the cwd.
+	for _, k := range EnvKeys {
+		t.Setenv(k, "")
+	}
+	t.Setenv("EVENT_LOG_V2_DIR", t.TempDir())
+	r := Run(Options{}) // ServerChecks false
+	if len(r.Checks) == 0 {
+		t.Fatal("expected local checks")
+	}
+	names := map[string]bool{}
+	for _, c := range r.Checks {
+		names[c.Name] = true
+	}
+	for _, want := range []string{"auth/config", "wal-dir", "sqlite", "triage-hash"} {
+		if !names[want] {
+			t.Fatalf("missing local check %q", want)
+		}
+	}
+	if names["server-livez"] || names["server-readyz"] {
+		t.Fatal("server checks must not run without ServerChecks")
+	}
+}
+
+func TestRunLocalChecksDoNotFailWithoutServer(t *testing.T) {
+	// Local checks should be ok/warn/skip — never fail merely because no server
+	// is running. Clear every key processEnv reads so dev-machine exports (e.g.
+	// auth keys) can't make ParseConfig error and turn this into a false failure.
+	// Empty WAYLOG_PROFILE defaults to dev in ParseConfig.
+	for _, k := range EnvKeys {
+		t.Setenv(k, "")
+	}
+	// Point the WAL dir at a writable temp dir so the (never-skipped) wal-dir
+	// check passes hermetically without probing the process's cwd.
+	t.Setenv("EVENT_LOG_V2_DIR", t.TempDir())
+	r := Run(Options{})
+	if !r.OK() {
+		t.Fatalf("local-only run must not fail: %+v", r.Checks)
+	}
+}
diff --git a/internal/doctor/render.go b/internal/doctor/render.go
new file mode 100644
index 0000000..45d2a97
--- /dev/null
+++ b/internal/doctor/render.go
@@ -0,0 +1,43 @@
+package doctor
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+)
+
+func symbol(s Status) string {
+	switch s {
+	case StatusOK:
+		return "[ ok ]"
+	case StatusWarn:
+		return "[warn]"
+	case StatusFail:
+		return "[fail]"
+	default:
+		return "[skip]"
+	}
+}
+
+// Render writes a human checklist. Every check appears on its own line.
+func Render(w io.Writer, r Result) {
+	for _, c := range r.Checks {
+		if c.Detail != "" {
+			fmt.Fprintf(w, "%s %-16s %s\n", symbol(c.Status), c.Name, c.Detail)
+			continue
+		}
+		fmt.Fprintf(w, "%s %s\n", symbol(c.Status), c.Name)
+	}
+	if r.OK() {
+		fmt.Fprintln(w, "\ndoctor: ok")
+	} else {
+		fmt.Fprintln(w, "\ndoctor: FAILED")
+	}
+}
+
+// RenderJSON writes the result as indented JSON.
+func RenderJSON(w io.Writer, r Result) error {
+	enc := json.NewEncoder(w)
+	enc.SetIndent("", "  ")
+	return enc.Encode(r)
+}
diff --git a/internal/doctor/render_test.go b/internal/doctor/render_test.go
new file mode 100644
index 0000000..ed0b297
--- /dev/null
+++ b/internal/doctor/render_test.go
@@ -0,0 +1,36 @@
+package doctor
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResultOKIgnoresWarnAndSkip(t *testing.T) {
+	r := Result{Checks: []Check{
+		{Name: "a", Status: StatusOK},
+		{Name: "b", Status: StatusWarn, Detail: "weak key"},
+		{Name: "c", Status: StatusSkip, Detail: "SQLITE_PATH unset"},
+	}}
+	if !r.OK() {
+		t.Fatal("warn/skip must not make Result not-OK")
+	}
+	r.Checks = append(r.Checks, Check{Name: "d", Status: StatusFail, Detail: "boom"})
+	if r.OK() {
+		t.Fatal("a failed check must make Result not-OK")
+	}
+}
+
+func TestRenderShowsEveryCheckAndDetail(t *testing.T) {
+	r := Result{Checks: []Check{
+		{Name: "auth/config", Status: StatusOK},
+		{Name: "sqlite", Status: StatusFail, Detail: "cannot open"},
+	}}
+	var b strings.Builder
+	Render(&b, r)
+	out := b.String()
+	for _, want := range []string{"auth/config", "sqlite", "cannot open", "ok", "fail"} {
+		if !strings.Contains(out, want) {
+			t.Fatalf("render missing %q in:\n%s", want, out)
+		}
+	}
+}
diff --git a/internal/doctor/triagecheck.go b/internal/doctor/triagecheck.go
new file mode 100644
index 0000000..882dcfb
--- /dev/null
+++ b/internal/doctor/triagecheck.go
@@ -0,0 +1,82 @@
+package doctor
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// checkTriageHash dogfoods determinism (v0.1.1 item 1): it builds a triage
+// report twice from fixed, in-memory canned dependencies and asserts the
+// report_hash is identical. No server, no live incident, no network.
+func checkTriageHash() Check {
+	build := func() (*pkgtriage.Report, error) {
+		eng, err := triage.NewEngine(triage.Deps{
+			Incidents:  cannedIncidents{},
+			Blast:      cannedBlast{},
+			Story:      cannedStory{},
+			Signals:    cannedSignals{},
+			NextChecks: cannedNextChecks{},
+			Now:        func() time.Time { return time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) },
+		})
+		if err != nil {
+			return nil, err
+		}
+		opts, err := triage.ParseBuildOptions("15m", false, time.Unix(0, 0).UTC())
+		if err != nil {
+			return nil, err
+		}
+		return eng.Build(context.Background(), "inc_doctor", opts)
+	}
+	a, err := build()
+	if err != nil {
+		return Check{Name: "triage-hash", Status: StatusFail, Detail: err.Error()}
+	}
+	b, err := build()
+	if err != nil {
+		return Check{Name: "triage-hash", Status: StatusFail, Detail: err.Error()}
+	}
+	if a.ReportHash == "" || a.ReportHash != b.ReportHash {
+		return Check{Name: "triage-hash", Status: StatusFail, Detail: fmt.Sprintf("hash unstable: %q vs %q", a.ReportHash, b.ReportHash)}
+	}
+	return Check{Name: "triage-hash", Status: StatusOK, Detail: a.ReportHash}
+}
+
+// --- canned, deterministic triage dependencies (production code, not test) ---
+//
+// These types live in production (non-test) code because checkTriageHash is
+// itself production code that must build a real triage report to dogfood
+// determinism; the canned deps supply that report's fixed inputs.
+
+type cannedIncidents struct{}
+
+func (cannedIncidents) GetIncident(_ context.Context, id string) (triage.IncidentSummary, error) {
+	return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type cannedBlast struct{}
+
+func (cannedBlast) BlastSnapshot(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+	return triage.BlastSnapshotResult{}, nil
+}
+
+type cannedStory struct{}
+
+func (cannedStory) FirstFailureStory(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.FirstFailureResult, error) {
+	return triage.FirstFailureResult{}, nil
+}
+
+type cannedSignals struct{}
+
+func (cannedSignals) SignalsFor(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) ([]triage.SignalEvidence, error) {
+	return nil, nil
+}
+
+type cannedNextChecks struct{}
+
+func (cannedNextChecks) NextChecks(_ context.Context, _ triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+	return nil, nil
+}
diff --git a/internal/doctor/triagecheck_test.go b/internal/doctor/triagecheck_test.go
new file mode 100644
index 0000000..aa96023
--- /dev/null
+++ b/internal/doctor/triagecheck_test.go
@@ -0,0 +1,13 @@
+package doctor
+
+import "testing"
+
+func TestCheckTriageHashIsOK(t *testing.T) {
+	c := checkTriageHash()
+	if c.Status != StatusOK {
+		t.Fatalf("triage hash sanity must be ok, got %q (%s)", c.Status, c.Detail)
+	}
+	if c.Detail == "" {
+		t.Fatal("expected the report hash in the detail")
+	}
+}
diff --git a/internal/eventlog/v2/dir.go b/internal/eventlog/v2/dir.go
new file mode 100644
index 0000000..38e474c
--- /dev/null
+++ b/internal/eventlog/v2/dir.go
@@ -0,0 +1,27 @@
+package eventlogv2
+
+import (
+	"path/filepath"
+	"strings"
+)
+
+// ResolveDir resolves the v2 WAL directory the way the whole system does: the
+// explicit EVENT_LOG_V2_DIR value (v2Dir) when set, otherwise the default
+// derived from EVENT_LOG_DIR (baseDir). Both cmd/ingest (the server) and
+// `waylog doctor` call this so their resolution can never drift.
+func ResolveDir(v2Dir, baseDir string) string {
+	if d := strings.TrimSpace(v2Dir); d != "" {
+		return d
+	}
+	return DefaultDir(strings.TrimSpace(baseDir))
+}
+
+// DefaultDir resolves the default v2 WAL directory from EVENT_LOG_DIR: it nests
+// "v2" under the configured event-log dir, or uses ./data/eventlog-v2 when no
+// event-log dir is set.
+func DefaultDir(eventLogDir string) string {
+	if eventLogDir != "" {
+		return filepath.Join(eventLogDir, "v2")
+	}
+	return "./data/eventlog-v2"
+}
diff --git a/internal/firstrun/burst.go b/internal/firstrun/burst.go
new file mode 100644
index 0000000..1148276
--- /dev/null
+++ b/internal/firstrun/burst.go
@@ -0,0 +1,112 @@
+// Package firstrun runs the self-contained Crux first-run demo: launch the
+// ingest server, drive a real failure burst through the SDK, and print the
+// deterministic incident report.
+package firstrun
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"net/http"
+	"time"
+
+	waylog "github.com/sssmaran/WaylogCLI/pkg/waylog/v2"
+)
+
+// burstHTTPClient is a package-level client with a fixed timeout so postJSON
+// never hangs indefinitely (mirrors the SDK signal client's 5 s policy).
+var burstHTTPClient = &http.Client{Timeout: 5 * time.Second}
+
+// BurstConfig controls how many failing checkout events RunBurst emits.
+type BurstConfig struct {
+	IngestURL string
+	WriteKey  string
+	Requests  int // failing checkout requests to emit; default 25
+}
+
+// BurstResult reports what RunBurst produced.
+type BurstResult struct {
+	FailingEvents int
+}
+
+const (
+	burstService   = "checkout"
+	burstStep      = "payment.charge"
+	burstCode      = "PMT_502"
+	burstEnv       = "demo"
+	burstTestCount = 10 // number of failing requests used in unit tests
+)
+
+// RunBurst emits Requests failing checkout events through the real SDK so they
+// traverse the ingest → v2 reader → incident engine pipeline, then posts one
+// alert and one runtime signal to give the incident engine corroborating
+// evidence.
+func RunBurst(cfg BurstConfig) (BurstResult, error) {
+	if cfg.Requests <= 0 {
+		cfg.Requests = 25
+	}
+
+	if err := waylog.Init(waylog.Config{
+		Service:   burstService,
+		Env:       burstEnv,
+		IngestURL: cfg.IngestURL,
+		APIKey:    cfg.WriteKey,
+	}); err != nil {
+		return BurstResult{}, fmt.Errorf("sdk init: %w", err)
+	}
+
+	for i := 0; i < cfg.Requests; i++ {
+		ctx := waylog.Begin(context.Background(), waylog.BeginOptions{})
+		_ = waylog.StepVoid(ctx, burstStep, func(ctx context.Context) error {
+			return waylog.NewError(burstCode, waylog.WithReason("upstream payment gateway 5xx"))
+		})
+		if _, err := waylog.Finalize(ctx); err != nil {
+			return BurstResult{}, fmt.Errorf("finalize event %d: %w", i, err)
+		}
+	}
+
+	// Flush all buffered events to the ingest server before posting signals.
+	flushCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	if err := waylog.Shutdown(flushCtx); err != nil {
+		return BurstResult{}, fmt.Errorf("sdk shutdown: %w", err)
+	}
+
+	if err := postAlert(cfg.IngestURL, cfg.WriteKey); err != nil {
+		return BurstResult{}, err
+	}
+	if err := postRuntimeSignal(cfg.IngestURL, cfg.WriteKey); err != nil {
+		return BurstResult{}, err
+	}
+	return BurstResult{FailingEvents: cfg.Requests}, nil
+}
+
+func postJSON(url, key, body string) error {
+	req, err := http.NewRequest(http.MethodPost, url, bytes.NewBufferString(body))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+key)
+	resp, err := burstHTTPClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 300 {
+		return fmt.Errorf("POST %s: HTTP %d", url, resp.StatusCode)
+	}
+	return nil
+}
+
+func postAlert(ingestURL, key string) error {
+	ts := time.Now().UTC().Format(time.RFC3339)
+	body := fmt.Sprintf(`{"source":"crux","alert_id":"alert_firstrun_pmt_502","service":"checkout","env":"demo","severity":"critical","reason":"PMT_502 spike","message":"first-run demo alert for checkout payment failures","error_code":"PMT_502","timestamp":%q}`, ts)
+	return postJSON(ingestURL+"/v1/alerts", key, body)
+}
+
+func postRuntimeSignal(ingestURL, key string) error {
+	ts := time.Now().UTC().Format(time.RFC3339)
+	body := fmt.Sprintf(`{"type":"runtime","source":"k8s-demo","service":"checkout","env":"demo","severity":"critical","reason":"OOMKilled","message":"Container checkout killed by OOM (limit: 256Mi, usage: 312Mi).","resource":{"service":"checkout","container":"checkout"},"metadata":{"subtype":"oom_killed","pod":"checkout-7f8b9c-x2k","container":"checkout"},"timestamp":%q}`, ts)
+	return postJSON(ingestURL+"/v1/signals", key, body)
+}
diff --git a/internal/firstrun/burst_test.go b/internal/firstrun/burst_test.go
new file mode 100644
index 0000000..458bdf9
--- /dev/null
+++ b/internal/firstrun/burst_test.go
@@ -0,0 +1,71 @@
+package firstrun
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+
+	waylog "github.com/sssmaran/WaylogCLI/pkg/waylog/v2"
+)
+
+func TestRunBurstEmitsFailingPaymentEvents(t *testing.T) {
+	var events, alerts, signals int64
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/v1/events":
+			batch, _ := decodeEventBatch(r)
+			atomic.AddInt64(&events, int64(len(batch)))
+			w.WriteHeader(http.StatusAccepted)
+		case "/v1/alerts":
+			atomic.AddInt64(&alerts, 1)
+			w.WriteHeader(http.StatusCreated)
+		case "/v1/signals":
+			atomic.AddInt64(&signals, 1)
+			w.WriteHeader(http.StatusCreated)
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+	defer srv.Close()
+	t.Cleanup(func() { _ = waylog.Shutdown(context.Background()) })
+
+	res, err := RunBurst(BurstConfig{
+		IngestURL: srv.URL,
+		WriteKey:  "demo",
+		Requests:  burstTestCount,
+	})
+	if err != nil {
+		t.Fatalf("RunBurst: %v", err)
+	}
+	if res.FailingEvents != burstTestCount {
+		t.Fatalf("FailingEvents = %d, want %d", res.FailingEvents, burstTestCount)
+	}
+	if atomic.LoadInt64(&alerts) != 1 {
+		t.Fatalf("alerts posted = %d, want 1", atomic.LoadInt64(&alerts))
+	}
+	if atomic.LoadInt64(&signals) != 1 {
+		t.Fatalf("runtime signals posted = %d, want 1", atomic.LoadInt64(&signals))
+	}
+	if atomic.LoadInt64(&events) < burstTestCount {
+		t.Fatalf("events received = %d, want >= %d", atomic.LoadInt64(&events), burstTestCount)
+	}
+}
+
+// decodeEventBatch handles both NDJSON (one JSON object per line, used by the
+// batch transport) and a JSON array body. Returns one entry per event.
+func decodeEventBatch(r *http.Request) ([]map[string]any, error) {
+	dec := json.NewDecoder(r.Body)
+	defer r.Body.Close()
+	var out []map[string]any
+	for dec.More() {
+		var one map[string]any
+		if err := dec.Decode(&one); err != nil {
+			break
+		}
+		out = append(out, one)
+	}
+	return out, nil
+}
diff --git a/internal/firstrun/firstrun.go b/internal/firstrun/firstrun.go
new file mode 100644
index 0000000..97393cb
--- /dev/null
+++ b/internal/firstrun/firstrun.go
@@ -0,0 +1,152 @@
+package firstrun
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"syscall"
+	"time"
+)
+
+// Options configures the first-run experience.
+type Options struct {
+	Requests int           // failing requests to emit (default 25)
+	Timeout  time.Duration // max wait for an incident (default 90s)
+	Stdout   io.Writer
+	Stderr   io.Writer
+	NoWait   bool // if true, tear down after printing the report (tests/CI)
+}
+
+// Run executes the full first-run: launch ingest, drive a burst, wait for the
+// incident, print the deterministic report, then (unless NoWait) keep the
+// server up so the operator can browse the dashboard and run `crux ...`.
+func Run(opt Options) error {
+	if opt.Requests <= 0 {
+		opt.Requests = 25
+	}
+	if opt.Timeout <= 0 {
+		opt.Timeout = 90 * time.Second
+	}
+	if opt.Stdout == nil {
+		opt.Stdout = os.Stdout
+	}
+	if opt.Stderr == nil {
+		opt.Stderr = os.Stderr
+	}
+
+	dataDir, err := os.MkdirTemp("", "crux-first-run-")
+	if err != nil {
+		return err
+	}
+	defer os.RemoveAll(dataDir)
+
+	port, err := freePort()
+	if err != nil {
+		return err
+	}
+	addr := fmt.Sprintf("127.0.0.1:%d", port)
+	ingestURL := "http://" + addr
+	const writeKey = "demo"
+
+	execPath, _ := os.Executable()
+	execDir := filepath.Dir(execPath)
+	cmdPath, cmdArgs, runDir, err := locateServer(execDir)
+	if err != nil {
+		return err
+	}
+
+	fmt.Fprintf(opt.Stdout, "Starting Crux on %s ...\n", ingestURL)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	srvCmd := exec.CommandContext(ctx, cmdPath, cmdArgs...)
+	srvCmd.Env = serverEnv(dataDir, writeKey, addr)
+	if runDir != "" {
+		srvCmd.Dir = runDir
+	}
+	srvCmd.Stdout = io.Discard
+	srvCmd.Stderr = io.Discard
+	// Run the server in its own process group. With `go run ./cmd/ingest`, the go
+	// process forks a child (the compiled binary); cancelling the context only
+	// kills `go run`, orphaning the child. Killing the whole group on teardown
+	// reaps the actual ingest server too, so it doesn't leak ports/DB handles.
+	srvCmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	if err := srvCmd.Start(); err != nil {
+		return fmt.Errorf("start ingest: %w", err)
+	}
+	defer func() {
+		if srvCmd.Process != nil {
+			_ = syscall.Kill(-srvCmd.Process.Pid, syscall.SIGKILL)
+		}
+		cancel()
+		_ = srvCmd.Wait()
+	}()
+
+	if err := waitReady(ingestURL+"/readyz", 30*time.Second); err != nil {
+		return fmt.Errorf("ingest did not become ready: %w", err)
+	}
+
+	fmt.Fprintf(opt.Stdout, "Generating a checkout->payment failure burst (%d requests) ...\n", opt.Requests)
+	if _, err := RunBurst(BurstConfig{IngestURL: ingestURL, WriteKey: writeKey, Requests: opt.Requests}); err != nil {
+		return fmt.Errorf("burst: %w", err)
+	}
+
+	fmt.Fprintln(opt.Stdout, "Waiting for the incident engine to open an incident ...")
+	rep, err := waitForReport(reportPoll{IngestURL: ingestURL, ReadKey: "", Timeout: opt.Timeout, Interval: time.Second})
+	if err != nil {
+		return err
+	}
+
+	printReport(opt.Stdout, ingestURL, rep)
+
+	if opt.NoWait {
+		return nil
+	}
+	fmt.Fprintf(opt.Stdout, "\nServer running at %s -- open %s/ui/ in a browser.\nPress Ctrl-C to stop.\n", ingestURL, ingestURL)
+	sig := waitForInterrupt()
+	<-sig
+	return nil
+}
+
+func printReport(w io.Writer, ingestURL string, rep reportResult) {
+	bar := "============================================================"
+	fmt.Fprintf(w, "\n%s\n", bar)
+	fmt.Fprintf(w, "  Incident %s -- deterministic triage report\n", rep.IncidentID)
+	fmt.Fprintf(w, "  report_hash: %s\n", rep.ReportHash)
+	fmt.Fprintf(w, "%s\n\n", bar)
+	fmt.Fprintln(w, rep.Markdown)
+	fmt.Fprintf(w, "\nNext (run `crux incidents` against this server):\n")
+	fmt.Fprintf(w, "  crux --addr %s incidents\n", ingestURL)
+	fmt.Fprintf(w, "  crux --addr %s triage %s --snapshot\n", ingestURL, rep.IncidentID)
+	fmt.Fprintf(w, "  open %s/ui/\n", ingestURL)
+}
+
+func freePort() (int, error) {
+	l, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		return 0, err
+	}
+	defer l.Close()
+	return l.Addr().(*net.TCPAddr).Port, nil
+}
+
+func waitReady(url string, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		if _, code, err := getWithKey(url, ""); err == nil && code == 200 {
+			return nil
+		}
+		time.Sleep(250 * time.Millisecond)
+	}
+	return fmt.Errorf("timed out waiting for %s", url)
+}
+
+func waitForInterrupt() <-chan os.Signal {
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
+	return ch
+}
diff --git a/internal/firstrun/firstrun_test.go b/internal/firstrun/firstrun_test.go
new file mode 100644
index 0000000..bd81545
--- /dev/null
+++ b/internal/firstrun/firstrun_test.go
@@ -0,0 +1,31 @@
+package firstrun
+
+import (
+	"bytes"
+	"os/exec"
+	"testing"
+	"time"
+)
+
+func TestRunEndToEndOpensRealIncident(t *testing.T) {
+	if _, err := exec.LookPath("go"); err != nil {
+		t.Skip("go toolchain required for source-checkout first-run")
+	}
+	var out bytes.Buffer
+	err := Run(Options{
+		Requests: 30,
+		Timeout:  90 * time.Second,
+		Stdout:   &out,
+		Stderr:   &out,
+		NoWait:   true,
+	})
+	if err != nil {
+		t.Fatalf("Run: %v\n--- output ---\n%s", err, out.String())
+	}
+	got := out.String()
+	for _, want := range []string{"report_hash", "PMT_502", "crux incidents"} {
+		if !bytes.Contains([]byte(got), []byte(want)) {
+			t.Fatalf("output missing %q\n%s", want, got)
+		}
+	}
+}
diff --git a/internal/firstrun/poll.go b/internal/firstrun/poll.go
new file mode 100644
index 0000000..2b81a29
--- /dev/null
+++ b/internal/firstrun/poll.go
@@ -0,0 +1,87 @@
+package firstrun
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+)
+
+type reportPoll struct {
+	IngestURL string
+	ReadKey   string
+	Timeout   time.Duration
+	Interval  time.Duration
+}
+
+type reportResult struct {
+	IncidentID string
+	ReportHash string
+	Markdown   string
+}
+
+func getWithKey(url, key string) ([]byte, int, error) {
+	req, err := http.NewRequest(http.MethodGet, url, nil)
+	if err != nil {
+		return nil, 0, err
+	}
+	if key != "" {
+		req.Header.Set("Authorization", "Bearer "+key)
+	}
+	resp, err := burstHTTPClient.Do(req) // reuse the 5s-timeout client from burst.go
+	if err != nil {
+		return nil, 0, err
+	}
+	defer resp.Body.Close()
+	body, _ := io.ReadAll(resp.Body)
+	return body, resp.StatusCode, nil
+}
+
+func waitForReport(p reportPoll) (reportResult, error) {
+	if p.Interval <= 0 {
+		p.Interval = time.Second
+	}
+	deadline := time.Now().Add(p.Timeout)
+	var incidentID string
+	for time.Now().Before(deadline) {
+		body, code, err := getWithKey(p.IngestURL+"/v1/incidents/active", p.ReadKey)
+		if err == nil && code == http.StatusOK {
+			var active struct {
+				Incidents []struct {
+					IncidentID string `json:"incident_id"`
+				} `json:"incidents"`
+			}
+			if json.Unmarshal(body, &active) == nil && len(active.Incidents) > 0 {
+				incidentID = active.Incidents[0].IncidentID
+				break
+			}
+		}
+		time.Sleep(p.Interval)
+	}
+	if incidentID == "" {
+		return reportResult{}, fmt.Errorf("no incident opened within %s", p.Timeout)
+	}
+
+	// /v1/triage/{id} returns pkgtriage.Report directly; report_hash is a top-level field.
+	// snapshot=true must match the markdown fetch below so the printed hash and
+	// the printed report describe the same frozen state across an engine tick.
+	jsonBody, _, err := getWithKey(p.IngestURL+"/v1/triage/"+incidentID+"?snapshot=true", p.ReadKey)
+	if err != nil {
+		return reportResult{}, fmt.Errorf("fetch triage json: %w", err)
+	}
+	var triage struct {
+		ReportHash string `json:"report_hash"`
+	}
+	_ = json.Unmarshal(jsonBody, &triage)
+
+	mdBody, _, err := getWithKey(p.IngestURL+"/v1/triage/"+incidentID+"/report?format=markdown&snapshot=true", p.ReadKey)
+	if err != nil {
+		return reportResult{}, fmt.Errorf("fetch triage report: %w", err)
+	}
+	return reportResult{
+		IncidentID: incidentID,
+		ReportHash: triage.ReportHash,
+		Markdown:   string(mdBody),
+	}, nil
+}
diff --git a/internal/firstrun/poll_test.go b/internal/firstrun/poll_test.go
new file mode 100644
index 0000000..7160a72
--- /dev/null
+++ b/internal/firstrun/poll_test.go
@@ -0,0 +1,62 @@
+package firstrun
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestWaitForIncidentThenReport(t *testing.T) {
+	calls := 0
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/v1/incidents/active":
+			calls++
+			if calls < 2 {
+				w.Write([]byte(`{"incidents":[]}`))
+				return
+			}
+			w.Write([]byte(`{"incidents":[{"incident_id":"inc_demo_1"}]}`))
+		case r.URL.Path == "/v1/triage/inc_demo_1":
+			// report_hash is at the top level of the triage report JSON
+			// (pkg/triage.Report has ReportHash string `json:"report_hash"` directly)
+			w.Write([]byte(`{"report_hash":"abc123","schema_version":"triage.v1","incident_ref":{"id":"inc_demo_1","window":"10m"},"confidence":"high","generated_at":"2026-01-01T00:00:00Z"}`))
+		case r.URL.Path == "/v1/triage/inc_demo_1/report":
+			w.Write([]byte("# Incident inc_demo_1\nreport_hash: abc123\n"))
+		default:
+			w.WriteHeader(404)
+		}
+	}))
+	defer srv.Close()
+
+	out, err := waitForReport(reportPoll{
+		IngestURL: srv.URL,
+		ReadKey:   "",
+		Timeout:   3 * time.Second,
+		Interval:  10 * time.Millisecond,
+	})
+	if err != nil {
+		t.Fatalf("waitForReport: %v", err)
+	}
+	if out.IncidentID != "inc_demo_1" {
+		t.Fatalf("IncidentID = %q, want inc_demo_1", out.IncidentID)
+	}
+	if out.ReportHash != "abc123" {
+		t.Fatalf("ReportHash = %q, want abc123", out.ReportHash)
+	}
+	if out.Markdown == "" {
+		t.Fatal("Markdown report must be populated")
+	}
+}
+
+func TestWaitForIncidentTimesOut(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte(`{"incidents":[]}`))
+	}))
+	defer srv.Close()
+	_, err := waitForReport(reportPoll{IngestURL: srv.URL, Timeout: 100 * time.Millisecond, Interval: 10 * time.Millisecond})
+	if err == nil {
+		t.Fatal("expected timeout error when no incident opens")
+	}
+}
diff --git a/internal/firstrun/server.go b/internal/firstrun/server.go
new file mode 100644
index 0000000..1c2b4d9
--- /dev/null
+++ b/internal/firstrun/server.go
@@ -0,0 +1,92 @@
+package firstrun
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+)
+
+// serverEnv returns the environment for the embedded demo ingest server. It
+// mirrors scripts/demo.sh's no-login, fast-tick local profile and points all
+// state at the throwaway dir. SQLITE_PATH + EVENT_LOG_DIR are mandatory: the
+// incident engine only enables when SQLite is configured (cmd/ingest/main.go).
+func serverEnv(dataDir, writeKey, addr string) []string {
+	base := os.Environ()
+	set := map[string]string{
+		"INGEST_ADDR":                   addr,
+		"WAYLOG_WRITE_KEY":              writeKey,
+		"WAYLOG_READ_KEY":               "",
+		"DASHBOARD_AUTH":                "off",
+		"WAYLOG_PROFILE":                "demo",
+		"WAYLOG_INCIDENT_TICK_INTERVAL": "5s",
+		"WAYLOG_INCIDENTS_ENABLED":      "true",
+		// Disable the OTLP gRPC listener: it binds a fixed port (:4317) that
+		// collides if anything else is already running, crashing the process at
+		// boot. The first-run burst uses the HTTP ingest path, which is unaffected.
+		"OTLP_ENABLED":     "false",
+		"SQLITE_PATH":      filepath.Join(dataDir, "crux.db"),
+		"EVENT_LOG_DIR":    filepath.Join(dataDir, "eventlog"),
+		"EVENT_LOG_V2_DIR": filepath.Join(dataDir, "eventlog-v2"),
+		"CORS_ORIGIN":      "*",
+	}
+	out := make([]string, 0, len(base)+len(set))
+	for _, kv := range base {
+		keep := true
+		for k := range set {
+			if len(kv) > len(k) && kv[:len(k)+1] == k+"=" {
+				keep = false
+				break
+			}
+		}
+		if keep {
+			out = append(out, kv)
+		}
+	}
+	for k, v := range set {
+		out = append(out, k+"="+v)
+	}
+	return out
+}
+
+// locateServer finds the ingest server to launch. It returns the command, its
+// args, and the working directory to spawn it in ("" inherits the current dir).
+// Preference order:
+//  1. an `ingest` binary adjacent to the running crux executable (release tier),
+//  2. `go run ./cmd/ingest` when a source checkout + Go toolchain are present,
+//     discovered by walking up from the current working directory to the Go
+//     module root (the dir containing go.mod). This is CWD-independent: it works
+//     from any subdirectory of the checkout (e.g. tests run from internal/firstrun).
+func locateServer(execDir string) (cmd string, args []string, dir string, err error) {
+	adjacent := filepath.Join(execDir, "ingest")
+	if fi, statErr := os.Stat(adjacent); statErr == nil && !fi.IsDir() {
+		return adjacent, nil, "", nil
+	}
+	if goBin, lookErr := exec.LookPath("go"); lookErr == nil {
+		if root := moduleRoot(); root != "" {
+			if fi, statErr := os.Stat(filepath.Join(root, "cmd", "ingest")); statErr == nil && fi.IsDir() {
+				return goBin, []string{"run", "./cmd/ingest"}, root, nil
+			}
+		}
+	}
+	return "", nil, "", fmt.Errorf("no ingest server found: expected %q next to crux, or run from a source checkout with Go installed", adjacent)
+}
+
+// moduleRoot walks up from the current working directory looking for a go.mod
+// file and returns the directory containing it, or "" if none is found.
+func moduleRoot() string {
+	wd, err := os.Getwd()
+	if err != nil {
+		return ""
+	}
+	for {
+		if fi, statErr := os.Stat(filepath.Join(wd, "go.mod")); statErr == nil && !fi.IsDir() {
+			return wd
+		}
+		parent := filepath.Dir(wd)
+		if parent == wd {
+			return ""
+		}
+		wd = parent
+	}
+}
diff --git a/internal/firstrun/server_test.go b/internal/firstrun/server_test.go
new file mode 100644
index 0000000..1906d3a
--- /dev/null
+++ b/internal/firstrun/server_test.go
@@ -0,0 +1,90 @@
+package firstrun
+
+import (
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestServerEnvEnablesIncidentsAndIsLocalOnly(t *testing.T) {
+	dir := t.TempDir()
+	env := serverEnv(dir, "demo", "127.0.0.1:8099")
+
+	get := func(k string) string {
+		for _, kv := range env {
+			if strings.HasPrefix(kv, k+"=") {
+				return strings.TrimPrefix(kv, k+"=")
+			}
+		}
+		return ""
+	}
+	if get("SQLITE_PATH") == "" {
+		t.Fatal("SQLITE_PATH must be set so the incident engine is enabled")
+	}
+	if get("EVENT_LOG_DIR") == "" {
+		t.Fatal("EVENT_LOG_DIR must be set for durable ingest + incidents")
+	}
+	if get("WAYLOG_INCIDENT_TICK_INTERVAL") != "5s" {
+		t.Fatalf("tick interval = %q, want 5s for a fast demo", get("WAYLOG_INCIDENT_TICK_INTERVAL"))
+	}
+	if get("DASHBOARD_AUTH") != "off" {
+		t.Fatalf("DASHBOARD_AUTH = %q, want off for no-login demo", get("DASHBOARD_AUTH"))
+	}
+	if !strings.HasPrefix(get("INGEST_ADDR"), "127.0.0.1:") {
+		t.Fatalf("INGEST_ADDR = %q, want loopback-bound", get("INGEST_ADDR"))
+	}
+	if !strings.HasPrefix(get("SQLITE_PATH"), dir) {
+		t.Fatal("SQLITE_PATH must live under the throwaway dir")
+	}
+}
+
+func TestLocateServerPrefersAdjacentIngestBinary(t *testing.T) {
+	dir := t.TempDir()
+	bin := filepath.Join(dir, "ingest")
+	if err := os.WriteFile(bin, []byte("#!/bin/sh\n"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	cmd, args, runDir, err := locateServer(dir)
+	if err != nil {
+		t.Fatalf("locateServer: %v", err)
+	}
+	if cmd != bin || len(args) != 0 || runDir != "" {
+		t.Fatalf("got cmd=%q args=%v dir=%q, want adjacent ingest binary with empty dir", cmd, args, runDir)
+	}
+}
+
+func TestLocateServerFindsModuleRootFromSubdir(t *testing.T) {
+	if _, err := exec.LookPath("go"); err != nil {
+		t.Skip("go toolchain required")
+	}
+	// Build a fake module: /go.mod, /cmd/ingest/, /sub/.
+	root := t.TempDir()
+	if err := os.WriteFile(filepath.Join(root, "go.mod"), []byte("module fake\n\ngo 1.24\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.MkdirAll(filepath.Join(root, "cmd", "ingest"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	sub := filepath.Join(root, "sub")
+	if err := os.MkdirAll(sub, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	t.Chdir(sub)
+
+	// execDir has no adjacent ingest binary, forcing the go-run fallback.
+	cmd, args, runDir, err := locateServer(t.TempDir())
+	if err != nil {
+		t.Fatalf("locateServer: %v", err)
+	}
+	if len(args) != 2 || args[0] != "run" || args[1] != "./cmd/ingest" {
+		t.Fatalf("got cmd=%q args=%v, want go run ./cmd/ingest", cmd, args)
+	}
+	// t.TempDir on macOS may resolve through /private symlinks; compare by EvalSymlinks.
+	wantRoot, _ := filepath.EvalSymlinks(root)
+	gotRoot, _ := filepath.EvalSymlinks(runDir)
+	if gotRoot != wantRoot {
+		t.Fatalf("runDir = %q, want module root %q", runDir, root)
+	}
+}
diff --git a/internal/graph/analysis/blast.go b/internal/graph/analysis/blast.go
deleted file mode 100644
index 6dfe8c0..0000000
--- a/internal/graph/analysis/blast.go
+++ /dev/null
@@ -1,96 +0,0 @@
-package analysis
-
-import (
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-// ServiceImpact represents how many requests a given service was affected by.
-type ServiceImpact struct {
-	Service string `json:"service"`
-	Count   int    `json:"count"`
-}
-
-// BlastResult holds the blast-radius computation for a specific error code.
-type BlastResult struct {
-	AffectedRequests int             `json:"affected_requests"`
-	AffectedUsers    int             `json:"affected_users"`
-	Services         []ServiceImpact `json:"services"`
-}
-
-// ComputeBlastRadius computes the blast radius of a specific error code within
-// the given time window. It works directly on a graph snapshot.
-func ComputeBlastRadius(g *core.Graph, errorCode string, start, end time.Time) BlastResult {
-	result := BlastResult{
-		Services: []ServiceImpact{},
-	}
-
-	// Single pass over edges: find matched requests and build request→user index.
-	matchedRequests := map[string]bool{}
-	reqUsers := map[string][]string{}
-	for _, e := range g.Edges {
-		switch e.Type {
-		case core.EdgeFailedWith:
-			fromNode, ok := g.Nodes[e.From]
-			if !ok || fromNode.Type != core.NodeRequest {
-				continue
-			}
-			errNode, ok := g.Nodes[e.To]
-			if !ok || errNode.Type != core.NodeError {
-				continue
-			}
-			code, _ := errNode.Attr["code"].(string)
-			if code != errorCode {
-				continue
-			}
-			if fromNode.LastSeen.Before(start) || fromNode.LastSeen.After(end) {
-				continue
-			}
-			matchedRequests[fromNode.ID] = true
-		case core.EdgeRequestBy:
-			reqUsers[e.From] = append(reqUsers[e.From], e.To)
-		}
-	}
-
-	result.AffectedRequests = len(matchedRequests)
-
-	// Count unique users and group by service.
-	users := map[string]bool{}
-	services := map[string]int{}
-
-	for reqID := range matchedRequests {
-		req := g.Nodes[reqID]
-		svc := core.ServiceFromNode(req)
-		if svc == "" && req.Attr != nil {
-			if name, ok := req.Attr["service"].(string); ok {
-				svc = name
-			}
-			if root, ok := req.Attr["root_service"].(string); ok && root != "" {
-				svc = root
-			}
-		}
-		if svc != "" {
-			services[svc]++
-		}
-		if req.Attr != nil {
-			if userID, ok := req.Attr["user_id"].(string); ok && userID != "" {
-				users[userID] = true
-			}
-		}
-		for _, uid := range reqUsers[reqID] {
-			users[uid] = true
-		}
-	}
-
-	result.AffectedUsers = len(users)
-
-	for svc, count := range services {
-		result.Services = append(result.Services, ServiceImpact{
-			Service: svc,
-			Count:   count,
-		})
-	}
-
-	return result
-}
diff --git a/internal/graph/analysis/blast_test.go b/internal/graph/analysis/blast_test.go
deleted file mode 100644
index fb851e0..0000000
--- a/internal/graph/analysis/blast_test.go
+++ /dev/null
@@ -1,102 +0,0 @@
-package analysis
-
-import (
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func TestBlastRadius_BasicImpact(t *testing.T) {
-	st := graphstore.NewStore()
-	b := build.NewBuilder()
-	now := time.Now()
-
-	ev := testutil.MakeEvent(
-		testutil.WithService("payment"),
-		testutil.WithError("PMT_502", "bad gateway"),
-		testutil.WithStatusCode(502),
-		testutil.WithTimestamp(now),
-	)
-	st.Merge(b.Build(ev))
-
-	snap := st.Snapshot()
-	result := ComputeBlastRadius(snap, "PMT_502", now.Add(-1*time.Hour), now.Add(time.Second))
-
-	if result.AffectedRequests < 1 {
-		t.Errorf("expected at least 1 affected request, got %d", result.AffectedRequests)
-	}
-	if result.AffectedUsers < 1 {
-		t.Errorf("expected at least 1 affected user, got %d", result.AffectedUsers)
-	}
-	if result.Services == nil {
-		t.Fatal("Services slice must never be nil")
-	}
-	if len(result.Services) < 1 {
-		t.Errorf("expected at least 1 service, got %d", len(result.Services))
-	}
-
-	// Verify service name is "payment"
-	found := false
-	for _, s := range result.Services {
-		if strings.Contains(s.Service, "payment") {
-			found = true
-			break
-		}
-	}
-	if !found {
-		t.Errorf("expected payment service in results, got %v", result.Services)
-	}
-}
-
-func TestBlastRadius_ZeroResults(t *testing.T) {
-	t.Run("empty graph", func(t *testing.T) {
-		st := graphstore.NewStore()
-		snap := st.Snapshot()
-		now := time.Now()
-		result := ComputeBlastRadius(snap, "NONEXISTENT", now.Add(-1*time.Hour), now)
-
-		if result.AffectedRequests != 0 {
-			t.Errorf("expected 0 affected requests, got %d", result.AffectedRequests)
-		}
-		if result.AffectedUsers != 0 {
-			t.Errorf("expected 0 affected users, got %d", result.AffectedUsers)
-		}
-		if result.Services == nil {
-			t.Fatal("Services slice must never be nil, even when empty")
-		}
-		if len(result.Services) != 0 {
-			t.Errorf("expected empty services, got %v", result.Services)
-		}
-	})
-
-	t.Run("nonexistent error code", func(t *testing.T) {
-		st := graphstore.NewStore()
-		b := build.NewBuilder()
-		now := time.Now()
-
-		ev := testutil.MakeEvent(
-			testutil.WithService("payment"),
-			testutil.WithError("PMT_502", "bad gateway"),
-			testutil.WithStatusCode(502),
-			testutil.WithTimestamp(now),
-		)
-		st.Merge(b.Build(ev))
-
-		snap := st.Snapshot()
-		result := ComputeBlastRadius(snap, "NONEXISTENT", now.Add(-1*time.Hour), now.Add(time.Second))
-
-		if result.AffectedRequests != 0 {
-			t.Errorf("expected 0 affected requests, got %d", result.AffectedRequests)
-		}
-		if result.AffectedUsers != 0 {
-			t.Errorf("expected 0 affected users, got %d", result.AffectedUsers)
-		}
-		if result.Services == nil {
-			t.Fatal("Services slice must never be nil")
-		}
-	})
-}
diff --git a/internal/graph/analysis/diff.go b/internal/graph/analysis/diff.go
deleted file mode 100644
index dba4249..0000000
--- a/internal/graph/analysis/diff.go
+++ /dev/null
@@ -1,148 +0,0 @@
-package analysis
-
-import "github.com/sssmaran/WaylogCLI/internal/graph/store"
-
-// DiffRollups is the canonical diff for default user-facing comparisons.
-// It consumes root-cause-counted [RollupSummary]s so compare_windows /
-// anomaly detection / overview deltas never re-introduce propagation
-// amplification. DiffSummaries remains for detail surfaces that need
-// propagation-counted spread.
-func DiffRollups(before, after RollupSummary) WindowDiff {
-	out := WindowDiff{
-		TotalRequestsBefore: before.TotalRequests,
-		TotalRequestsAfter:  after.TotalRequests,
-		TotalFailuresBefore: before.TotalFailures,
-		TotalFailuresAfter:  after.TotalFailures,
-		LatencyP50Before:    before.LatencyP50,
-		LatencyP50After:     after.LatencyP50,
-		LatencyP95Before:    before.LatencyP95,
-		LatencyP95After:     after.LatencyP95,
-		LatencyP99Before:    before.LatencyP99,
-		LatencyP99After:     after.LatencyP99,
-	}
-	seen := map[string]bool{}
-	for code, afterCount := range after.PrimaryErrorCount {
-		seen[code] = true
-		beforeCount := before.PrimaryErrorCount[code]
-		switch {
-		case beforeCount == 0 && afterCount > 0:
-			out.New = append(out.New, DiffEntry{
-				ErrorCode: code,
-				After:     afterCount,
-				Delta:     afterCount,
-			})
-		case afterCount > beforeCount:
-			out.Increased = append(out.Increased, DiffEntry{
-				ErrorCode: code,
-				Before:    beforeCount,
-				After:     afterCount,
-				Delta:     afterCount - beforeCount,
-			})
-		case afterCount < beforeCount:
-			out.Decreased = append(out.Decreased, DiffEntry{
-				ErrorCode: code,
-				Before:    beforeCount,
-				After:     afterCount,
-				Delta:     afterCount - beforeCount,
-			})
-		}
-	}
-	for code, beforeCount := range before.PrimaryErrorCount {
-		if seen[code] {
-			continue
-		}
-		out.Removed = append(out.Removed, DiffEntry{
-			ErrorCode: code,
-			Before:    beforeCount,
-			Delta:     -beforeCount,
-		})
-	}
-	return out
-}
-
-type DiffEntry struct {
-	ErrorCode string
-	Before    int
-	After     int
-	Delta     int
-}
-
-type WindowDiff struct {
-	New       []DiffEntry
-	Removed   []DiffEntry
-	Increased []DiffEntry
-	Decreased []DiffEntry
-
-	TotalRequestsBefore int
-	TotalRequestsAfter  int
-	TotalFailuresBefore int
-	TotalFailuresAfter  int
-	LatencyP50Before    int64
-	LatencyP50After     int64
-	LatencyP95Before    int64
-	LatencyP95After     int64
-	LatencyP99Before    int64
-	LatencyP99After     int64
-}
-
-func DiffSummaries(before, after store.WindowSummary) WindowDiff {
-	out := WindowDiff{
-		TotalRequestsBefore: before.TotalRequests,
-		TotalRequestsAfter:  after.TotalRequests,
-		TotalFailuresBefore: before.TotalFailures,
-		TotalFailuresAfter:  after.TotalFailures,
-		LatencyP50Before:    before.LatencyP50,
-		LatencyP50After:     after.LatencyP50,
-		LatencyP95Before:    before.LatencyP95,
-		LatencyP95After:     after.LatencyP95,
-		LatencyP99Before:    before.LatencyP99,
-		LatencyP99After:     after.LatencyP99,
-	}
-
-	seen := map[string]bool{}
-
-	// Errors present in "after"
-	for err, afterCount := range after.ErrorCount {
-		seen[err] = true
-		beforeCount := before.ErrorCount[err]
-
-		switch {
-		case beforeCount == 0 && afterCount > 0:
-			out.New = append(out.New, DiffEntry{
-				ErrorCode: err,
-				After:     afterCount,
-				Delta:     afterCount,
-			})
-
-		case afterCount > beforeCount:
-			out.Increased = append(out.Increased, DiffEntry{
-				ErrorCode: err,
-				Before:    beforeCount,
-				After:     afterCount,
-				Delta:     afterCount - beforeCount,
-			})
-
-		case afterCount < beforeCount:
-			out.Decreased = append(out.Decreased, DiffEntry{
-				ErrorCode: err,
-				Before:    beforeCount,
-				After:     afterCount,
-				Delta:     afterCount - beforeCount,
-			})
-		}
-	}
-
-	// Errors that disappeared
-	for err, beforeCount := range before.ErrorCount {
-		if seen[err] {
-			continue
-		}
-		out.Removed = append(out.Removed, DiffEntry{
-			ErrorCode: err,
-			Before:    beforeCount,
-			Delta:     -beforeCount,
-		})
-	}
-
-	return out
-}
diff --git a/internal/graph/analysis/diff_test.go b/internal/graph/analysis/diff_test.go
deleted file mode 100644
index 40059d9..0000000
--- a/internal/graph/analysis/diff_test.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package analysis
-
-import (
-	"testing"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-func TestDiffSummaries_RequestAndLatency(t *testing.T) {
-	before := store.WindowSummary{
-		TotalRequests: 100,
-		TotalFailures: 10,
-		LatencyP50:    50,
-		LatencyP95:    200,
-		LatencyP99:    500,
-		ErrorCount:    map[string]int{"err1": 5},
-	}
-	after := store.WindowSummary{
-		TotalRequests: 150,
-		TotalFailures: 25,
-		LatencyP50:    60,
-		LatencyP95:    250,
-		LatencyP99:    600,
-		ErrorCount:    map[string]int{"err1": 10, "err2": 3},
-	}
-
-	diff := DiffSummaries(before, after)
-
-	if diff.TotalRequestsBefore != 100 || diff.TotalRequestsAfter != 150 {
-		t.Errorf("TotalRequests before/after = %d/%d, want 100/150",
-			diff.TotalRequestsBefore, diff.TotalRequestsAfter)
-	}
-	if diff.TotalFailuresBefore != 10 || diff.TotalFailuresAfter != 25 {
-		t.Errorf("TotalFailures before/after = %d/%d, want 10/25",
-			diff.TotalFailuresBefore, diff.TotalFailuresAfter)
-	}
-	if diff.LatencyP50Before != 50 || diff.LatencyP50After != 60 {
-		t.Errorf("LatencyP50 before/after = %d/%d, want 50/60",
-			diff.LatencyP50Before, diff.LatencyP50After)
-	}
-	if diff.LatencyP95Before != 200 || diff.LatencyP95After != 250 {
-		t.Errorf("LatencyP95 before/after = %d/%d, want 200/250",
-			diff.LatencyP95Before, diff.LatencyP95After)
-	}
-	if diff.LatencyP99Before != 500 || diff.LatencyP99After != 600 {
-		t.Errorf("LatencyP99 before/after = %d/%d, want 500/600",
-			diff.LatencyP99Before, diff.LatencyP99After)
-	}
-
-	// err1 should be in Increased
-	if len(diff.Increased) != 1 || diff.Increased[0].ErrorCode != "err1" {
-		t.Errorf("Increased = %+v, want [{err1 5 10 5}]", diff.Increased)
-	}
-	// err2 should be in New
-	if len(diff.New) != 1 || diff.New[0].ErrorCode != "err2" {
-		t.Errorf("New = %+v, want [{err2 0 3 3}]", diff.New)
-	}
-}
diff --git a/internal/graph/analysis/explain.go b/internal/graph/analysis/explain.go
deleted file mode 100644
index 8f4d0e0..0000000
--- a/internal/graph/analysis/explain.go
+++ /dev/null
@@ -1,419 +0,0 @@
-package analysis
-
-import (
-	"fmt"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-// SpanSummary represents a span in the error propagation chain.
-type SpanSummary struct {
-	SpanID    string `json:"span_id"`
-	Service   string `json:"service"`
-	ErrorCode string `json:"error_code,omitempty"`
-	LatencyMs any    `json:"latency_ms,omitempty"`
-	Depth     int    `json:"depth"`
-}
-
-// Explanation is a deterministic, structured view of why a request failed.
-// It is a projection of graph structure — not inference.
-type Explanation struct {
-	RequestID string
-
-	LatencyMs any
-	Flow      any
-
-	UserID   string
-	UserTier any
-
-	FeatureFlags []string
-
-	//span-aware attributes
-	SpanID      string
-	SpanService any
-	SpanDepth   string // "root" | "child"
-
-	Service any
-
-	ErrorCode any
-	ErrorMsg  any
-
-	SpanChain []SpanSummary
-}
-
-// ExplainRequest reconstructs failure context for a request node using the
-// legacy graph path. Prefer ExplainRequestWithTrace when a trace store is
-// available.
-func ExplainRequest(g *core.Graph, requestID string) (Explanation, error) {
-	return ExplainRequestWithTrace(g, nil, requestID)
-}
-
-// ExplainRequestWithTrace reconstructs failure context from the graph plus an
-// optional trace store. When trace data is available, the span chain and root
-// cause are sourced from the flat trace record rather than graph span nodes.
-func ExplainRequestWithTrace(g *core.Graph, traceStore *tracestore.Store, requestID string) (Explanation, error) {
-	req, ok := g.Nodes[requestID]
-	if !ok {
-		return Explanation{}, fmt.Errorf("request node not found: %s", requestID)
-	}
-
-	ex := Explanation{
-		RequestID: requestID,
-	}
-
-	// ---- request attributes ----
-	if req.Attr != nil {
-		ex.LatencyMs = req.Attr["latency_ms"]
-		ex.Flow = req.Attr["flow"]
-	}
-
-	if traceStore != nil {
-		traceID := ""
-		if req.Attr != nil {
-			traceID, _ = req.Attr["trace_id"].(string)
-		}
-		if rec, ok := traceStore.Get(traceID); ok {
-			if traced, ok := explainFromTraceRecord(g, req, requestID, rec); ok {
-				return traced, nil
-			}
-		}
-	}
-
-	// ---- graph span/error fallback ----
-	if ex, ok := explainFromGraph(g, requestID); ok {
-		return ex, nil
-	}
-
-	populateUserFlagsService(g, requestID, &ex)
-	return ex, nil
-}
-
-func explainFromTraceRecord(g *core.Graph, req core.Node, requestID string, rec *tracestore.TraceRecord) (Explanation, bool) {
-	if rec == nil || len(rec.Spans) == 0 {
-		return Explanation{}, false
-	}
-	ex := Explanation{RequestID: requestID}
-	if req.Attr != nil {
-		ex.LatencyMs = req.Attr["latency_ms"]
-		ex.Flow = req.Attr["flow"]
-	}
-
-	spans := map[string]tracestore.SpanRecord{}
-	parentOf := map[string]string{}
-	for _, span := range rec.Spans {
-		if span.SpanID == "" {
-			continue
-		}
-		spans[span.SpanID] = span
-		if span.ParentSpanID != "" {
-			parentOf[span.SpanID] = span.ParentSpanID
-		}
-	}
-	if len(spans) == 0 {
-		return Explanation{}, false
-	}
-
-	depthCache := computeSpanDepths(spans, parentOf)
-
-	candidates := make([]rootCauseCandidate, 0, len(spans))
-	for id, span := range spans {
-		if span.Success || span.ErrorCode == "" {
-			continue
-		}
-		candidates = append(candidates, rootCauseCandidate{id: id, depth: depthCache[id], ts: span.Timestamp})
-	}
-	rootCauseID := pickRootCauseCandidate(candidates)
-	rootCauseDepth := depthCache[rootCauseID]
-	if rootCauseID == "" {
-		for _, e := range g.OutEdges[requestID] {
-			if e.Type == core.EdgeFailedWith {
-				errNode := g.Nodes[e.To]
-				if errNode.Attr != nil {
-					ex.ErrorCode = errNode.Attr["code"]
-					ex.ErrorMsg = errNode.Attr["message"]
-				}
-				populateUserFlagsService(g, requestID, &ex)
-				return ex, true
-			}
-		}
-		return Explanation{}, false
-	}
-
-	rc := spans[rootCauseID]
-	ex.ErrorCode = rc.ErrorCode
-	ex.ErrorMsg = rc.ErrorMessage
-	ex.SpanID = rootCauseID
-	if rootCauseDepth > 0 {
-		ex.SpanDepth = "child"
-	} else {
-		ex.SpanDepth = "root"
-	}
-	ex.SpanService = rc.Service
-	if req.Attr != nil {
-		ex.Service = stringAttr(req.Attr["service"])
-	}
-
-	cur := rootCauseID
-	visited := map[string]bool{}
-	for cur != "" && !visited[cur] {
-		visited[cur] = true
-		span := spans[cur]
-		ss := SpanSummary{
-			SpanID:    cur,
-			Service:   span.Service,
-			ErrorCode: span.ErrorCode,
-			LatencyMs: span.LatencyMs,
-			Depth:     depthCache[cur],
-		}
-		ex.SpanChain = append(ex.SpanChain, ss)
-		cur = parentOf[cur]
-	}
-	populateUserFlagsService(g, requestID, &ex)
-	return ex, true
-}
-
-func explainFromGraph(g *core.Graph, requestID string) (Explanation, bool) {
-	req, ok := g.Nodes[requestID]
-	if !ok {
-		return Explanation{}, false
-	}
-
-	ex := Explanation{
-		RequestID: requestID,
-	}
-	if req.Attr != nil {
-		ex.LatencyMs = req.Attr["latency_ms"]
-		ex.Flow = req.Attr["flow"]
-	}
-
-	type spanInfo struct {
-		node    core.Node
-		errNode *core.Node
-	}
-	spans := map[string]*spanInfo{}
-
-	for _, e := range g.OutEdges[requestID] {
-		if e.Type != core.EdgeRequestHasSpan {
-			continue
-		}
-		spanNode, ok := g.Nodes[e.To]
-		if !ok || spanNode.Type != core.NodeSpan {
-			continue
-		}
-		si := &spanInfo{node: spanNode}
-		for _, se := range g.OutEdges[spanNode.ID] {
-			if se.Type == core.EdgeFailedWith {
-				en := g.Nodes[se.To]
-				si.errNode = &en
-				break
-			}
-		}
-		spans[spanNode.ID] = si
-	}
-
-	if len(spans) == 0 {
-		for _, e := range g.OutEdges[requestID] {
-			if e.Type == core.EdgeFailedWith {
-				errNode := g.Nodes[e.To]
-				if errNode.Attr != nil {
-					ex.ErrorCode = errNode.Attr["code"]
-					ex.ErrorMsg = errNode.Attr["message"]
-				}
-				populateUserFlagsService(g, requestID, &ex)
-				return ex, true
-			}
-		}
-		return Explanation{}, false
-	}
-
-	parentOf := map[string]string{}
-	for spanID := range spans {
-		for _, e := range g.OutEdges[spanID] {
-			if e.Type == core.EdgeSpanChildOf {
-				if _, ok := spans[e.To]; ok {
-					parentOf[spanID] = e.To
-				}
-				break
-			}
-		}
-		if _, found := parentOf[spanID]; !found {
-			if psid, ok := spans[spanID].node.Attr["parent_span_id"].(string); ok && psid != "" {
-				traceID, _ := spans[spanID].node.Attr["trace_id"].(string)
-				if traceID != "" {
-					parentNodeID := core.ID("span", traceID, psid)
-					if _, ok := spans[parentNodeID]; ok {
-						parentOf[spanID] = parentNodeID
-					}
-				}
-			}
-		}
-	}
-
-	depthCache := computeSpanDepths(spans, parentOf)
-
-	candidates := make([]rootCauseCandidate, 0, len(spans))
-	for id, si := range spans {
-		if si.errNode == nil {
-			continue
-		}
-		candidates = append(candidates, rootCauseCandidate{id: id, depth: depthCache[id], ts: spanTimestamp(si.node)})
-	}
-	rootCauseID := pickRootCauseCandidate(candidates)
-	rootCauseDepth := depthCache[rootCauseID]
-
-	if rootCauseID == "" {
-		for _, e := range g.OutEdges[requestID] {
-			if e.Type == core.EdgeFailedWith {
-				errNode := g.Nodes[e.To]
-				if errNode.Attr != nil {
-					ex.ErrorCode = errNode.Attr["code"]
-					ex.ErrorMsg = errNode.Attr["message"]
-				}
-				populateUserFlagsService(g, requestID, &ex)
-				return ex, true
-			}
-		}
-		return Explanation{}, false
-	}
-	si := spans[rootCauseID]
-	if si.errNode != nil && si.errNode.Attr != nil {
-		ex.ErrorCode = si.errNode.Attr["code"]
-		ex.ErrorMsg = si.errNode.Attr["message"]
-	}
-	ex.SpanID = rootCauseID
-	if rootCauseDepth > 0 {
-		ex.SpanDepth = "child"
-	} else {
-		ex.SpanDepth = "root"
-	}
-	ex.SpanService = spanServiceName(g, rootCauseID)
-
-	var chain []SpanSummary
-	cur := rootCauseID
-	visited := map[string]bool{}
-	for cur != "" && !visited[cur] {
-		visited[cur] = true
-		si := spans[cur]
-		ss := SpanSummary{
-			SpanID:  cur,
-			Service: spanServiceNameStr(g, cur),
-			Depth:   depthCache[cur],
-		}
-		if si.node.Attr != nil {
-			ss.LatencyMs = si.node.Attr["latency_ms"]
-		}
-		if si.errNode != nil && si.errNode.Attr != nil {
-			ss.ErrorCode, _ = si.errNode.Attr["code"].(string)
-		}
-		chain = append(chain, ss)
-		cur = parentOf[cur]
-	}
-	ex.SpanChain = chain
-	populateUserFlagsService(g, requestID, &ex)
-	return ex, true
-}
-
-func stringAttr(v any) string {
-	if s, ok := v.(string); ok {
-		return s
-	}
-	return ""
-}
-
-func populateUserFlagsService(g *core.Graph, requestID string, ex *Explanation) {
-	req, _ := g.Nodes[requestID]
-	if req.Attr != nil {
-		if uid, ok := req.Attr["user_id"].(string); ok && uid != "" {
-			ex.UserID = uid
-		}
-		if tier, ok := req.Attr["user_tier"].(string); ok && tier != "" {
-			ex.UserTier = tier
-		}
-		if flags := requestFeatureFlagsFromNode(req); len(flags) > 0 {
-			ex.FeatureFlags = append(ex.FeatureFlags, flags...)
-		}
-		if svc, ok := req.Attr["root_service"].(string); ok && svc != "" {
-			ex.Service = svc
-		} else if svc, ok := req.Attr["service"].(string); ok && svc != "" {
-			ex.Service = svc
-		}
-	}
-	for _, e := range g.OutEdges[requestID] {
-		switch e.Type {
-		case core.EdgeRequestBy:
-			u := g.Nodes[e.To]
-			ex.UserID = u.ID
-			if u.Attr != nil {
-				ex.UserTier = u.Attr["tier"]
-			}
-		case core.EdgeUsedFlag:
-			flagNode := g.Nodes[e.To]
-			if flagNode.Attr != nil {
-				if name, ok := flagNode.Attr["name"].(string); ok {
-					ex.FeatureFlags = append(ex.FeatureFlags, name)
-				}
-			}
-		case core.EdgeHandledBy:
-			svcNode := g.Nodes[e.To]
-			if svcNode.Attr != nil {
-				ex.Service = svcNode.Attr["name"]
-			}
-		}
-	}
-}
-
-func requestFeatureFlagsFromNode(req core.Node) []string {
-	if req.Attr == nil {
-		return nil
-	}
-	if flags, ok := req.Attr["feature_flags"].([]string); ok {
-		return append([]string(nil), flags...)
-	}
-	if flags, ok := req.Attr["feature_flags"].([]any); ok {
-		out := make([]string, 0, len(flags))
-		for _, item := range flags {
-			if name, ok := item.(string); ok && name != "" {
-				out = append(out, name)
-			}
-		}
-		return out
-	}
-	return nil
-}
-
-func spanServiceName(g *core.Graph, spanID string) any {
-	for _, e := range g.OutEdges[spanID] {
-		if e.Type == core.EdgeSpanOnService {
-			svc := g.Nodes[e.To]
-			if svc.Attr != nil {
-				return svc.Attr["name"]
-			}
-		}
-	}
-	return nil
-}
-
-func spanServiceNameStr(g *core.Graph, spanID string) string {
-	v := spanServiceName(g, spanID)
-	if s, ok := v.(string); ok {
-		return s
-	}
-	return ""
-}
-
-func spanTimestamp(n core.Node) time.Time {
-	if n.Attr == nil {
-		return time.Time{}
-	}
-	switch v := n.Attr["timestamp"].(type) {
-	case time.Time:
-		return v
-	case string:
-		if t, err := time.Parse(time.RFC3339Nano, v); err == nil {
-			return t
-		}
-	}
-	return time.Time{}
-}
diff --git a/internal/graph/analysis/explain_test.go b/internal/graph/analysis/explain_test.go
deleted file mode 100644
index 602895b..0000000
--- a/internal/graph/analysis/explain_test.go
+++ /dev/null
@@ -1,227 +0,0 @@
-package analysis
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func TestExplainRequest_DeepestSpanRootCause(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	reqID := core.ID("request", traceID)
-	rootSpanID := core.ID("span", traceID, "aaaaaaaaaaaaaaaa")
-	childSpanID := core.ID("span", traceID, "bbbbbbbbbbbbbbbb")
-	grandchildSpanID := core.ID("span", traceID, "cccccccccccccccc")
-
-	errShallow := core.ID("error", "SHALLOW_ERR")
-	errDeep := core.ID("error", "DEEP_ERR")
-	svcID := core.ID("service", "payment")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testutil.MakeNode(reqID, core.NodeRequest, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(rootSpanID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(childSpanID, core.NodeSpan, map[string]any{"trace_id": traceID, "parent_span_id": "aaaaaaaaaaaaaaaa"}),
-			testutil.MakeNode(grandchildSpanID, core.NodeSpan, map[string]any{"trace_id": traceID, "parent_span_id": "bbbbbbbbbbbbbbbb"}),
-			testutil.MakeNode(errShallow, core.NodeError, map[string]any{"code": "SHALLOW_ERR", "message": "shallow"}),
-			testutil.MakeNode(errDeep, core.NodeError, map[string]any{"code": "DEEP_ERR", "message": "deep"}),
-			testutil.MakeNode(svcID, core.NodeService, map[string]any{"name": "payment"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: rootSpanID, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: childSpanID, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: grandchildSpanID, Type: core.EdgeRequestHasSpan},
-			// Parent chain via EdgeSpanChildOf
-			{From: childSpanID, To: rootSpanID, Type: core.EdgeSpanChildOf},
-			{From: grandchildSpanID, To: childSpanID, Type: core.EdgeSpanChildOf},
-			// Errors on root and grandchild spans
-			{From: rootSpanID, To: errShallow, Type: core.EdgeFailedWith},
-			{From: grandchildSpanID, To: errDeep, Type: core.EdgeFailedWith},
-			// Service on grandchild
-			{From: grandchildSpanID, To: svcID, Type: core.EdgeSpanOnService},
-		},
-	)
-
-	ex, err := ExplainRequest(g, reqID)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if ex.ErrorCode != "DEEP_ERR" {
-		t.Errorf("ErrorCode = %v, want DEEP_ERR", ex.ErrorCode)
-	}
-	if ex.SpanID != grandchildSpanID {
-		t.Errorf("SpanID = %s, want %s", ex.SpanID, grandchildSpanID)
-	}
-	if ex.SpanDepth != "child" {
-		t.Errorf("SpanDepth = %s, want child", ex.SpanDepth)
-	}
-	if ex.SpanService != "payment" {
-		t.Errorf("SpanService = %v, want payment", ex.SpanService)
-	}
-
-	// SpanChain should go grandchild → child → root
-	if len(ex.SpanChain) != 3 {
-		t.Fatalf("SpanChain len = %d, want 3", len(ex.SpanChain))
-	}
-	if ex.SpanChain[0].SpanID != grandchildSpanID {
-		t.Errorf("SpanChain[0].SpanID = %s, want grandchild", ex.SpanChain[0].SpanID)
-	}
-	if ex.SpanChain[0].Depth != 2 {
-		t.Errorf("SpanChain[0].Depth = %d, want 2", ex.SpanChain[0].Depth)
-	}
-	if ex.SpanChain[1].SpanID != childSpanID {
-		t.Errorf("SpanChain[1].SpanID = %s, want child", ex.SpanChain[1].SpanID)
-	}
-	if ex.SpanChain[2].SpanID != rootSpanID {
-		t.Errorf("SpanChain[2].SpanID = %s, want root", ex.SpanChain[2].SpanID)
-	}
-}
-
-func TestExplainRequest_SameDepthPreferEarlier(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	reqID := core.ID("request", traceID)
-	span1ID := core.ID("span", traceID, "aaaaaaaaaaaaaaaa")
-	span2ID := core.ID("span", traceID, "bbbbbbbbbbbbbbbb")
-	err1ID := core.ID("error", "ERR_LATE")
-	err2ID := core.ID("error", "ERR_EARLY")
-
-	later := time.Date(2026, 1, 1, 12, 0, 1, 0, time.UTC)
-	earlier := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC)
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testutil.MakeNode(reqID, core.NodeRequest, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(span1ID, core.NodeSpan, map[string]any{"trace_id": traceID, "timestamp": later}),
-			testutil.MakeNode(span2ID, core.NodeSpan, map[string]any{"trace_id": traceID, "timestamp": earlier}),
-			testutil.MakeNode(err1ID, core.NodeError, map[string]any{"code": "ERR_LATE"}),
-			testutil.MakeNode(err2ID, core.NodeError, map[string]any{"code": "ERR_EARLY"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: span1ID, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: span2ID, Type: core.EdgeRequestHasSpan},
-			{From: span1ID, To: err1ID, Type: core.EdgeFailedWith},
-			{From: span2ID, To: err2ID, Type: core.EdgeFailedWith},
-		},
-	)
-
-	ex, err := ExplainRequest(g, reqID)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if ex.ErrorCode != "ERR_EARLY" {
-		t.Errorf("ErrorCode = %v, want ERR_EARLY (earlier timestamp wins tiebreak)", ex.ErrorCode)
-	}
-}
-
-func TestExplainRequest_NoSpanErrorFallsBackToRequest(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	reqID := core.ID("request", traceID)
-	spanID := core.ID("span", traceID, "aaaaaaaaaaaaaaaa")
-	errID := core.ID("error", "REQ_ERR")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testutil.MakeNode(reqID, core.NodeRequest, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(spanID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(errID, core.NodeError, map[string]any{"code": "REQ_ERR", "message": "request level error"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: spanID, Type: core.EdgeRequestHasSpan},
-			// Span has no error edge — only request has the error
-			{From: reqID, To: errID, Type: core.EdgeFailedWith},
-		},
-	)
-
-	ex, err := ExplainRequest(g, reqID)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if ex.ErrorCode != "REQ_ERR" {
-		t.Errorf("ErrorCode = %v, want REQ_ERR", ex.ErrorCode)
-	}
-	if ex.SpanID != "" {
-		t.Errorf("SpanID = %s, want empty (fallback to request-level error)", ex.SpanID)
-	}
-	if ex.SpanChain != nil {
-		t.Errorf("SpanChain = %v, want nil", ex.SpanChain)
-	}
-}
-
-func TestExplainRequest_CyclicSpanParentsNoPanic(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	reqID := core.ID("request", traceID)
-	spanAID := core.ID("span", traceID, "aaaaaaaaaaaaaaaa")
-	spanBID := core.ID("span", traceID, "bbbbbbbbbbbbbbbb")
-	errID := core.ID("error", "CYCLE_ERR")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testutil.MakeNode(reqID, core.NodeRequest, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(spanAID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(spanBID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(errID, core.NodeError, map[string]any{"code": "CYCLE_ERR", "message": "err"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: spanAID, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: spanBID, Type: core.EdgeRequestHasSpan},
-			// Cycle: A → B → A
-			{From: spanAID, To: spanBID, Type: core.EdgeSpanChildOf},
-			{From: spanBID, To: spanAID, Type: core.EdgeSpanChildOf},
-			{From: spanAID, To: errID, Type: core.EdgeFailedWith},
-		},
-	)
-
-	// Must not panic or infinite-loop
-	ex, err := ExplainRequest(g, reqID)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if ex.ErrorCode != "CYCLE_ERR" {
-		t.Errorf("ErrorCode = %v, want CYCLE_ERR", ex.ErrorCode)
-	}
-}
-
-func TestExplainRequest_SameDepthNoTimestampDeterministic(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	reqID := core.ID("request", traceID)
-	span1ID := core.ID("span", traceID, "aaaaaaaaaaaaaaaa")
-	span2ID := core.ID("span", traceID, "bbbbbbbbbbbbbbbb")
-	err1ID := core.ID("error", "ERR_1")
-	err2ID := core.ID("error", "ERR_2")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testutil.MakeNode(reqID, core.NodeRequest, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(span1ID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(span2ID, core.NodeSpan, map[string]any{"trace_id": traceID}),
-			testutil.MakeNode(err1ID, core.NodeError, map[string]any{"code": "ERR_1"}),
-			testutil.MakeNode(err2ID, core.NodeError, map[string]any{"code": "ERR_2"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: span1ID, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: span2ID, Type: core.EdgeRequestHasSpan},
-			{From: span1ID, To: err1ID, Type: core.EdgeFailedWith},
-			{From: span2ID, To: err2ID, Type: core.EdgeFailedWith},
-		},
-	)
-
-	// Run 20 times — must always pick the same winner (lexicographically smaller span ID)
-	var first string
-	for i := 0; i < 20; i++ {
-		ex, err := ExplainRequest(g, reqID)
-		if err != nil {
-			t.Fatal(err)
-		}
-		code, _ := ex.ErrorCode.(string)
-		if i == 0 {
-			first = code
-		} else if code != first {
-			t.Fatalf("nondeterministic: iteration %d got %s, want %s", i, code, first)
-		}
-	}
-}
diff --git a/internal/graph/analysis/patterns.go b/internal/graph/analysis/patterns.go
deleted file mode 100644
index dcb60bf..0000000
--- a/internal/graph/analysis/patterns.go
+++ /dev/null
@@ -1,139 +0,0 @@
-package analysis
-
-import (
-	"fmt"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-// FailurePattern represents a recurring failure shape in the system.
-type FailurePattern struct {
-	ErrorCode    string   `json:"error_code"`
-	Flow         string   `json:"flow"`
-	UserTier     string   `json:"user_tier"`
-	FeatureFlags []string `json:"feature_flags"`
-	Count        int      `json:"count"`
-}
-
-// DetectFailurePatterns scans the graph and groups failed requests
-// by shared causal attributes.
-func DetectFailurePatterns(g *core.Graph) []FailurePattern {
-	patterns := map[string]*FailurePattern{}
-
-	for _, e := range g.Edges {
-		if e.Type != core.EdgeFailedWith {
-			continue
-		}
-
-		req, ok := g.Nodes[e.From]
-		if !ok || req.Type != core.NodeRequest {
-			continue
-		}
-
-		errNode, ok := g.Nodes[e.To]
-		if !ok {
-			continue
-		}
-
-		var (
-			errorCode string
-			flow      string
-			userTier  string
-			flags     []string
-		)
-
-		if errNode.Attr != nil {
-			errorCode, _ = errNode.Attr["code"].(string)
-		}
-
-		if req.Attr != nil {
-			flow, _ = req.Attr["flow"].(string)
-		}
-
-		if req.Attr != nil {
-			userTier, _ = req.Attr["user_tier"].(string)
-			if userTier == "" {
-				userTier, _ = req.Attr["tier"].(string)
-			}
-			flags = append(flags, store.AttrToStringSlice(req.Attr["feature_flags"])...)
-		}
-		if len(flags) == 0 {
-			for _, ed := range g.OutEdges[req.ID] {
-				if ed.Type != core.EdgeUsedFlag {
-					continue
-				}
-				flag, ok := g.Nodes[ed.To]
-				if ok && flag.Attr != nil {
-					if name, ok := flag.Attr["name"].(string); ok {
-						flags = append(flags, name)
-					}
-				}
-			}
-		}
-
-		if flags == nil {
-			flags = []string{}
-		}
-		key := fmt.Sprintf("%s|%s|%s|%v", errorCode, flow, userTier, flags)
-
-		if _, ok := patterns[key]; !ok {
-			patterns[key] = &FailurePattern{
-				ErrorCode:    errorCode,
-				Flow:         flow,
-				UserTier:     userTier,
-				FeatureFlags: flags,
-			}
-		}
-
-		patterns[key].Count++
-	}
-
-	var out []FailurePattern
-	for _, p := range patterns {
-		out = append(out, *p)
-	}
-
-	return out
-}
-
-// FailurePatternsFromRollup builds failure patterns from a root-cause-counted
-// [RollupSummary]. This is the canonical path for failure_patterns in the
-// default rollup contract — use it instead of DetectFailurePatternsFromSummary,
-// which retains propagation-counted semantics for detail surfaces.
-func FailurePatternsFromRollup(r RollupSummary) []FailurePattern {
-	out := make([]FailurePattern, 0, len(r.PrimaryErrorCount))
-	for code, count := range r.PrimaryErrorCount {
-		out = append(out, FailurePattern{
-			ErrorCode: code,
-			Count:     count,
-		})
-	}
-	return out
-}
-
-// DetectFailurePatternsFromSummary builds failure patterns
-// using window summaries instead of graph traversal.
-func DetectFailurePatternsFromSummary(sum store.WindowSummary) []FailurePattern {
-	patterns := map[string]*FailurePattern{}
-
-	for errID, count := range sum.ErrorCount {
-		// NOTE:
-		// At this stage we only know error + count.
-		// Flow / tier / flags will be layered later (Module 6.4).
-		key := errID
-
-		if _, ok := patterns[key]; !ok {
-			patterns[key] = &FailurePattern{
-				ErrorCode: errID,
-			}
-		}
-		patterns[key].Count += count
-	}
-
-	var out []FailurePattern
-	for _, p := range patterns {
-		out = append(out, *p)
-	}
-	return out
-}
diff --git a/internal/graph/analysis/patterns_test.go b/internal/graph/analysis/patterns_test.go
deleted file mode 100644
index 484b5b3..0000000
--- a/internal/graph/analysis/patterns_test.go
+++ /dev/null
@@ -1,42 +0,0 @@
-package analysis
-
-import (
-	"testing"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func TestDetectFailurePatternsIgnoresSpanOriginFailedWithEdges(t *testing.T) {
-	s := graphstore.NewStore()
-	b := build.NewBuilder()
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithService("checkout"),
-		testutil.WithFlow("checkout"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "payment failed"),
-		testutil.WithUser("u-1", "standard", "us-east-1"),
-	)
-	s.Merge(b.Build(ev))
-
-	patterns := DetectFailurePatterns(s.Snapshot())
-	if len(patterns) != 1 {
-		t.Fatalf("patterns len = %d, want 1", len(patterns))
-	}
-	if patterns[0].Count != 1 {
-		t.Fatalf("pattern count = %d, want 1", patterns[0].Count)
-	}
-	if patterns[0].ErrorCode != "PMT_502" {
-		t.Fatalf("error_code = %q, want PMT_502", patterns[0].ErrorCode)
-	}
-	if patterns[0].UserTier != "standard" {
-		t.Fatalf("user_tier = %q, want standard", patterns[0].UserTier)
-	}
-	if patterns[0].Flow != "checkout" {
-		t.Fatalf("flow = %q, want checkout", patterns[0].Flow)
-	}
-}
diff --git a/internal/graph/analysis/rollup.go b/internal/graph/analysis/rollup.go
deleted file mode 100644
index 3197086..0000000
--- a/internal/graph/analysis/rollup.go
+++ /dev/null
@@ -1,164 +0,0 @@
-// Package analysis exposes deterministic, structure-only views over the hot
-// graph and trace store. The contents are projections — never inference.
-//
-// # Canonical rollup contract
-//
-// [RollupWindow] is the SINGLE SOURCE OF TRUTH for default user-facing
-// rollups. Any endpoint, tool, or detector that surfaces "top errors",
-// "top services", "failure patterns", spike/anomaly summaries, or overview
-// KPIs MUST consume RollupWindow. These surfaces count one root-cause error
-// per failed request (see [RootCauseSpan] for the tie-break) instead of
-// amplifying by propagated error spread.
-//
-// Detail surfaces that intentionally show spread — trace stories, blast
-// radius, failure chains — keep propagation-counted semantics and consume
-// store.SummarizeWindow directly. New user-facing default rollups must NOT
-// introduce ad-hoc aggregation; add a field to RollupSummary instead.
-package analysis
-
-import (
-	"sort"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-// RollupSummary is the root-cause-counted window summary. Each failed request
-// in [Start, End] contributes exactly one PrimaryErrorCount entry, one
-// ServiceFailureCount entry per distinct service it touched, and one
-// FlagFailureCount entry per distinct feature flag it carried.
-//
-// TotalRequests, ServiceRequestCount, FlagRequestCount, and latency
-// percentiles retain the same "per-request" semantics as
-// store.WindowSummary — the only behavioral difference is the error-side
-// counting.
-type RollupSummary struct {
-	Start time.Time
-	End   time.Time
-
-	TotalRequests int
-	TotalFailures int
-
-	// PrimaryErrorCount counts the canonical root-cause error code for each
-	// failed request exactly once. Replaces store.WindowSummary.ErrorCount
-	// for all default user-facing rollups.
-	PrimaryErrorCount map[string]int
-
-	ServiceRequestCount map[string]int
-	ServiceFailureCount map[string]int
-
-	FlagRequestCount map[string]int
-	FlagFailureCount map[string]int
-
-	LatencyP50 int64
-	LatencyP95 int64
-	LatencyP99 int64
-}
-
-// RollupSource is the minimal request-fact producer RollupWindow needs.
-// Both *store.Store and the tools-layer Store interface satisfy this.
-type RollupSource interface {
-	ForEachRequestFact(start, end time.Time, fn func(store.RequestFacts))
-}
-
-// RollupWindow computes root-cause-counted rollups for all requests seen
-// between [start, end].
-//
-// For each failed request, RootCauseSpan picks a single primary error code
-// (deepest → earliest → lex; trace store preferred, then graph, then
-// request-level fallback). If even the request-level fallback finds nothing
-// for a failed request — for instance during a partial replay — the request
-// still counts toward TotalFailures but contributes nothing to
-// PrimaryErrorCount.
-func RollupWindow(g *core.Graph, s RollupSource, ts *tracestore.Store, start, end time.Time) RollupSummary {
-	out := RollupSummary{
-		Start:               start,
-		End:                 end,
-		PrimaryErrorCount:   map[string]int{},
-		ServiceRequestCount: map[string]int{},
-		ServiceFailureCount: map[string]int{},
-		FlagRequestCount:    map[string]int{},
-		FlagFailureCount:    map[string]int{},
-	}
-
-	if s == nil {
-		return out
-	}
-
-	var latencies []int64
-	s.ForEachRequestFact(start, end, func(f store.RequestFacts) {
-		out.TotalRequests++
-		latencies = append(latencies, f.LatencyMs)
-
-		seenSvc := map[string]bool{}
-		for _, svc := range f.Services {
-			if svc == "" || seenSvc[svc] {
-				continue
-			}
-			seenSvc[svc] = true
-			out.ServiceRequestCount[svc]++
-		}
-		seenFlag := map[string]bool{}
-		for _, flag := range f.FeatureFlags {
-			if flag == "" || seenFlag[flag] {
-				continue
-			}
-			seenFlag[flag] = true
-			out.FlagRequestCount[flag]++
-		}
-
-		if len(f.Errors) == 0 {
-			return
-		}
-		out.TotalFailures++
-
-		for svc := range seenSvc {
-			out.ServiceFailureCount[svc]++
-		}
-		for flag := range seenFlag {
-			out.FlagFailureCount[flag]++
-		}
-
-		if g != nil {
-			if _, code, ok := RootCauseSpan(g, ts, f.RequestID); ok && code != "" {
-				out.PrimaryErrorCount[code]++
-				return
-			}
-		}
-		// RootCauseSpan found nothing — fall back to the first error on the
-		// fact so the PrimaryErrorCount total remains close to TotalFailures
-		// during replay or when graph lookup fails.
-		for _, code := range f.Errors {
-			if code != "" {
-				out.PrimaryErrorCount[code]++
-				return
-			}
-		}
-	})
-
-	sort.Slice(latencies, func(i, j int) bool { return latencies[i] < latencies[j] })
-	out.LatencyP50 = percentile(latencies, 50)
-	out.LatencyP95 = percentile(latencies, 95)
-	out.LatencyP99 = percentile(latencies, 99)
-	return out
-}
-
-// percentile implements nearest-rank percentile on a pre-sorted slice.
-// Mirrors the semantics of store.percentile so rollups and propagation
-// summaries stay comparable.
-func percentile(sorted []int64, pct int) int64 {
-	n := len(sorted)
-	if n == 0 {
-		return 0
-	}
-	idx := (pct*n + 99) / 100
-	if idx < 1 {
-		idx = 1
-	}
-	if idx > n {
-		idx = n
-	}
-	return sorted[idx-1]
-}
diff --git a/internal/graph/analysis/rollup_invariant_test.go b/internal/graph/analysis/rollup_invariant_test.go
deleted file mode 100644
index 0607852..0000000
--- a/internal/graph/analysis/rollup_invariant_test.go
+++ /dev/null
@@ -1,33 +0,0 @@
-package analysis
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-func TestRollupInvariantRootCauseStaysBelowNaivePropagation(t *testing.T) {
-	s := store.NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	now := time.Now().UTC()
-
-	const failedRequests = 3
-	for i := range failedRequests {
-		ingestCascade(t, s, ts, b, i, now.Add(-20*time.Second))
-	}
-
-	summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now.Add(time.Minute))
-	rootCauseCount := summary.PrimaryErrorCount["PMT_502"]
-	naivePropagatedCount := failedRequests * 3
-
-	if rootCauseCount != failedRequests {
-		t.Fatalf("PMT_502 root-cause count = %d, want %d", rootCauseCount, failedRequests)
-	}
-	if rootCauseCount >= naivePropagatedCount {
-		t.Fatalf("root-cause count should stay below naive propagated count: root=%d naive=%d", rootCauseCount, naivePropagatedCount)
-	}
-}
diff --git a/internal/graph/analysis/rollup_test.go b/internal/graph/analysis/rollup_test.go
deleted file mode 100644
index 4c25a1f..0000000
--- a/internal/graph/analysis/rollup_test.go
+++ /dev/null
@@ -1,181 +0,0 @@
-package analysis
-
-import (
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-// ingestCascade emits a payment → checkout → api-gateway failing cascade for a
-// single trace and returns the trace id. Each span carries a different error
-// code; the root cause is PMT_502 (leaf).
-func ingestCascade(t *testing.T, s *store.Store, ts *tracestore.Store, b *build.Builder, idx int, when time.Time) string {
-	t.Helper()
-	traceID := fmt.Sprintf("cccc%028d", idx)
-	reqID := core.ID("request", traceID)
-
-	ingest := func(ev event.WideEvent) {
-		r := b.BuildResult(ev)
-		s.Merge(r.Graph)
-		if ts != nil && r.Span != nil {
-			ts.Upsert(traceID, reqID, r.Span)
-		}
-	}
-
-	// payment (leaf — deepest failing span)
-	ingest(testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID(fmt.Sprintf("p%015d", idx)),
-		testutil.WithParentSpanID(fmt.Sprintf("c%015d", idx)),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "payment failed"),
-		testutil.WithCallerService("checkout"),
-		testutil.WithTimestamp(when),
-	))
-	// checkout (middle)
-	ingest(testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID(fmt.Sprintf("c%015d", idx)),
-		testutil.WithParentSpanID(fmt.Sprintf("a%015d", idx)),
-		testutil.WithService("checkout"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("CHK_DOWNSTREAM", "downstream failed"),
-		testutil.WithCallerService("api-gateway"),
-		testutil.WithTimestamp(when.Add(1*time.Millisecond)),
-	))
-	// api-gateway (root)
-	ingest(testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID(fmt.Sprintf("a%015d", idx)),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("GW_DOWNSTREAM", "downstream failed"),
-		testutil.WithTimestamp(when.Add(2*time.Millisecond)),
-	))
-	return traceID
-}
-
-// TestRollupWindow_RootCauseCounted_PMT502IsThreeNotNine is the canonical
-// regression test for the root-cause aggregation bug. Three cascading
-// failures payment→checkout→api-gateway previously reported each code with
-// count=9 (3 requests × 3 propagated codes). The correct behavior counts the
-// deepest failing span once per request, so PMT_502 = 3 and the propagation
-// codes do not appear in the primary rollup.
-func TestRollupWindow_RootCauseCounted_PMT502IsThreeNotNine(t *testing.T) {
-	s := store.NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	now := time.Now().UTC()
-
-	for i := range 3 {
-		ingestCascade(t, s, ts, b, i, now.Add(-20*time.Second))
-	}
-
-	summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now.Add(time.Minute))
-
-	if summary.TotalRequests != 3 {
-		t.Errorf("TotalRequests = %d, want 3", summary.TotalRequests)
-	}
-	if summary.TotalFailures != 3 {
-		t.Errorf("TotalFailures = %d, want 3", summary.TotalFailures)
-	}
-	if got := summary.PrimaryErrorCount["PMT_502"]; got != 3 {
-		t.Errorf("PrimaryErrorCount[PMT_502] = %d, want 3 (root-cause counted)", got)
-	}
-	if got := summary.PrimaryErrorCount["CHK_DOWNSTREAM"]; got != 0 {
-		t.Errorf("PrimaryErrorCount[CHK_DOWNSTREAM] = %d, want 0 (propagation, not root cause)", got)
-	}
-	if got := summary.PrimaryErrorCount["GW_DOWNSTREAM"]; got != 0 {
-		t.Errorf("PrimaryErrorCount[GW_DOWNSTREAM] = %d, want 0 (propagation, not root cause)", got)
-	}
-	if len(summary.PrimaryErrorCount) != 1 {
-		t.Errorf("PrimaryErrorCount has %d entries, want 1 (only the root-cause code)", len(summary.PrimaryErrorCount))
-	}
-
-	// Every touched service (regardless of whether its name or hashed node
-	// ID winds up in the facts due to upstream builder stubbing) participates
-	// in every failed request exactly once.
-	if len(summary.ServiceFailureCount) != 3 {
-		t.Errorf("ServiceFailureCount has %d entries, want 3 (payment + checkout + api-gateway)", len(summary.ServiceFailureCount))
-	}
-	for svc, count := range summary.ServiceFailureCount {
-		if count != 3 {
-			t.Errorf("ServiceFailureCount[%s] = %d, want 3 (once per failed request)", svc, count)
-		}
-	}
-}
-
-// TestRollupWindow_EmptyStore covers the no-data path.
-func TestRollupWindow_EmptyStore(t *testing.T) {
-	s := store.NewStore()
-	ts := tracestore.NewStore()
-	now := time.Now().UTC()
-
-	summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now)
-	if summary.TotalRequests != 0 || summary.TotalFailures != 0 {
-		t.Errorf("empty store rollup: total=%d failures=%d, want 0/0", summary.TotalRequests, summary.TotalFailures)
-	}
-	if len(summary.PrimaryErrorCount) != 0 {
-		t.Errorf("empty store: PrimaryErrorCount has %d entries, want 0", len(summary.PrimaryErrorCount))
-	}
-}
-
-// TestRollupWindow_NilStoreReturnsEmpty verifies the defensive nil-store path.
-func TestRollupWindow_NilStoreReturnsEmpty(t *testing.T) {
-	now := time.Now().UTC()
-	summary := RollupWindow(nil, nil, nil, now.Add(-time.Minute), now)
-	if summary.TotalRequests != 0 {
-		t.Errorf("nil store: TotalRequests = %d, want 0", summary.TotalRequests)
-	}
-	if summary.PrimaryErrorCount == nil {
-		t.Error("PrimaryErrorCount should be non-nil even with nil store")
-	}
-}
-
-// TestRollupWindow_SuccessAndFailureMix verifies TotalRequests counts both
-// and the successful request does not contribute to PrimaryErrorCount.
-func TestRollupWindow_SuccessAndFailureMix(t *testing.T) {
-	s := store.NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	now := time.Now().UTC()
-
-	// One successful request
-	okEv := testutil.MakeEvent(
-		testutil.WithTraceID("aaaa00000000000000000000000000000"[:32]),
-		testutil.WithService("api-gateway"),
-		testutil.WithTimestamp(now.Add(-20*time.Second)),
-	)
-	r := b.BuildResult(okEv)
-	s.Merge(r.Graph)
-
-	// One failing cascade
-	ingestCascade(t, s, ts, b, 42, now.Add(-15*time.Second))
-
-	summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now)
-
-	if summary.TotalRequests != 2 {
-		t.Errorf("TotalRequests = %d, want 2", summary.TotalRequests)
-	}
-	if summary.TotalFailures != 1 {
-		t.Errorf("TotalFailures = %d, want 1", summary.TotalFailures)
-	}
-	if summary.PrimaryErrorCount["PMT_502"] != 1 {
-		t.Errorf("PrimaryErrorCount[PMT_502] = %d, want 1", summary.PrimaryErrorCount["PMT_502"])
-	}
-}
-
-// graphOf is a test helper that reaches into the store's graph via its
-// exposed accessor. Kept here so test intent stays local.
-func graphOf(s *store.Store) *core.Graph {
-	return s.Graph()
-}
diff --git a/internal/graph/analysis/rootcause.go b/internal/graph/analysis/rootcause.go
deleted file mode 100644
index 229ad2f..0000000
--- a/internal/graph/analysis/rootcause.go
+++ /dev/null
@@ -1,239 +0,0 @@
-package analysis
-
-import (
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-// RootCauseSpan selects the single root-cause span for a (usually failed)
-// request using a deterministic tie-break. It is the canonical selector used
-// by default user-facing rollups (see RollupWindow) and by failure
-// explanations (ExplainRequest / ExplainRequestWithTrace).
-//
-// Tie-break order:
-//  1. deepest failing span in the span tree wins
-//  2. earliest timestamp breaks depth ties
-//  3. lexicographic span ID breaks remaining ties
-//
-// Source precedence when resolving the cause:
-//  1. trace store (when non-nil and a record exists for the request's trace)
-//  2. graph span nodes reachable from the request
-//  3. request-level failed_with edge (no span attribution)
-//
-// Return values:
-//   - (spanID, errorCode, true) when a cause is found. spanID is empty when
-//     the cause is a request-level error with no span attribution.
-//   - ("", "", false) when the request has no failure information at all.
-func RootCauseSpan(g *core.Graph, ts *tracestore.Store, requestID string) (string, string, bool) {
-	if g == nil || requestID == "" {
-		return "", "", false
-	}
-	req, ok := g.Nodes[requestID]
-	if !ok {
-		return "", "", false
-	}
-
-	if ts != nil {
-		traceID := ""
-		if req.Attr != nil {
-			traceID, _ = req.Attr["trace_id"].(string)
-		}
-		if traceID != "" {
-			if rec, found := ts.Get(traceID); found {
-				if id, code, ok := rootCauseFromTraceRecord(rec); ok {
-					return id, code, true
-				}
-			}
-		}
-	}
-
-	if id, code, ok := rootCauseFromGraph(g, requestID); ok {
-		return id, code, true
-	}
-
-	for _, e := range g.OutEdges[requestID] {
-		if e.Type != core.EdgeFailedWith {
-			continue
-		}
-		errNode := g.Nodes[e.To]
-		if errNode.Attr == nil {
-			continue
-		}
-		if code, _ := errNode.Attr["code"].(string); code != "" {
-			return "", code, true
-		}
-	}
-	return "", "", false
-}
-
-type rootCauseCandidate struct {
-	id    string
-	depth int
-	ts    time.Time
-}
-
-// pickRootCauseCandidate applies the deepest→earliest→lex tie-break and
-// returns the winning id, or "" when candidates is empty.
-func pickRootCauseCandidate(candidates []rootCauseCandidate) string {
-	bestID := ""
-	bestDepth := -1
-	var bestTime time.Time
-	for _, c := range candidates {
-		switch {
-		case c.depth > bestDepth:
-			bestID, bestDepth, bestTime = c.id, c.depth, c.ts
-		case c.depth == bestDepth && !c.ts.IsZero() && (bestTime.IsZero() || c.ts.Before(bestTime)):
-			bestID, bestDepth, bestTime = c.id, c.depth, c.ts
-		case c.depth == bestDepth && c.ts.Equal(bestTime) && c.id < bestID:
-			bestID, bestDepth, bestTime = c.id, c.depth, c.ts
-		}
-	}
-	return bestID
-}
-
-func rootCauseFromTraceRecord(rec *tracestore.TraceRecord) (string, string, bool) {
-	if rec == nil || len(rec.Spans) == 0 {
-		return "", "", false
-	}
-	spans := make(map[string]tracestore.SpanRecord, len(rec.Spans))
-	parentOf := map[string]string{}
-	for _, span := range rec.Spans {
-		if span.SpanID == "" {
-			continue
-		}
-		spans[span.SpanID] = span
-		if span.ParentSpanID != "" {
-			parentOf[span.SpanID] = span.ParentSpanID
-		}
-	}
-	if len(spans) == 0 {
-		return "", "", false
-	}
-
-	depths := computeSpanDepths(spans, parentOf)
-
-	candidates := make([]rootCauseCandidate, 0, len(spans))
-	for id, span := range spans {
-		if span.Success || span.ErrorCode == "" {
-			continue
-		}
-		candidates = append(candidates, rootCauseCandidate{id: id, depth: depths[id], ts: span.Timestamp})
-	}
-	rootID := pickRootCauseCandidate(candidates)
-	if rootID == "" {
-		return "", "", false
-	}
-	return rootID, spans[rootID].ErrorCode, true
-}
-
-func rootCauseFromGraph(g *core.Graph, requestID string) (string, string, bool) {
-	type info struct {
-		node    core.Node
-		errCode string
-	}
-	spans := map[string]*info{}
-	for _, e := range g.OutEdges[requestID] {
-		if e.Type != core.EdgeRequestHasSpan {
-			continue
-		}
-		sn, ok := g.Nodes[e.To]
-		if !ok || sn.Type != core.NodeSpan {
-			continue
-		}
-		si := &info{node: sn}
-		for _, se := range g.OutEdges[sn.ID] {
-			if se.Type != core.EdgeFailedWith {
-				continue
-			}
-			en, ok := g.Nodes[se.To]
-			if !ok || en.Attr == nil {
-				break
-			}
-			if code, _ := en.Attr["code"].(string); code != "" {
-				si.errCode = code
-			}
-			break
-		}
-		spans[sn.ID] = si
-	}
-	if len(spans) == 0 {
-		return "", "", false
-	}
-
-	parentOf := map[string]string{}
-	for spanID, si := range spans {
-		for _, e := range g.OutEdges[spanID] {
-			if e.Type == core.EdgeSpanChildOf {
-				if _, ok := spans[e.To]; ok {
-					parentOf[spanID] = e.To
-				}
-				break
-			}
-		}
-		if _, found := parentOf[spanID]; !found {
-			if psid, ok := si.node.Attr["parent_span_id"].(string); ok && psid != "" {
-				traceID, _ := si.node.Attr["trace_id"].(string)
-				if traceID != "" {
-					parentNodeID := core.ID("span", traceID, psid)
-					if _, ok := spans[parentNodeID]; ok {
-						parentOf[spanID] = parentNodeID
-					}
-				}
-			}
-		}
-	}
-
-	depths := computeSpanDepths(spans, parentOf)
-
-	candidates := make([]rootCauseCandidate, 0, len(spans))
-	for id, si := range spans {
-		if si.errCode == "" {
-			continue
-		}
-		candidates = append(candidates, rootCauseCandidate{id: id, depth: depths[id], ts: spanTimestamp(si.node)})
-	}
-	rootID := pickRootCauseCandidate(candidates)
-	if rootID == "" {
-		return "", "", false
-	}
-	return rootID, spans[rootID].errCode, true
-}
-
-// computeSpanDepths walks the parent chain for each span and returns the depth
-// from root. Cycles and orphan parents (outside the member set) yield depth 0.
-func computeSpanDepths[T any](spans map[string]T, parentOf map[string]string) map[string]int {
-	depth := map[string]int{}
-	visiting := map[string]bool{}
-	var walk func(string) int
-	walk = func(id string) int {
-		if d, ok := depth[id]; ok {
-			return d
-		}
-		if visiting[id] {
-			depth[id] = 0
-			return 0
-		}
-		visiting[id] = true
-		pid, has := parentOf[id]
-		if !has || pid == "" {
-			depth[id] = 0
-			delete(visiting, id)
-			return 0
-		}
-		if _, ok := spans[pid]; !ok {
-			depth[id] = 0
-			delete(visiting, id)
-			return 0
-		}
-		d := walk(pid) + 1
-		depth[id] = d
-		delete(visiting, id)
-		return d
-	}
-	for id := range spans {
-		walk(id)
-	}
-	return depth
-}
diff --git a/internal/graph/analysis/rootcause_test.go b/internal/graph/analysis/rootcause_test.go
deleted file mode 100644
index 6503c08..0000000
--- a/internal/graph/analysis/rootcause_test.go
+++ /dev/null
@@ -1,278 +0,0 @@
-package analysis
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-const (
-	testTraceID    = "0123456789abcdef0123456789abcdef"
-	spanRoot       = "aaaaaaaaaaaaaaaa"
-	spanChild      = "bbbbbbbbbbbbbbbb"
-	spanGrandchild = "cccccccccccccccc"
-	spanSibling    = "dddddddddddddddd"
-)
-
-func testRequestNode() core.Node {
-	return testutil.MakeNode(
-		core.ID("request", testTraceID),
-		core.NodeRequest,
-		map[string]any{"trace_id": testTraceID},
-	)
-}
-
-func TestRootCauseSpan_TraceStore_DeepestFailingSpanWins(t *testing.T) {
-	reqID := core.ID("request", testTraceID)
-	g := testutil.MakeGraph([]core.Node{testRequestNode()}, nil)
-
-	ts := tracestore.NewStore()
-	base := time.Date(2026, 4, 18, 12, 0, 0, 0, time.UTC)
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:    spanRoot,
-		Service:   "gateway",
-		ErrorCode: "GW_DOWNSTREAM",
-		Success:   false,
-		Timestamp: base,
-	})
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanChild,
-		ParentSpanID: spanRoot,
-		Service:      "checkout",
-		ErrorCode:    "CHK_DOWNSTREAM",
-		Success:      false,
-		Timestamp:    base.Add(time.Millisecond),
-	})
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanGrandchild,
-		ParentSpanID: spanChild,
-		Service:      "payment",
-		ErrorCode:    "PMT_502",
-		Success:      false,
-		Timestamp:    base.Add(2 * time.Millisecond),
-	})
-
-	id, code, ok := RootCauseSpan(g, ts, reqID)
-	if !ok {
-		t.Fatal("RootCauseSpan returned ok=false; want ok=true")
-	}
-	if id != spanGrandchild {
-		t.Errorf("spanID = %s, want %s (deepest failing span)", id, spanGrandchild)
-	}
-	if code != "PMT_502" {
-		t.Errorf("errorCode = %s, want PMT_502", code)
-	}
-}
-
-func TestRootCauseSpan_TraceStore_EarliestTimestampBreaksDepthTie(t *testing.T) {
-	reqID := core.ID("request", testTraceID)
-	g := testutil.MakeGraph([]core.Node{testRequestNode()}, nil)
-
-	ts := tracestore.NewStore()
-	base := time.Date(2026, 4, 18, 12, 0, 0, 0, time.UTC)
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:    spanRoot,
-		Service:   "gateway",
-		Success:   true,
-		Timestamp: base,
-	})
-	// Two children at equal depth under root. The earlier one is the root cause.
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanChild,
-		ParentSpanID: spanRoot,
-		Service:      "checkout",
-		ErrorCode:    "ERR_LATE",
-		Success:      false,
-		Timestamp:    base.Add(20 * time.Millisecond),
-	})
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanSibling,
-		ParentSpanID: spanRoot,
-		Service:      "payment",
-		ErrorCode:    "ERR_EARLY",
-		Success:      false,
-		Timestamp:    base.Add(5 * time.Millisecond),
-	})
-
-	id, code, ok := RootCauseSpan(g, ts, reqID)
-	if !ok {
-		t.Fatal("RootCauseSpan returned ok=false; want ok=true")
-	}
-	if id != spanSibling {
-		t.Errorf("spanID = %s, want %s (earlier timestamp wins)", id, spanSibling)
-	}
-	if code != "ERR_EARLY" {
-		t.Errorf("errorCode = %s, want ERR_EARLY", code)
-	}
-}
-
-func TestRootCauseSpan_TraceStore_LexIDBreaksRemainingTies(t *testing.T) {
-	reqID := core.ID("request", testTraceID)
-	g := testutil.MakeGraph([]core.Node{testRequestNode()}, nil)
-
-	ts := tracestore.NewStore()
-	base := time.Date(2026, 4, 18, 12, 0, 0, 0, time.UTC)
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:    spanRoot,
-		Service:   "gateway",
-		Success:   true,
-		Timestamp: base,
-	})
-	// Two children at equal depth and timestamp — lex lowest id wins.
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanSibling, // "dddd..."
-		ParentSpanID: spanRoot,
-		Service:      "payment",
-		ErrorCode:    "ERR_D",
-		Success:      false,
-		Timestamp:    base.Add(5 * time.Millisecond),
-	})
-	ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-		SpanID:       spanChild, // "bbbb..."
-		ParentSpanID: spanRoot,
-		Service:      "checkout",
-		ErrorCode:    "ERR_B",
-		Success:      false,
-		Timestamp:    base.Add(5 * time.Millisecond),
-	})
-
-	id, _, ok := RootCauseSpan(g, ts, reqID)
-	if !ok {
-		t.Fatal("RootCauseSpan returned ok=false; want ok=true")
-	}
-	if id != spanChild {
-		t.Errorf("spanID = %s, want %s (lex-lowest id wins)", id, spanChild)
-	}
-}
-
-func TestRootCauseSpan_GraphFallback(t *testing.T) {
-	// No trace store — exercises the graph-span-node fallback path.
-	reqID := core.ID("request", testTraceID)
-	rootSpan := core.ID("span", testTraceID, spanRoot)
-	childSpan := core.ID("span", testTraceID, spanChild)
-	errShallow := core.ID("error", "SHALLOW_ERR")
-	errDeep := core.ID("error", "DEEP_ERR")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testRequestNode(),
-			testutil.MakeNode(rootSpan, core.NodeSpan, map[string]any{"trace_id": testTraceID}),
-			testutil.MakeNode(childSpan, core.NodeSpan, map[string]any{"trace_id": testTraceID, "parent_span_id": spanRoot}),
-			testutil.MakeNode(errShallow, core.NodeError, map[string]any{"code": "SHALLOW_ERR"}),
-			testutil.MakeNode(errDeep, core.NodeError, map[string]any{"code": "DEEP_ERR"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: rootSpan, Type: core.EdgeRequestHasSpan},
-			{From: reqID, To: childSpan, Type: core.EdgeRequestHasSpan},
-			{From: childSpan, To: rootSpan, Type: core.EdgeSpanChildOf},
-			{From: rootSpan, To: errShallow, Type: core.EdgeFailedWith},
-			{From: childSpan, To: errDeep, Type: core.EdgeFailedWith},
-		},
-	)
-
-	id, code, ok := RootCauseSpan(g, nil, reqID)
-	if !ok {
-		t.Fatal("RootCauseSpan returned ok=false; want ok=true")
-	}
-	if id != childSpan {
-		t.Errorf("spanID = %s, want %s (deepest graph span)", id, childSpan)
-	}
-	if code != "DEEP_ERR" {
-		t.Errorf("errorCode = %s, want DEEP_ERR", code)
-	}
-}
-
-func TestRootCauseSpan_RequestLevelErrorFallback(t *testing.T) {
-	// No span data at all — only an EdgeFailedWith from the request.
-	reqID := core.ID("request", testTraceID)
-	errID := core.ID("error", "UNATTRIBUTED")
-
-	g := testutil.MakeGraph(
-		[]core.Node{
-			testRequestNode(),
-			testutil.MakeNode(errID, core.NodeError, map[string]any{"code": "UNATTRIBUTED"}),
-		},
-		[]core.Edge{
-			{From: reqID, To: errID, Type: core.EdgeFailedWith},
-		},
-	)
-
-	id, code, ok := RootCauseSpan(g, nil, reqID)
-	if !ok {
-		t.Fatal("RootCauseSpan returned ok=false; want ok=true")
-	}
-	if id != "" {
-		t.Errorf("spanID = %s, want empty (request-level error)", id)
-	}
-	if code != "UNATTRIBUTED" {
-		t.Errorf("errorCode = %s, want UNATTRIBUTED", code)
-	}
-}
-
-func TestRootCauseSpan_NoFailures(t *testing.T) {
-	// Request exists but has no error information — returns ok=false.
-	reqID := core.ID("request", testTraceID)
-	g := testutil.MakeGraph([]core.Node{testRequestNode()}, nil)
-
-	_, _, ok := RootCauseSpan(g, nil, reqID)
-	if ok {
-		t.Error("RootCauseSpan returned ok=true; want ok=false for request with no errors")
-	}
-}
-
-func TestRootCauseSpan_StableUnderArrivalOrder(t *testing.T) {
-	// Insert the same three spans in two different orders — same root cause.
-	reqID := core.ID("request", testTraceID)
-	g := testutil.MakeGraph([]core.Node{testRequestNode()}, nil)
-	base := time.Date(2026, 4, 18, 12, 0, 0, 0, time.UTC)
-
-	build := func() *tracestore.Store {
-		ts := tracestore.NewStore()
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanRoot, Service: "gateway", ErrorCode: "GW", Success: false, Timestamp: base,
-		})
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanChild, ParentSpanID: spanRoot, Service: "checkout", ErrorCode: "CHK", Success: false, Timestamp: base.Add(time.Millisecond),
-		})
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanGrandchild, ParentSpanID: spanChild, Service: "payment", ErrorCode: "PMT", Success: false, Timestamp: base.Add(2 * time.Millisecond),
-		})
-		return ts
-	}
-	buildReverse := func() *tracestore.Store {
-		ts := tracestore.NewStore()
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanGrandchild, ParentSpanID: spanChild, Service: "payment", ErrorCode: "PMT", Success: false, Timestamp: base.Add(2 * time.Millisecond),
-		})
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanChild, ParentSpanID: spanRoot, Service: "checkout", ErrorCode: "CHK", Success: false, Timestamp: base.Add(time.Millisecond),
-		})
-		ts.Upsert(testTraceID, reqID, &tracestore.SpanRecord{
-			SpanID: spanRoot, Service: "gateway", ErrorCode: "GW", Success: false, Timestamp: base,
-		})
-		return ts
-	}
-
-	idA, codeA, okA := RootCauseSpan(g, build(), reqID)
-	idB, codeB, okB := RootCauseSpan(g, buildReverse(), reqID)
-	if !okA || !okB {
-		t.Fatalf("RootCauseSpan returned ok=false (A=%v, B=%v)", okA, okB)
-	}
-	if idA != idB || codeA != codeB {
-		t.Errorf("root cause unstable under arrival order: (A=%s/%s) vs (B=%s/%s)", idA, codeA, idB, codeB)
-	}
-	if idA != spanGrandchild {
-		t.Errorf("spanID = %s, want %s (deepest)", idA, spanGrandchild)
-	}
-}
-
-func TestRootCauseSpan_RequestNotFound(t *testing.T) {
-	g := core.New()
-	_, _, ok := RootCauseSpan(g, nil, "request:missing")
-	if ok {
-		t.Error("RootCauseSpan returned ok=true for missing request; want false")
-	}
-}
diff --git a/internal/graph/analysis/topology.go b/internal/graph/analysis/topology.go
deleted file mode 100644
index b3d05b9..0000000
--- a/internal/graph/analysis/topology.go
+++ /dev/null
@@ -1,235 +0,0 @@
-package analysis
-
-import (
-	"math"
-	"sort"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-type TopologyNode struct {
-	ID          string  `json:"id"`
-	Label       string  `json:"label"`
-	Status      string  `json:"status"` // "healthy", "degraded", "failing"
-	Invocations int     `json:"invocations"`
-	Errors      int     `json:"errors"`
-	ErrorRate   float64 `json:"error_rate"`
-}
-
-type TopologyEdge struct {
-	Source   string `json:"source"`
-	Target   string `json:"target"`
-	Requests int    `json:"requests"`
-	Failures int    `json:"failures"`
-}
-
-type TopologyResult struct {
-	Nodes []TopologyNode `json:"nodes"`
-	Edges []TopologyEdge `json:"edges"`
-}
-
-type CyNode struct {
-	Data map[string]any `json:"data"`
-}
-
-type CyEdge struct {
-	Data map[string]any `json:"data"`
-}
-
-type CytoscapeResult struct {
-	Nodes []CyNode `json:"nodes"`
-	Edges []CyEdge `json:"edges"`
-}
-
-type edgeKey struct {
-	source, target string
-}
-
-type edgeStats struct {
-	requests int
-	failures int
-}
-
-// BuildTopology aggregates service counters from the graph store and caller →
-// service edge counts from the trace store. When traceStore is nil, it falls
-// back to the legacy span graph for compatibility.
-func BuildTopology(graphStore *graphstore.Store, traceStore *tracestore.Store, start, end time.Time) TopologyResult {
-	services := map[string]graphstore.ServiceStats{}
-	edges := map[edgeKey]*edgeStats{}
-
-	if traceStore != nil {
-		// Primary path: span-level data from trace store.
-		traceStore.ForEachSpan(start, end, func(_ string, span tracestore.SpanRecord) {
-			if span.Service != "" {
-				stats := services[span.Service]
-				stats.Invocations++
-				if !span.Success {
-					stats.Errors++
-				}
-				services[span.Service] = stats
-			}
-			if span.CallerService == "" || span.Service == "" {
-				return
-			}
-			ek := edgeKey{source: span.CallerService, target: span.Service}
-			es := edges[ek]
-			if es == nil {
-				es = &edgeStats{}
-				edges[ek] = es
-			}
-			es.requests++
-			if !span.Success {
-				es.failures++
-			}
-			if _, ok := services[span.CallerService]; !ok {
-				services[span.CallerService] = graphstore.ServiceStats{}
-			}
-		})
-	} else if graphStore != nil {
-		// Flattened graph path: service stats from RequestFacts, edges from
-		// EdgeCalls in the graph (the builder still emits caller→service edges).
-		graphStore.ForEachRequestFact(start, end, func(f graphstore.RequestFacts) {
-			seen := map[string]bool{}
-			for _, svc := range f.Services {
-				if svc == "" || seen[svc] {
-					continue
-				}
-				seen[svc] = true
-				stats := services[svc]
-				stats.Invocations++
-				if len(f.Errors) > 0 {
-					stats.Errors++
-				}
-				services[svc] = stats
-			}
-			if len(f.Services) == 0 && f.RootService != "" {
-				stats := services[f.RootService]
-				stats.Invocations++
-				if len(f.Errors) > 0 {
-					stats.Errors++
-				}
-				services[f.RootService] = stats
-			}
-		})
-
-		// Derive caller→service edges from graph EdgeCalls edges.
-		g := graphStore.Snapshot()
-		for _, e := range g.Edges {
-			if e.Type != core.EdgeCalls {
-				continue
-			}
-			srcNode, srcOK := g.Nodes[e.From]
-			tgtNode, tgtOK := g.Nodes[e.To]
-			if !srcOK || !tgtOK {
-				continue
-			}
-			if srcNode.Type != core.NodeService || tgtNode.Type != core.NodeService {
-				continue
-			}
-			if srcNode.LastSeen.Before(start) && tgtNode.LastSeen.Before(start) {
-				continue
-			}
-			srcName, _ := srcNode.Attr["name"].(string)
-			tgtName, _ := tgtNode.Attr["name"].(string)
-			if srcName == "" || tgtName == "" {
-				continue
-			}
-			if _, ok := services[srcName]; !ok {
-				services[srcName] = graphstore.ServiceStats{}
-			}
-			ek := edgeKey{source: srcName, target: tgtName}
-			es := edges[ek]
-			if es == nil {
-				es = &edgeStats{}
-				edges[ek] = es
-			}
-			es.requests++
-		}
-	}
-
-	nodes := make([]TopologyNode, 0, len(services))
-	for id, ss := range services {
-		var errRate float64
-		if ss.Invocations > 0 {
-			errRate = float64(ss.Errors) / float64(ss.Invocations)
-		}
-		nodes = append(nodes, TopologyNode{
-			ID:          id,
-			Label:       id,
-			Status:      statusFromErrorRate(errRate),
-			Invocations: ss.Invocations,
-			Errors:      ss.Errors,
-			ErrorRate:   errRate,
-		})
-	}
-
-	edgeList := make([]TopologyEdge, 0, len(edges))
-	for ek, es := range edges {
-		edgeList = append(edgeList, TopologyEdge{
-			Source:   ek.source,
-			Target:   ek.target,
-			Requests: es.requests,
-			Failures: es.failures,
-		})
-	}
-
-	return TopologyResult{Nodes: nodes, Edges: edgeList}
-}
-
-// ToCytoscapeFormat converts a TopologyResult to Cytoscape JSON format
-// compatible with the dashboard Graph tab expectations (error_rate as
-// percentage, edges with "count" and "label":"calls").
-func ToCytoscapeFormat(result TopologyResult) CytoscapeResult {
-	nodes := make([]CyNode, 0, len(result.Nodes))
-	for _, n := range result.Nodes {
-		errPct := math.Round(n.ErrorRate*10000) / 100
-		nodes = append(nodes, CyNode{
-			Data: map[string]any{
-				"id":          n.ID,
-				"label":       n.Label,
-				"type":        "service",
-				"invocations": n.Invocations,
-				"errors":      n.Errors,
-				"error_rate":  errPct,
-			},
-		})
-	}
-	sort.Slice(nodes, func(i, j int) bool {
-		return nodes[i].Data["id"].(string) < nodes[j].Data["id"].(string)
-	})
-
-	edges := make([]CyEdge, 0, len(result.Edges))
-	for _, e := range result.Edges {
-		edges = append(edges, CyEdge{
-			Data: map[string]any{
-				"source": e.Source,
-				"target": e.Target,
-				"label":  "calls",
-				"count":  e.Requests,
-			},
-		})
-	}
-	sort.Slice(edges, func(i, j int) bool {
-		si := edges[i].Data["source"].(string)
-		sj := edges[j].Data["source"].(string)
-		if si != sj {
-			return si < sj
-		}
-		return edges[i].Data["target"].(string) < edges[j].Data["target"].(string)
-	})
-
-	return CytoscapeResult{Nodes: nodes, Edges: edges}
-}
-
-func statusFromErrorRate(rate float64) string {
-	if rate >= 0.5 {
-		return "failing"
-	}
-	if rate >= 0.1 {
-		return "degraded"
-	}
-	return "healthy"
-}
diff --git a/internal/graph/analysis/topology_test.go b/internal/graph/analysis/topology_test.go
deleted file mode 100644
index 59154a6..0000000
--- a/internal/graph/analysis/topology_test.go
+++ /dev/null
@@ -1,234 +0,0 @@
-package analysis
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-func TestBuildTopology_BasicGraph(t *testing.T) {
-	now := time.Now().UTC()
-	start := now.Add(-10 * time.Minute)
-	end := now.Add(time.Minute)
-
-	builder := build.NewBuilder()
-	st := graphstore.NewStore()
-	ts := tracestore.NewStore()
-
-	// Event 1: frontend calls api-gateway (success)
-	ev1 := testutil.MakeEvent(
-		testutil.WithService("api-gateway"),
-		testutil.WithCallerService("frontend"),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithTimestamp(now),
-	)
-	upsertTopologyEvent(st, ts, builder, ev1)
-
-	// Event 2: api-gateway calls checkout (success)
-	ev2 := testutil.MakeEvent(
-		testutil.WithService("checkout"),
-		testutil.WithCallerService("api-gateway"),
-		testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-		testutil.WithTraceID("abcdef01234567890abcdef012345678"),
-		testutil.WithTimestamp(now),
-	)
-	upsertTopologyEvent(st, ts, builder, ev2)
-
-	// Event 3: api-gateway calls checkout (error)
-	ev3 := testutil.MakeEvent(
-		testutil.WithService("checkout"),
-		testutil.WithCallerService("api-gateway"),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithTraceID("abcdef01234567890abcdef012345679"),
-		testutil.WithError("CHK_500", "internal error"),
-		testutil.WithTimestamp(now),
-	)
-	upsertTopologyEvent(st, ts, builder, ev3)
-
-	result := BuildTopology(st, ts, start, end)
-
-	// Should have 3 service nodes: frontend, api-gateway, checkout
-	if len(result.Nodes) != 3 {
-		t.Fatalf("expected 3 nodes, got %d", len(result.Nodes))
-	}
-
-	nodeByID := map[string]TopologyNode{}
-	for _, n := range result.Nodes {
-		nodeByID[n.ID] = n
-	}
-
-	// api-gateway: 1 invocation, 0 errors
-	gw, ok := nodeByID["api-gateway"]
-	if !ok {
-		t.Fatal("missing api-gateway node")
-	}
-	if gw.Invocations != 1 {
-		t.Errorf("api-gateway invocations: got %d, want 1", gw.Invocations)
-	}
-	if gw.Errors != 0 {
-		t.Errorf("api-gateway errors: got %d, want 0", gw.Errors)
-	}
-	if gw.Status != "healthy" {
-		t.Errorf("api-gateway status: got %q, want %q", gw.Status, "healthy")
-	}
-
-	// checkout: 2 invocations, 1 error -> error_rate 0.5 -> "failing"
-	ck, ok := nodeByID["checkout"]
-	if !ok {
-		t.Fatal("missing checkout node")
-	}
-	if ck.Invocations != 2 {
-		t.Errorf("checkout invocations: got %d, want 2", ck.Invocations)
-	}
-	if ck.Errors != 1 {
-		t.Errorf("checkout errors: got %d, want 1", ck.Errors)
-	}
-	if ck.Status != "failing" {
-		t.Errorf("checkout status: got %q, want %q", ck.Status, "failing")
-	}
-
-	// Should have 2 edges: frontend->api-gateway, api-gateway->checkout
-	if len(result.Edges) != 2 {
-		t.Fatalf("expected 2 edges, got %d", len(result.Edges))
-	}
-
-	edgeFound := map[string]bool{}
-	for _, e := range result.Edges {
-		edgeFound[e.Source+"->"+e.Target] = true
-	}
-	if !edgeFound["frontend->api-gateway"] {
-		t.Error("missing edge frontend->api-gateway")
-	}
-	if !edgeFound["api-gateway->checkout"] {
-		t.Error("missing edge api-gateway->checkout")
-	}
-
-	// Check edge stats for api-gateway->checkout
-	for _, e := range result.Edges {
-		if e.Source == "api-gateway" && e.Target == "checkout" {
-			if e.Requests != 2 {
-				t.Errorf("edge requests: got %d, want 2", e.Requests)
-			}
-			if e.Failures != 1 {
-				t.Errorf("edge failures: got %d, want 1", e.Failures)
-			}
-		}
-	}
-}
-
-func TestBuildTopology_EmptyGraph(t *testing.T) {
-	st := graphstore.NewStore()
-	ts := tracestore.NewStore()
-
-	now := time.Now().UTC()
-	result := BuildTopology(st, ts, now.Add(-time.Hour), now)
-
-	if result.Nodes == nil {
-		t.Error("Nodes should be non-nil empty slice")
-	}
-	if result.Edges == nil {
-		t.Error("Edges should be non-nil empty slice")
-	}
-	if len(result.Nodes) != 0 {
-		t.Errorf("expected 0 nodes, got %d", len(result.Nodes))
-	}
-	if len(result.Edges) != 0 {
-		t.Errorf("expected 0 edges, got %d", len(result.Edges))
-	}
-}
-
-func TestBuildTopology_WindowFiltering(t *testing.T) {
-	now := time.Now().UTC()
-	builder := build.NewBuilder()
-	st := graphstore.NewStore()
-	ts := tracestore.NewStore()
-
-	// Event inside window
-	ev1 := testutil.MakeEvent(
-		testutil.WithService("svc-a"),
-		testutil.WithCallerService("svc-b"),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithTimestamp(now),
-	)
-	upsertTopologyEvent(st, ts, builder, ev1)
-
-	// Event outside window (2 hours ago)
-	ev2 := testutil.MakeEvent(
-		testutil.WithService("svc-old"),
-		testutil.WithCallerService("svc-ancient"),
-		testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-		testutil.WithTraceID("abcdef01234567890abcdef012345678"),
-		testutil.WithTimestamp(now.Add(-2*time.Hour)),
-	)
-	upsertTopologyEvent(st, ts, builder, ev2)
-
-	// Window: last 30 minutes
-	result := BuildTopology(st, ts, now.Add(-30*time.Minute), now.Add(time.Minute))
-
-	// Only svc-a and svc-b should appear (not svc-old/svc-ancient)
-	if len(result.Nodes) != 2 {
-		t.Errorf("expected 2 nodes within window, got %d", len(result.Nodes))
-	}
-}
-
-func upsertTopologyEvent(st *graphstore.Store, ts *tracestore.Store, builder *build.Builder, ev event.WideEvent) {
-	result := builder.BuildResult(ev)
-	st.Merge(result.Graph)
-	if result.Span != nil {
-		ts.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-	}
-}
-
-func TestToCytoscapeFormat(t *testing.T) {
-	result := TopologyResult{
-		Nodes: []TopologyNode{
-			{ID: "svc-a", Label: "svc-a", Status: "healthy", Invocations: 10, Errors: 0, ErrorRate: 0},
-			{ID: "svc-b", Label: "svc-b", Status: "degraded", Invocations: 20, Errors: 3, ErrorRate: 0.15},
-		},
-		Edges: []TopologyEdge{
-			{Source: "svc-a", Target: "svc-b", Requests: 15, Failures: 3},
-		},
-	}
-
-	cy := ToCytoscapeFormat(result)
-
-	if len(cy.Nodes) != 2 {
-		t.Fatalf("expected 2 cytoscape nodes, got %d", len(cy.Nodes))
-	}
-	if len(cy.Edges) != 1 {
-		t.Fatalf("expected 1 cytoscape edge, got %d", len(cy.Edges))
-	}
-
-	// Check node data fields
-	n0 := cy.Nodes[0].Data
-	if n0["id"] != "svc-a" {
-		t.Errorf("node id: got %v, want svc-a", n0["id"])
-	}
-	if n0["type"] != "service" {
-		t.Errorf("node type: got %v, want service", n0["type"])
-	}
-	if n0["invocations"] != 10 {
-		t.Errorf("node invocations: got %v, want 10", n0["invocations"])
-	}
-
-	// Check edge data fields (Cytoscape format uses "count" and "label":"calls")
-	e0 := cy.Edges[0].Data
-	if e0["source"] != "svc-a" {
-		t.Errorf("edge source: got %v, want svc-a", e0["source"])
-	}
-	if e0["target"] != "svc-b" {
-		t.Errorf("edge target: got %v, want svc-b", e0["target"])
-	}
-	if e0["count"] != 15 {
-		t.Errorf("edge count: got %v, want 15", e0["count"])
-	}
-	if e0["label"] != "calls" {
-		t.Errorf("edge label: got %v, want calls", e0["label"])
-	}
-}
diff --git a/internal/graph/build/builder.go b/internal/graph/build/builder.go
deleted file mode 100644
index 090133a..0000000
--- a/internal/graph/build/builder.go
+++ /dev/null
@@ -1,227 +0,0 @@
-package build
-
-import (
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-type Builder struct{}
-
-type BuildResult struct {
-	Graph *core.Graph
-	Span  *tracestore.SpanRecord
-}
-
-func NewBuilder() *Builder {
-	return &Builder{}
-}
-
-func (b *Builder) Build(ev event.WideEvent) *core.Graph {
-	return b.BuildResult(ev).Graph
-}
-
-func (b *Builder) BuildResult(ev event.WideEvent) BuildResult {
-	g := core.New()
-	errID := ""
-	isRoot := ev.Request.SpanID != "" && ev.Request.ParentSpanID == ""
-	var span *tracestore.SpanRecord
-
-	// --------------------
-	// Request node
-	// --------------------
-	reqID := core.ID("request", ev.Request.TraceID)
-	req := core.Node{
-		ID:   reqID,
-		Type: core.NodeRequest,
-		Attr: map[string]any{
-			"event_name":     ev.EventName,
-			"trace_id":       ev.Request.TraceID,
-			"flow":           ev.Request.Flow,
-			"latency_ms":     ev.Metrics.LatencyMs,
-			"success":        ev.Outcome.Success,
-			"status_code":    ev.Outcome.StatusCode,
-			"service":        ev.System.Service,
-			"is_root":        isRoot,
-			"http_method":    ev.Request.HTTPMethod,
-			"route_template": ev.Request.RouteTemplate,
-			"version":        ev.System.Version,
-			"user_id":        ev.User.ID,
-			"user_tier":      ev.User.Tier,
-			"user_region":    ev.User.Region,
-			"user_vip":       ev.User.VIP,
-			"feature_flags":  append([]string(nil), ev.Request.FeatureFlags...),
-		},
-	}
-	if ev.Error != nil {
-		req.Attr["error_code"] = ev.Error.Code
-		if ev.Error.Path != "" {
-			req.Attr["error_path"] = ev.Error.Path
-		}
-		if ev.Error.Reason != "" {
-			req.Attr["error_reason"] = ev.Error.Reason
-		}
-	}
-	if ev.ParentRequestID != "" {
-		req.Attr["parent_request_id"] = ev.ParentRequestID
-	}
-	if ev.Retry != nil {
-		if ev.Retry.Of != 0 {
-			req.Attr["retry_of"] = ev.Retry.Of
-		}
-		if ev.Retry.PreviousAttemptID != "" {
-			req.Attr["retry_previous_attempt_id"] = ev.Retry.PreviousAttemptID
-		}
-	}
-	if len(ev.Metadata) > 0 {
-		req.Attr["metadata"] = ev.Metadata
-	}
-	touch(&req, ev.Timestamp)
-	g.AddNode(req)
-
-	// --------------------
-	// Service node
-	// --------------------
-	svcID := core.ID(
-		"service",
-		ev.System.Service,
-		ev.System.Env,
-	)
-	svc := core.Node{
-		ID:   svcID,
-		Type: core.NodeService,
-		Attr: map[string]any{
-			"name":          ev.System.Service,
-			"env":           ev.System.Env,
-			"version":       ev.System.Version,
-			"deployment_id": ev.System.DeploymentID,
-		},
-	}
-	touch(&svc, ev.Timestamp)
-	g.AddNode(svc)
-
-	g.AddEdge(core.Edge{
-		From: reqID,
-		To:   svcID,
-		Type: core.EdgeHandledBy,
-	})
-
-	// --------------------
-	// Service-to-service call edge
-	// --------------------
-	if ev.System.CallerService != "" {
-		callerID := core.ID("service", ev.System.CallerService, ev.System.Env)
-
-		// Ensure caller service node exists
-		caller := core.Node{
-			ID:   callerID,
-			Type: core.NodeService,
-			Attr: map[string]any{
-				"env": ev.System.Env,
-			},
-		}
-		touch(&caller, ev.Timestamp)
-		g.AddNode(caller)
-
-		// caller_service -> calls -> service
-		g.AddEdge(core.Edge{
-			From: callerID,
-			To:   svcID,
-			Type: core.EdgeCalls,
-		})
-	}
-	// --------------------
-	// Downstream service dependency
-	// --------------------
-	if ev.System.DownstreamService != "" {
-		downID := core.ID("service", ev.System.DownstreamService, ev.System.Env)
-
-		down := core.Node{
-			ID:   downID,
-			Type: core.NodeService,
-			Attr: map[string]any{
-				"env": ev.System.Env,
-			},
-		}
-		touch(&down, ev.Timestamp)
-		g.AddNode(down)
-
-		g.AddEdge(core.Edge{
-			From: svcID,
-			To:   downID,
-			Type: core.EdgeCalls,
-		})
-	}
-	// --------------------
-	// Trace store span record
-	// --------------------
-	if ev.Request.SpanID != "" {
-		span = &tracestore.SpanRecord{
-			SpanID:            ev.Request.SpanID,
-			ParentSpanID:      ev.Request.ParentSpanID,
-			Service:           ev.System.Service,
-			EventName:         ev.EventName,
-			StatusCode:        ev.Outcome.StatusCode,
-			Success:           ev.Outcome.Success,
-			LatencyMs:         ev.Metrics.LatencyMs,
-			CallerService:     ev.System.CallerService,
-			DownstreamService: ev.System.DownstreamService,
-			Timestamp:         ev.Timestamp,
-			HTTPMethod:        ev.Request.HTTPMethod,
-			RouteTemplate:     ev.Request.RouteTemplate,
-		}
-		if ev.Error != nil {
-			span.ErrorCode = ev.Error.Code
-			span.ErrorMessage = ev.Error.Message
-			span.ErrorPath = ev.Error.Path
-			span.ErrorReason = ev.Error.Reason
-		}
-		if ev.Retry != nil {
-			span.RetryOf = ev.Retry.Of
-			span.RetryPreviousID = ev.Retry.PreviousAttemptID
-		}
-		if len(ev.Metadata) > 0 {
-			span.Metadata = ev.Metadata
-		}
-	}
-
-	// --------------------
-	// Error node
-	// --------------------
-	if ev.Error != nil {
-		errID = core.ID("error", ev.Error.Code)
-		errNode := core.Node{
-			ID:   errID,
-			Type: core.NodeError,
-			Attr: map[string]any{
-				"code":    ev.Error.Code,
-				"message": ev.Error.Message,
-				"service": ev.System.Service,
-			},
-		}
-		touch(&errNode, ev.Timestamp)
-		g.AddNode(errNode)
-
-		g.AddEdge(core.Edge{
-			From: reqID,
-			To:   errID,
-			Type: core.EdgeFailedWith,
-		})
-	}
-
-	return BuildResult{Graph: g, Span: span}
-}
-
-func touch(n *core.Node, ts time.Time) {
-	if ts.IsZero() {
-		return
-	}
-	if n.FirstSeen.IsZero() || ts.Before(n.FirstSeen) {
-		n.FirstSeen = ts
-	}
-	if n.LastSeen.IsZero() || ts.After(n.LastSeen) {
-		n.LastSeen = ts
-	}
-}
diff --git a/internal/graph/build/builder_test.go b/internal/graph/build/builder_test.go
deleted file mode 100644
index ab63421..0000000
--- a/internal/graph/build/builder_test.go
+++ /dev/null
@@ -1,488 +0,0 @@
-package build
-
-import (
-	"testing"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func TestBuilder_Build_RequestNode(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID("0123456789abcdef0123456789abcdef"),
-		testutil.WithService("payment-service"),
-		testutil.WithLatency(100),
-		testutil.WithStatusCode(200),
-	)
-
-	g := builder.Build(ev)
-
-	// Check request node exists
-	reqID := core.ID("request", ev.Request.TraceID)
-	req, ok := g.Nodes[reqID]
-	if !ok {
-		t.Fatalf("expected request node %s to exist", reqID)
-	}
-	if req.Type != core.NodeRequest {
-		t.Errorf("expected node type %v, got %v", core.NodeRequest, req.Type)
-	}
-	if req.Attr["trace_id"] != ev.Request.TraceID {
-		t.Errorf("expected trace_id %s, got %v", ev.Request.TraceID, req.Attr["trace_id"])
-	}
-}
-
-func TestBuilder_Build_ErrorNode(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithError("ERR_PAYMENT_FAILED", "Payment processing failed"),
-	)
-
-	g := builder.Build(ev)
-
-	// Check error node exists
-	errID := core.ID("error", ev.Error.Code)
-	errNode, ok := g.Nodes[errID]
-	if !ok {
-		t.Fatalf("expected error node %s to exist", errID)
-	}
-	if errNode.Type != core.NodeError {
-		t.Errorf("expected node type %v, got %v", core.NodeError, errNode.Type)
-	}
-	if errNode.Attr["code"] != ev.Error.Code {
-		t.Errorf("expected code %s, got %v", ev.Error.Code, errNode.Attr["code"])
-	}
-
-	// Check request → error edge exists
-	reqID := core.ID("request", ev.Request.TraceID)
-	hasEdge := false
-	for _, e := range g.Edges {
-		if e.From == reqID && e.To == errID && e.Type == core.EdgeFailedWith {
-			hasEdge = true
-			break
-		}
-	}
-	if !hasEdge {
-		t.Error("expected request→error edge with EdgeFailedWith")
-	}
-
-	req, ok := g.Nodes[reqID]
-	if !ok {
-		t.Fatalf("expected request node %s to exist", reqID)
-	}
-	if got := req.Attr["error_code"]; got != ev.Error.Code {
-		t.Errorf("request attr error_code = %v, want %s", got, ev.Error.Code)
-	}
-}
-
-func TestBuilder_Build_SpanErrorEdge_OnlyWhenBothExist(t *testing.T) {
-	builder := NewBuilder()
-
-	t.Run("no span, with error", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID(""), // No span
-			testutil.WithError("ERR_TEST", "Test error"),
-		)
-
-		g := builder.Build(ev)
-
-		// Error node should exist
-		errID := core.ID("error", ev.Error.Code)
-		if _, ok := g.Nodes[errID]; !ok {
-			t.Fatal("expected error node to exist")
-		}
-
-		// No span→error edge should exist
-		for _, e := range g.Edges {
-			if e.Type == core.EdgeFailedWith && e.To == errID {
-				fromNode := g.Nodes[e.From]
-				if fromNode.Type == core.NodeSpan {
-					t.Error("should not have span→error edge when there's no span")
-				}
-			}
-		}
-
-		if got := builder.BuildResult(ev).Span; got != nil {
-			t.Fatalf("expected no trace-store span record, got %+v", got)
-		}
-	})
-
-	t.Run("with span, no error", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID("0123456789abcdef"),
-			// No error
-		)
-
-		result := builder.BuildResult(ev)
-		if result.Span == nil {
-			t.Fatal("expected trace-store span record to exist")
-		}
-		if result.Span.SpanID != ev.Request.SpanID {
-			t.Fatalf("span record span_id = %q, want %q", result.Span.SpanID, ev.Request.SpanID)
-		}
-	})
-
-	t.Run("with span and error", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID("0123456789abcdef"),
-			testutil.WithError("ERR_TEST", "Test error"),
-		)
-
-		g := builder.Build(ev)
-		errID := core.ID("error", ev.Error.Code)
-
-		if _, ok := g.Nodes[errID]; !ok {
-			t.Fatal("expected error node to exist")
-		}
-
-		hasRequestErrorEdge := false
-		for _, e := range g.Edges {
-			if e.To == errID && e.Type == core.EdgeFailedWith {
-				fromNode := g.Nodes[e.From]
-				if fromNode.Type == core.NodeSpan {
-					t.Fatalf("unexpected legacy span→error edge %q -> %q", e.From, e.To)
-				}
-				if fromNode.Type == core.NodeRequest {
-					hasRequestErrorEdge = true
-				}
-			}
-		}
-		if !hasRequestErrorEdge {
-			t.Error("expected request→error edge with EdgeFailedWith")
-		}
-
-		result := builder.BuildResult(ev)
-		if result.Span == nil {
-			t.Fatal("expected trace-store span record to exist")
-		}
-		if result.Span.ErrorCode != ev.Error.Code {
-			t.Fatalf("span record error_code = %q, want %q", result.Span.ErrorCode, ev.Error.Code)
-		}
-	})
-}
-
-func TestBuilder_Build_NoEmptyEdges(t *testing.T) {
-	builder := NewBuilder()
-
-	testCases := []struct {
-		name string
-		opts []testutil.EventOption
-	}{
-		{
-			name: "success event",
-			opts: []testutil.EventOption{testutil.WithStatusCode(200)},
-		},
-		{
-			name: "error event",
-			opts: []testutil.EventOption{testutil.WithError("ERR_TEST", "test")},
-		},
-		{
-			name: "error with span",
-			opts: []testutil.EventOption{
-				testutil.WithSpanID("0123456789abcdef"),
-				testutil.WithError("ERR_TEST", "test"),
-			},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			ev := testutil.MakeEvent(tc.opts...)
-			g := builder.Build(ev)
-
-			// Check no edge has empty From or To
-			for _, e := range g.Edges {
-				if e.From == "" {
-					t.Errorf("edge has empty From: %+v", e)
-				}
-				if e.To == "" {
-					t.Errorf("edge has empty To: %+v", e)
-				}
-			}
-		})
-	}
-}
-
-func TestBuilder_Build_ServiceNode(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithService("api-gateway"),
-	)
-
-	g := builder.Build(ev)
-
-	svcID := core.ID("service", ev.System.Service, ev.System.Env)
-	svc, ok := g.Nodes[svcID]
-	if !ok {
-		t.Fatalf("expected service node %s to exist", svcID)
-	}
-	if svc.Type != core.NodeService {
-		t.Errorf("expected node type %v, got %v", core.NodeService, svc.Type)
-	}
-	if svc.Attr["name"] != ev.System.Service {
-		t.Errorf("expected name %s, got %v", ev.System.Service, svc.Attr["name"])
-	}
-}
-
-func TestBuilder_Build_UserAttrsOnRequestNode(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithUser("user-456", "premium", "eu-west-1"),
-	)
-
-	g := builder.Build(ev)
-
-	reqID := core.ID("request", ev.Request.TraceID)
-	req, ok := g.Nodes[reqID]
-	if !ok {
-		t.Fatalf("expected request node %s to exist", reqID)
-	}
-	if req.Attr["user_tier"] != ev.User.Tier {
-		t.Errorf("expected user_tier %s, got %v", ev.User.Tier, req.Attr["user_tier"])
-	}
-	if req.Attr["user_id"] != ev.User.ID {
-		t.Errorf("expected user_id %s, got %v", ev.User.ID, req.Attr["user_id"])
-	}
-	if req.Attr["user_region"] != ev.User.Region {
-		t.Errorf("expected user_region %s, got %v", ev.User.Region, req.Attr["user_region"])
-	}
-	if _, ok := g.Nodes[core.ID("user", ev.User.ID)]; ok {
-		t.Fatal("legacy user node should not be present in flattened graph")
-	}
-}
-
-// TestBuilder_Build_EmptyUserID_NoUserNodeOrEdge is a regression guard for
-// OTLP ingestion, which produces events with no user concept. The flattened
-// graph stores user info as attributes on the request node (not as a
-// separate user node), so empty User.ID must not produce synthetic nodes
-// or edges. Downstream analysis (blast, explain) already guards on
-// non-empty user_id, so empty attrs are inert.
-func TestBuilder_Build_EmptyUserID_NoUserNodeOrEdge(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithUser("", "", ""),
-	)
-
-	g := builder.Build(ev)
-
-	// No user nodes at all (flattened graph invariant).
-	for _, n := range g.Nodes {
-		if n.Type == core.NodeUser {
-			t.Errorf("expected no user node when User.ID is empty, found %s", n.ID)
-		}
-	}
-	// No request_by edges at all (flattened graph invariant).
-	for _, e := range g.Edges {
-		if e.Type == core.EdgeRequestBy {
-			t.Errorf("expected no request_by edge when User.ID is empty, found %s→%s", e.From, e.To)
-		}
-	}
-	// Request and service nodes still build normally.
-	reqID := core.ID("request", ev.Request.TraceID)
-	if _, ok := g.Nodes[reqID]; !ok {
-		t.Error("expected request node to still be built")
-	}
-	svcID := core.ID("service", ev.System.Service, ev.System.Env)
-	if _, ok := g.Nodes[svcID]; !ok {
-		t.Error("expected service node to still be built")
-	}
-	// User attrs on the request node are empty strings, not synthetic values.
-	if req, ok := g.Nodes[reqID]; ok {
-		if v, _ := req.Attr["user_id"].(string); v != "" {
-			t.Errorf("expected empty user_id attr, got %q", v)
-		}
-		if v, _ := req.Attr["user_tier"].(string); v != "" {
-			t.Errorf("expected empty user_tier attr, got %q", v)
-		}
-	}
-}
-
-func TestBuilder_Build_CallerService(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithService("checkout"),
-		testutil.WithCallerService("frontend"),
-	)
-
-	g := builder.Build(ev)
-
-	callerID := core.ID("service", ev.System.CallerService, ev.System.Env)
-	if _, ok := g.Nodes[callerID]; !ok {
-		t.Fatalf("expected caller service node %s to exist", callerID)
-	}
-
-	// Check calls edge exists
-	svcID := core.ID("service", ev.System.Service, ev.System.Env)
-	hasEdge := false
-	for _, e := range g.Edges {
-		if e.From == callerID && e.To == svcID && e.Type == core.EdgeCalls {
-			hasEdge = true
-			break
-		}
-	}
-	if !hasEdge {
-		t.Error("expected caller→service edge with EdgeCalls")
-	}
-}
-
-func TestBuilder_Build_DownstreamService(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithService("checkout"),
-		testutil.WithDownstreamService("payment"),
-	)
-
-	g := builder.Build(ev)
-
-	downID := core.ID("service", ev.System.DownstreamService, ev.System.Env)
-	if _, ok := g.Nodes[downID]; !ok {
-		t.Fatalf("expected downstream service node %s to exist", downID)
-	}
-
-	// Check calls edge exists
-	svcID := core.ID("service", ev.System.Service, ev.System.Env)
-	hasEdge := false
-	for _, e := range g.Edges {
-		if e.From == svcID && e.To == downID && e.Type == core.EdgeCalls {
-			hasEdge = true
-			break
-		}
-	}
-	if !hasEdge {
-		t.Error("expected service→downstream edge with EdgeCalls")
-	}
-}
-
-func TestBuilder_Build_FeatureFlagsOnRequestNode(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithFeatureFlags("dark-mode", "new-checkout"),
-	)
-
-	g := builder.Build(ev)
-
-	reqID := core.ID("request", ev.Request.TraceID)
-	req := g.Nodes[reqID]
-	flags, ok := req.Attr["feature_flags"].([]string)
-	if !ok {
-		t.Fatalf("feature_flags attr should be []string, got %T", req.Attr["feature_flags"])
-	}
-	for i, flag := range ev.Request.FeatureFlags {
-		if flags[i] != flag {
-			t.Fatalf("feature_flags[%d] = %q, want %q", i, flags[i], flag)
-		}
-		if _, ok := g.Nodes[core.ID("feature_flag", flag)]; ok {
-			t.Fatalf("legacy feature_flag node for %q should not be present", flag)
-		}
-	}
-}
-
-func TestBuilder_Build_SpanRecordEnrichedAttrs(t *testing.T) {
-	builder := NewBuilder()
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID("0123456789abcdef0123456789abcdef"),
-		testutil.WithSpanID("0123456789abcdef"),
-		testutil.WithService("payment-service"),
-		testutil.WithLatency(42),
-		testutil.WithStatusCode(502),
-		testutil.WithCallerService("checkout"),
-		testutil.WithDownstreamService("stripe"),
-		testutil.WithError("PMT_502", "payment failed"),
-	)
-
-	result := builder.BuildResult(ev)
-	if result.Span == nil {
-		t.Fatal("expected trace-store span record")
-	}
-
-	checks := map[string]any{
-		"SpanID":            ev.Request.SpanID,
-		"ParentSpanID":      ev.Request.ParentSpanID,
-		"Service":           "payment-service",
-		"EventName":         ev.EventName,
-		"StatusCode":        502,
-		"Success":           false,
-		"LatencyMs":         int64(42),
-		"CallerService":     "checkout",
-		"DownstreamService": "stripe",
-		"ErrorCode":         "PMT_502",
-	}
-	if result.Span.SpanID != checks["SpanID"] {
-		t.Fatalf("SpanID = %q, want %q", result.Span.SpanID, checks["SpanID"])
-	}
-	if result.Span.ParentSpanID != checks["ParentSpanID"] {
-		t.Fatalf("ParentSpanID = %q, want %q", result.Span.ParentSpanID, checks["ParentSpanID"])
-	}
-	if result.Span.Service != checks["Service"] {
-		t.Fatalf("Service = %q, want %q", result.Span.Service, checks["Service"])
-	}
-	if result.Span.EventName != checks["EventName"] {
-		t.Fatalf("EventName = %q, want %q", result.Span.EventName, checks["EventName"])
-	}
-	if result.Span.StatusCode != checks["StatusCode"] {
-		t.Fatalf("StatusCode = %d, want %v", result.Span.StatusCode, checks["StatusCode"])
-	}
-	if result.Span.Success != checks["Success"] {
-		t.Fatalf("Success = %v, want %v", result.Span.Success, checks["Success"])
-	}
-	if result.Span.LatencyMs != checks["LatencyMs"] {
-		t.Fatalf("LatencyMs = %d, want %v", result.Span.LatencyMs, checks["LatencyMs"])
-	}
-	if result.Span.CallerService != checks["CallerService"] {
-		t.Fatalf("CallerService = %q, want %q", result.Span.CallerService, checks["CallerService"])
-	}
-	if result.Span.DownstreamService != checks["DownstreamService"] {
-		t.Fatalf("DownstreamService = %q, want %q", result.Span.DownstreamService, checks["DownstreamService"])
-	}
-	if result.Span.ErrorCode != checks["ErrorCode"] {
-		t.Fatalf("ErrorCode = %q, want %q", result.Span.ErrorCode, checks["ErrorCode"])
-	}
-	if result.Span.Timestamp.IsZero() {
-		t.Fatal("expected Timestamp to be populated")
-	}
-}
-
-func TestBuilder_Build_RequestIsRoot(t *testing.T) {
-	builder := NewBuilder()
-
-	t.Run("root span (no parent)", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID("0123456789abcdef"),
-			testutil.WithParentSpanID(""),
-		)
-		g := builder.Build(ev)
-		reqID := core.ID("request", ev.Request.TraceID)
-		req := g.Nodes[reqID]
-		isRoot, ok := req.Attr["is_root"].(bool)
-		if !ok || !isRoot {
-			t.Errorf("expected is_root=true for root span, got %v", req.Attr["is_root"])
-		}
-	})
-
-	t.Run("child span (has parent)", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID("0123456789abcdef"),
-			testutil.WithParentSpanID("fedcba9876543210"),
-		)
-		g := builder.Build(ev)
-		reqID := core.ID("request", ev.Request.TraceID)
-		req := g.Nodes[reqID]
-		isRoot, ok := req.Attr["is_root"].(bool)
-		if !ok || isRoot {
-			t.Errorf("expected is_root=false for child span, got %v", req.Attr["is_root"])
-		}
-	})
-
-	t.Run("no span id", func(t *testing.T) {
-		ev := testutil.MakeEvent(
-			testutil.WithSpanID(""),
-		)
-		g := builder.Build(ev)
-		reqID := core.ID("request", ev.Request.TraceID)
-		req := g.Nodes[reqID]
-		isRoot, ok := req.Attr["is_root"].(bool)
-		if !ok || isRoot {
-			t.Errorf("expected is_root=false when span_id is empty, got %v", req.Attr["is_root"])
-		}
-	})
-}
diff --git a/internal/graph/causal/infer.go b/internal/graph/causal/infer.go
deleted file mode 100644
index 11d6e78..0000000
--- a/internal/graph/causal/infer.go
+++ /dev/null
@@ -1,169 +0,0 @@
-package causal
-
-import (
-	"sort"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-const (
-	minAfterFailures  = 30
-	maxBeforeFailures = 5
-	minLift           = 3.0
-	maxDeployGap      = 30 * time.Minute
-)
-
-// InferIntroducedBy scans the graph for error spikes that follow a deployment
-// and returns causal claims linking error codes to the deploy that likely
-// introduced them. All returned claims have ShadowMode=true.
-//
-// deps must have their FirstSeen within [start, end]; callers should pre-filter
-// using coldstore.DeploymentsInWindow and convert to []DeploymentInfo.
-//
-// A claim is emitted when:
-//   - Exactly one deployment for the service falls inside [start, end].
-//   - After-failures >= minAfterFailures and before-failures <= maxBeforeFailures.
-//   - The first post-deploy failure occurs within maxDeployGap of the deploy.
-//   - Lift (after / before, Laplace-smoothed) >= minLift.
-func InferIntroducedBy(g *core.Graph, deps []DeploymentInfo, start, end time.Time) []Claim {
-	// Bucket deployments by service, filtered to the analysis window.
-	svcDeploys := map[string][]DeploymentInfo{}
-	for _, d := range deps {
-		if d.FirstSeen.Before(start) || d.FirstSeen.After(end) {
-			continue
-		}
-		svcDeploys[d.Service] = append(svcDeploys[d.Service], d)
-	}
-
-	// Keep only services with exactly one deployment (ambiguous otherwise).
-	uniqueDeploys := map[string]DeploymentInfo{}
-	for svc, ds := range svcDeploys {
-		if len(ds) == 1 {
-			uniqueDeploys[svc] = ds[0]
-		}
-	}
-
-	if len(uniqueDeploys) == 0 {
-		return nil
-	}
-
-	type failureKey struct {
-		service   string
-		errorCode string
-	}
-	type failureCounts struct {
-		before       int
-		after        int
-		firstFailure time.Time
-	}
-	counts := map[failureKey]*failureCounts{}
-
-	for _, e := range g.Edges {
-		if e.Type != core.EdgeFailedWith {
-			continue
-		}
-		reqNode, ok := g.Nodes[e.From]
-		if !ok || reqNode.Type != core.NodeRequest {
-			continue
-		}
-		errNode, ok := g.Nodes[e.To]
-		if !ok || errNode.Type != core.NodeError {
-			continue
-		}
-
-		if reqNode.LastSeen.Before(start) || reqNode.LastSeen.After(end) {
-			continue
-		}
-
-		svc := core.ServiceFromNode(reqNode)
-		if svc == "" {
-			continue
-		}
-
-		deploy, ok := uniqueDeploys[svc]
-		if !ok {
-			continue
-		}
-
-		code, _ := errNode.Attr["code"].(string)
-		if code == "" {
-			continue
-		}
-
-		key := failureKey{service: svc, errorCode: code}
-		fc, ok := counts[key]
-		if !ok {
-			fc = &failureCounts{}
-			counts[key] = fc
-		}
-
-		if reqNode.LastSeen.Before(deploy.FirstSeen) {
-			fc.before++
-		} else {
-			fc.after++
-			if fc.firstFailure.IsZero() || reqNode.LastSeen.Before(fc.firstFailure) {
-				fc.firstFailure = reqNode.LastSeen
-			}
-		}
-	}
-
-	var claims []Claim
-	for key, fc := range counts {
-		if fc.after < minAfterFailures {
-			continue
-		}
-		if fc.before > maxBeforeFailures {
-			continue
-		}
-
-		deploy := uniqueDeploys[key.service]
-
-		if fc.firstFailure.IsZero() || fc.firstFailure.Sub(deploy.FirstSeen) > maxDeployGap {
-			continue
-		}
-
-		// Laplace smoothing: avoid divide-by-zero; treat zero prior as 0.5.
-		beforeRate := float64(fc.before)
-		if beforeRate == 0 {
-			beforeRate = 0.5
-		}
-		lift := float64(fc.after) / beforeRate
-		if lift < minLift {
-			continue
-		}
-
-		timeDelta := fc.firstFailure.Sub(deploy.FirstSeen)
-		ev := Evidence{
-			BeforeFailures: fc.before,
-			AfterFailures:  fc.after,
-			Lift:           lift,
-			TimeDeltaMin:   timeDelta.Minutes(),
-			WindowMinutes:  end.Sub(start).Minutes(),
-		}
-
-		conf, tier := Score(ev)
-
-		claims = append(claims, Claim{
-			ClaimType:   ClaimIntroducedBy,
-			Subject:     key.errorCode,
-			Target:      deploy.ID,
-			Service:     key.service,
-			Confidence:  conf,
-			Tier:        tier,
-			Evidence:    ev,
-			WindowStart: start,
-			WindowEnd:   end,
-			ShadowMode:  true,
-		})
-	}
-
-	sort.Slice(claims, func(i, j int) bool {
-		if claims[i].Service != claims[j].Service {
-			return claims[i].Service < claims[j].Service
-		}
-		return claims[i].Subject < claims[j].Subject
-	})
-
-	return claims
-}
diff --git a/internal/graph/causal/infer_test.go b/internal/graph/causal/infer_test.go
deleted file mode 100644
index ca7412c..0000000
--- a/internal/graph/causal/infer_test.go
+++ /dev/null
@@ -1,314 +0,0 @@
-package causal
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-// baseTime is the anchor for all test timestamps.
-var baseTime = time.Date(2024, 1, 15, 12, 0, 0, 0, time.UTC)
-
-// buildGraph constructs a core.Graph with the given request→error edges.
-// Each entry in failures is (reqID, service, errorCode, reqTime).
-func buildGraph(failures []struct {
-	reqID     string
-	service   string
-	errorCode string
-	reqTime   time.Time
-}) *core.Graph {
-	g := core.New()
-	for i, f := range failures {
-		errID := f.reqID + "-err"
-		_ = i
-
-		g.AddNode(core.Node{
-			ID:   f.reqID,
-			Type: core.NodeRequest,
-			Attr: map[string]any{
-				"root_service": f.service,
-			},
-			FirstSeen: f.reqTime,
-			LastSeen:  f.reqTime,
-		})
-		g.AddNode(core.Node{
-			ID:   errID,
-			Type: core.NodeError,
-			Attr: map[string]any{
-				"code": f.errorCode,
-			},
-			FirstSeen: f.reqTime,
-			LastSeen:  f.reqTime,
-		})
-		g.AddEdge(core.Edge{
-			From: f.reqID,
-			To:   errID,
-			Type: core.EdgeFailedWith,
-		})
-	}
-	return g
-}
-
-// TestInferIntroducedBy_BasicClaim verifies that a deploy followed by a spike
-// of 30+ failures (0 before) produces a single claim with the right fields.
-func TestInferIntroducedBy_BasicClaim(t *testing.T) {
-	deployTime := baseTime.Add(10 * time.Minute)
-	start := baseTime
-	end := baseTime.Add(60 * time.Minute)
-
-	// Build 30 failures that occurred 5 minutes after the deploy.
-	var failures []struct {
-		reqID     string
-		service   string
-		errorCode string
-		reqTime   time.Time
-	}
-	failTime := deployTime.Add(5 * time.Minute)
-	for i := 0; i < 30; i++ {
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{
-			reqID:     "req-" + string(rune('a'+i%26)) + string(rune('0'+i/26)),
-			service:   "checkout",
-			errorCode: "PMT_502",
-			reqTime:   failTime.Add(time.Duration(i) * time.Second),
-		})
-	}
-
-	g := buildGraph(failures)
-	deps := []DeploymentInfo{
-		{ID: "deploy-1", Service: "checkout", FirstSeen: deployTime},
-	}
-
-	claims := InferIntroducedBy(g, deps, start, end)
-
-	if len(claims) != 1 {
-		t.Fatalf("expected 1 claim, got %d", len(claims))
-	}
-	c := claims[0]
-	if c.ClaimType != ClaimIntroducedBy {
-		t.Errorf("ClaimType = %q, want %q", c.ClaimType, ClaimIntroducedBy)
-	}
-	if c.Subject != "PMT_502" {
-		t.Errorf("Subject = %q, want PMT_502", c.Subject)
-	}
-	if c.Target != "deploy-1" {
-		t.Errorf("Target = %q, want deploy-1", c.Target)
-	}
-	if c.Service != "checkout" {
-		t.Errorf("Service = %q, want checkout", c.Service)
-	}
-	if !c.ShadowMode {
-		t.Error("ShadowMode should be true")
-	}
-	if c.Evidence.AfterFailures != 30 {
-		t.Errorf("AfterFailures = %d, want 30", c.Evidence.AfterFailures)
-	}
-	if c.Evidence.BeforeFailures != 0 {
-		t.Errorf("BeforeFailures = %d, want 0", c.Evidence.BeforeFailures)
-	}
-	// With 0 before → Laplace-smoothed lift = 30/0.5 = 60.
-	if c.Evidence.Lift < minLift {
-		t.Errorf("Lift = %.2f, want >= %.1f", c.Evidence.Lift, minLift)
-	}
-	if c.WindowStart != start || c.WindowEnd != end {
-		t.Error("window start/end mismatch")
-	}
-}
-
-// TestInferIntroducedBy_OutsideWindow ensures deployments whose FirstSeen falls
-// outside [start, end] produce no claims.
-func TestInferIntroducedBy_OutsideWindow(t *testing.T) {
-	start := baseTime
-	end := baseTime.Add(60 * time.Minute)
-
-	// Deploy is 2 hours before the window.
-	deployTime := baseTime.Add(-2 * time.Hour)
-
-	var failures []struct {
-		reqID     string
-		service   string
-		errorCode string
-		reqTime   time.Time
-	}
-	failTime := baseTime.Add(5 * time.Minute) // inside window
-	for i := 0; i < 35; i++ {
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{
-			reqID:     "req-out-" + string(rune('a'+i%26)),
-			service:   "payment",
-			errorCode: "PAY_404",
-			reqTime:   failTime.Add(time.Duration(i) * time.Second),
-		})
-	}
-
-	g := buildGraph(failures)
-	deps := []DeploymentInfo{
-		{ID: "deploy-old", Service: "payment", FirstSeen: deployTime},
-	}
-
-	claims := InferIntroducedBy(g, deps, start, end)
-
-	if len(claims) != 0 {
-		t.Errorf("expected 0 claims for out-of-window deploy, got %d", len(claims))
-	}
-}
-
-// TestInferIntroducedBy_TwoDeploysAmbiguous verifies that two deployments for
-// the same service in the window produce no claims (ambiguous attribution).
-func TestInferIntroducedBy_TwoDeploysAmbiguous(t *testing.T) {
-	start := baseTime
-	end := baseTime.Add(60 * time.Minute)
-
-	deploy1Time := baseTime.Add(5 * time.Minute)
-	deploy2Time := baseTime.Add(20 * time.Minute)
-
-	var failures []struct {
-		reqID     string
-		service   string
-		errorCode string
-		reqTime   time.Time
-	}
-	failTime := deploy1Time.Add(2 * time.Minute)
-	for i := 0; i < 40; i++ {
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{
-			reqID:     "req-amb-" + string(rune('a'+i%26)) + string(rune('0'+i/26)),
-			service:   "api-gateway",
-			errorCode: "GW_500",
-			reqTime:   failTime.Add(time.Duration(i) * time.Second),
-		})
-	}
-
-	g := buildGraph(failures)
-	deps := []DeploymentInfo{
-		{ID: "deploy-a", Service: "api-gateway", FirstSeen: deploy1Time},
-		{ID: "deploy-b", Service: "api-gateway", FirstSeen: deploy2Time},
-	}
-
-	claims := InferIntroducedBy(g, deps, start, end)
-
-	if len(claims) != 0 {
-		t.Errorf("expected 0 claims for ambiguous (2-deploy) service, got %d", len(claims))
-	}
-}
-
-// TestInferIntroducedBy_BelowThreshold verifies that fewer than minAfterFailures
-// failures after the deploy produce no claims.
-func TestInferIntroducedBy_BelowThreshold(t *testing.T) {
-	start := baseTime
-	end := baseTime.Add(60 * time.Minute)
-	deployTime := baseTime.Add(10 * time.Minute)
-
-	// Only 20 failures — below the required 30.
-	var failures []struct {
-		reqID     string
-		service   string
-		errorCode string
-		reqTime   time.Time
-	}
-	failTime := deployTime.Add(3 * time.Minute)
-	for i := 0; i < 20; i++ {
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{
-			reqID:     "req-low-" + string(rune('a'+i%26)),
-			service:   "db-service",
-			errorCode: "DB_TIMEOUT",
-			reqTime:   failTime.Add(time.Duration(i) * time.Second),
-		})
-	}
-
-	g := buildGraph(failures)
-	deps := []DeploymentInfo{
-		{ID: "deploy-db", Service: "db-service", FirstSeen: deployTime},
-	}
-
-	claims := InferIntroducedBy(g, deps, start, end)
-
-	if len(claims) != 0 {
-		t.Errorf("expected 0 claims (below minAfterFailures), got %d", len(claims))
-	}
-}
-
-// TestInferIntroducedBy_Deterministic verifies that the same input always
-// produces the same output in the same order.
-func TestInferIntroducedBy_Deterministic(t *testing.T) {
-	start := baseTime
-	end := baseTime.Add(120 * time.Minute)
-
-	// Two services each with one deployment and 35 post-deploy failures.
-	svcA := struct{ name, code, deployID string }{"svc-alpha", "ALPHA_ERR", "deploy-alpha"}
-	svcB := struct{ name, code, deployID string }{"svc-beta", "BETA_ERR", "deploy-beta"}
-
-	deployTimeA := baseTime.Add(10 * time.Minute)
-	deployTimeB := baseTime.Add(15 * time.Minute)
-
-	var failures []struct {
-		reqID     string
-		service   string
-		errorCode string
-		reqTime   time.Time
-	}
-	for i := 0; i < 35; i++ {
-		t_ := deployTimeA.Add(time.Duration(i+1) * time.Minute)
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{"req-a-" + string(rune('a'+i%26)), svcA.name, svcA.code, t_})
-	}
-	for i := 0; i < 35; i++ {
-		t_ := deployTimeB.Add(time.Duration(i+1) * time.Minute)
-		failures = append(failures, struct {
-			reqID     string
-			service   string
-			errorCode string
-			reqTime   time.Time
-		}{"req-b-" + string(rune('a'+i%26)), svcB.name, svcB.code, t_})
-	}
-
-	g := buildGraph(failures)
-	deps := []DeploymentInfo{
-		{ID: svcA.deployID, Service: svcA.name, FirstSeen: deployTimeA},
-		{ID: svcB.deployID, Service: svcB.name, FirstSeen: deployTimeB},
-	}
-
-	first := InferIntroducedBy(g, deps, start, end)
-	second := InferIntroducedBy(g, deps, start, end)
-
-	if len(first) != len(second) {
-		t.Fatalf("non-deterministic: first=%d claims, second=%d claims", len(first), len(second))
-	}
-	for i := range first {
-		if first[i].Service != second[i].Service || first[i].Subject != second[i].Subject {
-			t.Errorf("claim[%d] differs between runs: first=%+v second=%+v", i, first[i], second[i])
-		}
-	}
-
-	// Also verify ordering: svc-alpha < svc-beta lexicographically.
-	if len(first) == 2 {
-		if first[0].Service != svcA.name {
-			t.Errorf("expected first claim for %q, got %q", svcA.name, first[0].Service)
-		}
-		if first[1].Service != svcB.name {
-			t.Errorf("expected second claim for %q, got %q", svcB.name, first[1].Service)
-		}
-	}
-}
diff --git a/internal/graph/causal/model.go b/internal/graph/causal/model.go
deleted file mode 100644
index 995e106..0000000
--- a/internal/graph/causal/model.go
+++ /dev/null
@@ -1,49 +0,0 @@
-package causal
-
-import "time"
-
-// DeploymentInfo carries the deployment fields needed for causal inference.
-// Callers convert from coldstore.Deployment before calling InferIntroducedBy
-// to avoid an import cycle (coldstore already imports this package).
-type DeploymentInfo struct {
-	ID        string
-	Service   string
-	FirstSeen time.Time
-}
-
-// ClaimType identifies the kind of causal inference.
-type ClaimType string
-
-const ClaimIntroducedBy ClaimType = "introduced_by"
-
-// ConfidenceTier categorizes confidence into actionable bands.
-type ConfidenceTier string
-
-const (
-	TierSupported    ConfidenceTier = "supported"
-	TierProvisional  ConfidenceTier = "provisional"
-	TierInsufficient ConfidenceTier = "insufficient"
-)
-
-// Evidence captures the raw signals behind a causal claim.
-type Evidence struct {
-	BeforeFailures int     `json:"before_failures"`
-	AfterFailures  int     `json:"after_failures"`
-	Lift           float64 `json:"lift"`
-	TimeDeltaMin   float64 `json:"time_delta_min"`
-	WindowMinutes  float64 `json:"window_minutes"`
-}
-
-// Claim is a single causal inference result.
-type Claim struct {
-	ClaimType   ClaimType      `json:"claim_type"`
-	Subject     string         `json:"subject"`
-	Target      string         `json:"target"`
-	Service     string         `json:"service"`
-	Confidence  float64        `json:"confidence"`
-	Tier        ConfidenceTier `json:"tier"`
-	Evidence    Evidence       `json:"evidence"`
-	WindowStart time.Time      `json:"window_start"`
-	WindowEnd   time.Time      `json:"window_end"`
-	ShadowMode  bool           `json:"shadow_mode"`
-}
diff --git a/internal/graph/causal/score.go b/internal/graph/causal/score.go
deleted file mode 100644
index b2c5daf..0000000
--- a/internal/graph/causal/score.go
+++ /dev/null
@@ -1,48 +0,0 @@
-package causal
-
-import "math"
-
-// Score computes a confidence value (0.0–1.0) and a tier from the evidence.
-func Score(ev Evidence) (float64, ConfidenceTier) {
-	liftScore := 0.0
-	if ev.Lift > 1 {
-		liftScore = math.Log2(ev.Lift) / math.Log2(100)
-		if liftScore > 1 {
-			liftScore = 1
-		}
-	}
-
-	proximityScore := 0.0
-	if ev.WindowMinutes > 0 {
-		proximityScore = 1.0 - (ev.TimeDeltaMin / ev.WindowMinutes)
-		if proximityScore < 0 {
-			proximityScore = 0
-		}
-	}
-
-	volumeScore := 0.0
-	if ev.AfterFailures > 1 {
-		volumeScore = math.Log10(float64(ev.AfterFailures)) / math.Log10(1000)
-		if volumeScore > 1 {
-			volumeScore = 1
-		}
-	}
-
-	conf := 0.50*liftScore + 0.25*proximityScore + 0.25*volumeScore
-
-	if conf > 1 {
-		conf = 1
-	}
-	if conf < 0 {
-		conf = 0
-	}
-
-	tier := TierInsufficient
-	if conf >= 0.85 {
-		tier = TierSupported
-	} else if conf >= 0.70 {
-		tier = TierProvisional
-	}
-
-	return conf, tier
-}
diff --git a/internal/graph/causal/score_test.go b/internal/graph/causal/score_test.go
deleted file mode 100644
index 26f1d57..0000000
--- a/internal/graph/causal/score_test.go
+++ /dev/null
@@ -1,98 +0,0 @@
-package causal
-
-import (
-	"testing"
-)
-
-func TestScore(t *testing.T) {
-	tests := []struct {
-		name        string
-		evidence    Evidence
-		wantTier    ConfidenceTier
-		wantMinConf float64
-		wantMaxConf float64
-	}{
-		{
-			name: "high lift strong signal → supported",
-			evidence: Evidence{
-				BeforeFailures: 2,
-				AfterFailures:  100,
-				Lift:           50.0,
-				TimeDeltaMin:   5.0,
-				WindowMinutes:  30.0,
-			},
-			wantTier:    TierProvisional,
-			wantMinConf: 0.75,
-			wantMaxConf: 0.85,
-		},
-		{
-			name: "moderate lift → provisional",
-			evidence: Evidence{
-				BeforeFailures: 3,
-				AfterFailures:  40,
-				Lift:           5.0,
-				TimeDeltaMin:   15.0,
-				WindowMinutes:  30.0,
-			},
-			wantTier:    TierInsufficient,
-			wantMinConf: 0.40,
-			wantMaxConf: 0.70,
-		},
-		{
-			name: "low lift → insufficient",
-			evidence: Evidence{
-				BeforeFailures: 4,
-				AfterFailures:  35,
-				Lift:           3.5,
-				TimeDeltaMin:   25.0,
-				WindowMinutes:  30.0,
-			},
-			wantTier:    TierInsufficient,
-			wantMinConf: 0.0,
-			wantMaxConf: 0.70,
-		},
-		{
-			name: "boundary at 0.85 exactly",
-			evidence: Evidence{
-				BeforeFailures: 1,
-				AfterFailures:  80,
-				Lift:           20.0,
-				TimeDeltaMin:   3.0,
-				WindowMinutes:  30.0,
-			},
-			wantTier:    TierProvisional,
-			wantMinConf: 0.70,
-			wantMaxConf: 0.80,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			conf, tier := Score(tt.evidence)
-			if tier != tt.wantTier {
-				t.Errorf("tier = %q, want %q (conf=%.4f)", tier, tt.wantTier, conf)
-			}
-			if conf < tt.wantMinConf || conf > tt.wantMaxConf {
-				t.Errorf("confidence = %.4f, want [%.2f, %.2f]", conf, tt.wantMinConf, tt.wantMaxConf)
-			}
-		})
-	}
-}
-
-func TestScoreTierBoundaries(t *testing.T) {
-	_, tierHigh := Score(Evidence{
-		BeforeFailures: 0, AfterFailures: 200, Lift: 100,
-		TimeDeltaMin: 1, WindowMinutes: 30,
-	})
-	if tierHigh != TierSupported {
-		t.Errorf("extreme signal should be supported, got %q", tierHigh)
-	}
-
-	_, tierZero := Score(Evidence{
-		BeforeFailures: 5, AfterFailures: 30, Lift: 3.0,
-		TimeDeltaMin: 29, WindowMinutes: 30,
-	})
-	if tierZero == TierSupported {
-		t.Errorf("borderline signal should not be supported, got %q", tierZero)
-	}
-}
diff --git a/internal/graph/causal/store.go b/internal/graph/causal/store.go
deleted file mode 100644
index bca740a..0000000
--- a/internal/graph/causal/store.go
+++ /dev/null
@@ -1,14 +0,0 @@
-package causal
-
-import "context"
-
-// ClaimQuery filters for ActiveClaims queries.
-type ClaimQuery struct {
-	ClaimType ClaimType
-}
-
-// ClaimStore persists and queries causal claims.
-type ClaimStore interface {
-	SaveClaims(ctx context.Context, claims []Claim) error
-	ActiveClaims(ctx context.Context, q ClaimQuery) ([]Claim, error)
-}
diff --git a/internal/graph/core/edge.go b/internal/graph/core/edge.go
deleted file mode 100644
index abe9b87..0000000
--- a/internal/graph/core/edge.go
+++ /dev/null
@@ -1,21 +0,0 @@
-package core
-
-type EdgeType string
-
-const (
-	EdgeRequestBy  EdgeType = "request_by"
-	EdgeHandledBy  EdgeType = "handled_by"
-	EdgeUsedFlag   EdgeType = "used_flag"
-	EdgeFailedWith EdgeType = "failed_with"
-	EdgeCalls      EdgeType = "calls"
-
-	EdgeRequestHasSpan EdgeType = "has_span"
-	EdgeSpanChildOf    EdgeType = "span_child_of"
-	EdgeSpanOnService  EdgeType = "span_on"
-)
-
-type Edge struct {
-	From string
-	To   string
-	Type EdgeType
-}
diff --git a/internal/graph/core/graph.go b/internal/graph/core/graph.go
deleted file mode 100644
index 5eaaad8..0000000
--- a/internal/graph/core/graph.go
+++ /dev/null
@@ -1,44 +0,0 @@
-package core
-
-type Graph struct {
-	Nodes    map[string]Node
-	Edges    []Edge
-	OutEdges map[string][]Edge `json:"-"`
-	InEdges  map[string][]Edge `json:"-"`
-}
-
-func New() *Graph {
-	return &Graph{
-		Nodes:    make(map[string]Node),
-		Edges:    make([]Edge, 0),
-		OutEdges: make(map[string][]Edge),
-		InEdges:  make(map[string][]Edge),
-	}
-}
-
-func (g *Graph) AddNode(n Node) {
-	g.Nodes[n.ID] = n
-}
-
-func (g *Graph) AddEdge(e Edge) {
-	g.Edges = append(g.Edges, e)
-	if g.OutEdges == nil {
-		g.OutEdges = make(map[string][]Edge)
-	}
-	if g.InEdges == nil {
-		g.InEdges = make(map[string][]Edge)
-	}
-	g.OutEdges[e.From] = append(g.OutEdges[e.From], e)
-	g.InEdges[e.To] = append(g.InEdges[e.To], e)
-}
-
-// RebuildIndexes rebuilds OutEdges and InEdges from the Edges slice.
-// Call after deserializing or manually modifying Edges.
-func (g *Graph) RebuildIndexes() {
-	g.OutEdges = make(map[string][]Edge, len(g.Nodes))
-	g.InEdges = make(map[string][]Edge, len(g.Nodes))
-	for _, e := range g.Edges {
-		g.OutEdges[e.From] = append(g.OutEdges[e.From], e)
-		g.InEdges[e.To] = append(g.InEdges[e.To], e)
-	}
-}
diff --git a/internal/graph/core/ids.go b/internal/graph/core/ids.go
deleted file mode 100644
index d6ae4ca..0000000
--- a/internal/graph/core/ids.go
+++ /dev/null
@@ -1,14 +0,0 @@
-package core
-
-import (
-	"crypto/sha1"
-	"encoding/hex"
-)
-
-func ID(parts ...string) string {
-	h := sha1.New()
-	for _, p := range parts {
-		h.Write([]byte(p))
-	}
-	return hex.EncodeToString(h.Sum(nil))
-}
diff --git a/internal/graph/core/node.go b/internal/graph/core/node.go
deleted file mode 100644
index 4a13a7c..0000000
--- a/internal/graph/core/node.go
+++ /dev/null
@@ -1,44 +0,0 @@
-package core
-
-import (
-	"strings"
-	"time"
-)
-
-type NodeType string
-
-const (
-	NodeRequest NodeType = "request"
-	NodeUser    NodeType = "user"
-	NodeService NodeType = "service"
-	NodeFlag    NodeType = "feature_flag"
-	NodeError   NodeType = "error"
-	NodeSpan    NodeType = "span"
-)
-
-type Node struct {
-	ID   string
-	Type NodeType
-	Attr map[string]any
-
-	//for time-window commands
-	FirstSeen time.Time
-	LastSeen  time.Time
-}
-
-// ServiceFromNode extracts the canonical service name from a request node.
-// Prefers root_service (set by root span merge), falls back to event_name prefix.
-func ServiceFromNode(n Node) string {
-	if n.Attr == nil {
-		return ""
-	}
-	if svc, ok := n.Attr["root_service"].(string); ok && svc != "" {
-		return svc
-	}
-	if name, ok := n.Attr["event_name"].(string); ok {
-		if idx := strings.IndexByte(name, '.'); idx > 0 {
-			return name[:idx]
-		}
-	}
-	return ""
-}
diff --git a/internal/graph/store/counters.go b/internal/graph/store/counters.go
deleted file mode 100644
index fc76b89..0000000
--- a/internal/graph/store/counters.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package store
-
-import "time"
-
-// RequestFacts is the minimal per-request "semantic envelope" needed for fast summaries.
-// It is derived from the graph at merge time, not from raw events (semantics stay locked).
-type RequestFacts struct {
-	RequestID string
-	TraceID   string
-	SeenAt    time.Time // use Request.LastSeen as the request timestamp
-
-	// Canonical owner (root span's service). Empty until root merges.
-	RootService string
-
-	// Typically one service, but keep slice for future multi-service chains
-	Services     []string // service names from handled_by edges
-	Errors       []string // error codes from failed_with edges
-	FeatureFlags []string // flag names from request attrs or used_flag edges
-
-	UserID     string
-	UserTier   string
-	UserVIP    bool
-	UserRegion string
-
-	Version   string
-	LatencyMs int64
-	Status    string
-}
-
-// HasFeatureFlag reports whether the request fact contains the named flag.
-func (f RequestFacts) HasFeatureFlag(flag string) bool {
-	for _, current := range f.FeatureFlags {
-		if current == flag {
-			return true
-		}
-	}
-	return false
-}
-
-// HasError reports whether the request fact contains the named error code.
-func (f RequestFacts) HasError(code string) bool {
-	for _, current := range f.Errors {
-		if current == code {
-			return true
-		}
-	}
-	return false
-}
-
-type ServiceStats struct {
-	Invocations int
-	Errors      int
-}
-
-// Counters are all-time rollups. Optional but nice for non-windowed commands.
-type Counters struct {
-	// errorID -> count
-	ErrorCount map[string]int
-
-	// serviceID -> errorID -> count
-	ServiceErrorCount map[string]map[string]int
-
-	// flagID -> errorID -> count
-	FlagErrorCount map[string]map[string]int
-}
-
-func NewCounters() *Counters {
-	return &Counters{
-		ErrorCount:        map[string]int{},
-		ServiceErrorCount: map[string]map[string]int{},
-		FlagErrorCount:    map[string]map[string]int{},
-	}
-}
diff --git a/internal/graph/store/invariants_test.go b/internal/graph/store/invariants_test.go
deleted file mode 100644
index 6e7368e..0000000
--- a/internal/graph/store/invariants_test.go
+++ /dev/null
@@ -1,376 +0,0 @@
-package store
-
-import (
-	"fmt"
-	"sort"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-// mergeInto runs events through the canonical ingest pipeline: builder → store
-// merge + trace-store upsert. Mirrors internal/ingest/handler.go.
-func mergeInto(s *Store, ts *tracestore.Store, b *build.Builder, events ...event.WideEvent) {
-	for _, ev := range events {
-		res := b.BuildResult(ev)
-		s.Merge(res.Graph)
-		if res.Span != nil && ts != nil {
-			ts.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), res.Span)
-		}
-	}
-}
-
-// renderTree produces a stable deterministic rendering of a span forest.
-func renderTree(nodes []*tracestore.TreeNode) string {
-	var lines []string
-	var walk func(*tracestore.TreeNode, int)
-	walk = func(n *tracestore.TreeNode, depth int) {
-		if n == nil {
-			return
-		}
-		lines = append(lines, fmt.Sprintf("%s%s:%s", strings.Repeat("  ", depth), n.Span.SpanID, n.Span.Service))
-		for _, c := range n.Children {
-			walk(c, depth+1)
-		}
-	}
-	for _, r := range nodes {
-		walk(r, 0)
-	}
-	return strings.Join(lines, "\n")
-}
-
-// Invariant 1: Out-of-order span arrival yields the same tree as in-order arrival.
-func TestInvariant_OutOfOrderArrival_SameTree(t *testing.T) {
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa1"
-	base := time.Now().UTC()
-
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("gateway"),
-		testutil.WithEventName("gateway.request"),
-		testutil.WithTimestamp(base),
-	)
-	mid := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("mmmmmmmmmmmmmmmm"),
-		testutil.WithParentSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithService("checkout"),
-		testutil.WithEventName("checkout.request"),
-		testutil.WithTimestamp(base.Add(time.Millisecond)),
-	)
-	leaf := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("llllllllllllllll"),
-		testutil.WithParentSpanID("mmmmmmmmmmmmmmmm"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "downstream failed"),
-		testutil.WithTimestamp(base.Add(2*time.Millisecond)),
-	)
-
-	inOrder := buildForest(traceID, root, mid, leaf)
-	outOfOrder := buildForest(traceID, leaf, mid, root)
-
-	want := renderTree(inOrder)
-	got := renderTree(outOfOrder)
-	if want != got {
-		t.Fatalf("tree diverges on out-of-order arrival\nin-order:\n%s\nout-of-order:\n%s", want, got)
-	}
-}
-
-func buildForest(traceID string, events ...event.WideEvent) []*tracestore.TreeNode {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	mergeInto(s, ts, b, events...)
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		return nil
-	}
-	return tracestore.BuildTree(rec.Spans)
-}
-
-// Invariant 2: Root arrives late → root_service is upgraded on the request facts.
-func TestInvariant_RootLateMerge_UpgradesRootService(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa2"
-
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithParentSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "failed"),
-	)
-	mergeInto(s, ts, b, child)
-
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("gateway"),
-		testutil.WithEventName("gateway.request"),
-		testutil.WithStatusCode(200),
-	)
-	mergeInto(s, ts, b, root)
-
-	facts, ok := s.requestFacts[core.ID("request", traceID)]
-	if !ok {
-		t.Fatalf("request facts missing")
-	}
-	if facts.RootService != "gateway" {
-		t.Fatalf("root_service = %q, want %q (late root must win)", facts.RootService, "gateway")
-	}
-}
-
-// Invariant 3: Fan-out children all appear in the tree, and the error index
-// counts one entry per failed request (not per propagated hop).
-func TestInvariant_FanOut_ChildrenAllAppear_BlastRadiusStable(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa3"
-
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("gateway"),
-	)
-	left := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("lllllllllllllll1"),
-		testutil.WithParentSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithService("checkout"),
-	)
-	right := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("rrrrrrrrrrrrrrr2"),
-		testutil.WithParentSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithService("inventory"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("INV_500", "stock fetch failed"),
-	)
-	mergeInto(s, ts, b, root, left, right)
-
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		t.Fatalf("trace record missing")
-	}
-	roots := tracestore.BuildTree(rec.Spans)
-	if len(roots) != 1 {
-		t.Fatalf("expected 1 root, got %d", len(roots))
-	}
-	if len(roots[0].Children) != 2 {
-		t.Fatalf("expected 2 fan-out children, got %d", len(roots[0].Children))
-	}
-
-	ids, _ := s.ErrorIndex("INV_500")
-	if len(ids) != 1 {
-		t.Fatalf("error-index request count = %d, want 1 (blast_radius must not inflate by hop)", len(ids))
-	}
-}
-
-// Invariant 4: Retries land on separate request nodes and carry retry.of +
-// retry.previous_attempt_id so UIs can render them as sibling attempts.
-func TestInvariant_Retries_RenderAsSiblingAttempts(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-
-	firstTrace := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4"
-	retryTrace := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaab4"
-
-	first := testutil.MakeEvent(
-		testutil.WithTraceID(firstTrace),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "upstream reset"),
-	)
-	retry := testutil.MakeEvent(
-		testutil.WithTraceID(retryTrace),
-		testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(200),
-		testutil.WithRetry(1, firstTrace),
-	)
-	mergeInto(s, ts, b, first, retry)
-
-	snap := s.Snapshot()
-	retryReq, ok := snap.Nodes[core.ID("request", retryTrace)]
-	if !ok {
-		t.Fatalf("retry request node missing")
-	}
-	if got := retryReq.Attr["retry_of"]; got != 1 {
-		t.Fatalf("retry_of = %v, want 1", got)
-	}
-	if got, _ := retryReq.Attr["retry_previous_attempt_id"].(string); got != firstTrace {
-		t.Fatalf("retry_previous_attempt_id = %q, want %q", got, firstTrace)
-	}
-	if _, ok := snap.Nodes[core.ID("request", firstTrace)]; !ok {
-		t.Fatalf("original request node missing — retries must not replace the original")
-	}
-}
-
-// Invariant 5: Duplicate-span ingestion does not double-render the span.
-func TestInvariant_DuplicateSpan_NoDoubleRender(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa5"
-
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("gateway"),
-	)
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithParentSpanID("rrrrrrrrrrrrrrrr"),
-		testutil.WithService("checkout"),
-	)
-	mergeInto(s, ts, b, root, child, child)
-
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		t.Fatalf("trace record missing")
-	}
-	if len(rec.Spans) != 2 {
-		t.Fatalf("duplicate span was not deduped: got %d, want 2", len(rec.Spans))
-	}
-	roots := tracestore.BuildTree(rec.Spans)
-	if len(roots) != 1 || len(roots[0].Children) != 1 {
-		t.Fatalf("duplicate produced extra tree entry: %s", renderTree(roots))
-	}
-}
-
-// Invariant 6: parent_request_id is stored on the child request node only and
-// does NOT attach the child span as a span_child_of the parent trace.
-func TestInvariant_ParentRequestID_RendersAsSecondaryLink(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	parentTrace := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa6"
-	childTrace := "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
-
-	parent := testutil.MakeEvent(
-		testutil.WithTraceID(parentTrace),
-		testutil.WithSpanID("pppppppppppppppp"),
-		testutil.WithService("gateway"),
-	)
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(childTrace),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithService("worker"),
-		testutil.WithParentRequestID(core.ID("request", parentTrace)),
-	)
-	mergeInto(s, ts, b, parent, child)
-
-	snap := s.Snapshot()
-	childReq := snap.Nodes[core.ID("request", childTrace)]
-	got, _ := childReq.Attr["parent_request_id"].(string)
-	if want := core.ID("request", parentTrace); got != want {
-		t.Fatalf("parent_request_id = %q, want %q", got, want)
-	}
-
-	for _, e := range snap.Edges {
-		if e.Type == core.EdgeSpanChildOf && e.From == "cccccccccccccccc" {
-			t.Fatalf("child span was linked as span_child_of parent trace — must remain a secondary reference only")
-		}
-	}
-}
-
-// Invariant 7: A child whose parent span never arrived must not crash tree
-// rendering; orphans are promoted to roots.
-func TestInvariant_MissingParent_TreeStillRenders(t *testing.T) {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa7"
-
-	orphan := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("oooooooooooooooo"),
-		testutil.WithParentSpanID("nonexistentparent"),
-		testutil.WithService("payment"),
-	)
-	mergeInto(s, ts, b, orphan)
-
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		t.Fatalf("trace record missing")
-	}
-	roots := tracestore.BuildTree(rec.Spans)
-	if len(roots) != 1 || roots[0].Span.SpanID != "oooooooooooooooo" {
-		t.Fatalf("orphan was not promoted to root: %s", renderTree(roots))
-	}
-}
-
-// Invariant 8: Events with differing event_name shapes (native vs OTLP-style)
-// produce the same explain-style error-code set for the trace.
-func TestInvariant_MixedEventNames_StableExplainOutput(t *testing.T) {
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa8"
-
-	native := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("nnnnnnnnnnnnnnnn"),
-		testutil.WithService("payment"),
-		testutil.WithEventName("payment.request"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "downstream failed"),
-	)
-	otlpShaped := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("nnnnnnnnnnnnnnnn"),
-		testutil.WithService("payment"),
-		testutil.WithEventName("POST /charge"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "downstream failed"),
-	)
-
-	a := explainCodes(traceID, native)
-	b := explainCodes(traceID, otlpShaped)
-	if !equalStrings(a, b) {
-		t.Fatalf("explain diverged across event_name shapes: native=%v otlp=%v", a, b)
-	}
-}
-
-func explainCodes(traceID string, ev event.WideEvent) []string {
-	s := NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-	mergeInto(s, ts, b, ev)
-	facts, ok := s.requestFacts[core.ID("request", traceID)]
-	if !ok {
-		return nil
-	}
-	codes := append([]string(nil), facts.Errors...)
-	sort.Strings(codes)
-	return codes
-}
-
-func equalStrings(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i := range a {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-}
diff --git a/internal/graph/store/requestfacts_test.go b/internal/graph/store/requestfacts_test.go
deleted file mode 100644
index 26563c9..0000000
--- a/internal/graph/store/requestfacts_test.go
+++ /dev/null
@@ -1,32 +0,0 @@
-package store
-
-import "testing"
-
-func TestRequestFacts_HasFeatureFlag(t *testing.T) {
-	f := RequestFacts{
-		UserTier:     "premium",
-		FeatureFlags: []string{"flag-a", "flag-b"},
-	}
-
-	if !f.HasFeatureFlag("flag-a") {
-		t.Fatal("HasFeatureFlag should match flag-a")
-	}
-	if !f.HasFeatureFlag("flag-b") {
-		t.Fatal("HasFeatureFlag should match flag-b")
-	}
-	if f.HasFeatureFlag("missing") {
-		t.Fatal("HasFeatureFlag should return false for unknown flags")
-	}
-}
-
-func TestRequestFacts_HasError(t *testing.T) {
-	f := RequestFacts{
-		Errors: []string{"ERR_500", "ERR_404"},
-	}
-	if !f.HasError("ERR_500") {
-		t.Fatal("HasError should match ERR_500")
-	}
-	if f.HasError("ERR_999") {
-		t.Fatal("HasError should return false for unknown error")
-	}
-}
diff --git a/internal/graph/store/store.go b/internal/graph/store/store.go
deleted file mode 100644
index e99f8b2..0000000
--- a/internal/graph/store/store.go
+++ /dev/null
@@ -1,881 +0,0 @@
-package store
-
-import (
-	"sort"
-	"sync"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/window"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-type Store struct {
-	mu    sync.RWMutex
-	graph *core.Graph
-	//for fast lookups
-	requestFacts    map[string]RequestFacts
-	seenRequests    map[string]struct{}
-	counters        *Counters
-	serviceStats    map[string]*ServiceStats
-	edgeSet         map[string]struct{}            // "from:to:type" for dedup
-	traceToRequest  map[string]string              // trace_id -> request node ID
-	traceToSpans    map[string][]string            // trace_id -> []span node IDs
-	errorIndex      map[string]map[string]struct{} // error_code -> set of request IDs
-	errorIndexReady bool
-}
-
-func NewStore() *Store {
-	return &Store{
-		graph:           core.New(),
-		requestFacts:    map[string]RequestFacts{},
-		seenRequests:    map[string]struct{}{},
-		counters:        NewCounters(),
-		serviceStats:    map[string]*ServiceStats{},
-		edgeSet:         map[string]struct{}{},
-		traceToRequest:  map[string]string{},
-		traceToSpans:    map[string][]string{},
-		errorIndex:      map[string]map[string]struct{}{},
-		errorIndexReady: true,
-	}
-}
-
-// ensureGraphLocked guarantees s.graph is non-nil.
-// Call ONLY while holding s.mu.
-func (s *Store) ensureGraphLocked() {
-	if s.graph == nil {
-		s.graph = core.New()
-	}
-}
-
-// Merge merges another graph into this store.
-// Node IDs are deterministic, so duplicates are avoided.
-// Edges are append-only.
-func (s *Store) Merge(g *core.Graph) {
-	if g == nil {
-		return
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.ensureGraphLocked()
-
-	// Merge nodes
-	for id, incoming := range g.Nodes {
-		existing, exists := s.graph.Nodes[id]
-		if !exists {
-			s.graph.Nodes[id] = incoming
-			continue
-		}
-
-		// Merge time ranges (imp!)
-		mergeNodeTime(&existing, &incoming)
-
-		// Deterministic merge for request and legacy span nodes.
-		if existing.Type == core.NodeRequest {
-			mergeRequestAttrs(&existing, &incoming)
-		}
-		if existing.Type == core.NodeSpan {
-			mergeSpanAttrs(&existing, &incoming)
-		}
-		// Backfill error node service attribute (added after initial release).
-		if existing.Type == core.NodeError {
-			if existing.Attr != nil && incoming.Attr != nil {
-				if _, ok := existing.Attr["service"]; !ok {
-					if svc, ok := incoming.Attr["service"].(string); ok && svc != "" {
-						existing.Attr["service"] = svc
-					}
-				}
-			}
-		}
-		s.graph.Nodes[id] = existing
-	}
-	// Merge edges (deduplicated); collect FailedWith edges for deferred error index update
-	var failedEdges []core.Edge
-	for _, e := range g.Edges {
-		key := e.From + ":" + e.To + ":" + string(e.Type)
-		if _, exists := s.edgeSet[key]; exists {
-			continue
-		}
-		s.edgeSet[key] = struct{}{}
-		s.graph.Edges = append(s.graph.Edges, e)
-		if s.graph.OutEdges == nil {
-			s.graph.OutEdges = make(map[string][]core.Edge)
-		}
-		if s.graph.InEdges == nil {
-			s.graph.InEdges = make(map[string][]core.Edge)
-		}
-		s.graph.OutEdges[e.From] = append(s.graph.OutEdges[e.From], e)
-		s.graph.InEdges[e.To] = append(s.graph.InEdges[e.To], e)
-
-		if e.Type == core.EdgeFailedWith {
-			failedEdges = append(failedEdges, e)
-		}
-	}
-
-	// Update trace indexes
-	for id, n := range g.Nodes {
-		traceID, _ := n.Attr["trace_id"].(string)
-		if traceID == "" {
-			continue
-		}
-		switch n.Type {
-		case core.NodeRequest:
-			s.traceToRequest[traceID] = id
-		case core.NodeSpan:
-			s.traceToSpans[traceID] = appendUniqueString(s.traceToSpans[traceID], id)
-		}
-	}
-
-	for id, n := range g.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-
-		// Always extract from the merged graph (not the delta)
-		facts, ok := extractRequestFactsFromGraph(s.graph, id)
-		if !ok {
-			continue
-		}
-
-		if _, seen := s.seenRequests[id]; seen {
-			oldFacts := s.requestFacts[id]
-			if !factsEqual(oldFacts, facts) {
-				s.reverseFactsFromCountersLocked(oldFacts)
-				s.applyFactsToCountersLocked(facts)
-			}
-			s.requestFacts[id] = facts
-			continue
-		}
-
-		s.seenRequests[id] = struct{}{}
-		s.requestFacts[id] = facts
-		s.applyFactsToCountersLocked(facts)
-	}
-
-	// Process error index with fully-populated traceToRequest
-	for _, e := range failedEdges {
-		s.addToErrorIndexLocked(e)
-	}
-}
-
-func (s *Store) applyFactsToCountersLocked(f RequestFacts) {
-	// error counts
-	for _, errID := range f.Errors {
-		s.counters.ErrorCount[errID]++
-		// service -> error
-		for _, svcID := range f.Services {
-			m := s.counters.ServiceErrorCount[svcID]
-			if m == nil {
-				m = map[string]int{}
-				s.counters.ServiceErrorCount[svcID] = m
-			}
-			m[errID]++
-		}
-		// flag -> error
-		for _, flagID := range f.FeatureFlags {
-			m := s.counters.FlagErrorCount[flagID]
-			if m == nil {
-				m = map[string]int{}
-				s.counters.FlagErrorCount[flagID] = m
-			}
-			m[errID]++
-		}
-	}
-
-	services := f.Services
-	if len(services) == 0 && f.RootService != "" {
-		services = []string{f.RootService}
-	}
-	for _, svcID := range services {
-		stats := s.serviceStats[svcID]
-		if stats == nil {
-			stats = &ServiceStats{}
-			s.serviceStats[svcID] = stats
-		}
-		stats.Invocations++
-		if len(f.Errors) > 0 {
-			stats.Errors++
-		}
-	}
-}
-
-func (s *Store) reverseFactsFromCountersLocked(f RequestFacts) {
-	for _, errID := range f.Errors {
-		s.counters.ErrorCount[errID]--
-		if s.counters.ErrorCount[errID] <= 0 {
-			delete(s.counters.ErrorCount, errID)
-		}
-		for _, svcID := range f.Services {
-			m := s.counters.ServiceErrorCount[svcID]
-			if m != nil {
-				m[errID]--
-				if m[errID] <= 0 {
-					delete(m, errID)
-				}
-				if len(m) == 0 {
-					delete(s.counters.ServiceErrorCount, svcID)
-				}
-			}
-		}
-		for _, flagID := range f.FeatureFlags {
-			m := s.counters.FlagErrorCount[flagID]
-			if m != nil {
-				m[errID]--
-				if m[errID] <= 0 {
-					delete(m, errID)
-				}
-				if len(m) == 0 {
-					delete(s.counters.FlagErrorCount, flagID)
-				}
-			}
-		}
-	}
-
-	services := f.Services
-	if len(services) == 0 && f.RootService != "" {
-		services = []string{f.RootService}
-	}
-	for _, svcID := range services {
-		stats := s.serviceStats[svcID]
-		if stats == nil {
-			continue
-		}
-		stats.Invocations--
-		if len(f.Errors) > 0 {
-			stats.Errors--
-		}
-		if stats.Invocations <= 0 && stats.Errors <= 0 {
-			delete(s.serviceStats, svcID)
-		}
-	}
-}
-
-func factsEqual(a, b RequestFacts) bool {
-	return sortedEqual(a.Services, b.Services) &&
-		sortedEqual(a.Errors, b.Errors) &&
-		sortedEqual(a.FeatureFlags, b.FeatureFlags)
-}
-
-func sortedEqual(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	ac := make([]string, len(a))
-	copy(ac, a)
-	bc := make([]string, len(b))
-	copy(bc, b)
-	sort.Strings(ac)
-	sort.Strings(bc)
-	for i := range ac {
-		if ac[i] != bc[i] {
-			return false
-		}
-	}
-	return true
-}
-
-//helper for time-window commands
-// internal/graph/store.go
-
-func mergeNodeTime(dst, src *core.Node) {
-	if !src.FirstSeen.IsZero() &&
-		(dst.FirstSeen.IsZero() || src.FirstSeen.Before(dst.FirstSeen)) {
-		dst.FirstSeen = src.FirstSeen
-	}
-
-	if !src.LastSeen.IsZero() &&
-		(dst.LastSeen.IsZero() || src.LastSeen.After(dst.LastSeen)) {
-		dst.LastSeen = src.LastSeen
-	}
-}
-
-// mergeRequestAttrs applies deterministic merge rules for request nodes.
-// - success: AND (any failure makes the request failed)
-// - If incoming is from root span (is_root=true): overwrite status_code, latency_ms, event_name, flow, root_service
-// - error_codes: accumulated as deduplicated []string
-func mergeRequestAttrs(dst, src *core.Node) {
-	if dst.Attr == nil {
-		dst.Attr = map[string]any{}
-	}
-	if src.Attr == nil {
-		return
-	}
-
-	// success = AND: any false makes it false
-	if srcSuccess, ok := src.Attr["success"].(bool); ok && !srcSuccess {
-		dst.Attr["success"] = false
-	}
-
-	mergeRequestScalar(dst.Attr, src.Attr, "version")
-	mergeRequestScalar(dst.Attr, src.Attr, "user_id")
-	mergeRequestScalar(dst.Attr, src.Attr, "user_tier")
-	mergeRequestScalar(dst.Attr, src.Attr, "user_region")
-	if _, ok := dst.Attr["user_vip"]; !ok {
-		if v, ok := src.Attr["user_vip"]; ok {
-			dst.Attr["user_vip"] = v
-		}
-	}
-	mergeRequestStringSlice(dst.Attr, src.Attr, "feature_flags")
-
-	// If incoming event is from root span, its values become the trace-level summary
-	if isRoot, ok := src.Attr["is_root"].(bool); ok && isRoot {
-		if v, ok := src.Attr["status_code"]; ok {
-			dst.Attr["status_code"] = v
-		}
-		if v, ok := src.Attr["latency_ms"]; ok {
-			dst.Attr["latency_ms"] = v
-		}
-		if v, ok := src.Attr["event_name"]; ok {
-			dst.Attr["event_name"] = v
-		}
-		if v, ok := src.Attr["flow"]; ok {
-			dst.Attr["flow"] = v
-		}
-		if v, ok := src.Attr["service"]; ok {
-			dst.Attr["root_service"] = v
-		}
-		if v, ok := src.Attr["http_method"].(string); ok && v != "" {
-			dst.Attr["http_method"] = v
-		}
-		if v, ok := src.Attr["route_template"].(string); ok && v != "" {
-			dst.Attr["route_template"] = v
-		}
-		dst.Attr["is_root"] = true
-	}
-
-	// Accumulate error_codes as deduplicated []string
-	mergeErrorCodes(dst, src)
-}
-
-func mergeRequestScalar(dst, src map[string]any, key string) {
-	if _, ok := dst[key]; ok {
-		return
-	}
-	if v, ok := src[key]; ok {
-		dst[key] = v
-	}
-}
-
-func mergeRequestStringSlice(dst, src map[string]any, key string) {
-	values := append(AttrToStringSlice(dst[key]), AttrToStringSlice(src[key])...)
-	if len(values) == 0 {
-		return
-	}
-	dst[key] = dedupeStringSlice(values)
-}
-
-func dedupeStringSlice(in []string) []string {
-	seen := map[string]struct{}{}
-	out := make([]string, 0, len(in))
-	for _, v := range in {
-		if v == "" {
-			continue
-		}
-		if _, ok := seen[v]; ok {
-			continue
-		}
-		seen[v] = struct{}{}
-		out = append(out, v)
-	}
-	return out
-}
-
-func mergeErrorCodes(dst, src *core.Node) {
-	var codes []string
-	seen := map[string]struct{}{}
-	appendCode := func(code string) {
-		if code == "" {
-			return
-		}
-		if _, exists := seen[code]; exists {
-			return
-		}
-		codes = append(codes, code)
-		seen[code] = struct{}{}
-	}
-
-	// Include prior merged state first, then single-code attrs, then incoming values.
-	for _, c := range AttrToStringSlice(dst.Attr["error_codes"]) {
-		appendCode(c)
-	}
-	if dstErr, ok := dst.Attr["error_code"].(string); ok {
-		appendCode(dstErr)
-	}
-	for _, c := range AttrToStringSlice(src.Attr["error_codes"]) {
-		appendCode(c)
-	}
-	if srcErr, ok := src.Attr["error_code"].(string); ok {
-		appendCode(srcErr)
-	}
-
-	if len(codes) > 0 {
-		dst.Attr["error_codes"] = codes
-	}
-}
-
-// AttrToStringSlice extracts a []string from an attribute value that may be
-// typed as []string or []any (the latter occurs after JSON round-trip).
-func AttrToStringSlice(v any) []string {
-	switch values := v.(type) {
-	case []string:
-		return values
-	case []any:
-		out := make([]string, 0, len(values))
-		for _, item := range values {
-			if s, ok := item.(string); ok && s != "" {
-				out = append(out, s)
-			}
-		}
-		return out
-	default:
-		return nil
-	}
-}
-
-// mergeSpanAttrs enriches a stub span node with data from the real event.
-// Stubs are created when a child event arrives before its parent's own event.
-// When the parent's event arrives, its enriched fields fill in the gaps.
-func mergeSpanAttrs(dst, src *core.Node) {
-	if dst.Attr == nil {
-		dst.Attr = map[string]any{}
-	}
-	if src.Attr == nil {
-		return
-	}
-
-	// Fill in any attrs that dst is missing from src
-	enrichKeys := []string{
-		"event_name", "status_code", "success", "latency_ms",
-		"flow", "timestamp", "caller_service", "downstream_service",
-		"service", "error_code", "http_method", "route_template",
-	}
-	for _, key := range enrichKeys {
-		if _, hasDst := dst.Attr[key]; !hasDst {
-			if v, hasSrc := src.Attr[key]; hasSrc {
-				dst.Attr[key] = v
-			}
-		}
-	}
-}
-
-// Graph returns the live graph pointer.
-// IMPORTANT: callers MUST treat this as read-only.
-func (s *Store) Graph() *core.Graph {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	s.ensureGraphLocked()
-
-	return s.graph
-}
-
-// Snapshot returns a deep copy of the graph.
-// This is safe for persistence, debugging, and CLI reads.
-func (s *Store) Snapshot() *core.Graph {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	// imp! graph must never be nil
-	s.ensureGraphLocked()
-
-	// Deep copy nodes
-	nodes := make(map[string]core.Node, len(s.graph.Nodes))
-	for id, n := range s.graph.Nodes {
-		var attr map[string]any
-		if n.Attr != nil {
-			attr = make(map[string]any, len(n.Attr))
-			for k, v := range n.Attr {
-				attr[k] = v
-			}
-		}
-
-		nodes[id] = core.Node{
-			ID:        n.ID,
-			Type:      n.Type,
-			Attr:      attr,
-			FirstSeen: n.FirstSeen,
-			LastSeen:  n.LastSeen,
-		}
-	}
-
-	// Copy edges
-	edges := make([]core.Edge, len(s.graph.Edges))
-	copy(edges, s.graph.Edges)
-
-	snap := &core.Graph{
-		Nodes: nodes,
-		Edges: edges,
-	}
-	snap.RebuildIndexes()
-	return snap
-}
-
-// RequestIDForTrace returns the request node ID for a given trace ID.
-func (s *Store) RequestIDForTrace(traceID string) (string, bool) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	id, ok := s.traceToRequest[traceID]
-	return id, ok
-}
-
-// SpanIDsForTrace returns all span node IDs for a given trace ID.
-func (s *Store) SpanIDsForTrace(traceID string) []string {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	spanIDs := s.traceToSpans[traceID]
-	return append([]string(nil), spanIDs...)
-}
-
-// Restore replaces the current graph with a defensive copy of g.
-// This avoids memory aliasing with snapshot data.
-func (s *Store) Restore(g *core.Graph) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	if g == nil {
-		s.graph = core.New()
-		s.rebuildDerivedIndexesLocked()
-		return
-	}
-
-	// Deep-copy nodes
-	nodes := make(map[string]core.Node, len(g.Nodes))
-	for id, n := range g.Nodes {
-		var attrCopy map[string]any
-		if n.Attr != nil {
-			attrCopy = make(map[string]any, len(n.Attr))
-			for k, v := range n.Attr {
-				attrCopy[k] = v
-			}
-		}
-
-		nodes[id] = core.Node{
-			ID:        n.ID,
-			Type:      n.Type,
-			Attr:      attrCopy,
-			FirstSeen: n.FirstSeen,
-			LastSeen:  n.LastSeen,
-		}
-
-	}
-
-	// Copy edges
-	edges := make([]core.Edge, len(g.Edges))
-	copy(edges, g.Edges)
-
-	s.graph = &core.Graph{
-		Nodes: nodes,
-		Edges: edges,
-	}
-	s.backfillRequestTimestampsLocked()
-	s.rebuildDerivedIndexesLocked()
-}
-
-// PruneOlderThan drops requests with LastSeen before cutoff.
-// This keeps 1-hop context for remaining requests.
-func (s *Store) PruneOlderThan(cutoff time.Time) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	s.ensureGraphLocked()
-	s.graph = window.FilterByWindow(s.graph, cutoff, time.Now())
-	s.rebuildDerivedIndexesLocked()
-}
-
-func (s *Store) rebuildDerivedIndexesLocked() {
-	s.requestFacts = map[string]RequestFacts{}
-	s.seenRequests = map[string]struct{}{}
-	s.counters = NewCounters()
-	s.serviceStats = map[string]*ServiceStats{}
-	s.edgeSet = map[string]struct{}{}
-	s.traceToRequest = map[string]string{}
-	s.traceToSpans = map[string][]string{}
-	s.errorIndex = map[string]map[string]struct{}{}
-	s.errorIndexReady = false
-
-	// Rebuild edge set; collect FailedWith for deferred error index update
-	var failedEdges []core.Edge
-	for _, e := range s.graph.Edges {
-		key := e.From + ":" + e.To + ":" + string(e.Type)
-		s.edgeSet[key] = struct{}{}
-		if e.Type == core.EdgeFailedWith {
-			failedEdges = append(failedEdges, e)
-		}
-	}
-	s.graph.RebuildIndexes()
-
-	// Rebuild trace indexes (must precede error index for span→request lookup)
-	for id, n := range s.graph.Nodes {
-		traceID, _ := n.Attr["trace_id"].(string)
-		if traceID == "" {
-			continue
-		}
-		switch n.Type {
-		case core.NodeRequest:
-			s.traceToRequest[traceID] = id
-		case core.NodeSpan:
-			s.traceToSpans[traceID] = appendUniqueString(s.traceToSpans[traceID], id)
-		}
-	}
-
-	// Now build error index with fully-populated traceToRequest
-	for _, e := range failedEdges {
-		s.addToErrorIndexLocked(e)
-	}
-	s.errorIndexReady = true
-
-	for id, n := range s.graph.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		facts, ok := extractRequestFactsFromGraph(s.graph, id)
-		if !ok {
-			continue
-		}
-		s.seenRequests[id] = struct{}{}
-		s.requestFacts[id] = facts
-		s.applyFactsToCountersLocked(facts)
-	}
-}
-
-func (s *Store) backfillRequestTimestampsLocked() {
-	for id, n := range s.graph.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		if !n.LastSeen.IsZero() {
-			continue
-		}
-		ts := parseTimestampAttr(n.Attr)
-		if ts.IsZero() {
-			continue
-		}
-		n.FirstSeen = ts
-		n.LastSeen = ts
-		s.graph.Nodes[id] = n
-	}
-}
-
-func parseTimestampAttr(attr map[string]any) time.Time {
-	if attr == nil {
-		return time.Time{}
-	}
-	if v, ok := attr["timestamp"]; ok {
-		switch t := v.(type) {
-		case time.Time:
-			return t
-		case string:
-			if ts, err := time.Parse(time.RFC3339Nano, t); err == nil {
-				return ts
-			}
-			if ts, err := time.Parse(time.RFC3339, t); err == nil {
-				return ts
-			}
-		case float64:
-			return time.Unix(int64(t), 0).UTC()
-		case int64:
-			return time.Unix(t, 0).UTC()
-		case int:
-			return time.Unix(int64(t), 0).UTC()
-		}
-	}
-	return time.Time{}
-}
-
-func appendUniqueString(values []string, candidate string) []string {
-	for _, existing := range values {
-		if existing == candidate {
-			return values
-		}
-	}
-	return append(values, candidate)
-}
-
-// addToErrorIndexLocked adds a FailedWith edge to the error index.
-// Must be called with s.mu held.
-func (s *Store) addToErrorIndexLocked(e core.Edge) {
-	errNode, ok := s.graph.Nodes[e.To]
-	if !ok {
-		return
-	}
-	code, _ := errNode.Attr["code"].(string)
-	if code == "" {
-		return
-	}
-
-	// Determine request ID: edge.From is either a request or span node
-	reqID := ""
-	fromNode, ok := s.graph.Nodes[e.From]
-	if !ok {
-		return
-	}
-	switch fromNode.Type {
-	case core.NodeRequest:
-		reqID = e.From
-	case core.NodeSpan:
-		// Look up via traceToRequest
-		traceID, _ := fromNode.Attr["trace_id"].(string)
-		if traceID != "" {
-			reqID = s.traceToRequest[traceID]
-		}
-	}
-	if reqID == "" {
-		return
-	}
-
-	set := s.errorIndex[code]
-	if set == nil {
-		set = map[string]struct{}{}
-		s.errorIndex[code] = set
-	}
-	set[reqID] = struct{}{}
-}
-
-// ErrorIndex returns request IDs affected by a given error code.
-// The bool indicates whether the index is ready (valid).
-func (s *Store) ErrorIndex(errorCode string) ([]string, bool) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	if !s.errorIndexReady {
-		return nil, false
-	}
-	set := s.errorIndex[errorCode]
-	out := make([]string, 0, len(set))
-	for id := range set {
-		out = append(out, id)
-	}
-	return out, true
-}
-
-func extractRequestFactsFromGraph(g *core.Graph, reqID string) (RequestFacts, bool) {
-	reqNode, ok := g.Nodes[reqID]
-	if !ok || reqNode.Type != core.NodeRequest {
-		return RequestFacts{}, false
-	}
-
-	f := RequestFacts{
-		RequestID: reqID,
-		SeenAt:    reqNode.LastSeen,
-	}
-
-	// Extract attrs from request node when available. This supports the
-	// flattened graph shape while remaining compatible with legacy snapshots.
-	if reqNode.Attr != nil {
-		if rs, ok := reqNode.Attr["root_service"].(string); ok {
-			f.RootService = rs
-		}
-		switch v := reqNode.Attr["latency_ms"].(type) {
-		case int64:
-			f.LatencyMs = v
-		case int:
-			f.LatencyMs = int64(v)
-		case float64:
-			f.LatencyMs = int64(v)
-		}
-		if st, ok := reqNode.Attr["status"].(string); ok {
-			f.Status = st
-		} else if success, ok := reqNode.Attr["success"].(bool); ok {
-			if success {
-				f.Status = "success"
-			} else {
-				f.Status = "failure"
-			}
-		}
-		if version, ok := reqNode.Attr["version"].(string); ok {
-			f.Version = version
-		}
-		if uid, ok := reqNode.Attr["user_id"].(string); ok {
-			f.UserID = uid
-		}
-		if tier, ok := reqNode.Attr["user_tier"].(string); ok {
-			f.UserTier = tier
-		}
-		if vip, ok := reqNode.Attr["user_vip"].(bool); ok {
-			f.UserVIP = vip
-		}
-		if region, ok := reqNode.Attr["user_region"].(string); ok {
-			f.UserRegion = region
-		}
-		if flags := AttrToStringSlice(reqNode.Attr["feature_flags"]); len(flags) > 0 {
-			f.FeatureFlags = append([]string(nil), flags...)
-		}
-		if len(f.FeatureFlags) == 0 {
-			if legacyFlags := AttrToStringSlice(reqNode.Attr["flags"]); len(legacyFlags) > 0 {
-				f.FeatureFlags = append([]string(nil), legacyFlags...)
-			}
-		}
-	}
-
-	// Gather neighbors via adjacency indexes.
-	// Store human-readable names/codes (not node IDs) so queries like
-	// error_code=PMT_502 or service=checkout work directly.
-	for _, e := range g.OutEdges[reqID] {
-		if n, ok := g.Nodes[e.To]; ok {
-			switch n.Type {
-			case core.NodeService:
-				if name, _ := n.Attr["name"].(string); name != "" {
-					f.Services = append(f.Services, name)
-				} else {
-					f.Services = append(f.Services, n.ID)
-				}
-			case core.NodeError:
-				if code, _ := n.Attr["code"].(string); code != "" {
-					f.Errors = append(f.Errors, code)
-				} else {
-					f.Errors = append(f.Errors, n.ID)
-				}
-			case core.NodeFlag:
-				if name, _ := n.Attr["name"].(string); name != "" {
-					f.FeatureFlags = append(f.FeatureFlags, name)
-				} else {
-					f.FeatureFlags = append(f.FeatureFlags, n.ID)
-				}
-			}
-		}
-	}
-	for _, e := range g.InEdges[reqID] {
-		if n, ok := g.Nodes[e.From]; ok {
-			switch n.Type {
-			case core.NodeService:
-				if name, _ := n.Attr["name"].(string); name != "" {
-					f.Services = append(f.Services, name)
-				} else {
-					f.Services = append(f.Services, n.ID)
-				}
-			case core.NodeError:
-				if code, _ := n.Attr["code"].(string); code != "" {
-					f.Errors = append(f.Errors, code)
-				} else {
-					f.Errors = append(f.Errors, n.ID)
-				}
-			case core.NodeFlag:
-				if name, _ := n.Attr["name"].(string); name != "" {
-					f.FeatureFlags = append(f.FeatureFlags, name)
-				} else {
-					f.FeatureFlags = append(f.FeatureFlags, n.ID)
-				}
-			}
-		}
-	}
-
-	return f, true
-}
-
-// TraceStore returns nil. The graph store does not own a trace store; callers
-// that need one should inject it via a wrapper (e.g. frozenStore in the ingest
-// handler). This method exists so *Store satisfies the tools.Store interface.
-func (s *Store) TraceStore() *tracestore.Store { return nil }
-
-func (s *Store) ServiceStats() map[string]ServiceStats {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	out := make(map[string]ServiceStats, len(s.serviceStats))
-	for name, stats := range s.serviceStats {
-		if stats == nil {
-			continue
-		}
-		out[name] = *stats
-	}
-	return out
-}
diff --git a/internal/graph/store/store_test.go b/internal/graph/store/store_test.go
deleted file mode 100644
index 29efb7e..0000000
--- a/internal/graph/store/store_test.go
+++ /dev/null
@@ -1,718 +0,0 @@
-package store
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-func newStoreWithTraceStore() (*Store, *tracestore.Store) {
-	return NewStore(), tracestore.NewStore()
-}
-
-func TestStore_Merge_RequestDeterministicMerge(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa1"
-
-	// Child event arrives first (non-root)
-	childEv := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithParentSpanID("pppppppppppppppp"),
-		testutil.WithService("payment"),
-		testutil.WithFlow("payment"),
-		testutil.WithStatusCode(502),
-		testutil.WithLatency(12),
-		testutil.WithError("PMT_502", "payment failed"),
-		testutil.WithEventName("payment.error"),
-	)
-	s.Merge(b.Build(childEv))
-
-	// Root event arrives second
-	rootEv := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("pppppppppppppppp"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("api-gateway"),
-		testutil.WithFlow("purchase"),
-		testutil.WithStatusCode(200),
-		testutil.WithLatency(45),
-		testutil.WithEventName("api-gateway.request"),
-	)
-	s.Merge(b.Build(rootEv))
-
-	// Verify request node
-	snap := s.Snapshot()
-	reqID := core.ID("request", traceID)
-	req, ok := snap.Nodes[reqID]
-	if !ok {
-		t.Fatalf("request node %s not found", reqID)
-	}
-
-	// Root's values should win for status_code, latency_ms, event_name
-	if got := req.Attr["status_code"]; got != 200 {
-		t.Errorf("status_code = %v, want 200 (from root)", got)
-	}
-	if got := req.Attr["latency_ms"]; got != int64(45) {
-		t.Errorf("latency_ms = %v, want 45 (from root)", got)
-	}
-	if got := req.Attr["event_name"]; got != "api-gateway.request" {
-		t.Errorf("event_name = %v, want api-gateway.request (from root)", got)
-	}
-	if got := req.Attr["flow"]; got != "purchase" {
-		t.Errorf("flow = %v, want purchase (from root)", got)
-	}
-
-	// success should be AND: child was false, so overall false
-	if got := req.Attr["success"]; got != false {
-		t.Errorf("success = %v, want false (AND of child failure)", got)
-	}
-
-	// is_root should be true (root event set it)
-	if got := req.Attr["is_root"]; got != true {
-		t.Errorf("is_root = %v, want true", got)
-	}
-
-	codes, ok := req.Attr["error_codes"].([]string)
-	if !ok {
-		t.Fatalf("error_codes should be []string, got %T (%v)", req.Attr["error_codes"], req.Attr["error_codes"])
-	}
-	if len(codes) != 1 || codes[0] != "PMT_502" {
-		t.Errorf("error_codes = %v, want [PMT_502]", codes)
-	}
-}
-
-func TestStore_TraceStore_SpanMergeBySpanID(t *testing.T) {
-	s, ts := newStoreWithTraceStore()
-	b := build.NewBuilder()
-	traceID := "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
-	parentSpanID := "1111111111111111"
-	childSpanID := "2222222222222222"
-
-	childEv := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID(childSpanID),
-		testutil.WithParentSpanID(parentSpanID),
-		testutil.WithService("checkout"),
-		testutil.WithStatusCode(200),
-		testutil.WithLatency(32),
-	)
-	childResult := b.BuildResult(childEv)
-	s.Merge(childResult.Graph)
-	if childResult.Span != nil {
-		ts.Upsert(traceID, core.ID("request", traceID), childResult.Span)
-	}
-
-	parentEv := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID(parentSpanID),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(200),
-		testutil.WithLatency(45),
-		testutil.WithEventName("api-gateway.request"),
-	)
-	parentResult := b.BuildResult(parentEv)
-	s.Merge(parentResult.Graph)
-	if parentResult.Span != nil {
-		ts.Upsert(traceID, core.ID("request", traceID), parentResult.Span)
-	}
-
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		t.Fatalf("trace %s not found", traceID)
-	}
-	if len(rec.Spans) != 2 {
-		t.Fatalf("expected 2 span records, got %d", len(rec.Spans))
-	}
-	var parent tracestore.SpanRecord
-	found := false
-	for _, span := range rec.Spans {
-		if span.SpanID == parentSpanID {
-			parent = span
-			found = true
-			break
-		}
-	}
-	if !found {
-		t.Fatalf("parent span %s not found", parentSpanID)
-	}
-	if parent.Service != "api-gateway" {
-		t.Errorf("service = %v, want api-gateway", parent.Service)
-	}
-	if parent.StatusCode != 200 {
-		t.Errorf("status_code = %v, want 200", parent.StatusCode)
-	}
-	if parent.LatencyMs != int64(45) {
-		t.Errorf("latency_ms = %v, want 45", parent.LatencyMs)
-	}
-	if parent.EventName != "api-gateway.request" {
-		t.Errorf("event_name = %v, want api-gateway.request", parent.EventName)
-	}
-}
-
-func TestStore_EdgeDedup(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "cccccccccccccccccccccccccccccccc"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(200),
-	)
-
-	g := b.Build(ev)
-	s.Merge(g)
-	edgeCount1 := len(s.Snapshot().Edges)
-
-	// Merge the same graph again — edge count should not increase
-	s.Merge(g)
-	edgeCount2 := len(s.Snapshot().Edges)
-
-	if edgeCount2 != edgeCount1 {
-		t.Errorf("edge count increased from %d to %d after duplicate merge", edgeCount1, edgeCount2)
-	}
-}
-
-func TestStore_Index_TraceToRequest(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "dddddddddddddddddddddddddddddddd"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-	)
-	s.Merge(b.Build(ev))
-
-	reqID, ok := s.RequestIDForTrace(traceID)
-	if !ok {
-		t.Fatal("expected RequestIDForTrace to return true")
-	}
-	expectedReqID := core.ID("request", traceID)
-	if reqID != expectedReqID {
-		t.Errorf("RequestIDForTrace = %s, want %s", reqID, expectedReqID)
-	}
-
-	// Unknown trace
-	_, ok = s.RequestIDForTrace("0000000000000000000000000000000f")
-	if ok {
-		t.Error("expected RequestIDForTrace to return false for unknown trace")
-	}
-}
-
-func TestStore_TraceStore_Get_ReturnsUniqueSpans(t *testing.T) {
-	s, ts := newStoreWithTraceStore()
-	b := build.NewBuilder()
-	traceID := "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
-
-	// Two events for the same trace with different spans
-	ev1 := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("api-gateway"),
-	)
-	ev2 := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("2222222222222222"),
-		testutil.WithParentSpanID("1111111111111111"),
-		testutil.WithService("checkout"),
-	)
-
-	for _, current := range []struct {
-		traceID string
-		event   event.WideEvent
-	}{
-		{traceID: traceID, event: ev1},
-		{traceID: traceID, event: ev2},
-		{traceID: traceID, event: ev2},
-	} {
-		result := b.BuildResult(current.event)
-		s.Merge(result.Graph)
-		if result.Span != nil {
-			ts.Upsert(current.traceID, core.ID("request", current.traceID), result.Span)
-		}
-	}
-
-	rec, ok := ts.Get(traceID)
-	if !ok {
-		t.Fatal("expected trace record to exist")
-	}
-	if len(rec.Spans) != 2 {
-		t.Fatalf("expected 2 unique span records, got %d", len(rec.Spans))
-	}
-}
-
-func TestStore_TraceStore_Get_ReturnsCopy(t *testing.T) {
-	s, ts := newStoreWithTraceStore()
-	b := build.NewBuilder()
-	traceID := "abababababababababababababababab"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("3333333333333333"),
-		testutil.WithService("api-gateway"),
-	)
-	result := b.BuildResult(ev)
-	s.Merge(result.Graph)
-	if result.Span != nil {
-		ts.Upsert(traceID, core.ID("request", traceID), result.Span)
-	}
-	got, ok := ts.Get(traceID)
-	if !ok || len(got.Spans) == 0 {
-		t.Fatal("expected at least one span record")
-	}
-	got.Spans[0].SpanID = "mutated-by-caller"
-
-	fresh, ok := ts.Get(traceID)
-	if !ok || len(fresh.Spans) == 0 {
-		t.Fatal("expected at least one span record on second read")
-	}
-	if fresh.Spans[0].SpanID == "mutated-by-caller" {
-		t.Fatal("Get returned internal backing data; caller mutation leaked into store")
-	}
-}
-
-func TestStore_Merge_RecomputesFacts(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa02"
-
-	// First event: checkout service, no error
-	ev1 := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("checkout"),
-		testutil.WithStatusCode(200),
-		testutil.WithEventName("checkout.request"),
-	)
-	s.Merge(b.Build(ev1))
-
-	// Verify no errors in counters yet
-	if len(s.counters.ErrorCount) != 0 {
-		t.Fatalf("expected 0 errors after first merge, got %v", s.counters.ErrorCount)
-	}
-
-	// Second event: same trace, child span with an error
-	ev2 := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("2222222222222222"),
-		testutil.WithParentSpanID("1111111111111111"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("ERR_PAYMENT", "payment failed"),
-		testutil.WithEventName("payment.error"),
-	)
-	s.Merge(b.Build(ev2))
-
-	// The error code should now be counted (keyed by code string, not node ID).
-	count, ok := s.counters.ErrorCount["ERR_PAYMENT"]
-	if !ok || count < 1 {
-		t.Errorf("expected ErrorCount[ERR_PAYMENT] >= 1, got %d (ok=%v); full counters: %v",
-			count, ok, s.counters.ErrorCount)
-	}
-}
-
-func TestStore_Index_Restore_Rebuilds(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "ffffffffffffffffffffffffffffffff"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithService("api-gateway"),
-	)
-	s.Merge(b.Build(ev))
-
-	// Take snapshot and restore into a fresh store
-	snap := s.Snapshot()
-	s2 := NewStore()
-	s2.Restore(snap)
-
-	// Indexes should be rebuilt from restored graph
-	reqID, ok := s2.RequestIDForTrace(traceID)
-	if !ok {
-		t.Fatal("RequestIDForTrace should work after Restore")
-	}
-	if reqID != core.ID("request", traceID) {
-		t.Errorf("RequestIDForTrace = %s, want %s", reqID, core.ID("request", traceID))
-	}
-
-	// Edge dedup should also work after restore — merge same snapshot again
-	edgesBefore := len(s2.Snapshot().Edges)
-	s2.Merge(snap)
-	edgesAfter := len(s2.Snapshot().Edges)
-	if edgesAfter != edgesBefore {
-		t.Errorf("edges grew from %d to %d after duplicate merge post-Restore", edgesBefore, edgesAfter)
-	}
-}
-
-func TestStore_LateRootMerge_UpdatesRootService(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa03"
-	reqID := core.ID("request", traceID)
-
-	// Child span arrives first — same service set as root will use,
-	// so Services/Errors/Flags don't change when root merges.
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("2222222222222222"),
-		testutil.WithParentSpanID("1111111111111111"),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(200),
-		testutil.WithEventName("api-gateway.request"),
-	)
-	s.Merge(b.Build(child))
-
-	// Before root: RootService should be empty.
-	facts1, ok := s.requestFacts[reqID]
-	if !ok {
-		t.Fatal("requestFacts not found after child merge")
-	}
-	if facts1.RootService != "" {
-		t.Errorf("RootService = %q before root merge, want empty", facts1.RootService)
-	}
-
-	// Root span arrives — counter-relevant fields (Services/Errors/Flags) unchanged.
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(200),
-		testutil.WithEventName("api-gateway.request"),
-	)
-	s.Merge(b.Build(root))
-
-	// After root: RootService must be set even though counters didn't change.
-	facts2, ok := s.requestFacts[reqID]
-	if !ok {
-		t.Fatal("requestFacts not found after root merge")
-	}
-	if facts2.RootService != "api-gateway" {
-		t.Errorf("RootService = %q after root merge, want api-gateway", facts2.RootService)
-	}
-
-	// Counter-relevant fields should be identical (no spurious recompute).
-	if !factsEqual(facts1, facts2) {
-		t.Error("counter-relevant fields changed unexpectedly")
-	}
-}
-
-func TestStore_Merge_RootOverwritesHTTPMethodAndRouteTemplate(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa04"
-	reqID := core.ID("request", traceID)
-
-	// Child arrives first with method/template.
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("2222222222222222"),
-		testutil.WithParentSpanID("1111111111111111"),
-		testutil.WithService("payment"),
-		testutil.WithEventName("payment.request"),
-		testutil.WithHTTPMethod("GET"),
-		testutil.WithRouteTemplate("/payments/{id}"),
-	)
-	s.Merge(b.Build(child))
-
-	snap := s.Snapshot()
-	req := snap.Nodes[reqID]
-	if got := req.Attr["http_method"]; got != "GET" {
-		t.Fatalf("http_method = %v, want GET before root merge", got)
-	}
-	if got := req.Attr["route_template"]; got != "/payments/{id}" {
-		t.Fatalf("route_template = %v, want /payments/{id} before root merge", got)
-	}
-
-	// Root arrives later with new method/template and should overwrite.
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithService("api-gateway"),
-		testutil.WithEventName("api-gateway.request"),
-		testutil.WithHTTPMethod("POST"),
-		testutil.WithRouteTemplate("/checkout"),
-	)
-	s.Merge(b.Build(root))
-
-	snap = s.Snapshot()
-	req = snap.Nodes[reqID]
-	if got := req.Attr["http_method"]; got != "POST" {
-		t.Errorf("http_method = %v, want POST after root merge", got)
-	}
-	if got := req.Attr["route_template"]; got != "/checkout" {
-		t.Errorf("route_template = %v, want /checkout after root merge", got)
-	}
-}
-
-// ---------- ErrorIndex tests ----------
-
-func requireSetEqual(t *testing.T, label string, got []string, want []string) {
-	t.Helper()
-	if len(got) != len(want) {
-		t.Fatalf("%s: len = %d, want %d; got %v", label, len(got), len(want), got)
-	}
-	set := map[string]struct{}{}
-	for _, v := range want {
-		set[v] = struct{}{}
-	}
-	for _, v := range got {
-		if _, ok := set[v]; !ok {
-			t.Fatalf("%s: unexpected element %q; got %v, want %v", label, v, got, want)
-		}
-	}
-}
-
-func TestErrorIndex_UnknownCode(t *testing.T) {
-	s := NewStore()
-	ids, ready := s.ErrorIndex("NOPE")
-	if !ready {
-		t.Fatal("expected ready=true on fresh store")
-	}
-	if len(ids) != 0 {
-		t.Errorf("expected empty slice, got %v", ids)
-	}
-}
-
-func TestErrorIndex_BasicLookup(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aa00000000000000000000000000aa01"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1100000000000001"),
-		testutil.WithService("svc-a"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("SVC_500", "internal error"),
-		testutil.WithEventName("svc-a.error"),
-	)
-	s.Merge(b.Build(ev))
-
-	ids, ready := s.ErrorIndex("SVC_500")
-	if !ready {
-		t.Fatal("expected ready=true")
-	}
-	requireSetEqual(t, "ErrorIndex(SVC_500)", ids, []string{core.ID("request", traceID)})
-}
-
-func TestErrorIndex_DeduplicatesSameRequest_DuplicateEdges(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aa00000000000000000000000000aa02"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1100000000000002"),
-		testutil.WithService("svc-b"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("DUP_502", "bad gateway"),
-		testutil.WithEventName("svc-b.error"),
-	)
-	g := b.Build(ev)
-	s.Merge(g)
-	s.Merge(g) // duplicate merge
-
-	ids, ready := s.ErrorIndex("DUP_502")
-	if !ready {
-		t.Fatal("expected ready=true")
-	}
-	requireSetEqual(t, "ErrorIndex(DUP_502)", ids, []string{core.ID("request", traceID)})
-}
-
-func TestErrorIndex_DeduplicatesSameRequest_SpanAndRequestOrigin(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aa00000000000000000000000000aa03"
-
-	// Root span with error
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1100000000000003"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("svc-c"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("BOTH_500", "root error"),
-		testutil.WithEventName("svc-c.error"),
-	)
-	s.Merge(b.Build(root))
-
-	// Child span with same error code
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1100000000000004"),
-		testutil.WithParentSpanID("1100000000000003"),
-		testutil.WithService("svc-c"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("BOTH_500", "child error"),
-		testutil.WithEventName("svc-c.error"),
-	)
-	s.Merge(b.Build(child))
-
-	ids, ready := s.ErrorIndex("BOTH_500")
-	if !ready {
-		t.Fatal("expected ready=true")
-	}
-	requireSetEqual(t, "ErrorIndex(BOTH_500)", ids, []string{core.ID("request", traceID)})
-}
-
-func TestErrorIndex_MultiRequestSameCode(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceA := "aa00000000000000000000000000aa04"
-	traceB := "aa00000000000000000000000000aa05"
-
-	evA := testutil.MakeEvent(
-		testutil.WithTraceID(traceA),
-		testutil.WithSpanID("1100000000000005"),
-		testutil.WithService("svc-d"),
-		testutil.WithStatusCode(503),
-		testutil.WithError("SHARED_503", "service unavailable"),
-		testutil.WithEventName("svc-d.error"),
-	)
-	evB := testutil.MakeEvent(
-		testutil.WithTraceID(traceB),
-		testutil.WithSpanID("1100000000000006"),
-		testutil.WithService("svc-e"),
-		testutil.WithStatusCode(503),
-		testutil.WithError("SHARED_503", "service unavailable"),
-		testutil.WithEventName("svc-e.error"),
-	)
-	s.Merge(b.Build(evA))
-	s.Merge(b.Build(evB))
-
-	ids, ready := s.ErrorIndex("SHARED_503")
-	if !ready {
-		t.Fatal("expected ready=true")
-	}
-	requireSetEqual(t, "ErrorIndex(SHARED_503)", ids, []string{
-		core.ID("request", traceA),
-		core.ID("request", traceB),
-	})
-}
-
-func TestErrorIndex_ReadinessAfterRestore(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aa00000000000000000000000000aa06"
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("1100000000000007"),
-		testutil.WithService("svc-f"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("RESTORE_ERR", "fail"),
-		testutil.WithEventName("svc-f.error"),
-	)
-	s.Merge(b.Build(ev))
-
-	snap := s.Snapshot()
-
-	s2 := NewStore()
-	s2.Restore(snap)
-
-	ids, ready := s2.ErrorIndex("RESTORE_ERR")
-	if !ready {
-		t.Fatal("expected ready=true after Restore")
-	}
-	requireSetEqual(t, "ErrorIndex(RESTORE_ERR) after restore", ids, []string{core.ID("request", traceID)})
-}
-
-func TestErrorIndex_PruneRemovesStaleEntries(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-
-	// Use a fixed reference time to avoid any time.Now() sensitivity.
-	ref := time.Date(2025, 6, 1, 12, 0, 0, 0, time.UTC)
-	oldTrace := "aa00000000000000000000000000aa07"
-	newTrace := "aa00000000000000000000000000aa08"
-
-	oldEv := testutil.MakeEvent(
-		testutil.WithTraceID(oldTrace),
-		testutil.WithSpanID("1100000000000008"),
-		testutil.WithService("svc-old"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("OLD_ERR", "old failure"),
-		testutil.WithEventName("svc-old.error"),
-		testutil.WithTimestamp(ref.Add(-2*time.Hour)),
-		testutil.WithUser("user-old", "standard", "us-west-2"),
-	)
-	newEv := testutil.MakeEvent(
-		testutil.WithTraceID(newTrace),
-		testutil.WithSpanID("1100000000000009"),
-		testutil.WithService("svc-new"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("NEW_ERR", "new failure"),
-		testutil.WithEventName("svc-new.error"),
-		testutil.WithTimestamp(ref),
-		testutil.WithUser("user-new", "standard", "us-west-2"),
-	)
-
-	s.Merge(b.Build(oldEv))
-	s.Merge(b.Build(newEv))
-
-	// Prune with 1h cutoff — old entries should be removed
-	s.PruneOlderThan(ref.Add(-1 * time.Hour))
-
-	oldIDs, ready := s.ErrorIndex("OLD_ERR")
-	if !ready {
-		t.Fatal("expected ready=true after prune")
-	}
-	if len(oldIDs) != 0 {
-		t.Errorf("OLD_ERR should be empty after prune, got %v", oldIDs)
-	}
-
-	newIDs, ready := s.ErrorIndex("NEW_ERR")
-	if !ready {
-		t.Fatal("expected ready=true after prune")
-	}
-	requireSetEqual(t, "ErrorIndex(NEW_ERR) after prune", newIDs, []string{core.ID("request", newTrace)})
-}
-
-func TestErrorIndex_SpanToRequestResolution(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	traceID := "aa00000000000000000000000000aa09"
-
-	// Root span (no error) — creates request + span nodes
-	root := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("110000000000000a"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("svc-g"),
-		testutil.WithStatusCode(200),
-		testutil.WithEventName("svc-g.request"),
-	)
-	s.Merge(b.Build(root))
-
-	// Child span with error — FailedWith edge from span node to error node
-	child := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("110000000000000b"),
-		testutil.WithParentSpanID("110000000000000a"),
-		testutil.WithService("svc-h"),
-		testutil.WithStatusCode(500),
-		testutil.WithError("CHILD_ERR", "child failed"),
-		testutil.WithEventName("svc-h.error"),
-	)
-	s.Merge(b.Build(child))
-
-	ids, ready := s.ErrorIndex("CHILD_ERR")
-	if !ready {
-		t.Fatal("expected ready=true")
-	}
-	requireSetEqual(t, "ErrorIndex(CHILD_ERR) via span→request", ids, []string{core.ID("request", traceID)})
-}
diff --git a/internal/graph/store/summaries.go b/internal/graph/store/summaries.go
deleted file mode 100644
index 82f4d46..0000000
--- a/internal/graph/store/summaries.go
+++ /dev/null
@@ -1,151 +0,0 @@
-package store
-
-import (
-	"sort"
-	"time"
-)
-
-// WindowSummary is the PROPAGATION-COUNTED window summary: every error code
-// on a failed request is tallied once per occurrence, so a payment→checkout
-// →api-gateway cascade with three distinct codes contributes three entries
-// (not one) to ErrorCount.
-//
-// This spread is correct for detail surfaces — trace stories, blast radius,
-// failure chains — that intentionally show how a failure propagated. It is
-// WRONG for default user-facing rollups (top errors, overview KPIs,
-// compare_windows, spike detection), which should count one root-cause
-// error per failed request. Those surfaces MUST consume
-// analysis.RollupWindow / analysis.RollupSummary instead.
-//
-// New callers introducing default rollups on WindowSummary will re-introduce
-// the PMT_502=9-not-3 cascade-amplification bug. Bind the result to a
-// variable named propagationSummary (or equivalent) so the review trail
-// reflects the propagation-counted semantics.
-type WindowSummary struct {
-	Start time.Time
-	End   time.Time
-
-	TotalRequests       int
-	TotalFailures       int
-	ServiceRequestCount map[string]int
-	FlagRequestCount    map[string]int
-	LatencyP50          int64
-	LatencyP95          int64
-	LatencyP99          int64
-
-	ErrorCount        map[string]int
-	ServiceErrorCount map[string]map[string]int
-	FlagErrorCount    map[string]map[string]int
-}
-
-// SummarizeWindow returns the propagation-counted [WindowSummary]. See the
-// WindowSummary doc for when to use it vs analysis.RollupWindow.
-func (s *Store) SummarizeWindow(start, end time.Time) WindowSummary {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	out := WindowSummary{
-		Start:               start,
-		End:                 end,
-		ErrorCount:          map[string]int{},
-		ServiceErrorCount:   map[string]map[string]int{},
-		FlagErrorCount:      map[string]map[string]int{},
-		ServiceRequestCount: map[string]int{},
-		FlagRequestCount:    map[string]int{},
-	}
-
-	var latencies []int64
-
-	for _, f := range s.requestFacts {
-		t := f.SeenAt
-		if t.IsZero() {
-			continue // skip malformed facts
-		}
-		if t.Before(start) || t.After(end) {
-			continue
-		}
-
-		out.TotalRequests++
-		if len(f.Errors) > 0 {
-			out.TotalFailures++
-		}
-		latencies = append(latencies, f.LatencyMs)
-
-		seenSvc := map[string]bool{}
-		for _, svcID := range f.Services {
-			if !seenSvc[svcID] {
-				seenSvc[svcID] = true
-				out.ServiceRequestCount[svcID]++
-			}
-		}
-		seenFlag := map[string]bool{}
-		for _, flagID := range f.FeatureFlags {
-			if !seenFlag[flagID] {
-				seenFlag[flagID] = true
-				out.FlagRequestCount[flagID]++
-			}
-		}
-
-		for _, errID := range f.Errors {
-			out.ErrorCount[errID]++
-
-			for svcID := range seenSvc {
-				m := out.ServiceErrorCount[svcID]
-				if m == nil {
-					m = map[string]int{}
-					out.ServiceErrorCount[svcID] = m
-				}
-				m[errID]++
-			}
-
-			for flagID := range seenFlag {
-				m := out.FlagErrorCount[flagID]
-				if m == nil {
-					m = map[string]int{}
-					out.FlagErrorCount[flagID] = m
-				}
-				m[errID]++
-			}
-		}
-	}
-
-	sort.Slice(latencies, func(i, j int) bool { return latencies[i] < latencies[j] })
-	out.LatencyP50 = percentile(latencies, 50)
-	out.LatencyP95 = percentile(latencies, 95)
-	out.LatencyP99 = percentile(latencies, 99)
-
-	return out
-}
-func percentile(sorted []int64, pct int) int64 {
-	n := len(sorted)
-	if n == 0 {
-		return 0
-	}
-	// Nearest-rank: idx = ceil(pct/100 * n) - 1
-	idx := (pct*n + 99) / 100
-	if idx < 1 {
-		idx = 1
-	}
-	if idx > n {
-		idx = n
-	}
-	return sorted[idx-1]
-}
-
-func (s *Store) ForEachRequestFact(
-	start, end time.Time,
-	fn func(RequestFacts),
-) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	for _, f := range s.requestFacts {
-		if f.SeenAt.IsZero() {
-			continue
-		}
-		if f.SeenAt.Before(start) || f.SeenAt.After(end) {
-			continue
-		}
-		fn(f)
-	}
-}
diff --git a/internal/graph/store/summaries_test.go b/internal/graph/store/summaries_test.go
deleted file mode 100644
index 2823066..0000000
--- a/internal/graph/store/summaries_test.go
+++ /dev/null
@@ -1,195 +0,0 @@
-package store
-
-import (
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func TestSummarizeWindow_CountsAndQuantiles(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	now := time.Now()
-
-	// 5 requests with varying latencies: 10, 20, 30, 40, 50
-	for i, lat := range []int64{10, 20, 30, 40, 50} {
-		ev := testutil.MakeEvent(
-			testutil.WithTraceID(padHex(i)),
-			testutil.WithLatency(lat),
-			testutil.WithTimestamp(now.Add(-time.Duration(i)*time.Second)),
-		)
-		s.Merge(b.Build(ev))
-	}
-
-	// 2 failure requests
-	for i, code := range []string{"ERR_A", "ERR_B"} {
-		ev := testutil.MakeEvent(
-			testutil.WithTraceID(padHex(100+i)),
-			testutil.WithLatency(100),
-			testutil.WithError(code, "msg"),
-			testutil.WithStatusCode(500),
-			testutil.WithTimestamp(now.Add(-time.Duration(i)*time.Second)),
-		)
-		s.Merge(b.Build(ev))
-	}
-
-	summary := s.SummarizeWindow(now.Add(-time.Minute), now.Add(time.Minute))
-
-	if summary.TotalRequests != 7 {
-		t.Errorf("TotalRequests = %d, want 7", summary.TotalRequests)
-	}
-	if summary.TotalFailures != 2 {
-		t.Errorf("TotalFailures = %d, want 2", summary.TotalFailures)
-	}
-
-	// 7 latencies sorted: 10, 20, 30, 40, 50, 100, 100
-	// P50: index = 50*7/100 = 3 → 40
-	// P95: index = 95*7/100 = 6 → 100
-	// P99: index = 99*7/100 = 6 → 100
-	if summary.LatencyP50 != 40 {
-		t.Errorf("LatencyP50 = %d, want 40", summary.LatencyP50)
-	}
-	if summary.LatencyP95 != 100 {
-		t.Errorf("LatencyP95 = %d, want 100", summary.LatencyP95)
-	}
-	if summary.LatencyP99 != 100 {
-		t.Errorf("LatencyP99 = %d, want 100", summary.LatencyP99)
-	}
-}
-
-func TestSummarizeWindow_DedupServicesFlags(t *testing.T) {
-	s := NewStore()
-	b := build.NewBuilder()
-	now := time.Now()
-
-	// Create two events for the same trace (same service → duplicate edges)
-	traceID := padHex(1)
-	for i := 0; i < 2; i++ {
-		ev := testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithService("svc-a"),
-			testutil.WithFeatureFlags("flag-x"),
-			testutil.WithTimestamp(now),
-			testutil.WithSpanID(padSpan(i)),
-		)
-		s.Merge(b.Build(ev))
-	}
-
-	// Create a second request with different trace
-	ev2 := testutil.MakeEvent(
-		testutil.WithTraceID(padHex(2)),
-		testutil.WithService("svc-a"),
-		testutil.WithFeatureFlags("flag-x"),
-		testutil.WithTimestamp(now),
-	)
-	s.Merge(b.Build(ev2))
-
-	summary := s.SummarizeWindow(now.Add(-time.Minute), now.Add(time.Minute))
-
-	// svc-a should count per-request, not per-event-merge
-	// Each request fact has deduplicated services
-	if summary.TotalRequests != 2 {
-		t.Errorf("TotalRequests = %d, want 2", summary.TotalRequests)
-	}
-
-	// ServiceRequestCount: each request contributes 1 (deduped)
-	for svcID, count := range summary.ServiceRequestCount {
-		if count > 2 {
-			t.Errorf("ServiceRequestCount[%s] = %d, want <= 2 (deduped per request)", svcID, count)
-		}
-	}
-}
-
-func TestSummarizeWindow_UsesFlattenedFeatureFlags(t *testing.T) {
-	s := NewStore()
-	now := time.Now()
-	s.requestFacts["req-1"] = RequestFacts{
-		RequestID:    "req-1",
-		SeenAt:       now,
-		Services:     []string{"checkout"},
-		Errors:       []string{"ERR_A"},
-		FeatureFlags: []string{"flag-a", "flag-b", "flag-a"},
-		LatencyMs:    25,
-		Status:       "error",
-	}
-	s.requestFacts["req-2"] = RequestFacts{
-		RequestID:    "req-2",
-		SeenAt:       now.Add(time.Second),
-		Services:     []string{"checkout"},
-		FeatureFlags: []string{"flag-b"},
-		LatencyMs:    75,
-		Status:       "ok",
-	}
-
-	summary := s.SummarizeWindow(now.Add(-time.Minute), now.Add(time.Minute))
-
-	if summary.TotalRequests != 2 {
-		t.Fatalf("TotalRequests = %d, want 2", summary.TotalRequests)
-	}
-	if summary.TotalFailures != 1 {
-		t.Fatalf("TotalFailures = %d, want 1", summary.TotalFailures)
-	}
-	if got := summary.ServiceRequestCount["checkout"]; got != 2 {
-		t.Fatalf("ServiceRequestCount[checkout] = %d, want 2", got)
-	}
-	if got := summary.FlagRequestCount["flag-a"]; got != 1 {
-		t.Fatalf("FlagRequestCount[flag-a] = %d, want 1", got)
-	}
-	if got := summary.FlagRequestCount["flag-b"]; got != 2 {
-		t.Fatalf("FlagRequestCount[flag-b] = %d, want 2", got)
-	}
-	if got := summary.FlagErrorCount["flag-a"]["ERR_A"]; got != 1 {
-		t.Fatalf("FlagErrorCount[flag-a][ERR_A] = %d, want 1", got)
-	}
-	if got := summary.FlagErrorCount["flag-b"]["ERR_A"]; got != 1 {
-		t.Fatalf("FlagErrorCount[flag-b][ERR_A] = %d, want 1", got)
-	}
-}
-
-func TestPercentile(t *testing.T) {
-	tests := []struct {
-		name   string
-		sorted []int64
-		pct    int
-		want   int64
-	}{
-		{"empty", nil, 50, 0},
-		{"single", []int64{42}, 50, 42},
-		{"single p99", []int64{42}, 99, 42},
-		{"ten elements p50", []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 50, 5},
-		{"ten elements p95", []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 95, 10},
-		{"ten elements p99", []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 99, 10},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := percentile(tt.sorted, tt.pct)
-			if got != tt.want {
-				t.Errorf("percentile(%v, %d) = %d, want %d", tt.sorted, tt.pct, got, tt.want)
-			}
-		})
-	}
-}
-
-func padHex(n int) string {
-	s := ""
-	for len(s) < 31 {
-		s += "0"
-	}
-	hex := "0123456789abcdef"
-	if n < 16 {
-		return s + string(hex[n])
-	}
-	return s[:30] + string(hex[n/16]) + string(hex[n%16])
-}
-
-func padSpan(n int) string {
-	s := ""
-	for len(s) < 15 {
-		s += "0"
-	}
-	hex := "0123456789abcdef"
-	return s + string(hex[n])
-}
diff --git a/internal/graph/window/window.go b/internal/graph/window/window.go
deleted file mode 100644
index 5a8d830..0000000
--- a/internal/graph/window/window.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package window
-
-import (
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-// FilterByWindow returns a graph containing ONLY
-// requests active in [start, end] and their 1-hop context.
-func FilterByWindow(g *core.Graph, start, end time.Time) *core.Graph {
-	if g == nil {
-		return core.New()
-	}
-
-	out := core.New()
-	keep := map[string]bool{}
-
-	// 1. Keep request nodes active in window
-	for id, n := range g.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		if n.LastSeen.IsZero() {
-			continue
-		}
-		if !n.LastSeen.Before(start) && !n.LastSeen.After(end) {
-			keep[id] = true
-		}
-	}
-
-	// 2. Pull in 1-hop neighbors via adjacency indexes
-	for id := range keep {
-		for _, e := range g.OutEdges[id] {
-			keep[e.To] = true
-		}
-		for _, e := range g.InEdges[id] {
-			keep[e.From] = true
-		}
-	}
-
-	// 3. Copy nodes
-	for id := range keep {
-		if n, ok := g.Nodes[id]; ok {
-			out.Nodes[id] = n
-		}
-	}
-
-	// 4. Copy edges and rebuild indexes
-	for _, e := range g.Edges {
-		if keep[e.From] && keep[e.To] {
-			out.AddEdge(e)
-		}
-	}
-
-	return out
-}
diff --git a/internal/incidents/capture.go b/internal/incidents/capture.go
new file mode 100644
index 0000000..6e6cf4b
--- /dev/null
+++ b/internal/incidents/capture.go
@@ -0,0 +1,369 @@
+package incidents
+
+import (
+	"sort"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+// pickAnchorTsStart returns the wall-clock TsStart of the failed event in
+// events whose Anchor.Step (and ideally Anchor.ErrorCode) match family.
+// Two-step matching, earliest wins:
+//  1. Anchor.Step == family.Step AND Anchor.ErrorCode == family.ErrorCode
+//  2. Anchor.Step == family.Step (ignoring ErrorCode)
+//  3. ok=false
+func pickAnchorTsStart(events []*eventv2.Event, family apiv2.ErrorFamily) (time.Time, bool) {
+	var bestStrict, bestLoose time.Time
+	var hasStrict, hasLoose bool
+	for _, ev := range events {
+		if ev == nil || ev.Anchor == nil {
+			continue
+		}
+		if ev.Anchor.Step != family.Step {
+			continue
+		}
+		if !hasLoose || ev.TsStart.Before(bestLoose) {
+			bestLoose, hasLoose = ev.TsStart, true
+		}
+		if ev.Anchor.ErrorCode == family.ErrorCode {
+			if !hasStrict || ev.TsStart.Before(bestStrict) {
+				bestStrict, hasStrict = ev.TsStart, true
+			}
+		}
+	}
+	if hasStrict {
+		return bestStrict, true
+	}
+	if hasLoose {
+		return bestLoose, true
+	}
+	return time.Time{}, false
+}
+
+// newBlastEvidence projects an apiv2.BlastRadiusResponse into a BlastEvidence
+// snapshot. Caller provides status: pass CaptureOK on a successful read
+// (zero counts are a valid OK), CaptureMissing if the reader call faulted
+// (panic-recovered upstream) — in which case b is the zero-value response
+// and we record zeros with status=missing rather than misreporting OK.
+func newBlastEvidence(b apiv2.BlastRadiusResponse, capturedAt time.Time, status EvidenceCaptureStatus) *BlastEvidence {
+	return &BlastEvidence{
+		AffectedRequests: b.AffectedRequests,
+		AffectedUsers:    b.AffectedUsers,
+		AffectedServices: b.AffectedServices,
+		TopServices:      append([]string(nil), b.TopServices...),
+		SampledTraces:    append([]string(nil), b.SampleTraces...),
+		CapturedAt:       capturedAt,
+		CaptureStatus:    status,
+	}
+}
+
+// newPropagationEvidence projects a StoryResponse (possibly nil) into a
+// PropagationEvidence snapshot. status is one of CaptureOK / CapturePartial /
+// CaptureMissing per the spec's mapping; firstSeenAt is the wall-clock
+// anchor TsStart (nil if unavailable).
+func newPropagationEvidence(story *apiv2.StoryResponse, sampleTraceID string, firstSeenAt *time.Time, capturedAt time.Time) *PropagationEvidence {
+	if story == nil {
+		return &PropagationEvidence{
+			SampleTraceID: sampleTraceID,
+			CapturedAt:    capturedAt,
+			CaptureStatus: CaptureMissing,
+		}
+	}
+	status := CaptureOK
+	originStep := ""
+	if story.Anchor != nil {
+		originStep = story.Anchor.Step
+	} else {
+		status = CapturePartial
+	}
+	if len(story.Path) == 0 {
+		status = CapturePartial
+	}
+	if firstSeenAt == nil {
+		status = CapturePartial
+	}
+	path := make([]PropagationStep, 0, len(story.Path))
+	for _, s := range story.Path {
+		path = append(path, PropagationStep{
+			Service:    story.Service,
+			Step:       s.Name,
+			StartMS:    s.StartMS,
+			DurationMS: s.DurationMS,
+			Status:     s.Status,
+			ErrorCode:  s.ErrorCode,
+		})
+	}
+	return &PropagationEvidence{
+		OriginService: story.Service,
+		OriginStep:    originStep,
+		Path:          path,
+		SampleTraceID: sampleTraceID,
+		FirstSeenAt:   firstSeenAt,
+		CapturedAt:    capturedAt,
+		CaptureStatus: status,
+	}
+}
+
+// updatePropagationSnapshot applies the Opening/Latest lifecycle to
+// PropagationSnapshot. Opening is set only on the first OK capture (or
+// preserved if already set). Latest is always overwritten with the new
+// attempt, including partial/missing.
+func updatePropagationSnapshot(prior *PropagationSnapshot, fresh *PropagationEvidence) *PropagationSnapshot {
+	if prior == nil {
+		prior = &PropagationSnapshot{}
+	}
+	out := &PropagationSnapshot{Opening: prior.Opening, Latest: fresh}
+	if out.Opening == nil && fresh != nil && fresh.CaptureStatus == CaptureOK {
+		out.Opening = fresh
+	}
+	return out
+}
+
+// updateBlastSnapshot applies the Opening/Latest lifecycle to BlastSnapshot.
+// Symmetric to updatePropagationSnapshot; independence rule means this
+// runs even if propagation capture failed.
+func updateBlastSnapshot(prior *BlastSnapshot, fresh *BlastEvidence) *BlastSnapshot {
+	if prior == nil {
+		prior = &BlastSnapshot{}
+	}
+	out := &BlastSnapshot{Opening: prior.Opening, Latest: fresh}
+	if out.Opening == nil && fresh != nil && fresh.CaptureStatus == CaptureOK {
+		out.Opening = fresh
+	}
+	return out
+}
+
+func captureAlertEvidenceFromSignals(rows []signals.Signal, inc Incident, capturedAt time.Time, matchWindow time.Duration) *AlertEvidence {
+	if matchWindow <= 0 {
+		matchWindow = 15 * time.Minute
+	}
+	if matchWindow > 24*time.Hour {
+		matchWindow = 24 * time.Hour
+	}
+	matches := make([]MatchedAlert, 0, len(rows))
+	for i := range rows {
+		sig := rows[i]
+		if ok, strategy := matchAlertSignalToIncident(&sig, inc, matchWindow, capturedAt); ok {
+			matches = append(matches, matchedAlertFromSignal(sig, strategy))
+		}
+	}
+	status := CaptureMissing
+	if len(matches) > 0 {
+		status = CaptureOK
+	}
+	return &AlertEvidence{Matches: matches, CapturedAt: capturedAt, CaptureStatus: status}
+}
+
+func matchAlertSignalToIncident(sig *signals.Signal, inc Incident, matchWindow time.Duration, now time.Time) (bool, string) {
+	if sig == nil || sig.Type != signals.TypeAlert {
+		return false, ""
+	}
+	ts := sig.Timestamp
+	if ts.IsZero() {
+		ts = sig.ReceivedAt
+	}
+	if !ts.IsZero() {
+		lo := inc.StartedAt.Add(-matchWindow)
+		hi := now
+		if ts.Before(lo) || ts.After(hi) {
+			return false, ""
+		}
+	}
+	if id := signalMetaString(sig.Metadata, "incident_id"); id != "" {
+		if id == inc.IncidentID {
+			return true, "incident_id"
+		}
+		return false, ""
+	}
+	if sig.Env != "" && inc.Env != "" && sig.Env != inc.Env {
+		return false, ""
+	}
+	if sig.Service != "" && sig.Service != inc.ErrorFamily.Service {
+		return false, ""
+	}
+	if code := signalMetaString(sig.Metadata, "error_code"); code != "" && code == inc.ErrorFamily.ErrorCode {
+		step := signalMetaString(sig.Metadata, "step")
+		if step == "" || step == inc.ErrorFamily.Step {
+			return true, "family"
+		}
+	}
+	return false, ""
+}
+
+func matchedAlertFromSignal(sig signals.Signal, strategy string) MatchedAlert {
+	evidenceIDs := []string(nil)
+	if sig.SignalID != "" {
+		evidenceIDs = []string{sig.SignalID}
+	}
+	matchedAt := sig.Timestamp
+	if matchedAt.IsZero() {
+		matchedAt = sig.ReceivedAt
+	}
+	return MatchedAlert{
+		SignalID:    sig.SignalID,
+		AlertID:     signalMetaString(sig.Metadata, "alert_id"),
+		Source:      sig.Source,
+		Severity:    string(sig.Severity),
+		Reason:      sig.Reason,
+		ProviderURL: signalMetaString(sig.Metadata, "provider_url"),
+		EvidenceIDs: evidenceIDs,
+		MatchedAt:   matchedAt,
+		Strategy:    strategy,
+	}
+}
+
+func signalMetaString(m map[string]any, key string) string {
+	if len(m) == 0 {
+		return ""
+	}
+	if s, ok := m[key].(string); ok {
+		return s
+	}
+	return ""
+}
+
+func updateAlertSnapshot(prior *AlertSnapshot, fresh *AlertEvidence) *AlertSnapshot {
+	if prior == nil {
+		prior = &AlertSnapshot{}
+	}
+	out := &AlertSnapshot{Opening: prior.Opening, Latest: fresh}
+	if out.Opening == nil && fresh != nil && fresh.CaptureStatus == CaptureOK {
+		out.Opening = fresh
+	}
+	return out
+}
+
+// matchRuntimeSignalToIncident reports whether sig is a runtime/healthcheck
+// signal for inc's service within [lo, hi]. Env mismatch excludes (signals are
+// already env-filtered at query time; this is a defensive second guard — see
+// Critical Design Decision 1: runtime signals must carry env to correlate).
+func matchRuntimeSignalToIncident(sig signals.Signal, inc Incident, lo, hi time.Time) bool {
+	if sig.Type != signals.TypeRuntime && sig.Type != signals.TypeHealthcheck {
+		return false
+	}
+	if sig.Env != "" && inc.Env != "" && sig.Env != inc.Env {
+		return false
+	}
+	if sig.Service != inc.Service {
+		return false
+	}
+	ts := sig.Timestamp
+	if ts.IsZero() {
+		ts = sig.ReceivedAt
+	}
+	if ts.Before(lo) || ts.After(hi) {
+		return false
+	}
+	return true
+}
+
+func runtimeSeverityRank(s string) int {
+	switch signals.Severity(s) {
+	case signals.SeverityCritical:
+		return 3
+	case signals.SeverityWarning:
+		return 2
+	case signals.SeverityInfo:
+		return 1
+	}
+	return 0
+}
+
+// sortRuntimeMatches orders runtime evidence deterministically: severity
+// priority (critical > warning > info), then OccurredAt ascending, then
+// SignalID. Stable ordering keeps report_hash stable across ticks.
+func sortRuntimeMatches(m []RuntimeEvidence) {
+	sort.SliceStable(m, func(i, j int) bool {
+		if ri, rj := runtimeSeverityRank(m[i].Severity), runtimeSeverityRank(m[j].Severity); ri != rj {
+			return ri > rj
+		}
+		if !m[i].OccurredAt.Equal(m[j].OccurredAt) {
+			return m[i].OccurredAt.Before(m[j].OccurredAt)
+		}
+		return m[i].SignalID < m[j].SignalID
+	})
+}
+
+func runtimeEvidenceFromSignal(sig signals.Signal, capturedAt time.Time) RuntimeEvidence {
+	occurred := sig.Timestamp
+	if occurred.IsZero() {
+		occurred = sig.ReceivedAt
+	}
+	var meta map[string]any
+	if len(sig.Metadata) > 0 {
+		meta = make(map[string]any, len(sig.Metadata))
+		for k, v := range sig.Metadata {
+			meta[k] = v
+		}
+	}
+	return RuntimeEvidence{
+		Subtype:       signalMetaString(sig.Metadata, "subtype"),
+		Service:       sig.Service,
+		Reason:        sig.Reason,
+		Severity:      string(sig.Severity),
+		Source:        sig.Source,
+		SignalID:      sig.SignalID,
+		OccurredAt:    occurred,
+		Metadata:      meta,
+		CapturedAt:    capturedAt,
+		CaptureStatus: CaptureOK,
+	}
+}
+
+// captureRuntimeEvidence projects all runtime signals matching inc within the
+// window into sorted RuntimeEvidence rows. Window mirrors alert capture:
+// [StartedAt-matchWindow, capturedAt]. The engine passes DeployCorrelationWindow
+// here, which matches the classifier's 15m runtime lookback at the default
+// config so the snapshot and the flat evidence rows agree on what matched.
+func captureRuntimeEvidence(rows []signals.Signal, inc Incident, capturedAt time.Time, matchWindow time.Duration) []RuntimeEvidence {
+	if matchWindow <= 0 {
+		matchWindow = 15 * time.Minute
+	}
+	if matchWindow > 24*time.Hour {
+		matchWindow = 24 * time.Hour
+	}
+	lo := inc.StartedAt.Add(-matchWindow)
+	out := make([]RuntimeEvidence, 0)
+	for i := range rows {
+		if matchRuntimeSignalToIncident(rows[i], inc, lo, capturedAt) {
+			out = append(out, runtimeEvidenceFromSignal(rows[i], capturedAt))
+		}
+	}
+	sortRuntimeMatches(out)
+	return out
+}
+
+// updateRuntimeSnapshot sets Matches to exactly the fresh windowed capture
+// (already sorted) so the live surfaces (API/dashboard/report) only ever show
+// currently-correlating runtime signals — never stale ones that have aged out
+// of the query window. Opening (earliest ever) and Latest (most recent ever)
+// are preserved across ticks as historical provenance. A tick with zero matches
+// clears Matches but keeps that provenance, mirroring how alert evidence sets
+// its Latest to the fresh (possibly empty) capture each tick.
+func updateRuntimeSnapshot(prior *RuntimeSnapshot, fresh []RuntimeEvidence) *RuntimeSnapshot {
+	if len(fresh) == 0 {
+		if prior == nil {
+			return nil
+		}
+		return &RuntimeSnapshot{Opening: prior.Opening, Latest: prior.Latest}
+	}
+	out := &RuntimeSnapshot{Matches: fresh}
+	if prior != nil {
+		out.Opening = prior.Opening
+		out.Latest = prior.Latest
+	}
+	for i := range fresh {
+		if out.Opening == nil || fresh[i].OccurredAt.Before(out.Opening.OccurredAt) {
+			cp := fresh[i]
+			out.Opening = &cp
+		}
+		if out.Latest == nil || fresh[i].OccurredAt.After(out.Latest.OccurredAt) {
+			cp := fresh[i]
+			out.Latest = &cp
+		}
+	}
+	return out
+}
diff --git a/internal/incidents/capture_status_test.go b/internal/incidents/capture_status_test.go
new file mode 100644
index 0000000..a0b4369
--- /dev/null
+++ b/internal/incidents/capture_status_test.go
@@ -0,0 +1,40 @@
+package incidents
+
+import (
+	"testing"
+	"time"
+
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+// Invariant (e): capture status must never misreport completeness.
+// A faulted/empty capture must be missing or partial, never OK.
+func TestPropagationCaptureStatusHonesty(t *testing.T) {
+	now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+	firstSeen := now.Add(-time.Minute)
+
+	// nil story (reader faulted) => must be missing, never OK.
+	missing := newPropagationEvidence(nil, "trace-x", nil, now)
+	if missing.CaptureStatus != CaptureMissing {
+		t.Fatalf("nil story must be CaptureMissing, got %q", missing.CaptureStatus)
+	}
+
+	// complete story (anchor + path + firstSeen) => OK.
+	full := newPropagationEvidence(&apiv2.StoryResponse{
+		Service: "checkout",
+		Anchor:  &apiv2.StoryAnchor{Step: "charge"},
+		Path:    []apiv2.StoryStep{{Name: "charge"}},
+	}, "trace-x", &firstSeen, now)
+	if full.CaptureStatus != CaptureOK {
+		t.Fatalf("complete story must be CaptureOK, got %q", full.CaptureStatus)
+	}
+
+	// missing anchor => partial, never OK.
+	partial := newPropagationEvidence(&apiv2.StoryResponse{
+		Service: "checkout",
+		Path:    []apiv2.StoryStep{{Name: "charge"}},
+	}, "trace-x", &firstSeen, now)
+	if partial.CaptureStatus != CapturePartial {
+		t.Fatalf("missing anchor must be CapturePartial, got %q", partial.CaptureStatus)
+	}
+}
diff --git a/internal/incidents/capture_test.go b/internal/incidents/capture_test.go
new file mode 100644
index 0000000..e340d66
--- /dev/null
+++ b/internal/incidents/capture_test.go
@@ -0,0 +1,336 @@
+package incidents
+
+import (
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/signals"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func ev(tsStart time.Time, step, code string) *eventv2.Event {
+	return &eventv2.Event{TsStart: tsStart, Anchor: &eventv2.Anchor{Step: step, ErrorCode: code}}
+}
+
+func TestPickAnchorTsStart_StrictMatchEarliestWins(t *testing.T) {
+	base := time.Unix(1700000000, 0)
+	events := []*eventv2.Event{
+		ev(base.Add(5*time.Second), "charge", "DB_TIMEOUT"),
+		ev(base.Add(2*time.Second), "charge", "DB_TIMEOUT"),
+		ev(base.Add(8*time.Second), "charge", "DB_TIMEOUT"),
+	}
+	got, ok := pickAnchorTsStart(events, apiv2.ErrorFamily{Step: "charge", ErrorCode: "DB_TIMEOUT"})
+	if !ok {
+		t.Fatal("ok=false")
+	}
+	if !got.Equal(base.Add(2 * time.Second)) {
+		t.Errorf("got %v, want %v", got, base.Add(2*time.Second))
+	}
+}
+
+func TestPickAnchorTsStart_FallsBackToStepOnly(t *testing.T) {
+	base := time.Unix(1700000000, 0)
+	events := []*eventv2.Event{
+		ev(base.Add(5*time.Second), "charge", "OTHER_CODE"),
+		ev(base.Add(3*time.Second), "charge", "ANOTHER"),
+	}
+	got, ok := pickAnchorTsStart(events, apiv2.ErrorFamily{Step: "charge", ErrorCode: "DB_TIMEOUT"})
+	if !ok {
+		t.Fatal("ok=false")
+	}
+	if !got.Equal(base.Add(3 * time.Second)) {
+		t.Errorf("got %v, want %v", got, base.Add(3*time.Second))
+	}
+}
+
+func TestPickAnchorTsStart_NoMatch(t *testing.T) {
+	events := []*eventv2.Event{ev(time.Now(), "other_step", "X")}
+	_, ok := pickAnchorTsStart(events, apiv2.ErrorFamily{Step: "charge", ErrorCode: "DB_TIMEOUT"})
+	if ok {
+		t.Fatal("expected ok=false on no step match")
+	}
+}
+
+func TestUpdatePropagationSnapshot_FirstCaptureOK_OpeningEqualsLatest(t *testing.T) {
+	now := time.Now()
+	fresh := &PropagationEvidence{CapturedAt: now, CaptureStatus: CaptureOK, SampleTraceID: "t1"}
+	snap := updatePropagationSnapshot(nil, fresh)
+	if snap.Opening != fresh {
+		t.Errorf("Opening != fresh")
+	}
+	if snap.Latest != fresh {
+		t.Errorf("Latest != fresh")
+	}
+}
+
+func TestUpdatePropagationSnapshot_FirstCaptureMissing_OpeningStaysNil_LatestRecorded(t *testing.T) {
+	now := time.Now()
+	fresh := &PropagationEvidence{CapturedAt: now, CaptureStatus: CaptureMissing}
+	snap := updatePropagationSnapshot(nil, fresh)
+	if snap.Opening != nil {
+		t.Errorf("Opening should be nil on first missing capture")
+	}
+	if snap.Latest == nil || snap.Latest.CaptureStatus != CaptureMissing {
+		t.Errorf("Latest should record missing capture; got %+v", snap.Latest)
+	}
+}
+
+func TestUpdatePropagationSnapshot_PriorOpeningSet_CarriesForward(t *testing.T) {
+	earlier := time.Now().Add(-time.Minute)
+	prior := &PropagationSnapshot{
+		Opening: &PropagationEvidence{CapturedAt: earlier, CaptureStatus: CaptureOK, SampleTraceID: "t_open"},
+		Latest:  &PropagationEvidence{CapturedAt: earlier, CaptureStatus: CaptureOK, SampleTraceID: "t_open"},
+	}
+	fresh := &PropagationEvidence{CapturedAt: time.Now(), CaptureStatus: CaptureOK, SampleTraceID: "t_new"}
+	snap := updatePropagationSnapshot(prior, fresh)
+	if snap.Opening.SampleTraceID != "t_open" {
+		t.Errorf("Opening should carry forward; got %q", snap.Opening.SampleTraceID)
+	}
+	if snap.Latest.SampleTraceID != "t_new" {
+		t.Errorf("Latest should overwrite; got %q", snap.Latest.SampleTraceID)
+	}
+}
+
+func TestUpdatePropagationSnapshot_PriorOpeningNil_NewOK_OpeningSet(t *testing.T) {
+	prior := &PropagationSnapshot{
+		Latest: &PropagationEvidence{CaptureStatus: CaptureMissing},
+	}
+	fresh := &PropagationEvidence{CapturedAt: time.Now(), CaptureStatus: CaptureOK, SampleTraceID: "t_ok"}
+	snap := updatePropagationSnapshot(prior, fresh)
+	if snap.Opening == nil || snap.Opening.SampleTraceID != "t_ok" {
+		t.Errorf("Opening should be promoted on first OK capture; got %+v", snap.Opening)
+	}
+}
+
+func TestUpdateBlastSnapshot_IndependentOfPropagation(t *testing.T) {
+	now := time.Now()
+	bfresh := &BlastEvidence{AffectedRequests: 5, CapturedAt: now, CaptureStatus: CaptureOK}
+	snap := updateBlastSnapshot(nil, bfresh)
+	if snap.Opening == nil || snap.Opening.AffectedRequests != 5 {
+		t.Errorf("Blast.Opening should be set independently; got %+v", snap.Opening)
+	}
+}
+
+func TestNewBlastEvidence_MissingStatusFromReaderFault(t *testing.T) {
+	e := newBlastEvidence(apiv2.BlastRadiusResponse{}, time.Now(), CaptureMissing)
+	if e.CaptureStatus != CaptureMissing {
+		t.Errorf("CaptureStatus = %s; want missing", e.CaptureStatus)
+	}
+	if e.AffectedRequests != 0 || e.AffectedServices != 0 {
+		t.Errorf("missing capture should carry zero counts; got %+v", e)
+	}
+}
+
+func TestNewPropagationEvidence_NilStory_Missing(t *testing.T) {
+	p := newPropagationEvidence(nil, "trace_x", nil, time.Now())
+	if p.CaptureStatus != CaptureMissing {
+		t.Errorf("CaptureStatus = %s; want missing", p.CaptureStatus)
+	}
+	if p.SampleTraceID != "trace_x" {
+		t.Errorf("SampleTraceID lost; got %q", p.SampleTraceID)
+	}
+}
+
+func TestNewPropagationEvidence_StoryWithoutAnchor_Partial(t *testing.T) {
+	story := &apiv2.StoryResponse{Service: "payment-service", Path: []apiv2.StoryStep{{Name: "charge", Status: "error"}}}
+	ts := time.Now()
+	p := newPropagationEvidence(story, "tx", &ts, time.Now())
+	if p.CaptureStatus != CapturePartial {
+		t.Errorf("CaptureStatus = %s; want partial (no anchor)", p.CaptureStatus)
+	}
+}
+
+func TestNewPropagationEvidence_StoryOK_FirstSeenNil_Partial(t *testing.T) {
+	story := &apiv2.StoryResponse{
+		Service: "payment-service",
+		Anchor:  &apiv2.StoryAnchor{Step: "charge"},
+		Path:    []apiv2.StoryStep{{Name: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}},
+	}
+	p := newPropagationEvidence(story, "tx", nil, time.Now())
+	if p.CaptureStatus != CapturePartial {
+		t.Errorf("CaptureStatus = %s; want partial (FirstSeenAt nil)", p.CaptureStatus)
+	}
+	if len(p.Path) != 1 || p.Path[0].Step != "charge" {
+		t.Errorf("Path lost: %+v", p.Path)
+	}
+}
+
+func TestCaptureAlertEvidence_FamilyMatchOK(t *testing.T) {
+	now := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC)
+	inc := Incident{
+		IncidentID:  "inc_1",
+		Env:         "demo",
+		ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		StartedAt:   now.Add(-time.Minute),
+	}
+	rows := []signals.Signal{{
+		SignalID:  "sig_1",
+		Type:      signals.TypeAlert,
+		Source:    "alertmanager",
+		Service:   "checkout",
+		Env:       "demo",
+		Severity:  signals.SeverityCritical,
+		Reason:    "PMT_502 spike",
+		Timestamp: now.Add(-30 * time.Second),
+		Metadata: map[string]any{
+			"alert_id":     "CheckoutPaymentFailure",
+			"error_code":   "PMT_502",
+			"step":         "payment.charge",
+			"provider_url": "https://alerts.example/inc",
+		},
+	}}
+
+	got := captureAlertEvidenceFromSignals(rows, inc, now, 15*time.Minute)
+	if got.CaptureStatus != CaptureOK {
+		t.Fatalf("CaptureStatus = %s, want ok", got.CaptureStatus)
+	}
+	if len(got.Matches) != 1 {
+		t.Fatalf("Matches len = %d, want 1", len(got.Matches))
+	}
+	m := got.Matches[0]
+	if m.SignalID != "sig_1" || m.AlertID != "CheckoutPaymentFailure" || m.Strategy != "family" {
+		t.Fatalf("match = %+v", m)
+	}
+	if len(m.EvidenceIDs) != 1 || m.EvidenceIDs[0] != "sig_1" {
+		t.Fatalf("EvidenceIDs = %+v", m.EvidenceIDs)
+	}
+}
+
+func TestCaptureAlertEvidence_MatchesOlderIncidentAlert(t *testing.T) {
+	now := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC)
+	inc := Incident{
+		IncidentID:  "inc_1",
+		Env:         "demo",
+		ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		StartedAt:   now.Add(-45 * time.Minute),
+	}
+	rows := []signals.Signal{{
+		SignalID:  "sig_old",
+		Type:      signals.TypeAlert,
+		Source:    "alertmanager",
+		Service:   "checkout",
+		Env:       "demo",
+		Severity:  signals.SeverityCritical,
+		Reason:    "PMT_502 spike",
+		Timestamp: now.Add(-40 * time.Minute),
+		Metadata:  map[string]any{"error_code": "PMT_502", "step": "payment.charge"},
+	}}
+
+	got := captureAlertEvidenceFromSignals(rows, inc, now, 15*time.Minute)
+	if got.CaptureStatus != CaptureOK {
+		t.Fatalf("CaptureStatus = %s, want ok", got.CaptureStatus)
+	}
+	if len(got.Matches) != 1 || got.Matches[0].SignalID != "sig_old" {
+		t.Fatalf("Matches = %+v", got.Matches)
+	}
+}
+
+func TestCaptureAlertEvidence_NoMatchMissing(t *testing.T) {
+	now := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC)
+	inc := Incident{
+		IncidentID:  "inc_1",
+		Env:         "demo",
+		ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+		StartedAt:   now.Add(-time.Minute),
+	}
+	rows := []signals.Signal{{
+		SignalID:  "sig_other",
+		Type:      signals.TypeAlert,
+		Source:    "alertmanager",
+		Service:   "checkout",
+		Env:       "demo",
+		Severity:  signals.SeverityCritical,
+		Reason:    "other",
+		Timestamp: now.Add(-30 * time.Second),
+		Metadata:  map[string]any{"error_code": "OTHER"},
+	}}
+
+	got := captureAlertEvidenceFromSignals(rows, inc, now, 15*time.Minute)
+	if got.CaptureStatus != CaptureMissing {
+		t.Fatalf("CaptureStatus = %s, want missing", got.CaptureStatus)
+	}
+	if len(got.Matches) != 0 {
+		t.Fatalf("Matches = %+v, want none", got.Matches)
+	}
+}
+
+func TestUpdateAlertSnapshot_FirstOKSetsOpening(t *testing.T) {
+	fresh := &AlertEvidence{
+		Matches:       []MatchedAlert{{SignalID: "sig_1"}},
+		CapturedAt:    time.Now(),
+		CaptureStatus: CaptureOK,
+	}
+	snap := updateAlertSnapshot(nil, fresh)
+	if snap.Opening != fresh || snap.Latest != fresh {
+		t.Fatalf("snapshot = %+v, want opening/latest fresh", snap)
+	}
+}
+
+func TestCaptureRuntimeEvidence_SortsAndMatchesBothKinds(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	inc := Incident{Service: "payment", Env: "demo", StartedAt: now}
+	rows := []signals.Signal{
+		{SignalID: "panic", Type: signals.TypeRuntime, Service: "payment", Env: "demo",
+			Severity: signals.SeverityWarning, Reason: "panic", Source: "go-sdk",
+			Timestamp: now.Add(-time.Minute), Metadata: map[string]any{"subtype": "panic"}},
+		{SignalID: "oom", Type: signals.TypeRuntime, Service: "payment", Env: "demo",
+			Severity: signals.SeverityCritical, Reason: "OOMKilled", Source: "k8s-demo",
+			Timestamp: now.Add(-2 * time.Minute), Metadata: map[string]any{"subtype": "oom_killed"}},
+		// foreign service + foreign env should be excluded
+		{SignalID: "other", Type: signals.TypeRuntime, Service: "checkout", Env: "demo",
+			Severity: signals.SeverityCritical, Timestamp: now.Add(-time.Minute)},
+	}
+	got := captureRuntimeEvidence(rows, inc, now, 15*time.Minute)
+	if len(got) != 2 {
+		t.Fatalf("want 2 matches, got %d: %+v", len(got), got)
+	}
+	// critical (oom) sorts before warning (panic)
+	if got[0].SignalID != "oom" || got[1].SignalID != "panic" {
+		t.Fatalf("sort wrong: %+v", got)
+	}
+}
+
+func TestUpdateRuntimeSnapshot_FreshMatchesPreservedOpening(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	infra := RuntimeEvidence{SignalID: "oom", Subtype: "oom_killed", Severity: "critical",
+		OccurredAt: now.Add(-3 * time.Minute), Source: "k8s-demo"}
+	prior := updateRuntimeSnapshot(nil, []RuntimeEvidence{infra})
+	app := RuntimeEvidence{SignalID: "panic", Subtype: "panic", Severity: "warning",
+		OccurredAt: now.Add(-time.Minute), Source: "go-sdk"}
+	// fresh capture has only the app match (infra aged out of the query window).
+	// Matches is bounded to the fresh set (no unbounded union), but Opening still
+	// points at the earliest event ever observed.
+	out := updateRuntimeSnapshot(prior, []RuntimeEvidence{app})
+	if len(out.Matches) != 1 || out.Matches[0].SignalID != "panic" {
+		t.Fatalf("Matches should be the fresh set [panic], got %+v", out.Matches)
+	}
+	if out.Opening == nil || out.Opening.SignalID != "oom" {
+		t.Fatalf("Opening should be the preserved earliest (oom): %+v", out.Opening)
+	}
+	if out.Latest == nil || out.Latest.SignalID != "panic" {
+		t.Fatalf("Latest should be most recent (panic): %+v", out.Latest)
+	}
+}
+
+func TestUpdateRuntimeSnapshot_EmptyFreshClearsMatchesKeepsProvenance(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	infra := RuntimeEvidence{SignalID: "oom", Subtype: "oom_killed", Severity: "critical",
+		OccurredAt: now.Add(-time.Minute), Source: "k8s-demo"}
+	prior := updateRuntimeSnapshot(nil, []RuntimeEvidence{infra})
+	// A tick with no matches must not leak the stale match into Matches, but
+	// keeps Opening/Latest as historical provenance.
+	got := updateRuntimeSnapshot(prior, nil)
+	if got == nil || len(got.Matches) != 0 {
+		t.Fatalf("empty fresh should clear Matches, got %+v", got)
+	}
+	if got.Opening == nil || got.Opening.SignalID != "oom" {
+		t.Fatalf("Opening provenance lost: %+v", got.Opening)
+	}
+	if got.Latest == nil || got.Latest.SignalID != "oom" {
+		t.Fatalf("Latest provenance lost: %+v", got.Latest)
+	}
+	if got := updateRuntimeSnapshot(nil, nil); got != nil {
+		t.Fatalf("no prior + empty fresh should stay nil, got %+v", got)
+	}
+}
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
index 6d57e82..b479af7 100644
--- a/internal/incidents/classifier.go
+++ b/internal/incidents/classifier.go
@@ -28,59 +28,184 @@ type Classification struct {
 
 func Classify(input ClassificationInput) Classification {
 	evidence := collectTraceEvidence(input.Events)
-	evidence = append(evidence, matchingAlertEvidence(input)...)
+	alerts := matchingAlerts(input)
+	for _, sig := range alerts {
+		evidence = append(evidence, signalEvidence(sig, alertLabel(sig)))
+	}
+	// Runtime signals are correlated evidence, not incident openers (Design
+	// Decision 2). Attach every matched runtime signal — infra AND app — as an
+	// evidence row regardless of the cause the classifier ultimately picks.
+	runtimeSigs := matchingRuntimeSignals(input)
+	for i := range runtimeSigs {
+		evidence = append(evidence, runtimeEvidence(runtimeSigs[i]))
+	}
 	warnings := instrumentationWarnings(input.Events, input.Signals)
+	ctx := NextCheckContext{
+		Service:                 input.Incident.Service,
+		ErrorCode:               input.Incident.ErrorFamily.ErrorCode,
+		Step:                    input.Incident.ErrorFamily.Step,
+		SampleTraceID:           sampleTraceID(input),
+		MissingServiceVersion:   containsString(warnings, "missing_service_version"),
+		MissingDependencySignal: containsString(warnings, "missing_dependency_signal"),
+		HasPartialTrace:         containsString(warnings, "partial_trace"),
+	}
+	if top := pickTopAlert(alerts); top != nil {
+		ctx.AlertSignalID = top.SignalID
+		ctx.AlertID = stringField(top.Metadata, "alert_id")
+		ctx.AlertSource = top.Source
+		ctx.AlertProviderURL = stringField(top.Metadata, "provider_url")
+	}
 
-	if dep := matchingDependencySignal(input); dep != nil {
-		evidence = append(evidence, signalEvidence(*dep, "Dependency signal overlaps first failing downstream"))
-		return classification(CauseDependency, ConfidenceHigh, evidence, warnings)
+	depSig := matchingDependencySignal(input)
+	downstream := firstFailingDownstream(input.Events)
+	deployment := matchingDeployment(input)
+	var deploySig *signals.Signal
+	if deployment == nil {
+		deploySig = matchingSignal(input, signals.TypeDeploy)
 	}
-	if downstream := firstFailingDownstream(input.Events); downstream != "" {
+	hasDeploy := deployment != nil || deploySig != nil
+
+	if depSig != nil {
+		ctx.Downstream = depSig.Service
+		ctx.DepSignalID = depSig.SignalID
+		ctx.DepSignalReason = depSig.Reason
+		evidence = append(evidence, signalEvidence(*depSig, dependencyLabel(*depSig)))
+	} else if downstream != "" {
+		ctx.Downstream = downstream
 		evidence = append(evidence, Evidence{
 			Kind:       EvidenceTrace,
-			Title:      "First failing step calls downstream service",
+			Title:      fmt.Sprintf("First failing step calls `%s`", downstream),
 			Detail:     downstream,
 			Service:    downstream,
 			OccurredAt: input.Incident.StartedAt,
 		})
-		return classification(CauseDependency, ConfidenceMedium, evidence, warnings)
 	}
-	if dep := matchingDeployment(input); dep != nil {
-		evidence = append(evidence, deploymentEvidence(*dep))
-		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+
+	// deployAt anchors the temporal tiebreak; a deploy signal without a
+	// timestamp leaves it zero (and therefore loses any tiebreak).
+	var deployAt time.Time
+	switch {
+	case deployment != nil:
+		deployAt = deployment.FirstSeen
+		ctx.DeployVersion = deployment.Version
+		ctx.DeployFirstSeen = deployment.FirstSeen
+		evidence = append(evidence, deploymentEvidence(*deployment))
+	case deploySig != nil:
+		ctx.DeployVersion = stringField(deploySig.Metadata, "version")
+		ctx.DeploySignalID = deploySig.SignalID
+		if !deploySig.Timestamp.IsZero() {
+			ctx.DeployFirstSeen = deploySig.Timestamp
+			deployAt = deploySig.Timestamp
+		}
+		evidence = append(evidence, signalEvidence(*deploySig, deployLabel(*deploySig)))
 	}
-	if sig := matchingSignal(input, signals.TypeDeploy); sig != nil {
-		evidence = append(evidence, signalEvidence(*sig, "Deploy signal overlaps incident window"))
-		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+
+	switch {
+	case depSig != nil && hasDeploy:
+		// Both causes are signal-backed: a cause anchored at/before onset beats
+		// one that lands after it (a change after the incident started cannot
+		// have caused it); among causes on the same side of onset, temporal
+		// proximity decides and an exact tie keeps the dependency (ADR 0001).
+		// Both evidence rows are already attached; the loser surfaces in next
+		// checks.
+		if deployBeatsDependency(deployAt, depSig.Timestamp, input.Incident.StartedAt) {
+			return classification(CauseDeploy, ConfidenceHigh, evidence, warnings, ctx)
+		}
+		return classification(CauseDependency, ConfidenceHigh, evidence, warnings, ctx)
+	case depSig != nil:
+		return classification(CauseDependency, ConfidenceHigh, evidence, warnings, ctx)
+	case hasDeploy:
+		// A correlated deploy beats the unconfirmed trace-only downstream
+		// inference (ADR 0001); the downstream stays as evidence + next check.
+		return classification(CauseDeploy, ConfidenceHigh, evidence, warnings, ctx)
+	case downstream != "":
+		return classification(CauseDependency, ConfidenceMedium, evidence, warnings, ctx)
 	}
-	if sig := matchingRuntimeSignal(input); sig != nil {
-		evidence = append(evidence, signalEvidence(*sig, "Runtime signal overlaps incident window"))
-		return classification(CauseRuntime, ConfidenceHigh, evidence, warnings)
+	if len(runtimeSigs) > 0 {
+		top := runtimeSigs[0]
+		ctx.RuntimeSignalID = top.SignalID
+		ctx.RuntimeReason = top.Reason
+		ctx.RuntimeSubtype = stringField(top.Metadata, "subtype")
+		return classification(CauseRuntime, ConfidenceHigh, evidence, warnings, ctx)
 	}
 	if len(input.Events) > 0 && input.Incident.ErrorFamily.Step != "" && firstFailingDownstream(input.Events) == "" {
-		return classification(CauseApp, ConfidenceMedium, evidence, warnings)
+		return classification(CauseApp, ConfidenceMedium, evidence, warnings, ctx)
+	}
+	return classification(CauseUnknown, ConfidenceLow, evidence, warnings, ctx)
+}
+
+func sampleTraceID(input ClassificationInput) string {
+	if len(input.Incident.SampleTraces) > 0 && input.Incident.SampleTraces[0] != "" {
+		return input.Incident.SampleTraces[0]
+	}
+	for _, ev := range input.Events {
+		if ev != nil && ev.TraceID != "" {
+			return ev.TraceID
+		}
+	}
+	return ""
+}
+
+// deployBeatsDependency decides, when both a deploy and a dependency signal
+// correlate with the incident, whether the deploy is the better cause. A
+// non-zero anchor at/before onset is preferred over one strictly after onset
+// (a change that lands after the incident started cannot have caused it); when
+// both anchors are on the same side of onset (or either is missing), the one
+// closer to onset wins, and an exact tie keeps the dependency.
+func deployBeatsDependency(deployAt, depAt, onset time.Time) bool {
+	if !deployAt.IsZero() && !depAt.IsZero() {
+		deployPrecedes := !deployAt.After(onset)
+		depPrecedes := !depAt.After(onset)
+		if deployPrecedes != depPrecedes {
+			return deployPrecedes
+		}
+	}
+	return closerToOnset(deployAt, depAt, onset)
+}
+
+// closerToOnset reports whether a is strictly closer to the incident onset
+// than b. A zero time loses: it means "no timestamp", never "at epoch".
+func closerToOnset(a, b, onset time.Time) bool {
+	return absDuration(onset.Sub(a)) < absDuration(onset.Sub(b))
+}
+
+func absDuration(d time.Duration) time.Duration {
+	if d < 0 {
+		return -d
 	}
-	return classification(CauseUnknown, ConfidenceLow, evidence, warnings)
+	return d
 }
 
-func classification(cause Cause, confidence Confidence, evidence []Evidence, warnings []string) Classification {
+func containsString(haystack []string, needle string) bool {
+	for _, s := range haystack {
+		if s == needle {
+			return true
+		}
+	}
+	return false
+}
+
+func classification(cause Cause, confidence Confidence, evidence []Evidence, warnings []string, ctx NextCheckContext) Classification {
 	return Classification{
 		Cause:                   cause,
 		Confidence:              confidence,
-		Evidence:                normalizeEvidence(evidence, 8),
-		NextChecks:              NextChecks(cause, confidence),
+		Evidence:                normalizeEvidence(evidence, 12),
+		NextChecks:              NextChecks(cause, confidence, ctx),
 		InstrumentationWarnings: uniqueStrings(warnings),
 	}
 }
 
 func matchingDependencySignal(input ClassificationInput) *signals.Signal {
 	downstream := firstFailingDownstream(input.Events)
+	if downstream == "" {
+		return nil
+	}
 	for i := range input.Signals {
 		sig := input.Signals[i]
 		if sig.Type != signals.TypeDependency {
 			continue
 		}
-		if downstream != "" && sig.Service != downstream {
+		if sig.Service != downstream {
 			continue
 		}
 		return &input.Signals[i]
@@ -106,24 +231,36 @@ func matchingDeployment(input ClassificationInput) *Deployment {
 	return nil
 }
 
-func matchingRuntimeSignal(input ClassificationInput) *signals.Signal {
+// matchingRuntimeSignals returns every runtime/healthcheck signal matching the
+// incident's service and env within [StartedAt-15m, Now], sorted by severity
+// priority, then timestamp, then signal_id. Returning all matches (not the
+// first) lets both infra (oom_killed) and app (panic) evidence coexist.
+func matchingRuntimeSignals(input ClassificationInput) []signals.Signal {
 	start := input.Incident.StartedAt
-	lo := start.Add(-5 * time.Minute)
-	hi := start.Add(time.Minute)
+	lo := start.Add(-15 * time.Minute)
+	hi := input.Now
+	if hi.IsZero() {
+		hi = input.Incident.UpdatedAt
+	}
+	if hi.Before(start) {
+		hi = start.Add(time.Minute)
+	}
+	var out []signals.Signal
 	for i := range input.Signals {
-		sig := input.Signals[i]
-		if sig.Type != signals.TypeRuntime && sig.Type != signals.TypeHealthcheck {
-			continue
+		if matchRuntimeSignalToIncident(input.Signals[i], input.Incident, lo, hi) {
+			out = append(out, input.Signals[i])
 		}
-		if sig.Service != input.Incident.Service {
-			continue
+	}
+	sort.SliceStable(out, func(i, j int) bool {
+		if ri, rj := severityRank(out[i].Severity), severityRank(out[j].Severity); ri != rj {
+			return ri > rj
 		}
-		if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
-			continue
+		if !out[i].Timestamp.Equal(out[j].Timestamp) {
+			return out[i].Timestamp.Before(out[j].Timestamp)
 		}
-		return &input.Signals[i]
-	}
-	return nil
+		return out[i].SignalID < out[j].SignalID
+	})
+	return out
 }
 
 func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal {
@@ -143,14 +280,14 @@ func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal
 	return nil
 }
 
-func matchingAlertEvidence(input ClassificationInput) []Evidence {
+func matchingAlerts(input ClassificationInput) []signals.Signal {
 	start := input.Incident.StartedAt
 	lo := start.Add(-15 * time.Minute)
 	hi := input.Now
 	if hi.IsZero() {
 		hi = input.Incident.UpdatedAt
 	}
-	out := []Evidence{}
+	var out []signals.Signal
 	for _, sig := range input.Signals {
 		if sig.Type != signals.TypeAlert {
 			continue
@@ -164,11 +301,40 @@ func matchingAlertEvidence(input ClassificationInput) []Evidence {
 		if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
 			continue
 		}
-		out = append(out, signalEvidence(sig, "External alert overlaps incident window"))
+		out = append(out, sig)
 	}
 	return out
 }
 
+func pickTopAlert(alerts []signals.Signal) *signals.Signal {
+	if len(alerts) == 0 {
+		return nil
+	}
+	best := &alerts[0]
+	bestRank := severityRank(best.Severity)
+	for i := 1; i < len(alerts); i++ {
+		cand := &alerts[i]
+		rank := severityRank(cand.Severity)
+		if rank > bestRank || (rank == bestRank && cand.Timestamp.After(best.Timestamp)) {
+			best = cand
+			bestRank = rank
+		}
+	}
+	return best
+}
+
+func severityRank(s signals.Severity) int {
+	switch s {
+	case signals.SeverityCritical:
+		return 3
+	case signals.SeverityWarning:
+		return 2
+	case signals.SeverityInfo:
+		return 1
+	}
+	return 0
+}
+
 func collectTraceEvidence(events []*eventv2.Event) []Evidence {
 	out := make([]Evidence, 0, 2)
 	for _, ev := range events {
@@ -191,7 +357,7 @@ func collectTraceEvidence(events []*eventv2.Event) []Evidence {
 func deploymentEvidence(dep Deployment) Evidence {
 	return Evidence{
 		Kind:       EvidenceDeployment,
-		Title:      "Deployment overlaps incident window",
+		Title:      deploymentLabel(dep),
 		Detail:     dep.Version,
 		Service:    dep.Service,
 		DeployID:   dep.ID,
@@ -199,6 +365,99 @@ func deploymentEvidence(dep Deployment) Evidence {
 	}
 }
 
+func alertLabel(sig signals.Signal) string {
+	sev := string(sig.Severity)
+	if sev == "" {
+		sev = "info"
+	}
+	reason := sig.Reason
+	if reason == "" {
+		reason = "external alert"
+	}
+	source := sig.Source
+	if source == "" {
+		source = "alert"
+	}
+	return fmt.Sprintf("%s: %s (%s)", sev, reason, source)
+}
+
+func dependencyLabel(sig signals.Signal) string {
+	service := sig.Service
+	if service == "" {
+		service = "downstream"
+	}
+	reason := sig.Reason
+	if reason == "" {
+		reason = "dependency signal"
+	}
+	return fmt.Sprintf("Dependency %s: %s", service, reason)
+}
+
+func deployLabel(sig signals.Signal) string {
+	service := sig.Service
+	if service == "" {
+		service = "service"
+	}
+	detail := stringField(sig.Metadata, "version")
+	if detail == "" {
+		detail = sig.Reason
+	}
+	if detail == "" {
+		detail = "deploy event"
+	}
+	return fmt.Sprintf("Deploy %s: %s", service, detail)
+}
+
+func deploymentLabel(dep Deployment) string {
+	service := dep.Service
+	if service == "" {
+		service = "service"
+	}
+	version := dep.Version
+	if version == "" {
+		version = "new revision"
+	}
+	return fmt.Sprintf("Deploy %s: %s", service, version)
+}
+
+func runtimeLabel(sig signals.Signal) string {
+	service := sig.Service
+	if service == "" {
+		service = "service"
+	}
+	reason := sig.Reason
+	if reason == "" {
+		reason = "runtime event"
+	}
+	if sig.Source != "" {
+		return fmt.Sprintf("Runtime %s: %s (%s)", service, reason, sig.Source)
+	}
+	return fmt.Sprintf("Runtime %s: %s", service, reason)
+}
+
+// runtimeEvidence builds a flat EvidenceRuntime row. subtype/source/severity
+// live in Fields so the generic Evidence struct stays unchanged while
+// acceptance assertions can query subtypes (Design Decision 4).
+func runtimeEvidence(sig signals.Signal) Evidence {
+	occurred := sig.Timestamp
+	if occurred.IsZero() {
+		occurred = sig.ReceivedAt
+	}
+	return Evidence{
+		Kind:       EvidenceRuntime,
+		Title:      runtimeLabel(sig),
+		Detail:     sig.Reason,
+		Service:    sig.Service,
+		SignalID:   sig.SignalID,
+		OccurredAt: occurred,
+		Fields: map[string]any{
+			"subtype":  stringField(sig.Metadata, "subtype"),
+			"source":   sig.Source,
+			"severity": string(sig.Severity),
+		},
+	}
+}
+
 func signalEvidence(sig signals.Signal, title string) Evidence {
 	fields := map[string]any{
 		"type":     string(sig.Type),
@@ -233,16 +492,38 @@ func normalizeEvidence(evidence []Evidence, limit int) []Evidence {
 		return evidence[i].Title < evidence[j].Title
 	})
 	seen := map[string]struct{}{}
-	out := make([]Evidence, 0, len(evidence))
+	deduped := make([]Evidence, 0, len(evidence))
 	for _, ev := range evidence {
 		key := string(ev.Kind) + "|" + ev.Title + "|" + ev.SignalID + "|" + ev.DeployID + "|" + ev.TraceID
 		if _, ok := seen[key]; ok {
 			continue
 		}
 		seen[key] = struct{}{}
-		out = append(out, ev)
-		if limit > 0 && len(out) == limit {
-			break
+		deduped = append(deduped, ev)
+	}
+	if limit <= 0 || len(deduped) <= limit {
+		return deduped
+	}
+	// Runtime evidence must never be truncated by the cap: the acceptance gate
+	// and the dashboard Runtime panel both rely on every matched runtime signal
+	// being present. Keep all EvidenceRuntime rows, then fill remaining slots
+	// with the earliest non-runtime rows. Chronological order is preserved
+	// because deduped is already sorted and we append in order.
+	budget := limit
+	for _, ev := range deduped {
+		if ev.Kind == EvidenceRuntime {
+			budget--
+		}
+	}
+	out := make([]Evidence, 0, limit)
+	for _, ev := range deduped {
+		if ev.Kind == EvidenceRuntime {
+			out = append(out, ev)
+			continue
+		}
+		if budget > 0 {
+			out = append(out, ev)
+			budget--
 		}
 	}
 	return out
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
index 10ae22c..bb86c1b 100644
--- a/internal/incidents/classifier_test.go
+++ b/internal/incidents/classifier_test.go
@@ -1,6 +1,7 @@
 package incidents
 
 import (
+	"strings"
 	"testing"
 	"time"
 
@@ -116,6 +117,44 @@ func TestClassifierRuntime(t *testing.T) {
 		}
 	})
 
+	t.Run("infra and app runtime both present, deterministic order", func(t *testing.T) {
+		infra := signals.Signal{
+			SignalID: "sig_oom", Type: signals.TypeRuntime, Service: "checkout", Env: "prod",
+			Reason: "OOMKilled", Severity: signals.SeverityCritical, Timestamp: now.Add(-2 * time.Minute),
+			Source: "k8s-demo", Metadata: map[string]any{"subtype": "oom_killed"},
+		}
+		app := signals.Signal{
+			SignalID: "sig_panic", Type: signals.TypeRuntime, Service: "checkout", Env: "prod",
+			Reason: "runtime panic", Severity: signals.SeverityWarning, Timestamp: now.Add(-time.Minute),
+			Source: "go-sdk", Metadata: map[string]any{"subtype": "panic"},
+		}
+		// Provide app first to prove sort (not input order) drives the result.
+		got := Classify(ClassificationInput{
+			Incident: base, Now: now,
+			Events:  []*eventv2.Event{checkoutEvent},
+			Signals: []signals.Signal{app, infra},
+		})
+		var runtimeRows []Evidence
+		for _, ev := range got.Evidence {
+			if ev.Kind == EvidenceRuntime {
+				runtimeRows = append(runtimeRows, ev)
+			}
+		}
+		if len(runtimeRows) != 2 {
+			t.Fatalf("want 2 runtime evidence rows, got %d: %+v", len(runtimeRows), got.Evidence)
+		}
+		// normalizeEvidence sorts by OccurredAt asc; infra (-2m) precedes app (-1m).
+		if runtimeRows[0].SignalID != "sig_oom" || runtimeRows[1].SignalID != "sig_panic" {
+			t.Fatalf("runtime evidence order wrong: %+v", runtimeRows)
+		}
+		if st, _ := runtimeRows[0].Fields["subtype"].(string); st != "oom_killed" {
+			t.Fatalf("infra subtype wrong: %+v", runtimeRows[0].Fields)
+		}
+		if st, _ := runtimeRows[1].Fields["subtype"].(string); st != "panic" {
+			t.Fatalf("app subtype wrong: %+v", runtimeRows[1].Fields)
+		}
+	})
+
 	t.Run("alert with OOM reason does not classify runtime", func(t *testing.T) {
 		sig := runtimeSig
 		sig.Type = signals.TypeAlert
@@ -132,7 +171,7 @@ func TestClassifierRuntime(t *testing.T) {
 
 	t.Run("runtime signal outside window", func(t *testing.T) {
 		sig := runtimeSig
-		sig.Timestamp = now.Add(-6 * time.Minute)
+		sig.Timestamp = now.Add(-20 * time.Minute)
 		got := Classify(ClassificationInput{
 			Incident: base,
 			Events:   []*eventv2.Event{checkoutEvent},
@@ -196,11 +235,213 @@ func TestClassifierRuntime(t *testing.T) {
 	})
 }
 
+func TestClassifierTemporalTiebreak(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+	paymentEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "payment.charge", "PMT_502", "payment")
+	depSig := func(ts time.Time) signals.Signal {
+		return signals.Signal{
+			SignalID: "sig_dep", Type: signals.TypeDependency, Service: "payment", Env: "prod",
+			Reason: "upstream_5xx", Severity: signals.SeverityCritical, Timestamp: ts,
+		}
+	}
+	deployAt := func(ts time.Time) Deployment {
+		return Deployment{ID: "dep_1", Service: "checkout", Version: "v1", Env: "prod", FirstSeen: ts}
+	}
+	hasEvidence := func(got Classification, match func(Evidence) bool) bool {
+		for _, ev := range got.Evidence {
+			if match(ev) {
+				return true
+			}
+		}
+		return false
+	}
+	hasCheck := func(got Classification, substr string) bool {
+		for _, line := range got.NextChecks {
+			if strings.Contains(line, substr) {
+				return true
+			}
+		}
+		return false
+	}
+
+	t.Run("deploy closer to onset wins over dependency signal", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{paymentEvent},
+			Signals:     []signals.Signal{depSig(now.Add(-10 * time.Minute))},
+			Deployments: []Deployment{deployAt(now.Add(-time.Minute))},
+		})
+		if got.Cause != CauseDeploy || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+		if !hasEvidence(got, func(ev Evidence) bool { return ev.DeployID == "dep_1" }) {
+			t.Fatalf("deployment evidence missing: %+v", got.Evidence)
+		}
+		if !hasEvidence(got, func(ev Evidence) bool { return ev.SignalID == "sig_dep" }) {
+			t.Fatalf("losing dependency-signal evidence must still attach: %+v", got.Evidence)
+		}
+		if !hasCheck(got, "Also verify downstream `payment`") {
+			t.Fatalf("next checks must cross-reference the losing dependency cause: %v", got.NextChecks)
+		}
+	})
+
+	t.Run("dependency signal closer to onset wins over deploy", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{paymentEvent},
+			Signals:     []signals.Signal{depSig(now.Add(-time.Minute))},
+			Deployments: []Deployment{deployAt(now.Add(-10 * time.Minute))},
+		})
+		if got.Cause != CauseDependency || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+		if !hasEvidence(got, func(ev Evidence) bool { return ev.DeployID == "dep_1" }) {
+			t.Fatalf("losing deployment evidence must still attach: %+v", got.Evidence)
+		}
+		if !hasCheck(got, "Also verify recent deploy `v1`") {
+			t.Fatalf("next checks must cross-reference the losing deploy cause: %v", got.NextChecks)
+		}
+	})
+
+	t.Run("equal distance keeps dependency priority", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{paymentEvent},
+			Signals:     []signals.Signal{depSig(now.Add(-time.Minute))},
+			Deployments: []Deployment{deployAt(now.Add(-time.Minute))},
+		})
+		if got.Cause != CauseDependency || got.Confidence != ConfidenceHigh {
+			t.Fatalf("classification=%+v", got)
+		}
+	})
+
+	t.Run("deploy after onset does not beat a dependency signal before onset", func(t *testing.T) {
+		// A routine/rollback deploy that lands during the incident must not be
+		// blamed over a dependency signal that preceded onset, even though the
+		// deploy's absolute distance to onset is smaller.
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{paymentEvent},
+			Signals:     []signals.Signal{depSig(now.Add(-time.Minute))},
+			Deployments: []Deployment{deployAt(now.Add(30 * time.Second))},
+		})
+		if got.Cause != CauseDependency || got.Confidence != ConfidenceHigh {
+			t.Fatalf("post-onset deploy must not win over a pre-onset dependency signal: %+v", got)
+		}
+	})
+
+	t.Run("deploy beats trace-only downstream inference", func(t *testing.T) {
+		got := Classify(ClassificationInput{
+			Incident:    base,
+			Events:      []*eventv2.Event{paymentEvent},
+			Deployments: []Deployment{deployAt(now.Add(-time.Minute))},
+		})
+		if got.Cause != CauseDeploy || got.Confidence != ConfidenceHigh {
+			t.Fatalf("trace-only downstream must not beat a correlated deploy: %+v", got)
+		}
+		if !hasEvidence(got, func(ev Evidence) bool {
+			return ev.Kind == EvidenceTrace && strings.Contains(ev.Title, "First failing step calls `payment`")
+		}) {
+			t.Fatalf("downstream trace evidence must still attach: %+v", got.Evidence)
+		}
+		if !hasCheck(got, "Also verify downstream `payment`") {
+			t.Fatalf("next checks must cross-reference the downstream: %v", got.NextChecks)
+		}
+	})
+}
+
+func TestUnrelatedDependencySignalDoesNotAttachToLeafIncident(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	base := Incident{Service: "db", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+	// Leaf-service event (no downstream call) — e.g. DB returning CART_NOT_FOUND.
+	leafEvent := testIncidentEvent("e1", "trace-a", now, "db", "cart.lookup", "CART_NOT_FOUND", "")
+	// An unrelated dependency signal that happens to be in the signal store for a different service.
+	unrelated := signals.Signal{
+		SignalID:  "sig_dep_unrelated",
+		Type:      signals.TypeDependency,
+		Service:   "payment",
+		Env:       "prod",
+		Reason:    "payment_gateway_5xx",
+		Severity:  signals.SeverityCritical,
+		Timestamp: now.Add(-time.Minute),
+	}
+
+	got := Classify(ClassificationInput{
+		Incident: base,
+		Events:   []*eventv2.Event{leafEvent},
+		Signals:  []signals.Signal{unrelated},
+	})
+
+	if got.Cause == CauseDependency {
+		t.Fatalf("leaf-service incident with unrelated dep signal should not classify as dependency: %+v", got)
+	}
+	for _, ev := range got.Evidence {
+		if ev.SignalID == "sig_dep_unrelated" {
+			t.Fatalf("unrelated dependency signal should not appear in evidence: %+v", ev)
+		}
+	}
+	for _, line := range got.NextChecks {
+		if strings.Contains(line, "`payment`") {
+			t.Fatalf("next checks should not reference unrelated `payment`: %q", line)
+		}
+	}
+}
+
 func TestNextChecksRuntime(t *testing.T) {
-	got := NextChecks(CauseRuntime, ConfidenceHigh)
+	got := NextChecks(CauseRuntime, ConfidenceHigh, NextCheckContext{
+		Service:         "payment",
+		RuntimeSignalID: "sig_rt_42",
+		RuntimeReason:   "OOMKilled",
+		RuntimeSubtype:  "oom_killed",
+	})
 	if len(got) == 0 {
 		t.Fatalf("expected non-empty next checks for runtime cause")
 	}
+	foundSignalLine := false
+	foundMemoryLine := false
+	for _, line := range got {
+		if strings.Contains(line, "runtime signal") && strings.Contains(line, "`OOMKilled`") && strings.Contains(line, "`payment`") {
+			foundSignalLine = true
+		}
+		if strings.Contains(line, "memory") && strings.Contains(line, "`payment`") {
+			foundMemoryLine = true
+		}
+	}
+	if !foundSignalLine {
+		t.Fatalf("expected runtime signal line referencing payment+OOMKilled, got %v", got)
+	}
+	if !foundMemoryLine {
+		t.Fatalf("expected memory-usage line for oom_killed subtype, got %v", got)
+	}
+}
+
+func TestNextChecksRuntimeWithoutSubtypeOmitsCategoryLine(t *testing.T) {
+	got := NextChecks(CauseRuntime, ConfidenceHigh, NextCheckContext{
+		Service:         "payment",
+		RuntimeSignalID: "sig_rt",
+		RuntimeReason:   "container restarted",
+	})
+	for _, line := range got {
+		if strings.Contains(line, "memory") || strings.Contains(line, "readiness") || strings.Contains(line, "liveness") {
+			t.Fatalf("runtime cause without subtype must not emit memory/probe lines, got %q", line)
+		}
+	}
+}
+
+func TestNextChecksFallbackWhenContextEmpty(t *testing.T) {
+	causes := []Cause{CauseDependency, CauseDeploy, CauseRuntime, CauseApp, CauseUnknown}
+	for _, cause := range causes {
+		got := NextChecks(cause, ConfidenceMedium, NextCheckContext{})
+		if len(got) == 0 {
+			t.Fatalf("%s: expected non-empty next checks", cause)
+		}
+		for _, line := range got {
+			if strings.Contains(line, "{") || strings.Contains(line, "``") {
+				t.Fatalf("%s: empty context produced unfilled placeholder in %q", cause, line)
+			}
+		}
+	}
 }
 
 func TestClassifyIncludesAlertEvidenceWithoutChangingCause(t *testing.T) {
@@ -241,7 +482,11 @@ func TestClassifyIncludesAlertEvidenceWithoutChangingCause(t *testing.T) {
 		if ev.SignalID == "sig_other_env" {
 			t.Fatalf("alert evidence from another env should not be included: %+v", ev)
 		}
-		if ev.SignalID == "sig_alert" && ev.Title == "External alert overlaps incident window" {
+		if ev.SignalID == "sig_alert" {
+			wantTitle := "critical: PMT_502 spike (grafana)"
+			if ev.Title != wantTitle {
+				t.Fatalf("alert evidence title %q, want %q", ev.Title, wantTitle)
+			}
 			if ev.Fields["alert_id"] != "alert_1" {
 				t.Fatalf("alert metadata missing: %+v", ev.Fields)
 			}
@@ -250,3 +495,37 @@ func TestClassifyIncludesAlertEvidenceWithoutChangingCause(t *testing.T) {
 	}
 	t.Fatalf("alert evidence missing: %+v", got.Evidence)
 }
+
+func TestNormalizeEvidence_RuntimeSurvivesCap(t *testing.T) {
+	base := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	var ev []Evidence
+	// 20 earlier alert rows, all occurring before the runtime events.
+	for i := 0; i < 20; i++ {
+		ev = append(ev, Evidence{
+			Kind: EvidenceSignal, Title: "alert", SignalID: "a" + string(rune('A'+i)),
+			OccurredAt: base.Add(time.Duration(i) * time.Second),
+		})
+	}
+	// Two runtime rows occurring AFTER all the alerts — would be truncated by a
+	// naive time-sorted cap.
+	ev = append(ev,
+		Evidence{Kind: EvidenceRuntime, Title: "Runtime checkout: OOMKilled", SignalID: "oom",
+			OccurredAt: base.Add(time.Hour), Fields: map[string]any{"subtype": "oom_killed"}},
+		Evidence{Kind: EvidenceRuntime, Title: "Runtime checkout: panic", SignalID: "panic",
+			OccurredAt: base.Add(2 * time.Hour), Fields: map[string]any{"subtype": "panic"}},
+	)
+	out := normalizeEvidence(ev, 12)
+	subtypes := map[string]bool{}
+	for _, e := range out {
+		if e.Kind == EvidenceRuntime {
+			st, _ := e.Fields["subtype"].(string)
+			subtypes[st] = true
+		}
+	}
+	if !subtypes["oom_killed"] || !subtypes["panic"] {
+		t.Fatalf("runtime rows truncated by cap; out kinds=%v", out)
+	}
+	if len(out) > 12 {
+		t.Fatalf("cap exceeded: %d rows", len(out))
+	}
+}
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
index 0680c23..24fd588 100644
--- a/internal/incidents/engine.go
+++ b/internal/incidents/engine.go
@@ -21,6 +21,7 @@ type Config struct {
 	Window                  time.Duration
 	MinCount                int
 	MinLift                 float64
+	MinRate                 float64 // errors/minute low-traffic guard; 0 disables
 	ResolveAfter            time.Duration
 	DeployCorrelationWindow time.Duration
 	SampleLimit             int
@@ -150,13 +151,22 @@ type derivedRow struct {
 // touching e.active or the store. Used by both live Tick and startup Rebuild.
 func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Incident, reader Reader) ([]derivedRow, error) {
 	currentStart := now.Add(-e.cfg.Window)
-	baselineStart := now.Add(-2 * e.cfg.Window)
 	statuses := failedStatuses()
 	current := reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
-	baseline := reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
-	baselineByFamily := map[string]int{}
-	for _, row := range baseline.Rows {
-		baselineByFamily[familyKey(row.ErrorFamily)] = row.Count
+	// Baseline = per-family median of the 3 prior windows (newest first); a
+	// family absent from a window counts 0. One anomalous prior window can
+	// neither suppress a spike nor fabricate lift (docs/internals.md).
+	baselineByFamily := map[string][3]int{}
+	for i := 0; i < 3; i++ {
+		until := now.Add(-time.Duration(i+1) * e.cfg.Window)
+		since := now.Add(-time.Duration(i+2) * e.cfg.Window)
+		res := reader.Errors(SearchFilter{Since: since, Until: until, Statuses: statuses}, 200)
+		for _, row := range res.Rows {
+			key := familyKey(row.ErrorFamily)
+			counts := baselineByFamily[key]
+			counts[i] = row.Count
+			baselineByFamily[key] = counts
+		}
 	}
 
 	seen := map[string]struct{}{}
@@ -165,7 +175,10 @@ func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Inci
 		if row.Count < e.cfg.MinCount {
 			continue
 		}
-		baselineCount := baselineByFamily[familyKey(row.ErrorFamily)]
+		if e.cfg.MinRate > 0 && float64(row.Count) < e.cfg.MinRate*e.cfg.Window.Minutes() {
+			continue
+		}
+		baselineCount := median3(baselineByFamily[familyKey(row.ErrorFamily)])
 		lift := computeLift(row.Count, baselineCount)
 		if baselineCount > 0 && lift < e.cfg.MinLift {
 			continue
@@ -311,6 +324,58 @@ func (e *Engine) TopActive(ctx context.Context) (*Incident, error) {
 	return &rows[0], nil
 }
 
+// safeBlastRadius wraps reader.BlastRadius with a panic recover so a reader
+// fault never propagates into the tick. ok=false means the reader call
+// faulted; callers should treat the returned response as zero-value and
+// record CaptureStatus=missing for downstream evidence.
+func safeBlastRadius(r Reader, f SearchFilter, k apiv2.BlastKey) (out apiv2.BlastRadiusResponse, ok bool) {
+	if r == nil {
+		return apiv2.BlastRadiusResponse{}, false
+	}
+	ok = true
+	defer func() {
+		if rec := recover(); rec != nil {
+			out, ok = apiv2.BlastRadiusResponse{}, false
+		}
+	}()
+	out = r.BlastRadius(f, k)
+	return out, ok
+}
+
+func safeTraceStory(r Reader, traceID string) (resp apiv2.StoryResponse, ok bool) {
+	if r == nil {
+		return apiv2.StoryResponse{}, false
+	}
+	defer func() {
+		if rec := recover(); rec != nil {
+			resp, ok = apiv2.StoryResponse{}, false
+		}
+	}()
+	return r.TraceStoryByTraceID(traceID)
+}
+
+func safeTraceEvents(r Reader, traceID string) (events []*eventv2.Event, ok bool) {
+	if r == nil {
+		return nil, false
+	}
+	defer func() {
+		if rec := recover(); rec != nil {
+			events, ok = nil, false
+		}
+	}()
+	return r.TraceEvents(traceID)
+}
+
+func filterEventsByTrace(events []*eventv2.Event, traceID string) []*eventv2.Event {
+	out := make([]*eventv2.Event, 0, 1)
+	for _, ev := range events {
+		if ev != nil && ev.TraceID == traceID {
+			out = append(out, ev)
+		}
+	}
+	return out
+}
+
 func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Incident, reader Reader, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, bool, error) {
 	events := sampleEventsFromReader(reader, row.ErrorFamily, since, now, 200)
 	startedAt := earliestEventTime(events, now)
@@ -327,11 +392,15 @@ func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Inci
 			hadExisting = true
 		}
 	}
-	blast := reader.BlastRadius(
+	blast, blastOK := safeBlastRadius(reader,
 		SearchFilter{Since: since, Until: now},
 		apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
 	)
-	sigs, err := e.querySignals(ctx, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+	signalSince := now.Add(-e.cfg.DeployCorrelationWindow)
+	if alertSince := startedAt.Add(-e.cfg.DeployCorrelationWindow); alertSince.Before(signalSince) {
+		signalSince = alertSince
+	}
+	sigs, err := e.querySignals(ctx, env, signalSince, now)
 	if err != nil && !errors.Is(err, signals.ErrUnavailable) {
 		return Incident{}, false, err
 	}
@@ -362,6 +431,40 @@ func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Inci
 		inc.StartedAt = existing.StartedAt
 		inc.RecoveringAt = nil
 	}
+	if reader != nil && inc.Status != StatusResolved {
+		blastStatus := CaptureOK
+		if !blastOK {
+			blastStatus = CaptureMissing
+		}
+		inc.Blast = updateBlastSnapshot(existing.Blast, newBlastEvidence(blast, now, blastStatus))
+
+		var story *apiv2.StoryResponse
+		var firstSeenAt *time.Time
+		var sampleTraceID string
+		if blastOK && len(blast.SampleTraces) > 0 {
+			sampleTraceID = blast.SampleTraces[0]
+			if s, ok := safeTraceStory(reader, sampleTraceID); ok {
+				story = &s
+			}
+			// Prefer events already loaded for this family scan; fall back to a
+			// dedicated TraceEvents read only if the sample trace had no
+			// anchor-matching event in that scan.
+			traceEvts := filterEventsByTrace(events, sampleTraceID)
+			if len(traceEvts) == 0 {
+				if evts, ok := safeTraceEvents(reader, sampleTraceID); ok {
+					traceEvts = evts
+				}
+			}
+			if ts, ok2 := pickAnchorTsStart(traceEvts, inc.ErrorFamily); ok2 {
+				firstSeenAt = &ts
+			}
+		}
+		inc.Propagation = updatePropagationSnapshot(existing.Propagation, newPropagationEvidence(story, sampleTraceID, firstSeenAt, now))
+	}
+	if inc.Status != StatusResolved {
+		inc.Alerts = updateAlertSnapshot(existing.Alerts, captureAlertEvidenceFromSignals(sigs, inc, now, e.cfg.DeployCorrelationWindow))
+		inc.Runtime = updateRuntimeSnapshot(existing.Runtime, captureRuntimeEvidence(sigs, inc, now, e.cfg.DeployCorrelationWindow))
+	}
 	class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
 	inc.Cause = class.Cause
 	inc.Confidence = class.Confidence
@@ -463,6 +566,12 @@ func computeLift(current, baseline int) float64 {
 	return float64(current) / float64(baseline)
 }
 
+func median3(c [3]int) int {
+	s := []int{c[0], c[1], c[2]}
+	sort.Ints(s)
+	return s[1]
+}
+
 func severity(count, services int, lift float64) int {
 	score := 1 + count/5 + services
 	if lift >= 10 {
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
index 6aea07f..1a3f5ac 100644
--- a/internal/incidents/engine_test.go
+++ b/internal/incidents/engine_test.go
@@ -120,7 +120,7 @@ func TestEngineUsesDownstreamDependencySignal(t *testing.T) {
 	if rows[0].Cause != CauseDependency || rows[0].Confidence != ConfidenceHigh {
 		t.Fatalf("classification = %s/%s, want dependency/high", rows[0].Cause, rows[0].Confidence)
 	}
-	if len(signalStore.filters) != 1 || signalStore.filters[0].Service != "" || signalStore.filters[0].Env != "prod" {
+	if len(signalStore.filters) < 1 || signalStore.filters[0].Service != "" || signalStore.filters[0].Env != "prod" {
 		t.Fatalf("signal filters = %+v", signalStore.filters)
 	}
 }
@@ -256,18 +256,29 @@ func TestRebuildOrchestratorUsesRebuildApply(t *testing.T) {
 }
 
 type fakeReader struct {
-	current ErrorsResult
-	base    ErrorsResult
-	blast   apiv2.BlastRadiusResponse
-	events  []*eventv2.Event
-	calls   int
+	current     ErrorsResult
+	base        ErrorsResult   // default for every baseline window
+	baseSeq     []ErrorsResult // optional per-window baselines, newest prior window first
+	blast       apiv2.BlastRadiusResponse
+	events      []*eventv2.Event
+	calls       int
+	story       apiv2.StoryResponse
+	storyOK     bool
+	traceEvts   []*eventv2.Event
+	traceEvtsOK bool
 }
 
+// Errors mirrors derive's query order: one current-window call followed by
+// three baseline-window calls (newest prior window first), repeating per tick.
 func (r *fakeReader) Errors(_ SearchFilter, _ int) ErrorsResult {
+	pos := r.calls % 4
 	r.calls++
-	if r.calls%2 == 1 {
+	if pos == 0 {
 		return r.current
 	}
+	if len(r.baseSeq) >= pos {
+		return r.baseSeq[pos-1]
+	}
 	return r.base
 }
 
@@ -281,6 +292,81 @@ func (r *fakeReader) SearchEvents(_ SearchFilter, _ int) []*eventv2.Event {
 	return r.events
 }
 
+func (r *fakeReader) TraceStoryByTraceID(_ string) (apiv2.StoryResponse, bool) {
+	return r.story, r.storyOK
+}
+
+func (r *fakeReader) TraceEvents(_ string) ([]*eventv2.Event, bool) {
+	return r.traceEvts, r.traceEvtsOK
+}
+
+func spikeReader(currentCount int, baselineCounts [3]int) *fakeReader {
+	row := func(n int) ErrorsResult {
+		if n == 0 {
+			return ErrorsResult{}
+		}
+		return ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily: testFamily(), Count: n, AffectedTraces: n, SampleTraces: []string{"trace-a"},
+		}}}
+	}
+	return &fakeReader{
+		current: row(currentCount),
+		baseSeq: []ErrorsResult{row(baselineCounts[0]), row(baselineCounts[1]), row(baselineCounts[2])},
+		blast:   apiv2.BlastRadiusResponse{AffectedRequests: currentCount, AffectedServices: 1},
+	}
+}
+
+func activeAfterTick(t *testing.T, reader *fakeReader, cfg Config) []Incident {
+	t.Helper()
+	engine := NewEngine(reader, nil, nil, NewMemoryStore(), cfg, nil, nil)
+	engine.now = func() time.Time { return time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) }
+	if err := engine.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	if err := engine.Tick(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	rows, err := engine.Active(context.Background())
+	if err != nil {
+		t.Fatal(err)
+	}
+	return rows
+}
+
+func TestBaselineMedianResistsOneSpikyPriorWindow(t *testing.T) {
+	// One anomalous prior window (12) must not suppress a real spike: the
+	// median of [12, 0, 0] is 0, so the family is treated as a fresh spike.
+	rows := activeAfterTick(t, spikeReader(12, [3]int{12, 0, 0}), Config{MinCount: 5, MinLift: 3, SampleLimit: 2})
+	if len(rows) != 1 {
+		t.Fatalf("spiky baseline window suppressed a real incident: %+v", rows)
+	}
+	if rows[0].BaselineCount != 0 {
+		t.Fatalf("baseline must be the median (0), got %d", rows[0].BaselineCount)
+	}
+}
+
+func TestBaselineMedianSuppressesSteadyNoise(t *testing.T) {
+	// A steadily failing family (~10/window) with current 12 has lift 1.2 < 3:
+	// no incident.
+	rows := activeAfterTick(t, spikeReader(12, [3]int{10, 9, 11}), Config{MinCount: 5, MinLift: 3, SampleLimit: 2})
+	if len(rows) != 0 {
+		t.Fatalf("steady error noise must not open an incident: %+v", rows)
+	}
+}
+
+func TestMinRateGuardSuppressesLowTraffic(t *testing.T) {
+	// 6 failures in a 10m window = 0.6/min. With MIN_RATE=1 the family must
+	// not open; with the guard disabled (0) it must.
+	cfg := Config{MinCount: 5, MinLift: 3, SampleLimit: 2, Window: 10 * time.Minute, MinRate: 1}
+	if rows := activeAfterTick(t, spikeReader(6, [3]int{0, 0, 0}), cfg); len(rows) != 0 {
+		t.Fatalf("min-rate guard must suppress low-traffic family: %+v", rows)
+	}
+	cfg.MinRate = 0
+	if rows := activeAfterTick(t, spikeReader(6, [3]int{0, 0, 0}), cfg); len(rows) != 1 {
+		t.Fatalf("disabled min-rate guard must preserve current behavior: %+v", rows)
+	}
+}
+
 type fakeSignalStore struct {
 	rows    []signals.Signal
 	filters []signals.Filter
@@ -290,3 +376,88 @@ func (s *fakeSignalStore) Query(_ context.Context, f signals.Filter) ([]signals.
 	s.filters = append(s.filters, f)
 	return s.rows, nil
 }
+
+func TestEngine_PropagationOpeningSurvivesAcrossTicks(t *testing.T) {
+	now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+	fam := testFamily()
+
+	reader := &fakeReader{
+		current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+			ErrorFamily:    fam,
+			Count:          6,
+			AffectedTraces: 6,
+			SampleTraces:   []string{"trace_a"},
+		}}},
+		blast: apiv2.BlastRadiusResponse{
+			AffectedRequests: 3,
+			AffectedServices: 2,
+			SampleTraces:     []string{"trace_a"},
+			TopServices:      []string{"checkout"},
+		},
+		events: []*eventv2.Event{
+			testIncidentEvent("anchor", "trace_a", now.Add(-time.Minute),
+				"checkout", fam.Step, fam.ErrorCode, fam.Service),
+		},
+		story: apiv2.StoryResponse{
+			Service: fam.Service,
+			Anchor:  &apiv2.StoryAnchor{Step: fam.Step},
+			Path:    []apiv2.StoryStep{{Name: fam.Step, Status: "error", ErrorCode: fam.ErrorCode}},
+		},
+		storyOK: true,
+		traceEvts: []*eventv2.Event{{
+			TsStart: now.Add(-90 * time.Second),
+			Anchor:  &eventv2.Anchor{Step: fam.Step, ErrorCode: fam.ErrorCode},
+		}},
+		traceEvtsOK: true,
+	}
+	store := NewMemoryStore()
+	engine := NewEngine(reader, nil, nil, store, Config{MinCount: 5, ResolveAfter: time.Minute, SampleLimit: 2}, nil, nil)
+	engine.now = func() time.Time { return now }
+	ctx := context.Background()
+	if err := engine.Bootstrap(ctx); err != nil {
+		t.Fatal(err)
+	}
+	if err := engine.Tick(ctx); err != nil {
+		t.Fatal(err)
+	}
+
+	actives, err := engine.Active(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(actives) != 1 {
+		t.Fatalf("expected 1 active incident, got %d", len(actives))
+	}
+	incID := actives[0].IncidentID
+	if actives[0].Propagation == nil || actives[0].Propagation.Opening == nil {
+		t.Fatalf("Propagation.Opening should be set after tick 1: %+v", actives[0].Propagation)
+	}
+	if actives[0].Blast == nil || actives[0].Blast.Opening == nil {
+		t.Fatalf("Blast.Opening should be set after tick 1: %+v", actives[0].Blast)
+	}
+
+	// Tick 2: blast still OK, but no sample traces -> propagation missing.
+	// Opening should carry forward through the engine merge + store persistence.
+	reader.blast.SampleTraces = nil
+	reader.storyOK = false
+	reader.traceEvtsOK = false
+	now = now.Add(30 * time.Second)
+	engine.now = func() time.Time { return now }
+	if err := engine.Tick(ctx); err != nil {
+		t.Fatal(err)
+	}
+
+	got, err := store.Get(ctx, incID)
+	if err != nil {
+		t.Fatalf("store get: %v", err)
+	}
+	if got.Propagation == nil || got.Propagation.Opening == nil {
+		t.Fatalf("Propagation.Opening lost after tick 2: %+v", got.Propagation)
+	}
+	if got.Propagation.Latest == nil || got.Propagation.Latest.CaptureStatus != CaptureMissing {
+		t.Errorf("Propagation.Latest should be missing: %+v", got.Propagation.Latest)
+	}
+	if got.Blast == nil || got.Blast.Opening == nil {
+		t.Errorf("Blast.Opening should still be set: %+v", got.Blast)
+	}
+}
diff --git a/internal/incidents/handler.go b/internal/incidents/handler.go
index d6058c2..a847c43 100644
--- a/internal/incidents/handler.go
+++ b/internal/incidents/handler.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"net/http"
 	"strings"
+	"time"
 
 	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 )
@@ -111,6 +112,161 @@ func toAPIIncident(inc Incident) apiv2.Incident {
 		Lift:                    inc.Lift,
 		BaselineCount:           inc.BaselineCount,
 		CurrentCount:            inc.CurrentCount,
+		Propagation:             toAPIPropagation(inc.Propagation),
+		Blast:                   toAPIBlast(inc.Blast),
+		Alerts:                  toAPIAlerts(inc.Alerts),
+		Runtime:                 toAPIRuntime(inc.Runtime),
+	}
+}
+
+func toAPIRuntime(s *RuntimeSnapshot) *apiv2.RuntimeSnapshot {
+	if s == nil {
+		return nil
+	}
+	matches := make([]apiv2.RuntimeEvidence, 0, len(s.Matches))
+	for i := range s.Matches {
+		matches = append(matches, toAPIRuntimeEvidenceVal(s.Matches[i]))
+	}
+	return &apiv2.RuntimeSnapshot{
+		Matches: matches,
+		Opening: toAPIRuntimeEvidence(s.Opening),
+		Latest:  toAPIRuntimeEvidence(s.Latest),
+	}
+}
+
+func toAPIRuntimeEvidence(r *RuntimeEvidence) *apiv2.RuntimeEvidence {
+	if r == nil {
+		return nil
+	}
+	out := toAPIRuntimeEvidenceVal(*r)
+	return &out
+}
+
+func toAPIRuntimeEvidenceVal(r RuntimeEvidence) apiv2.RuntimeEvidence {
+	var meta map[string]any
+	if len(r.Metadata) > 0 {
+		meta = make(map[string]any, len(r.Metadata))
+		for k, v := range r.Metadata {
+			meta[k] = v
+		}
+	}
+	return apiv2.RuntimeEvidence{
+		Subtype:       r.Subtype,
+		Service:       r.Service,
+		Reason:        r.Reason,
+		Severity:      r.Severity,
+		Source:        r.Source,
+		SignalID:      r.SignalID,
+		OccurredAt:    r.OccurredAt,
+		Metadata:      meta,
+		CapturedAt:    r.CapturedAt,
+		CaptureStatus: string(r.CaptureStatus),
+	}
+}
+
+func toAPIPropagation(s *PropagationSnapshot) *apiv2.PropagationSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &apiv2.PropagationSnapshot{
+		Opening: toAPIPropagationEvidence(s.Opening),
+		Latest:  toAPIPropagationEvidence(s.Latest),
+	}
+}
+
+func toAPIPropagationEvidence(p *PropagationEvidence) *apiv2.PropagationEvidence {
+	if p == nil {
+		return nil
+	}
+	path := make([]apiv2.PropagationStep, 0, len(p.Path))
+	for _, s := range p.Path {
+		path = append(path, apiv2.PropagationStep{
+			Service:    s.Service,
+			Step:       s.Step,
+			StartMS:    s.StartMS,
+			DurationMS: s.DurationMS,
+			Status:     s.Status,
+			ErrorCode:  s.ErrorCode,
+		})
+	}
+	var firstSeen *time.Time
+	if p.FirstSeenAt != nil {
+		t := *p.FirstSeenAt
+		firstSeen = &t
+	}
+	return &apiv2.PropagationEvidence{
+		OriginService: p.OriginService,
+		OriginStep:    p.OriginStep,
+		Path:          path,
+		SampleTraceID: p.SampleTraceID,
+		FirstSeenAt:   firstSeen,
+		CapturedAt:    p.CapturedAt,
+		CaptureStatus: string(p.CaptureStatus),
+	}
+}
+
+func toAPIBlast(s *BlastSnapshot) *apiv2.BlastSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &apiv2.BlastSnapshot{
+		Opening: toAPIBlastEvidence(s.Opening),
+		Latest:  toAPIBlastEvidence(s.Latest),
+	}
+}
+
+func toAPIBlastEvidence(b *BlastEvidence) *apiv2.BlastEvidence {
+	if b == nil {
+		return nil
+	}
+	var users *int
+	if b.AffectedUsers != nil {
+		u := *b.AffectedUsers
+		users = &u
+	}
+	return &apiv2.BlastEvidence{
+		AffectedRequests: b.AffectedRequests,
+		AffectedUsers:    users,
+		AffectedServices: b.AffectedServices,
+		TopServices:      append([]string(nil), b.TopServices...),
+		SampledTraces:    append([]string(nil), b.SampledTraces...),
+		CapturedAt:       b.CapturedAt,
+		CaptureStatus:    string(b.CaptureStatus),
+	}
+}
+
+func toAPIAlerts(s *AlertSnapshot) *apiv2.AlertSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &apiv2.AlertSnapshot{
+		Opening: toAPIAlertEvidence(s.Opening),
+		Latest:  toAPIAlertEvidence(s.Latest),
+	}
+}
+
+func toAPIAlertEvidence(a *AlertEvidence) *apiv2.AlertEvidence {
+	if a == nil {
+		return nil
+	}
+	matches := make([]apiv2.MatchedAlert, 0, len(a.Matches))
+	for _, m := range a.Matches {
+		matches = append(matches, apiv2.MatchedAlert{
+			SignalID:    m.SignalID,
+			AlertID:     m.AlertID,
+			Source:      m.Source,
+			Severity:    m.Severity,
+			Reason:      m.Reason,
+			ProviderURL: m.ProviderURL,
+			EvidenceIDs: append([]string(nil), m.EvidenceIDs...),
+			MatchedAt:   m.MatchedAt,
+			Strategy:    m.Strategy,
+		})
+	}
+	return &apiv2.AlertEvidence{
+		Matches:       matches,
+		CapturedAt:    a.CapturedAt,
+		CaptureStatus: string(a.CaptureStatus),
 	}
 }
 
diff --git a/internal/incidents/handler_test.go b/internal/incidents/handler_test.go
index 670c19c..dddc9ab 100644
--- a/internal/incidents/handler_test.go
+++ b/internal/incidents/handler_test.go
@@ -61,3 +61,68 @@ func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
 		t.Fatalf("json snapshot status=%d body=%s", rec.Code, rec.Body.String())
 	}
 }
+
+func TestHandler_IncidentDetail_EmitsPropagationAndBlast(t *testing.T) {
+	ts := time.Now().UTC().Truncate(time.Second)
+	users := 12
+	inc := Incident{
+		IncidentID:  "inc_emit",
+		Env:         "demo",
+		Service:     "payment-service",
+		ErrorFamily: apiv2.ErrorFamily{Service: "payment-service", Step: "charge", ErrorCode: "DB_TIMEOUT"},
+		Status:      StatusActive,
+		Cause:       CauseUnknown,
+		Confidence:  ConfidenceMedium,
+		Severity:    2,
+		StartedAt:   ts,
+		UpdatedAt:   ts,
+		LastSeenAt:  ts,
+		Propagation: &PropagationSnapshot{
+			Latest: &PropagationEvidence{OriginService: "payment-service", OriginStep: "charge", SampleTraceID: "tx", CapturedAt: ts, CaptureStatus: CaptureOK},
+		},
+		Blast: &BlastSnapshot{
+			Latest: &BlastEvidence{AffectedRequests: 5, AffectedServices: 2, AffectedUsers: &users, TopServices: []string{"checkout"}, CapturedAt: ts, CaptureStatus: CaptureOK},
+		},
+		Alerts: &AlertSnapshot{
+			Latest: &AlertEvidence{
+				Matches:       []MatchedAlert{{SignalID: "sig_1", AlertID: "CheckoutPaymentFailure", Source: "alertmanager", Severity: "critical", Reason: "PMT_502 spike", MatchedAt: ts, Strategy: "family"}},
+				CapturedAt:    ts,
+				CaptureStatus: CaptureOK,
+			},
+		},
+	}
+	store := NewMemoryStore()
+	if err := store.Upsert(context.Background(), inc); err != nil {
+		t.Fatal(err)
+	}
+	engine := NewEngine(&fakeReader{}, nil, nil, store, Config{}, nil, nil)
+	if err := engine.Bootstrap(context.Background()); err != nil {
+		t.Fatal(err)
+	}
+	h := NewHandler(engine)
+
+	rec := httptest.NewRecorder()
+	h.Incident(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/inc_emit", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var got apiv2.IncidentDetailResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &got); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if got.Incident.Propagation == nil || got.Incident.Propagation.Latest == nil {
+		t.Errorf("response missing .incident.propagation.latest: %s", rec.Body.String())
+	}
+	if got.Incident.Blast == nil || got.Incident.Blast.Latest == nil {
+		t.Errorf("response missing .incident.blast.latest: %s", rec.Body.String())
+	}
+	if got.Incident.Blast.Latest.AffectedRequests != 5 {
+		t.Errorf("Blast.Latest.AffectedRequests = %d; want 5", got.Incident.Blast.Latest.AffectedRequests)
+	}
+	if got.Incident.Alerts == nil || got.Incident.Alerts.Latest == nil {
+		t.Errorf("response missing .incident.alerts.latest: %s", rec.Body.String())
+	}
+	if len(got.Incident.Alerts.Latest.Matches) != 1 || got.Incident.Alerts.Latest.Matches[0].SignalID != "sig_1" {
+		t.Errorf("Alerts.Latest.Matches = %+v", got.Incident.Alerts.Latest.Matches)
+	}
+}
diff --git a/internal/incidents/interfaces.go b/internal/incidents/interfaces.go
index d2ce85e..cc6b405 100644
--- a/internal/incidents/interfaces.go
+++ b/internal/incidents/interfaces.go
@@ -13,6 +13,10 @@ type Reader interface {
 	Errors(f SearchFilter, limit int) ErrorsResult
 	BlastRadius(f SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse
 	SearchEvents(f SearchFilter, limit int) []*eventv2.Event
+
+	// Added for incident evidence capture (v1.0):
+	TraceStoryByTraceID(traceID string) (apiv2.StoryResponse, bool)
+	TraceEvents(traceID string) ([]*eventv2.Event, bool)
 }
 
 type SearchFilter struct {
diff --git a/internal/incidents/nextchecks.go b/internal/incidents/nextchecks.go
index 9f0d696..4fef791 100644
--- a/internal/incidents/nextchecks.go
+++ b/internal/incidents/nextchecks.go
@@ -1,37 +1,232 @@
 package incidents
 
-func NextChecks(cause Cause, confidence Confidence) []string {
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+// NextCheckContext carries the captured evidence a check can legitimately
+// reference. Empty fields mean "Crux does not have this data" — emitters must
+// skip any line whose required field is empty rather than printing filler.
+// Instrumentation gaps (missing version, missing dep signal) are first-class
+// inputs so the panel can suggest closing the gap instead of pretending to
+// run a check it cannot run.
+type NextCheckContext struct {
+	Service   string
+	ErrorCode string
+	Step      string
+
+	SampleTraceID string
+
+	Downstream      string
+	DepSignalID     string
+	DepSignalReason string
+
+	DeployVersion   string
+	DeployFirstSeen time.Time
+	DeploySignalID  string
+
+	RuntimeSignalID string
+	RuntimeReason   string
+	RuntimeSubtype  string
+
+	AlertSignalID    string
+	AlertID          string
+	AlertSource      string
+	AlertProviderURL string
+
+	MissingServiceVersion   bool
+	MissingDependencySignal bool
+	HasPartialTrace         bool
+}
+
+func NextChecks(cause Cause, confidence Confidence, ctx NextCheckContext) []string {
+	var out []string
 	switch cause {
 	case CauseDeploy:
-		return []string{
-			"Compare error onset with the deployment timestamp.",
-			"Check whether the deployed service version appears on failing traces.",
-			"Roll back or canary-disable the deployment if the affected family is still rising.",
-		}
+		out = deployChecks(ctx)
 	case CauseDependency:
-		return []string{
-			"Check the downstream service health and recent deploys.",
-			"Inspect retries, timeouts, and circuit-breaker state for the failing step.",
-			"Notify the downstream owner with sample traces and affected service list.",
-		}
+		out = dependencyChecks(ctx)
 	case CauseRuntime:
-		return []string{
-			"Check the service for recent restarts or crashloops.",
-			"Inspect memory and CPU usage for OOM kills or resource pressure.",
-			"Review readiness and liveness probe results around the incident start.",
-			"Verify node and task health for the affected service instances.",
-		}
+		out = runtimeChecks(ctx)
 	case CauseApp:
-		return []string{
-			"Inspect the first failing step and recent application logs.",
-			"Compare failing request fields against recent successful requests.",
-			"Add instrumentation if the step lacks enough context to isolate the bad branch.",
-		}
+		out = appChecks(ctx)
 	default:
-		return []string{
-			"Inspect sample traces for missing downstream or deploy evidence.",
-			"Check whether production signals are being posted to /v1/signals.",
-			"Add service version and dependency health signals to improve classification.",
+		out = unknownChecks(ctx)
+	}
+	out = appendGapChecks(out, ctx)
+	if len(out) == 0 {
+		out = append(out, "Crux has no evidence-backed checks for this incident yet. Verify ingest is receiving events and signals for this service.")
+	}
+	return out
+}
+
+func deployChecks(ctx NextCheckContext) []string {
+	var out []string
+	service := backtick(ctx.Service, "")
+	if ctx.DeployVersion != "" && !ctx.DeployFirstSeen.IsZero() && service != "" {
+		out = append(out, fmt.Sprintf("Compare incident onset with deploy `%s` on %s, first seen at %s.",
+			ctx.DeployVersion, service, ctx.DeployFirstSeen.UTC().Format(time.RFC3339)))
+	} else if ctx.DeployVersion != "" && service != "" {
+		out = append(out, fmt.Sprintf("Compare incident onset with deploy `%s` on %s.", ctx.DeployVersion, service))
+	} else if ctx.DeployVersion != "" {
+		out = append(out, fmt.Sprintf("Compare incident onset with deploy `%s`.", ctx.DeployVersion))
+	}
+	if ctx.DeploySignalID != "" {
+		if service != "" {
+			out = append(out, fmt.Sprintf("Review deploy signal `%s` on %s.", shortRef(ctx.DeploySignalID), service))
+		} else {
+			out = append(out, fmt.Sprintf("Review deploy signal `%s`.", shortRef(ctx.DeploySignalID)))
+		}
+	}
+	if ctx.SampleTraceID != "" && service != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s` on %s for the deployed-version marker.", shortRef(ctx.SampleTraceID), service))
+	}
+	if ctx.Downstream != "" {
+		out = append(out, fmt.Sprintf("Also verify downstream %s — it was implicated in the same window.", backtick(ctx.Downstream, "")))
+	}
+	return out
+}
+
+func dependencyChecks(ctx NextCheckContext) []string {
+	var out []string
+	downstream := backtick(ctx.Downstream, "")
+	step := backtick(ctx.Step, "")
+	if ctx.SampleTraceID != "" && downstream != "" {
+		if step != "" {
+			out = append(out, fmt.Sprintf("Inspect sampled trace `%s` at %s for the failing call to %s.",
+				shortRef(ctx.SampleTraceID), step, downstream))
+		} else {
+			out = append(out, fmt.Sprintf("Inspect sampled trace `%s` for the failing call to %s.",
+				shortRef(ctx.SampleTraceID), downstream))
+		}
+	}
+	if ctx.DepSignalID != "" && ctx.DepSignalReason != "" && downstream != "" {
+		out = append(out, fmt.Sprintf("Review dependency signal `%s`: `%s` on %s.",
+			shortRef(ctx.DepSignalID), ctx.DepSignalReason, downstream))
+	}
+	if ctx.DeployVersion != "" {
+		line := fmt.Sprintf("Also verify recent deploy %s", backtick(ctx.DeployVersion, ""))
+		if ctx.Service != "" {
+			line += fmt.Sprintf(" on %s", backtick(ctx.Service, ""))
+		}
+		out = append(out, line+".")
+	}
+	if ctx.AlertSignalID != "" {
+		out = append(out, alertCheckLine(ctx))
+	}
+	return out
+}
+
+func runtimeChecks(ctx NextCheckContext) []string {
+	var out []string
+	service := backtick(ctx.Service, "")
+	if ctx.RuntimeSignalID != "" && ctx.RuntimeReason != "" && service != "" {
+		out = append(out, fmt.Sprintf("Review runtime signal `%s`: `%s` on %s.",
+			shortRef(ctx.RuntimeSignalID), ctx.RuntimeReason, service))
+	}
+	subtype := strings.ToLower(ctx.RuntimeSubtype)
+	if service != "" {
+		switch {
+		case strings.Contains(subtype, "oom") || strings.Contains(subtype, "memory"):
+			out = append(out, fmt.Sprintf("Inspect memory usage for %s around the runtime event.", service))
+		case strings.Contains(subtype, "readiness") || strings.Contains(subtype, "liveness") || strings.Contains(subtype, "probe"):
+			out = append(out, fmt.Sprintf("Review readiness/liveness probe history for %s around the runtime event.", service))
+		case strings.Contains(subtype, "crashloop") || strings.Contains(subtype, "restart"):
+			out = append(out, fmt.Sprintf("Check %s restart history around the runtime event.", service))
+		}
+	}
+	if ctx.SampleTraceID != "" && service != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s` on %s overlapping the runtime event.",
+			shortRef(ctx.SampleTraceID), service))
+	}
+	if ctx.AlertSignalID != "" {
+		out = append(out, alertCheckLine(ctx))
+	}
+	return out
+}
+
+func appChecks(ctx NextCheckContext) []string {
+	var out []string
+	step := backtick(ctx.Step, "")
+	errCode := backtick(ctx.ErrorCode, "")
+	if ctx.SampleTraceID != "" && step != "" && errCode != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s` at %s for %s.",
+			shortRef(ctx.SampleTraceID), step, errCode))
+	} else if ctx.SampleTraceID != "" && step != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s` at %s.",
+			shortRef(ctx.SampleTraceID), step))
+	} else if ctx.SampleTraceID != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s`.", shortRef(ctx.SampleTraceID)))
+	}
+	if ctx.AlertSignalID != "" {
+		out = append(out, alertCheckLine(ctx))
+	}
+	return out
+}
+
+func unknownChecks(ctx NextCheckContext) []string {
+	var out []string
+	if ctx.SampleTraceID != "" {
+		out = append(out, fmt.Sprintf("Inspect sampled trace `%s` to confirm the failure mode.", shortRef(ctx.SampleTraceID)))
+	}
+	if ctx.AlertSignalID != "" {
+		out = append(out, alertCheckLine(ctx))
+	}
+	return out
+}
+
+// appendGapChecks adds instrumentation-gap suggestions that are independent of
+// the classified cause — these are always real (Crux observed the gap).
+func appendGapChecks(in []string, ctx NextCheckContext) []string {
+	if ctx.MissingServiceVersion && ctx.Service != "" {
+		in = append(in, fmt.Sprintf("Add `service.version` to events from `%s` to enable deploy correlation.", ctx.Service))
+	}
+	if ctx.MissingDependencySignal {
+		switch {
+		case ctx.Service != "" && ctx.Downstream != "":
+			in = append(in, fmt.Sprintf("Add dependency signals for `%s` → `%s` to confirm the downstream cause.", ctx.Service, ctx.Downstream))
+		case ctx.Downstream != "":
+			in = append(in, fmt.Sprintf("Add dependency signals for `%s` to confirm the downstream cause.", ctx.Downstream))
+		case ctx.Service != "":
+			in = append(in, fmt.Sprintf("Add dependency signals for `%s` to confirm the downstream cause.", ctx.Service))
 		}
 	}
+	if ctx.HasPartialTrace {
+		in = append(in, "Some events arrived without complete span fan-out — verify trace propagation in the SDK middleware.")
+	}
+	return in
+}
+
+func alertCheckLine(ctx NextCheckContext) string {
+	id := ctx.AlertID
+	if id == "" {
+		id = shortRef(ctx.AlertSignalID)
+	}
+	source := ctx.AlertSource
+	switch {
+	case ctx.AlertProviderURL != "" && source != "":
+		return fmt.Sprintf("Open matched alert `%s` in %s (%s).", id, source, ctx.AlertProviderURL)
+	case source != "":
+		return fmt.Sprintf("Open matched alert `%s` from %s.", id, source)
+	case ctx.AlertProviderURL != "":
+		return fmt.Sprintf("Open matched alert `%s` (%s).", id, ctx.AlertProviderURL)
+	default:
+		return fmt.Sprintf("Open matched alert `%s`.", id)
+	}
+}
+
+func backtick(value, fallback string) string {
+	if value == "" {
+		return fallback
+	}
+	return "`" + value + "`"
+}
+
+func shortRef(id string) string {
+	if len(id) <= 12 {
+		return id
+	}
+	return id[:12] + "…"
 }
diff --git a/internal/incidents/retention.go b/internal/incidents/retention.go
new file mode 100644
index 0000000..b6dd80f
--- /dev/null
+++ b/internal/incidents/retention.go
@@ -0,0 +1,47 @@
+package incidents
+
+import (
+	"context"
+	"log/slog"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+)
+
+// ResolvedPruner is the slice of the incident store the janitor needs.
+type ResolvedPruner interface {
+	PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error)
+}
+
+// RunRetention deletes resolved incidents older than retention on every
+// interval tick. Active and recovering incidents are never touched (the
+// store's DELETE filters on status=resolved). Mirrors signals.RunRetention.
+func RunRetention(ctx context.Context, store ResolvedPruner, retention, interval time.Duration, m *metrics.Metrics, log *slog.Logger) {
+	if store == nil || retention <= 0 || interval <= 0 {
+		return
+	}
+	if log == nil {
+		log = slog.Default()
+	}
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			cutoff := time.Now().UTC().Add(-retention)
+			deleted, err := store.PruneResolvedOlderThan(ctx, cutoff)
+			if err != nil {
+				log.Warn("incident retention prune failed", "err", err)
+				continue
+			}
+			if m != nil && deleted > 0 {
+				m.IncidentRetentionPruned.Add(float64(deleted))
+			}
+			if deleted > 0 {
+				log.Info("incident retention pruned", "deleted", deleted, "cutoff", cutoff)
+			}
+		}
+	}
+}
diff --git a/internal/incidents/retention_test.go b/internal/incidents/retention_test.go
new file mode 100644
index 0000000..fa6e11e
--- /dev/null
+++ b/internal/incidents/retention_test.go
@@ -0,0 +1,72 @@
+package incidents
+
+import (
+	"context"
+	"log/slog"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+type fakePruneStore struct {
+	n          atomic.Int64
+	lastCutoff atomic.Value // time.Time
+}
+
+func (s *fakePruneStore) PruneResolvedOlderThan(_ context.Context, cutoff time.Time) (int, error) {
+	s.n.Add(1)
+	s.lastCutoff.Store(cutoff)
+	return 2, nil
+}
+
+func TestRunRetentionPrunesResolvedAndStops(t *testing.T) {
+	store := &fakePruneStore{}
+	retention := time.Hour
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		RunRetention(ctx, store, retention, time.Millisecond, nil, slog.Default())
+		close(done)
+	}()
+
+	deadline := time.After(time.Second)
+	for store.n.Load() == 0 {
+		select {
+		case <-deadline:
+			t.Fatal("retention did not call prune")
+		default:
+			time.Sleep(time.Millisecond)
+		}
+	}
+	cutoff, _ := store.lastCutoff.Load().(time.Time)
+	want := time.Now().UTC().Add(-retention)
+	if d := want.Sub(cutoff); d < -time.Minute || d > time.Minute {
+		t.Fatalf("cutoff %v not ~now-retention %v", cutoff, want)
+	}
+
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("retention did not stop on context cancel")
+	}
+}
+
+func TestRunRetentionDisabledConfigsReturnImmediately(t *testing.T) {
+	store := &fakePruneStore{}
+	done := make(chan struct{})
+	go func() {
+		RunRetention(context.Background(), nil, time.Hour, time.Millisecond, nil, nil)
+		RunRetention(context.Background(), store, 0, time.Millisecond, nil, nil)
+		RunRetention(context.Background(), store, time.Hour, 0, nil, nil)
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("disabled retention configs must return immediately")
+	}
+	if store.n.Load() != 0 {
+		t.Fatalf("disabled configs must never prune, got %d calls", store.n.Load())
+	}
+}
diff --git a/internal/incidents/store.go b/internal/incidents/store.go
index bd3fb5b..28282ba 100644
--- a/internal/incidents/store.go
+++ b/internal/incidents/store.go
@@ -115,5 +115,117 @@ func cloneIncident(in Incident) Incident {
 		v := *in.ResolvedAt
 		out.ResolvedAt = &v
 	}
+	out.Propagation = clonePropagationSnapshot(in.Propagation)
+	out.Blast = cloneBlastSnapshot(in.Blast)
+	out.Alerts = cloneAlertSnapshot(in.Alerts)
+	out.Runtime = cloneRuntimeSnapshot(in.Runtime)
+	return out
+}
+
+func clonePropagationEvidence(p *PropagationEvidence) *PropagationEvidence {
+	if p == nil {
+		return nil
+	}
+	out := *p
+	if p.Path != nil {
+		out.Path = append([]PropagationStep(nil), p.Path...)
+	}
+	if p.FirstSeenAt != nil {
+		t := *p.FirstSeenAt
+		out.FirstSeenAt = &t
+	}
+	return &out
+}
+
+func clonePropagationSnapshot(s *PropagationSnapshot) *PropagationSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &PropagationSnapshot{
+		Opening: clonePropagationEvidence(s.Opening),
+		Latest:  clonePropagationEvidence(s.Latest),
+	}
+}
+
+func cloneBlastEvidence(b *BlastEvidence) *BlastEvidence {
+	if b == nil {
+		return nil
+	}
+	out := *b
+	if b.AffectedUsers != nil {
+		u := *b.AffectedUsers
+		out.AffectedUsers = &u
+	}
+	if b.TopServices != nil {
+		out.TopServices = append([]string(nil), b.TopServices...)
+	}
+	if b.SampledTraces != nil {
+		out.SampledTraces = append([]string(nil), b.SampledTraces...)
+	}
+	return &out
+}
+
+func cloneBlastSnapshot(s *BlastSnapshot) *BlastSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &BlastSnapshot{
+		Opening: cloneBlastEvidence(s.Opening),
+		Latest:  cloneBlastEvidence(s.Latest),
+	}
+}
+
+func cloneAlertEvidence(a *AlertEvidence) *AlertEvidence {
+	if a == nil {
+		return nil
+	}
+	out := *a
+	if a.Matches != nil {
+		out.Matches = append([]MatchedAlert(nil), a.Matches...)
+		for i := range out.Matches {
+			out.Matches[i].EvidenceIDs = append([]string(nil), a.Matches[i].EvidenceIDs...)
+		}
+	}
+	return &out
+}
+
+func cloneAlertSnapshot(s *AlertSnapshot) *AlertSnapshot {
+	if s == nil {
+		return nil
+	}
+	return &AlertSnapshot{
+		Opening: cloneAlertEvidence(s.Opening),
+		Latest:  cloneAlertEvidence(s.Latest),
+	}
+}
+
+func cloneRuntimeEvidence(r *RuntimeEvidence) *RuntimeEvidence {
+	if r == nil {
+		return nil
+	}
+	out := *r
+	if r.Metadata != nil {
+		out.Metadata = make(map[string]any, len(r.Metadata))
+		for k, v := range r.Metadata {
+			out.Metadata[k] = v
+		}
+	}
+	return &out
+}
+
+func cloneRuntimeSnapshot(s *RuntimeSnapshot) *RuntimeSnapshot {
+	if s == nil {
+		return nil
+	}
+	out := &RuntimeSnapshot{
+		Opening: cloneRuntimeEvidence(s.Opening),
+		Latest:  cloneRuntimeEvidence(s.Latest),
+	}
+	if s.Matches != nil {
+		out.Matches = make([]RuntimeEvidence, len(s.Matches))
+		for i := range s.Matches {
+			out.Matches[i] = *cloneRuntimeEvidence(&s.Matches[i])
+		}
+	}
 	return out
 }
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
index fed7350..4b22b72 100644
--- a/internal/incidents/types.go
+++ b/internal/incidents/types.go
@@ -35,12 +35,110 @@ const (
 type EvidenceKind string
 
 const (
-	EvidenceSignal     EvidenceKind = "signal"
-	EvidenceDeployment EvidenceKind = "deployment"
-	EvidenceTrace      EvidenceKind = "trace"
-	EvidenceMetric     EvidenceKind = "metric"
+	EvidenceSignal      EvidenceKind = "signal"
+	EvidenceDeployment  EvidenceKind = "deployment"
+	EvidenceTrace       EvidenceKind = "trace"
+	EvidenceMetric      EvidenceKind = "metric"
+	EvidencePropagation EvidenceKind = "propagation"
+	EvidenceBlast       EvidenceKind = "blast"
+	EvidenceRuntime     EvidenceKind = "runtime"
 )
 
+type EvidenceCaptureStatus string
+
+const (
+	CaptureOK      EvidenceCaptureStatus = "ok"
+	CapturePartial EvidenceCaptureStatus = "partial"
+	CaptureMissing EvidenceCaptureStatus = "missing"
+)
+
+type PropagationStep struct {
+	Service    string `json:"service"`
+	Step       string `json:"step"`
+	StartMS    int64  `json:"start_ms"`
+	DurationMS int64  `json:"duration_ms"`
+	Status     string `json:"status"`
+	ErrorCode  string `json:"error_code,omitempty"`
+}
+
+type PropagationEvidence struct {
+	OriginService string                `json:"origin_service"`
+	OriginStep    string                `json:"origin_step"`
+	Path          []PropagationStep     `json:"path"`
+	SampleTraceID string                `json:"sample_trace_id"`
+	FirstSeenAt   *time.Time            `json:"first_seen_at,omitempty"`
+	CapturedAt    time.Time             `json:"captured_at"`
+	CaptureStatus EvidenceCaptureStatus `json:"capture_status"`
+}
+
+type BlastEvidence struct {
+	AffectedRequests int                   `json:"affected_requests"`
+	AffectedUsers    *int                  `json:"affected_users,omitempty"`
+	AffectedServices int                   `json:"affected_services"`
+	TopServices      []string              `json:"top_services"`
+	SampledTraces    []string              `json:"sampled_traces"`
+	CapturedAt       time.Time             `json:"captured_at"`
+	CaptureStatus    EvidenceCaptureStatus `json:"capture_status"`
+}
+
+type MatchedAlert struct {
+	SignalID    string    `json:"signal_id"`
+	AlertID     string    `json:"alert_id,omitempty"`
+	Source      string    `json:"source"`
+	Severity    string    `json:"severity"`
+	Reason      string    `json:"reason"`
+	ProviderURL string    `json:"provider_url,omitempty"`
+	EvidenceIDs []string  `json:"evidence_ids,omitempty"`
+	MatchedAt   time.Time `json:"matched_at"`
+	Strategy    string    `json:"strategy"`
+}
+
+type AlertEvidence struct {
+	Matches       []MatchedAlert        `json:"matches"`
+	CapturedAt    time.Time             `json:"captured_at"`
+	CaptureStatus EvidenceCaptureStatus `json:"capture_status"`
+}
+
+type PropagationSnapshot struct {
+	Opening *PropagationEvidence `json:"opening,omitempty"`
+	Latest  *PropagationEvidence `json:"latest,omitempty"`
+}
+
+type BlastSnapshot struct {
+	Opening *BlastEvidence `json:"opening,omitempty"`
+	Latest  *BlastEvidence `json:"latest,omitempty"`
+}
+
+type AlertSnapshot struct {
+	Opening *AlertEvidence `json:"opening,omitempty"`
+	Latest  *AlertEvidence `json:"latest,omitempty"`
+}
+
+// RuntimeEvidence is a single matched runtime signal — infra (k8s OOMKill,
+// crashloop) or app (panic, unhandled rejection). Severity uses accepted
+// signal severities (critical|warning|info), never "error".
+type RuntimeEvidence struct {
+	Subtype       string                `json:"subtype"` // oom_killed, crashloop, readiness_fail, liveness_fail, panic, unhandled_rejection, uncaught_exception
+	Service       string                `json:"service"`
+	Reason        string                `json:"reason"`
+	Severity      string                `json:"severity"`
+	Source        string                `json:"source"` // k8s, k8s-demo, go-sdk, ts-sdk
+	SignalID      string                `json:"signal_id"`
+	OccurredAt    time.Time             `json:"occurred_at"` // sig.Timestamp — when the runtime event happened
+	Metadata      map[string]any        `json:"metadata,omitempty"`
+	CapturedAt    time.Time             `json:"captured_at"` // when captured — provenance only, never in report hash
+	CaptureStatus EvidenceCaptureStatus `json:"capture_status"`
+}
+
+// RuntimeSnapshot holds all matched runtime signals for an incident. Matches
+// preserves every match (infra AND app) so a later app panic does not erase
+// an earlier infra OOMKill. Opening/Latest are by OccurredAt.
+type RuntimeSnapshot struct {
+	Matches []RuntimeEvidence `json:"matches,omitempty"`
+	Opening *RuntimeEvidence  `json:"opening,omitempty"`
+	Latest  *RuntimeEvidence  `json:"latest,omitempty"`
+}
+
 type Evidence struct {
 	Kind       EvidenceKind   `json:"kind"`
 	Title      string         `json:"title"`
@@ -54,30 +152,34 @@ type Evidence struct {
 }
 
 type Incident struct {
-	IncidentID              string            `json:"incident_id"`
-	Env                     string            `json:"env"`
-	Service                 string            `json:"service"`
-	ErrorFamily             apiv2.ErrorFamily `json:"error_family"`
-	Status                  Status            `json:"status"`
-	Cause                   Cause             `json:"cause"`
-	Confidence              Confidence        `json:"confidence"`
-	Severity                int               `json:"severity"`
-	StartedAt               time.Time         `json:"started_at"`
-	UpdatedAt               time.Time         `json:"updated_at"`
-	LastSeenAt              time.Time         `json:"last_seen_at"`
-	RecoveringAt            *time.Time        `json:"recovering_at,omitempty"`
-	ResolvedAt              *time.Time        `json:"resolved_at,omitempty"`
-	AffectedRequests        int               `json:"affected_requests"`
-	AffectedUsers           *int              `json:"affected_users,omitempty"`
-	AffectedServices        int               `json:"affected_services"`
-	TopServices             []string          `json:"top_services"`
-	SampleTraces            []string          `json:"sample_traces"`
-	Evidence                []Evidence        `json:"evidence"`
-	NextChecks              []string          `json:"next_checks"`
-	InstrumentationWarnings []string          `json:"instrumentation_warnings,omitempty"`
-	Lift                    float64           `json:"lift"`
-	BaselineCount           int               `json:"baseline_count"`
-	CurrentCount            int               `json:"current_count"`
+	IncidentID              string               `json:"incident_id"`
+	Env                     string               `json:"env"`
+	Service                 string               `json:"service"`
+	ErrorFamily             apiv2.ErrorFamily    `json:"error_family"`
+	Status                  Status               `json:"status"`
+	Cause                   Cause                `json:"cause"`
+	Confidence              Confidence           `json:"confidence"`
+	Severity                int                  `json:"severity"`
+	StartedAt               time.Time            `json:"started_at"`
+	UpdatedAt               time.Time            `json:"updated_at"`
+	LastSeenAt              time.Time            `json:"last_seen_at"`
+	RecoveringAt            *time.Time           `json:"recovering_at,omitempty"`
+	ResolvedAt              *time.Time           `json:"resolved_at,omitempty"`
+	AffectedRequests        int                  `json:"affected_requests"`
+	AffectedUsers           *int                 `json:"affected_users,omitempty"`
+	AffectedServices        int                  `json:"affected_services"`
+	TopServices             []string             `json:"top_services"`
+	SampleTraces            []string             `json:"sample_traces"`
+	Evidence                []Evidence           `json:"evidence"`
+	NextChecks              []string             `json:"next_checks"`
+	InstrumentationWarnings []string             `json:"instrumentation_warnings,omitempty"`
+	Lift                    float64              `json:"lift"`
+	BaselineCount           int                  `json:"baseline_count"`
+	CurrentCount            int                  `json:"current_count"`
+	Propagation             *PropagationSnapshot `json:"propagation,omitempty"`
+	Blast                   *BlastSnapshot       `json:"blast,omitempty"`
+	Alerts                  *AlertSnapshot       `json:"alerts,omitempty"`
+	Runtime                 *RuntimeSnapshot     `json:"runtime,omitempty"`
 }
 
 type Deployment struct {
diff --git a/internal/ingest/deploy_webhook.go b/internal/ingest/deploy_webhook.go
index e5d2035..6991d43 100644
--- a/internal/ingest/deploy_webhook.go
+++ b/internal/ingest/deploy_webhook.go
@@ -100,14 +100,6 @@ func (s *Server) DeployWebhook(w http.ResponseWriter, r *http.Request) {
 		s.metrics.DeployUpsertsTotal.Inc()
 	}
 
-	// Publish SSE event so dashboard clients see the new deployment immediately.
-	if s.sseHub != nil {
-		data := s.ComputeSSETopic(TopicDeployments)
-		if data != nil {
-			s.sseHub.Publish(TopicDeployments, data)
-		}
-	}
-
 	if wantsEnvelope(r) {
 		writeJSON(w, http.StatusCreated, map[string]string{"id": req.ID}, meta, nil)
 	} else {
diff --git a/internal/ingest/deploy_webhook_test.go b/internal/ingest/deploy_webhook_test.go
index 38f561b..4bb2207 100644
--- a/internal/ingest/deploy_webhook_test.go
+++ b/internal/ingest/deploy_webhook_test.go
@@ -20,8 +20,7 @@ func makeTestServerWithColdStore(t *testing.T) (*Server, *coldstore.SQLiteStore)
 	}
 	cs := managed.(*coldstore.SQLiteStore)
 	t.Cleanup(func() { cs.Close() })
-	srv := makeTestServer()
-	srv.coldStore = cs
+	srv := &Server{coldStore: cs}
 	return srv, cs
 }
 
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index 1c08487..f61af3d 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -14,7 +14,6 @@ import (
 	"net"
 	"net/http"
 	"net/url"
-	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -25,16 +24,10 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/config"
 	"github.com/sssmaran/WaylogCLI/internal/detect"
 	"github.com/sssmaran/WaylogCLI/internal/eventlog"
-	"github.com/sssmaran/WaylogCLI/internal/graph/analysis"
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
 	"github.com/sssmaran/WaylogCLI/internal/llm"
 	"github.com/sssmaran/WaylogCLI/internal/metrics"
 	"github.com/sssmaran/WaylogCLI/internal/sampler"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/internal/tracestory"
 	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
@@ -92,13 +85,10 @@ func (c *unsampledCounters) Sum(window time.Duration) (total, errs uint64) {
 //
 // Readiness semantics: /readyz gates on ingest availability, not replay
 // completeness. When replay fails the server becomes ready in degraded mode —
-// new events ingest correctly but historical reads (trace story, overview,
-// recent traces) may return partial results until the graph is rebuilt from
-// incoming traffic.
+// new events ingest correctly but historical reads (trace story, errors,
+// blast radius) may return partial results until the v2 reader is rebuilt
+// from incoming traffic.
 type Server struct {
-	store       *store.Store
-	traceStore  *tracestore.Store
-	builder     *build.Builder
 	sampler     *sampler.Sampler
 	metrics     *metrics.Metrics
 	EventLog    *eventlog.Writer
@@ -119,7 +109,6 @@ type Server struct {
 	dashboardRefreshSec int
 	prometheusURL       string
 	grafanaURL          string
-	graphUI             bool
 	dedupCache          *DedupCache
 	agentKey            string
 	trustProxy          bool
@@ -138,53 +127,20 @@ type Server struct {
 	otlpEnabled               bool
 	otlpGRPCEnabled           bool
 	otlpGRPCAddr              string
-	v2ReadsEnabled            bool
 	incidentsEnabled          bool
 	incidentsPersistent       bool
 	incidentsRebuildSupported bool
 	profile                   string
 
-	// SSE
-	sseHub               *SSEHub
-	sseHeartbeatInterval time.Duration // configurable for testing, defaults to 15s
-
-	// Causal engine status
-	causalMu        sync.Mutex
-	causalEnabled   bool
-	causalLastRun   time.Time
-	causalLastError string
-
 	// Anomaly detector
 	detector interface{ Current() *detect.Insight }
 }
 
-// SetSSEHub sets the SSE hub for real-time dashboard updates.
-func (s *Server) SetSSEHub(hub *SSEHub) { s.sseHub = hub }
-
 // SetDetector sets the anomaly detector for the /v1/insight endpoint.
 func (s *Server) SetDetector(d interface{ Current() *detect.Insight }) { s.detector = d }
 
-// SetCausalEnabled marks the causal engine as active.
-// Called once at startup before HTTP traffic, no lock needed.
-func (s *Server) SetCausalEnabled() { s.causalEnabled = true }
-
-// SetCausalRunResult records the result of a causal inference tick.
-// Called from the causal goroutine; reads happen from HTTP handlers (/healthz).
-func (s *Server) SetCausalRunResult(err error) {
-	s.causalMu.Lock()
-	s.causalLastRun = time.Now()
-	if err != nil {
-		s.causalLastError = err.Error()
-	} else {
-		s.causalLastError = ""
-	}
-	s.causalMu.Unlock()
-}
-
 // ServerConfig holds configuration for creating a new Server.
 type ServerConfig struct {
-	Store                    *store.Store
-	TraceStore               *tracestore.Store
 	Sampler                  *sampler.Sampler
 	Metrics                  *metrics.Metrics
 	MaxBodyBytes             int64
@@ -198,7 +154,6 @@ type ServerConfig struct {
 	DashboardRefreshSec      int
 	PrometheusURL            string
 	GrafanaURL               string
-	GraphUI                  bool
 	DedupCache               *DedupCache
 	AgentKey                 string
 	TrustProxy               bool
@@ -209,7 +164,6 @@ type ServerConfig struct {
 	OTLPEnabled              bool
 	OTLPGRPCEnabled          bool
 	OTLPGRPCAddr             string
-	V2ReadsEnabled           bool
 	IncidentsEnabled         bool
 	IncidentsPersistent      bool
 	IncidentRebuildSupported bool
@@ -227,9 +181,6 @@ func NewServer(cfg ServerConfig) *Server {
 		startTime = time.Now()
 	}
 	s := &Server{
-		store:                     cfg.Store,
-		traceStore:                cfg.TraceStore,
-		builder:                   build.NewBuilder(),
 		sampler:                   cfg.Sampler,
 		metrics:                   cfg.Metrics,
 		maxBodyBytes:              maxBody,
@@ -243,7 +194,6 @@ func NewServer(cfg ServerConfig) *Server {
 		dashboardRefreshSec:       cfg.DashboardRefreshSec,
 		prometheusURL:             cfg.PrometheusURL,
 		grafanaURL:                cfg.GrafanaURL,
-		graphUI:                   cfg.GraphUI,
 		dedupCache:                cfg.DedupCache,
 		agentKey:                  cfg.AgentKey,
 		trustProxy:                cfg.TrustProxy,
@@ -254,7 +204,6 @@ func NewServer(cfg ServerConfig) *Server {
 		otlpEnabled:               cfg.OTLPEnabled,
 		otlpGRPCEnabled:           cfg.OTLPGRPCEnabled,
 		otlpGRPCAddr:              cfg.OTLPGRPCAddr,
-		v2ReadsEnabled:            cfg.V2ReadsEnabled,
 		incidentsEnabled:          cfg.IncidentsEnabled,
 		incidentsPersistent:       cfg.IncidentsPersistent,
 		incidentsRebuildSupported: cfg.IncidentRebuildSupported,
@@ -264,9 +213,6 @@ func NewServer(cfg ServerConfig) *Server {
 	if s.sampler == nil {
 		s.sampler = sampler.New(sampler.LoadConfigFromEnv())
 	}
-	if s.traceStore == nil {
-		s.traceStore = tracestore.NewStore()
-	}
 	if s.graphHotWindow <= 0 {
 		s.graphHotWindow, _ = runtimeGraphHotWindow()
 	}
@@ -293,7 +239,7 @@ func NewServer(cfg ServerConfig) *Server {
 // Use /readyz for traffic gating, /livez for liveness.
 func (s *Server) Health(w http.ResponseWriter, r *http.Request) {
 	status := "ok"
-	if s.store == nil || s.replayStatus == "failed" {
+	if s.replayStatus == "failed" {
 		status = "degraded"
 	}
 
@@ -303,17 +249,6 @@ func (s *Server) Health(w http.ResponseWriter, r *http.Request) {
 		"ready":  s.ready.Load(),
 	}
 
-	if s.store != nil {
-		snap := s.store.Snapshot()
-		resp["store"] = map[string]any{
-			"configured": true,
-			"nodes":      len(snap.Nodes),
-			"edges":      len(snap.Edges),
-		}
-	} else {
-		resp["store"] = map[string]any{"configured": false}
-	}
-
 	resp["event_log"] = map[string]any{"enabled": s.EventLogDir != ""}
 
 	replay := map[string]any{"status": s.replayStatus}
@@ -328,17 +263,6 @@ func (s *Server) Health(w http.ResponseWriter, r *http.Request) {
 	}
 	resp["replay"] = replay
 
-	s.causalMu.Lock()
-	causal := map[string]any{"enabled": s.causalEnabled}
-	if !s.causalLastRun.IsZero() {
-		causal["last_run"] = s.causalLastRun.Format(time.RFC3339)
-	}
-	if s.causalLastError != "" {
-		causal["last_error"] = s.causalLastError
-	}
-	s.causalMu.Unlock()
-	resp["causal"] = causal
-
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(resp)
 }
@@ -381,9 +305,6 @@ func (s *Server) SetReplayResult(err error) {
 }
 
 // Store returns the server's graph store.
-func (s *Server) Store() *store.Store {
-	return s.store
-}
 
 // AcceptedCount returns the number of accepted events.
 func (s *Server) AcceptedCount() uint64 {
@@ -391,17 +312,11 @@ func (s *Server) AcceptedCount() uint64 {
 }
 
 // Builder returns the server's graph builder.
-func (s *Server) Builder() *build.Builder {
-	return s.builder
-}
 
 // Sampler returns the server's sampler so external schema-1.x pipeline wiring
 // can share the same sampling policy.
 func (s *Server) Sampler() *sampler.Sampler { return s.sampler }
 
-// SSEHub returns the server's SSE hub for reuse as a Pipeline Notifier.
-func (s *Server) SSEHub() *SSEHub { return s.sseHub }
-
 // Counters returns the shared unsampled windowed counters for schema-1.x
 // pipeline wiring.
 func (s *Server) Counters() *unsampledCounters { return &s.counters }
@@ -420,161 +335,6 @@ func (s *Server) SetOTLPGRPC(enabled bool, addr string) {
 	s.otlpGRPCAddr = addr
 }
 
-// EventSearch handles GET /v1/events/search.
-// Both cold-store and JSONL paths return the same []coldstore.SearchResult shape.
-func (s *Server) EventSearch(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-	if s.coldStore == nil && s.EventLogDir == "" {
-		http.Error(w, "event search not configured", http.StatusServiceUnavailable)
-		return
-	}
-
-	q := r.URL.Query()
-	traceID := q.Get("trace_id")
-	userID := q.Get("user_id")
-	service := q.Get("service")
-	errorCode := q.Get("error_code")
-
-	if traceID == "" && userID == "" && service == "" && errorCode == "" {
-		http.Error(w, "at least one filter required (trace_id, user_id, service, error_code)", http.StatusBadRequest)
-		return
-	}
-
-	limit := parseBoundedPositiveInt(q, "limit", 50, 200)
-
-	cursorStr := q.Get("cursor")
-	var cursorID int64
-	if cursorStr != "" {
-		var err error
-		cursorID, err = decodeRowIDCursor(cursorStr)
-		if err != nil {
-			http.Error(w, "invalid cursor", http.StatusBadRequest)
-			return
-		}
-	}
-
-	var startTime, endTime time.Time
-	if v := q.Get("start"); v != "" {
-		t, err := parseFlexibleTime(v)
-		if err != nil {
-			http.Error(w, "invalid start: must be RFC3339", http.StatusBadRequest)
-			return
-		}
-		startTime = t
-	}
-	if v := q.Get("end"); v != "" {
-		t, err := parseFlexibleTime(v)
-		if err != nil {
-			http.Error(w, "invalid end: must be RFC3339", http.StatusBadRequest)
-			return
-		}
-		endTime = t
-	}
-
-	// Prefer cold store (SQLite) over JSONL scan
-	if s.coldStore != nil {
-		page, err := s.coldStore.SearchEvents(coldstore.SearchFilter{
-			TraceID:   traceID,
-			UserID:    userID,
-			Service:   service,
-			ErrorCode: errorCode,
-			Start:     startTime,
-			End:       endTime,
-			Limit:     limit,
-			Cursor:    cursorID,
-		})
-		if err != nil {
-			slog.Error("cold store search failed", "err", err)
-			if s.EventLogDir == "" {
-				http.Error(w, "search failed", http.StatusInternalServerError)
-				return
-			}
-			// Fall through to JSONL fallback
-		} else {
-			if page.Results == nil {
-				page.Results = []coldstore.SearchResult{}
-			}
-			resp := map[string]any{
-				"events":      page.Results,
-				"count":       len(page.Results),
-				"total_count": page.TotalCount,
-				"data_source": "sqlite",
-			}
-			if page.NextCursor > 0 {
-				resp["next_cursor"] = encodeRowIDCursor(page.NextCursor)
-			}
-			w.Header().Set("Content-Type", "application/json")
-			json.NewEncoder(w).Encode(resp)
-			return
-		}
-	}
-
-	if s.EventLogDir == "" {
-		http.Error(w, "event search not configured", http.StatusServiceUnavailable)
-		return
-	}
-
-	// JSONL fallback does not support cursor pagination.
-	if cursorID > 0 {
-		http.Error(w, "cursor pagination not supported for event log fallback", http.StatusBadRequest)
-		return
-	}
-
-	f := eventlog.SearchFilter{
-		TraceID:   traceID,
-		UserID:    userID,
-		Service:   service,
-		ErrorCode: errorCode,
-		Limit:     limit,
-		Start:     startTime,
-		End:       endTime,
-	}
-	events, err := eventlog.Search(s.EventLogDir, f)
-	if err != nil {
-		slog.Error("event search failed", "err", err)
-		http.Error(w, "search failed", http.StatusInternalServerError)
-		return
-	}
-
-	// Convert WideEvent to SearchResult for consistent API shape.
-	results := make([]coldstore.SearchResult, len(events))
-	for i, ev := range events {
-		var errCode, errMsg string
-		if ev.Error != nil {
-			errCode = ev.Error.Code
-			errMsg = ev.Error.Message
-		}
-		results[i] = coldstore.SearchResult{
-			TraceID:      ev.Request.TraceID,
-			SpanID:       ev.Request.SpanID,
-			EventName:    ev.EventName,
-			Service:      ev.System.Service,
-			Env:          ev.System.Env,
-			Version:      ev.System.Version,
-			DeploymentID: ev.System.DeploymentID,
-			UserID:       ev.User.ID,
-			StatusCode:   ev.Outcome.StatusCode,
-			Success:      ev.Outcome.Success,
-			ErrorCode:    errCode,
-			ErrorMessage: errMsg,
-			LatencyMs:    ev.Metrics.LatencyMs,
-			Timestamp:    ev.Timestamp,
-		}
-	}
-
-	resp := map[string]any{
-		"events":      results,
-		"count":       len(results),
-		"total_count": len(results),
-		"data_source": "event_log_fallback",
-	}
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(resp)
-}
-
 // Capabilities handles GET /v1/capabilities.
 // It returns runtime capabilities/config used by UI clients.
 func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
@@ -609,15 +369,11 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 			"prometheus": s.prometheusURL,
 			"grafana":    s.grafanaURL,
 		},
-		"graph": s.graphUI,
 		"otlp": map[string]any{
 			"http_traces": s.otlpEnabled,
 			"grpc_traces": s.otlpGRPCEnabled,
 			"grpc_addr":   s.otlpGRPCAddr,
 		},
-		"v2_reads": map[string]any{
-			"enabled": s.v2ReadsEnabled,
-		},
 		"profile": s.profile,
 		"incidents": map[string]any{
 			"enabled":    s.incidentsEnabled,
@@ -634,13 +390,6 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
 		},
 		"architecture": map[string]any{
 			"flattened": true,
-			"graph": map[string]any{
-				"nodes": []string{"request", "service", "error"},
-				"edges": []string{"handled_by", "failed_with", "calls"},
-			},
-			"trace_store": map[string]any{
-				"enabled": s.traceStore != nil,
-			},
 			"hot_window": map[string]any{
 				"enabled":       hotWindow > 0,
 				"duration":      hotWindow.String(),
@@ -674,9 +423,6 @@ func runtimeGraphHotWindow() (time.Duration, string) {
 	if hot := config.GetenvDuration("GRAPH_HOT_WINDOW", 0); hot > 0 {
 		return hot, "GRAPH_HOT_WINDOW"
 	}
-	if hot := config.GetenvDuration("GRAPH_RETENTION", 24*time.Hour); hot > 0 {
-		return hot, "GRAPH_RETENTION"
-	}
 	return 24 * time.Hour, "default"
 }
 
@@ -697,11 +443,8 @@ func (s *Server) Tools(w http.ResponseWriter, r *http.Request) {
 
 	registry := s.askRegistry
 	if registry == nil {
-		registry = tools.NewRegistry()
-		if err := tools.RegisterGraphTools(registry); err != nil {
-			respondError(w, r, http.StatusInternalServerError, "INTERNAL", "tool registry unavailable", true, APIMeta{RequestID: RequestIDFromContext(r.Context())})
-			return
-		}
+		respondError(w, r, http.StatusInternalServerError, "INTERNAL", "tool registry unavailable", true, APIMeta{RequestID: RequestIDFromContext(r.Context())})
+		return
 	}
 
 	type toolEntry struct {
@@ -768,21 +511,12 @@ type askToolStep struct {
 	Error      string `json:"error,omitempty"`
 }
 
-type overviewErrorEntry struct {
-	Code  string `json:"code"`
-	Count int    `json:"count"`
-}
-
-// Ask handles POST /v1/ask and returns an LLM answer backed by graph tools.
+// Ask handles POST /v1/ask and returns an LLM answer backed by the agent tools.
 func (s *Server) Ask(w http.ResponseWriter, r *http.Request) {
 	if r.Method != http.MethodPost {
 		respondError(w, r, http.StatusMethodNotAllowed, "METHOD_NOT_ALLOWED", "method not allowed", false, APIMeta{RequestID: RequestIDFromContext(r.Context())})
 		return
 	}
-	if s.store == nil {
-		respondError(w, r, http.StatusServiceUnavailable, "SERVICE_UNAVAILABLE", "store not configured", true, APIMeta{RequestID: RequestIDFromContext(r.Context())})
-		return
-	}
 
 	reqID := RequestIDFromContext(r.Context())
 
@@ -905,20 +639,16 @@ func (s *Server) Ask(w http.ResponseWriter, r *http.Request) {
 
 	registry := s.askRegistry
 	if registry == nil {
-		registry = tools.NewRegistry()
-		if err := tools.RegisterGraphTools(registry); err != nil {
-			slog.Error("ask tool registry init failed", "err", err)
-			askHTTPStatus = http.StatusInternalServerError
-			askErrCode = "INTERNAL"
-			if dedupIsExecutor {
-				dedupCompleted = true
-				principal := s.dedupPrincipal(r)
-				s.dedupCache.Complete(r.Method, r.URL.Path, principal, idempKey, body,
-					http.StatusInternalServerError, nil, &APIError{Code: "INTERNAL", Message: "tool registry unavailable", Retryable: true}, 0)
-			}
-			respondError(w, r, http.StatusInternalServerError, "INTERNAL", "tool registry unavailable", true, APIMeta{RequestID: reqID})
-			return
+		askHTTPStatus = http.StatusInternalServerError
+		askErrCode = "INTERNAL"
+		if dedupIsExecutor {
+			dedupCompleted = true
+			principal := s.dedupPrincipal(r)
+			s.dedupCache.Complete(r.Method, r.URL.Path, principal, idempKey, body,
+				http.StatusInternalServerError, nil, &APIError{Code: "INTERNAL", Message: "tool registry unavailable", Retryable: true}, 0)
 		}
+		respondError(w, r, http.StatusInternalServerError, "INTERNAL", "tool registry unavailable", true, APIMeta{RequestID: reqID})
+		return
 	}
 
 	defs := make([]llm.ToolDefinition, 0, len(registry.List()))
@@ -956,9 +686,8 @@ func (s *Server) Ask(w http.ResponseWriter, r *http.Request) {
 	ctx, cancel := context.WithTimeout(r.Context(), time.Duration(timeoutMs)*time.Millisecond)
 	defer cancel()
 
-	fs := &frozenStore{snap: s.store.Snapshot(), real: s.store, ts: s.traceStore}
 	answer, toolRecords, askErr := llm.Ask(ctx, provider, defs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) {
-		return registry.Call(ctx, fs, name, params)
+		return registry.Call(ctx, name, params)
 	}), req.Prompt, llm.AskOptions{MaxSteps: maxSteps, ErrorStrategy: req.ErrorStrategy})
 
 	// Convert ToolCallRecords to askToolSteps
@@ -1032,578 +761,6 @@ func (s *Server) Ask(w http.ResponseWriter, r *http.Request) {
 	json.NewEncoder(w).Encode(resp)
 }
 
-// TraceStory handles GET /v1/traces/story?trace_id=.
-func (s *Server) TraceStory(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-	traceID := r.URL.Query().Get("trace_id")
-	if traceID == "" {
-		http.Error(w, "trace_id required", http.StatusBadRequest)
-		return
-	}
-	format := r.URL.Query().Get("format")
-
-	snap, ok := s.snapshotOrServiceUnavailable(w)
-	if !ok {
-		return
-	}
-	story, ctx, err := tracestory.BuildWithFormat(snap, s.traceStore, traceID, format)
-	if err != nil {
-		http.Error(w, err.Error(), http.StatusNotFound)
-		return
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(map[string]any{
-		"story":   story,
-		"context": ctx,
-	})
-}
-
-// traceEntry is a summary of a single request for the recent traces list.
-type traceEntry struct {
-	TraceID        string    `json:"trace_id"`
-	Service        string    `json:"service,omitempty"`
-	FailureService string    `json:"failure_service,omitempty"`
-	Success        bool      `json:"success"`
-	StatusCode     int       `json:"status_code"`
-	LatencyMs      int64     `json:"latency_ms"`
-	EventName      string    `json:"event_name,omitempty"`
-	Timestamp      time.Time `json:"timestamp"`
-}
-
-// RecentTraces handles GET /v1/traces/recent?limit=&cursor=.
-func (s *Server) RecentTraces(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-
-	q := r.URL.Query()
-	limit := parseBoundedPositiveInt(q, "limit", 20, 100)
-	failuresOnly := parseOptionalBool(q, "failures_only")
-
-	cursorStr := q.Get("cursor")
-	var cursorTS time.Time
-	var cursorTraceID string
-	if cursorStr != "" {
-		var err error
-		cursorTS, cursorTraceID, err = decodeTimeCursor(cursorStr)
-		if err != nil {
-			http.Error(w, "invalid cursor", http.StatusBadRequest)
-			return
-		}
-	}
-
-	snap, ok := s.snapshotOrServiceUnavailable(w)
-	if !ok {
-		return
-	}
-	entries, totalCount, nextTS, nextTraceID := recentTracesFromGraphPaginated(snap, limit, failuresOnly, cursorTS, cursorTraceID)
-
-	resp := map[string]any{
-		"traces":      entries,
-		"total_count": totalCount,
-	}
-	if !nextTS.IsZero() {
-		resp["next_cursor"] = encodeTimeCursor(nextTS, nextTraceID)
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(resp)
-}
-
-func recentTracesFromGraph(g *core.Graph, limit int, failuresOnly bool) []traceEntry {
-	entries, _, _, _ := recentTracesFromGraphPaginated(g, limit, failuresOnly, time.Time{}, "")
-	return entries
-}
-
-func recentTracesFromGraphPaginated(g *core.Graph, limit int, failuresOnly bool, cursorTS time.Time, cursorTraceID string) ([]traceEntry, int, time.Time, string) {
-	var all []traceEntry
-	for reqID, n := range g.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		traceID, _ := n.Attr["trace_id"].(string)
-		if traceID == "" {
-			continue
-		}
-		failed := requestNodeFailed(n)
-		if failuresOnly && !failed {
-			continue
-		}
-		e := traceEntry{
-			TraceID:   traceID,
-			Timestamp: n.LastSeen,
-			Success:   !failed,
-		}
-		if v, ok := n.Attr["status_code"]; ok {
-			e.StatusCode = attrToInt(v)
-		}
-		if v, ok := n.Attr["latency_ms"]; ok {
-			e.LatencyMs = attrToInt64(v)
-		}
-		if v, ok := n.Attr["event_name"].(string); ok {
-			e.EventName = v
-		}
-		e.Service = requestOwnerService(n.Attr, e.EventName)
-		if failed {
-			e.FailureService = requestFailureService(g, reqID, n)
-		}
-		all = append(all, e)
-	}
-
-	// Sort by (Timestamp DESC, TraceID DESC) for stable ordering.
-	sort.Slice(all, func(i, j int) bool {
-		if !all[i].Timestamp.Equal(all[j].Timestamp) {
-			return all[i].Timestamp.After(all[j].Timestamp)
-		}
-		return all[i].TraceID > all[j].TraceID
-	})
-
-	totalCount := len(all)
-
-	// Apply cursor: skip entries at or "before" cursor in DESC order.
-	if !cursorTS.IsZero() {
-		idx := 0
-		for idx < len(all) {
-			e := all[idx]
-			if e.Timestamp.Before(cursorTS) || (e.Timestamp.Equal(cursorTS) && e.TraceID < cursorTraceID) {
-				break
-			}
-			idx++
-		}
-		all = all[idx:]
-	}
-
-	var nextTS time.Time
-	var nextTraceID string
-	if len(all) > limit {
-		all = all[:limit]
-		last := all[limit-1]
-		nextTS = last.Timestamp
-		nextTraceID = last.TraceID
-	}
-	return all, totalCount, nextTS, nextTraceID
-}
-
-// overviewPayload computes the overview data for a given window and trace limit.
-// Shared by the Overview REST handler and SSE computeOverviewJSON.
-func (s *Server) overviewPayload(dur time.Duration, limit int) map[string]any {
-	now := time.Now()
-	start := now.Add(-dur)
-	snap := s.store.Snapshot()
-
-	recent := recentTracesFromGraph(snap, limit, false)
-	rollup := analysis.RollupWindow(snap, s.store, s.traceStore, start, now)
-
-	errorRate := 0.0
-	if unsampledTotal, unsampledErrors := s.counters.Sum(dur); unsampledTotal > 0 {
-		errorRate = float64(unsampledErrors) / float64(unsampledTotal) * 100
-	} else if rollup.TotalRequests > 0 {
-		errorRate = float64(rollup.TotalFailures) / float64(rollup.TotalRequests) * 100
-	}
-
-	topErrors := make([]overviewErrorEntry, 0, len(rollup.PrimaryErrorCount))
-	for code, count := range rollup.PrimaryErrorCount {
-		topErrors = append(topErrors, overviewErrorEntry{Code: code, Count: count})
-	}
-	sort.Slice(topErrors, func(i, j int) bool {
-		if topErrors[i].Count == topErrors[j].Count {
-			return topErrors[i].Code < topErrors[j].Code
-		}
-		return topErrors[i].Count > topErrors[j].Count
-	})
-
-	latestFailedTraceID := latestFailedTrace(snap, start)
-
-	return map[string]any{
-		"window":                 dur.String(),
-		"total_requests":         rollup.TotalRequests,
-		"total_failures":         rollup.TotalFailures,
-		"error_rate":             errorRate,
-		"p50":                    rollup.LatencyP50,
-		"p95":                    rollup.LatencyP95,
-		"p99":                    rollup.LatencyP99,
-		"sampled":                s.sampleRatePct < 100,
-		"top_errors":             topErrors,
-		"recent_traces":          recent,
-		"latest_failed_trace_id": latestFailedTraceID,
-	}
-}
-
-// latestFailedTrace finds the most recent failed trace ID in the snapshot
-// that is newer than start.
-func latestFailedTrace(snap *core.Graph, start time.Time) string {
-	var latestID string
-	var latestTime time.Time
-	for _, n := range snap.Nodes {
-		if n.Type != core.NodeRequest || n.LastSeen.Before(start) {
-			continue
-		}
-		if success, _ := n.Attr["success"].(bool); success {
-			continue
-		}
-		if n.LastSeen.After(latestTime) {
-			latestTime = n.LastSeen
-			if tid, ok := n.Attr["trace_id"].(string); ok {
-				latestID = tid
-			}
-		}
-	}
-	return latestID
-}
-
-// Overview handles GET /v1/overview?window=&limit=.
-func (s *Server) Overview(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-
-	if s.store == nil {
-		http.Error(w, "store not configured", http.StatusServiceUnavailable)
-		return
-	}
-
-	q := r.URL.Query()
-	dur := parseLooseDuration(q, "window", 5*time.Minute)
-	limit := parseBoundedPositiveInt(q, "limit", 20, 100)
-
-	payload := s.overviewPayload(dur, limit)
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(payload)
-}
-
-// timeseriesPayload computes bucketed timeseries data for a given window and step.
-// Shared by the OverviewTimeseries REST handler and SSE computeTimeseriesJSON.
-func (s *Server) timeseriesPayload(window, step time.Duration) map[string]any {
-	points := int(window / step)
-
-	snap := s.store.Snapshot()
-	now := time.Now()
-	start := now.Add(-window)
-
-	type bucket struct {
-		Start     time.Time
-		End       time.Time
-		Total     int
-		Failures  int
-		Status2xx int
-		Status4xx int
-		Status5xx int
-		latencies []int64
-	}
-
-	buckets := make([]bucket, points)
-	for i := range buckets {
-		buckets[i].Start = start.Add(time.Duration(i) * step)
-		buckets[i].End = buckets[i].Start.Add(step)
-	}
-
-	for _, n := range snap.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		if n.LastSeen.Before(start) || n.LastSeen.After(now) {
-			continue
-		}
-		idx := int(n.LastSeen.Sub(start) / step)
-		if idx >= points {
-			idx = points - 1
-		}
-		b := &buckets[idx]
-		b.Total++
-		addStatusClassCount(attrToInt(n.Attr["status_code"]), &b.Status2xx, &b.Status4xx, &b.Status5xx)
-		if requestNodeFailed(n) {
-			b.Failures++
-		}
-		if lat := attrToInt64(n.Attr["latency_ms"]); lat > 0 {
-			b.latencies = append(b.latencies, lat)
-		}
-	}
-
-	type bucketOut struct {
-		Start     time.Time `json:"start"`
-		End       time.Time `json:"end"`
-		Total     int       `json:"total"`
-		Failures  int       `json:"failures"`
-		ErrorRate float64   `json:"error_rate"`
-		Status2xx int       `json:"status_2xx"`
-		Status4xx int       `json:"status_4xx"`
-		Status5xx int       `json:"status_5xx"`
-		P50       int64     `json:"p50"`
-		P95       int64     `json:"p95"`
-		P99       int64     `json:"p99"`
-	}
-
-	out := make([]bucketOut, points)
-	for i := range buckets {
-		b := &buckets[i]
-		out[i] = bucketOut{
-			Start: b.Start, End: b.End,
-			Total: b.Total, Failures: b.Failures,
-			Status2xx: b.Status2xx, Status4xx: b.Status4xx, Status5xx: b.Status5xx,
-		}
-		if b.Total > 0 {
-			out[i].ErrorRate = math.Round(float64(b.Failures)/float64(b.Total)*10000) / 100
-		}
-		if len(b.latencies) > 0 {
-			sort.Slice(b.latencies, func(a, c int) bool { return b.latencies[a] < b.latencies[c] })
-			out[i].P50 = percentile(b.latencies, 50)
-			out[i].P95 = percentile(b.latencies, 95)
-			out[i].P99 = percentile(b.latencies, 99)
-		}
-	}
-
-	return map[string]any{
-		"sampled": s.sampleRatePct < 100,
-		"buckets": out,
-	}
-}
-
-// OverviewTimeseries handles GET /v1/overview/timeseries?window=1h&step=5m.
-func (s *Server) OverviewTimeseries(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-
-	q := r.URL.Query()
-
-	window := 1 * time.Hour
-	if v := q.Get("window"); v != "" {
-		d, err := time.ParseDuration(v)
-		if err != nil || d <= 0 {
-			http.Error(w, "invalid window", http.StatusBadRequest)
-			return
-		}
-		if d > s.effectiveGraphHotWindow() {
-			http.Error(w, "window exceeds hot window", http.StatusBadRequest)
-			return
-		}
-		window = d
-	}
-
-	step := 5 * time.Minute
-	if v := q.Get("step"); v != "" {
-		d, err := time.ParseDuration(v)
-		if err != nil || d <= 0 {
-			http.Error(w, "invalid step", http.StatusBadRequest)
-			return
-		}
-		if d < 15*time.Second {
-			http.Error(w, "step min 15s", http.StatusBadRequest)
-			return
-		}
-		if d > 15*time.Minute {
-			http.Error(w, "step max 15m", http.StatusBadRequest)
-			return
-		}
-		step = d
-	}
-
-	points := int(window / step)
-	if points > 1440 {
-		http.Error(w, "too many points (window/step max 1440)", http.StatusBadRequest)
-		return
-	}
-
-	if s.store == nil {
-		http.Error(w, "store not available", http.StatusServiceUnavailable)
-		return
-	}
-
-	payload := s.timeseriesPayload(window, step)
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(payload)
-}
-
-func percentile(sorted []int64, pct int) int64 {
-	if len(sorted) == 0 {
-		return 0
-	}
-	idx := int(math.Ceil(float64(pct)/100*float64(len(sorted)))) - 1
-	if idx < 0 {
-		idx = 0
-	}
-	if idx >= len(sorted) {
-		idx = len(sorted) - 1
-	}
-	return sorted[idx]
-}
-
-// routesPayload computes per-route stats for a given window and limit.
-// Shared by the Routes REST handler and SSE computeRoutesJSON.
-func (s *Server) routesPayload(window time.Duration, limit int, failuresOnly bool) map[string]any {
-	snap := s.store.Snapshot()
-	now := time.Now()
-	start := now.Add(-window)
-
-	type routeStats struct {
-		Service       string
-		Method        string
-		RouteTemplate string
-		Total         int
-		Failures      int
-		Status2xx     int
-		Status4xx     int
-		Status5xx     int
-		latencies     []int64
-	}
-
-	groups := map[string]*routeStats{}
-
-	for _, n := range snap.Nodes {
-		if n.Type != core.NodeRequest {
-			continue
-		}
-		if n.LastSeen.Before(start) || n.LastSeen.After(now) {
-			continue
-		}
-		failed := requestNodeFailed(n)
-		if failuresOnly && !failed {
-			continue
-		}
-
-		eventName, _ := n.Attr["event_name"].(string)
-		if eventName == "" {
-			continue
-		}
-
-		svc := requestOwnerService(n.Attr, eventName)
-
-		method, _ := n.Attr["http_method"].(string)
-		if method == "" {
-			method = "UNKNOWN"
-		}
-		routeTemplate, _ := n.Attr["route_template"].(string)
-		if routeTemplate == "" {
-			routeTemplate = eventName
-		}
-
-		key := svc + "\x00" + method + "\x00" + routeTemplate
-		rs := groups[key]
-		if rs == nil {
-			rs = &routeStats{Service: svc, Method: method, RouteTemplate: routeTemplate}
-			groups[key] = rs
-		}
-
-		rs.Total++
-		addStatusClassCount(attrToInt(n.Attr["status_code"]), &rs.Status2xx, &rs.Status4xx, &rs.Status5xx)
-		if failed {
-			rs.Failures++
-		}
-		if lat := attrToInt64(n.Attr["latency_ms"]); lat > 0 {
-			rs.latencies = append(rs.latencies, lat)
-		}
-	}
-
-	type routeEntry struct {
-		Service       string  `json:"service"`
-		Method        string  `json:"method"`
-		RouteTemplate string  `json:"route_template"`
-		Route         string  `json:"route"`
-		Invocations   int     `json:"invocations"`
-		Errors        int     `json:"errors"`
-		ErrorRate     float64 `json:"error_rate"`
-		Status2xx     int     `json:"status_2xx"`
-		Status4xx     int     `json:"status_4xx"`
-		Status5xx     int     `json:"status_5xx"`
-		P75LatencyMs  int64   `json:"p75_latency_ms"`
-	}
-
-	routes := make([]routeEntry, 0, len(groups))
-	for _, rs := range groups {
-		re := routeEntry{
-			Service:       rs.Service,
-			Method:        rs.Method,
-			RouteTemplate: rs.RouteTemplate,
-			Route:         rs.RouteTemplate,
-			Invocations:   rs.Total,
-			Errors:        rs.Failures,
-			Status2xx:     rs.Status2xx,
-			Status4xx:     rs.Status4xx,
-			Status5xx:     rs.Status5xx,
-		}
-		re.ErrorRate = percentage(rs.Failures, rs.Total)
-		if len(rs.latencies) > 0 {
-			sort.Slice(rs.latencies, func(a, b int) bool { return rs.latencies[a] < rs.latencies[b] })
-			re.P75LatencyMs = percentile(rs.latencies, 75)
-		}
-		routes = append(routes, re)
-	}
-
-	sort.Slice(routes, func(i, j int) bool {
-		if routes[i].Invocations != routes[j].Invocations {
-			return routes[i].Invocations > routes[j].Invocations
-		}
-		if routes[i].Route != routes[j].Route {
-			return routes[i].Route < routes[j].Route
-		}
-		return routes[i].Method < routes[j].Method
-	})
-
-	if len(routes) > limit {
-		routes = routes[:limit]
-	}
-
-	return map[string]any{
-		"sampled": s.sampleRatePct < 100,
-		"routes":  routes,
-	}
-}
-
-// Routes handles GET /v1/routes?window=5m&limit=20.
-func (s *Server) Routes(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-
-	if _, ok := s.snapshotOrServiceUnavailable(w); !ok {
-		return
-	}
-
-	q := r.URL.Query()
-	window := parseLooseDuration(q, "window", 5*time.Minute)
-	limit := parseBoundedPositiveInt(q, "limit", 20, 100)
-	failuresOnly := parseOptionalBool(q, "failures_only")
-
-	payload := s.routesPayload(window, limit, failuresOnly)
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(payload)
-}
-
-// GraphTopology handles GET /v1/graph/topology?window=1h.
-// Returns Cytoscape-formatted service topology: service nodes with aggregate
-// stats and edges derived from span caller→service pairs.
-func (s *Server) GraphTopology(w http.ResponseWriter, r *http.Request) {
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-
-	q := r.URL.Query()
-	window := parseLooseDuration(q, "window", 1*time.Hour)
-	if maxWindow := s.effectiveGraphHotWindow(); window > maxWindow {
-		window = maxWindow
-	}
-
-	if _, ok := s.snapshotOrServiceUnavailable(w); !ok {
-		return
-	}
-
-	now := time.Now()
-	result := analysis.BuildTopology(s.store, s.traceStore, now.Add(-window), now)
-	cyto := analysis.ToCytoscapeFormat(result)
-
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(cyto)
-}
-
 // CORSWrap wraps a handler with CORS headers.
 // methods should be e.g. "GET, OPTIONS" or "POST, OPTIONS".
 func CORSWrap(allowOrigin string, methods string, h http.HandlerFunc) http.HandlerFunc {
@@ -1740,64 +897,6 @@ func anyToStringSlice(v any) []string {
 	}
 }
 
-func requestFailureService(g *core.Graph, reqID string, req core.Node) string {
-	if !requestNodeFailed(req) {
-		return ""
-	}
-	bestService := ""
-	bestTime := time.Time{}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeRequestHasSpan {
-			continue
-		}
-		span, ok := g.Nodes[e.To]
-		if !ok || span.Type != core.NodeSpan {
-			continue
-		}
-		if !requestNodeFailed(span) {
-			continue
-		}
-		svc, _ := span.Attr["service"].(string)
-		if svc == "" {
-			continue
-		}
-		ts := span.LastSeen
-		if bestService == "" || (!ts.IsZero() && (bestTime.IsZero() || ts.Before(bestTime))) {
-			bestService = svc
-			bestTime = ts
-		}
-	}
-	if bestService != "" {
-		return bestService
-	}
-	if svc, _ := req.Attr["root_service"].(string); svc != "" {
-		return svc
-	}
-	if svc, _ := req.Attr["service"].(string); svc != "" {
-		return svc
-	}
-	if eventName, _ := req.Attr["event_name"].(string); eventName != "" {
-		return serviceFromEventName(eventName)
-	}
-	return ""
-}
-
-func requestNodeFailed(n core.Node) bool {
-	if n.Attr == nil {
-		return false
-	}
-	if success, ok := n.Attr["success"].(bool); ok && !success {
-		return true
-	}
-	if statusCode := attrToInt(n.Attr["status_code"]); statusCode >= 500 {
-		return true
-	}
-	if code, ok := n.Attr["error_code"].(string); ok && code != "" {
-		return true
-	}
-	return len(anyToStringSlice(n.Attr["error_codes"])) > 0
-}
-
 func percentage(numerator, denominator int) float64 {
 	if denominator <= 0 {
 		return 0
@@ -1892,35 +991,6 @@ func errorCode(ev *event.WideEvent) string {
 	return ""
 }
 
-func (s *Server) snapshotOrServiceUnavailable(w http.ResponseWriter) (*core.Graph, bool) {
-	if s.store == nil {
-		http.Error(w, "store not configured", http.StatusServiceUnavailable)
-		return nil, false
-	}
-	return s.store.Snapshot(), true
-}
-
-// frozenStore captures a snapshot once and reuses it across tool calls within a single request.
-type frozenStore struct {
-	snap *core.Graph
-	real *store.Store
-	ts   *tracestore.Store
-}
-
-func (f *frozenStore) Snapshot() *core.Graph { return f.snap }
-func (f *frozenStore) SummarizeWindow(start, end time.Time) store.WindowSummary {
-	return f.real.SummarizeWindow(start, end)
-}
-func (f *frozenStore) ForEachRequestFact(start, end time.Time, fn func(store.RequestFacts)) {
-	f.real.ForEachRequestFact(start, end, fn)
-}
-func (f *frozenStore) ErrorIndex(errorCode string) ([]string, bool) {
-	return f.real.ErrorIndex(errorCode)
-}
-func (f *frozenStore) TraceStore() *tracestore.Store {
-	return f.ts
-}
-
 func toolErrorToHTTPStatus(te *tools.ToolError) int {
 	switch te.Code {
 	case tools.CodeInvalidParams:
@@ -1989,10 +1059,6 @@ func (s *Server) ToolCall(w http.ResponseWriter, r *http.Request) {
 		respondError(w, r, http.StatusMethodNotAllowed, "METHOD_NOT_ALLOWED", "method not allowed", false, APIMeta{RequestID: RequestIDFromContext(r.Context())})
 		return
 	}
-	if s.store == nil {
-		respondError(w, r, http.StatusServiceUnavailable, "SERVICE_UNAVAILABLE", "store not configured", true, APIMeta{RequestID: RequestIDFromContext(r.Context())})
-		return
-	}
 
 	toolName := strings.TrimPrefix(r.URL.Path, "/v1/tools/")
 	if toolName == "" {
@@ -2101,8 +1167,7 @@ func (s *Server) ToolCall(w http.ResponseWriter, r *http.Request) {
 		}()
 	}
 
-	fs := &frozenStore{snap: s.store.Snapshot(), real: s.store, ts: s.traceStore}
-	result, err := registry.Call(r.Context(), fs, toolName, params)
+	result, err := registry.Call(r.Context(), toolName, params)
 	duration := time.Since(start).Milliseconds()
 
 	if err != nil {
@@ -2187,75 +1252,6 @@ func clientIP(r *http.Request, trustProxy bool) string {
 	return host
 }
 
-// Topology handles GET /v1/topology — service-to-service edges with failure counts.
-func (s *Server) Topology(w http.ResponseWriter, r *http.Request) {
-	if r.Method == http.MethodOptions {
-		return
-	}
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-	reqStart := time.Now()
-	meta := APIMeta{RequestID: RequestIDFromContext(r.Context()), APIVersion: apiVersion}
-
-	q := r.URL.Query()
-	dur := parseLooseDuration(q, "window", time.Hour)
-	if maxWindow := s.effectiveGraphHotWindow(); dur > maxWindow {
-		dur = maxWindow
-	}
-	if _, ok := s.snapshotOrServiceUnavailable(w); !ok {
-		return
-	}
-
-	now := time.Now()
-	result := analysis.BuildTopology(s.store, s.traceStore, now.Add(-dur), now)
-
-	meta.DurationMs = time.Since(reqStart).Milliseconds()
-	meta.DataStatus = "complete"
-	if len(result.Nodes) == 0 {
-		meta.DataStatus = "empty"
-	}
-	writeJSON(w, http.StatusOK, result, meta, nil)
-}
-
-// BlastRadius handles GET /v1/blast_radius?error_code=X — impact analysis for an error code.
-func (s *Server) BlastRadius(w http.ResponseWriter, r *http.Request) {
-	if r.Method == http.MethodOptions {
-		return
-	}
-	if r.Method != http.MethodGet {
-		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
-		return
-	}
-	reqStart := time.Now()
-	meta := APIMeta{RequestID: RequestIDFromContext(r.Context()), APIVersion: apiVersion}
-
-	errorCode := r.URL.Query().Get("error_code")
-	if errorCode == "" {
-		respondError(w, r, http.StatusBadRequest, "INVALID_PARAMS", "error_code is required", false, meta)
-		return
-	}
-	q := r.URL.Query()
-	dur := parseLooseDuration(q, "window", time.Hour)
-	if maxWindow := s.effectiveGraphHotWindow(); dur > maxWindow {
-		dur = maxWindow
-	}
-	snap, ok := s.snapshotOrServiceUnavailable(w)
-	if !ok {
-		return
-	}
-	now := time.Now()
-	result := analysis.ComputeBlastRadius(snap, errorCode, now.Add(-dur), now)
-
-	meta.DurationMs = time.Since(reqStart).Milliseconds()
-	meta.DataStatus = "complete"
-	if result.AffectedRequests == 0 {
-		meta.DataStatus = "empty"
-	}
-	writeJSON(w, http.StatusOK, result, meta, nil)
-}
-
 // PlanExecute handles POST /v1/plans/execute — deterministic plan execution.
 func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
 	if r.Method != http.MethodPost {
@@ -2410,7 +1406,7 @@ func (s *Server) executePlanWithProgress(ctx context.Context, steps []PlanStep,
 			return result
 		}
 
-		toolResult, err := registry.Call(ctx, s.store, step.Tool, params)
+		toolResult, err := registry.Call(ctx, step.Tool, params)
 		stepResult.DurationMs = time.Since(stepStart).Milliseconds()
 
 		if err != nil {
@@ -2506,7 +1502,10 @@ func (s *Server) PlanStream(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	writeSSEHeaders(w)
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("X-Accel-Buffering", "no")
 	flusher.Flush()
 
 	heartbeat := time.NewTicker(15 * time.Second)
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index c8775b4..bf9e248 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -14,248 +14,14 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/sssmaran/WaylogCLI/internal/eventlog"
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
 	"github.com/sssmaran/WaylogCLI/internal/llm"
 	"github.com/sssmaran/WaylogCLI/internal/metrics"
 	"github.com/sssmaran/WaylogCLI/internal/sampler"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
 const testTrace = "aaaa0000bbbb1111cccc2222dddd3333"
 
-func makeTestServer() *Server {
-	st := graphstore.NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(45),
-			testutil.WithTimestamp(time.Now().Add(-2*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithService("checkout"),
-			testutil.WithCallerService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(32),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("3333333333333333"),
-			testutil.WithParentSpanID("2222222222222222"),
-			testutil.WithService("payment"),
-			testutil.WithCallerService("checkout"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "payment failed"),
-			testutil.WithLatency(12),
-			testutil.WithTimestamp(time.Now().Add(-30*time.Second)),
-		),
-	}
-
-	for _, ev := range events {
-		result := b.BuildResult(ev)
-		st.Merge(result.Graph)
-		if result.Span != nil {
-			ts.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-		}
-	}
-
-	return &Server{store: st, traceStore: ts, builder: b}
-}
-
-func TestTraceStory_Success(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/story?trace_id="+testTrace, nil)
-	w := httptest.NewRecorder()
-	srv.TraceStory(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp map[string]json.RawMessage
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if _, ok := resp["story"]; !ok {
-		t.Fatal("missing 'story' key")
-	}
-	if _, ok := resp["context"]; !ok {
-		t.Fatal("missing 'context' key")
-	}
-
-	var story struct {
-		TraceID  string `json:"trace_id"`
-		Chain    []any  `json:"chain"`
-		HopCount int    `json:"hop_count"`
-	}
-	if err := json.Unmarshal(resp["story"], &story); err != nil {
-		t.Fatalf("invalid story json: %v", err)
-	}
-	if story.TraceID != testTrace {
-		t.Errorf("trace_id = %q, want %q", story.TraceID, testTrace)
-	}
-	if story.HopCount != 3 {
-		t.Errorf("hop_count = %d, want 3", story.HopCount)
-	}
-}
-
-func TestTraceStory_NotFound(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/story?trace_id=00000000000000000000000000000000", nil)
-	w := httptest.NewRecorder()
-	srv.TraceStory(w, req)
-
-	if w.Code != http.StatusNotFound {
-		t.Errorf("expected 404, got %d", w.Code)
-	}
-}
-
-func TestTraceStory_MissingParam(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/story", nil)
-	w := httptest.NewRecorder()
-	srv.TraceStory(w, req)
-
-	if w.Code != http.StatusBadRequest {
-		t.Errorf("expected 400, got %d", w.Code)
-	}
-}
-
-func TestRecentTraces_Ordering(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/recent?limit=10", nil)
-	w := httptest.NewRecorder()
-	srv.RecentTraces(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Traces     []traceEntry `json:"traces"`
-		TotalCount int          `json:"total_count"`
-		NextCursor string       `json:"next_cursor,omitempty"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	entries := resp.Traces
-	if len(entries) == 0 {
-		t.Fatal("expected at least one trace entry")
-	}
-	if resp.TotalCount < len(entries) {
-		t.Errorf("total_count %d < returned entries %d", resp.TotalCount, len(entries))
-	}
-
-	// Verify descending order by timestamp
-	for i := 1; i < len(entries); i++ {
-		if entries[i].Timestamp.After(entries[i-1].Timestamp) {
-			t.Errorf("entries not sorted desc: [%d].Timestamp > [%d].Timestamp", i, i-1)
-		}
-	}
-}
-
-func TestRecentTraces_Limit(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/recent?limit=1", nil)
-	w := httptest.NewRecorder()
-	srv.RecentTraces(w, req)
-
-	var resp struct {
-		Traces     []traceEntry `json:"traces"`
-		TotalCount int          `json:"total_count"`
-		NextCursor string       `json:"next_cursor,omitempty"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if len(resp.Traces) > 1 {
-		t.Errorf("expected at most 1 entry, got %d", len(resp.Traces))
-	}
-	if resp.TotalCount > 1 && resp.NextCursor == "" {
-		t.Error("expected next_cursor when total_count > limit")
-	}
-}
-
-func TestRecentTraces_FailuresOnlyAndFailureSource(t *testing.T) {
-	srv := makeTestServerMixed()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/recent?limit=10&failures_only=true", nil)
-	w := httptest.NewRecorder()
-	srv.RecentTraces(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Traces     []traceEntry `json:"traces"`
-		TotalCount int          `json:"total_count"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	entries := resp.Traces
-	if len(entries) == 0 {
-		t.Fatal("expected at least one failed trace entry")
-	}
-	for _, e := range entries {
-		if e.Success {
-			t.Fatalf("expected only failed traces, got success trace %s", e.TraceID)
-		}
-		if e.FailureService == "" {
-			t.Fatalf("expected failure_service for failed trace %s", e.TraceID)
-		}
-	}
-}
-
-func TestOverview_Stats(t *testing.T) {
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/overview?window=10m", nil)
-	w := httptest.NewRecorder()
-	srv.Overview(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp map[string]any
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	for _, key := range []string{"window", "total_requests", "total_failures", "error_rate", "p50", "p95", "p99", "sampled", "top_errors", "recent_traces"} {
-		if _, ok := resp[key]; !ok {
-			t.Errorf("missing key %q in overview response", key)
-		}
-	}
-
-	totalReqs := int(resp["total_requests"].(float64))
-	if totalReqs < 1 {
-		t.Errorf("expected total_requests >= 1, got %d", totalReqs)
-	}
-}
-
 func TestCORSWrap(t *testing.T) {
 	handler := CORSWrap("http://localhost:3000", "GET, OPTIONS", func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -308,9 +74,6 @@ func TestCapabilities_Defaults(t *testing.T) {
 		Dashboard struct {
 			RefreshIntervalSec int `json:"refresh_interval_sec"`
 		} `json:"dashboard"`
-		V2Reads struct {
-			Enabled bool `json:"enabled"`
-		} `json:"v2_reads"`
 	}
 	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
 		t.Fatalf("invalid json: %v", err)
@@ -324,29 +87,6 @@ func TestCapabilities_Defaults(t *testing.T) {
 	if resp.Dashboard.RefreshIntervalSec != 10 {
 		t.Errorf("refresh_interval_sec = %d, want 10", resp.Dashboard.RefreshIntervalSec)
 	}
-	if resp.V2Reads.Enabled {
-		t.Errorf("v2_reads.enabled = true, want false")
-	}
-}
-
-func TestCapabilities_V2ReadsEnabled(t *testing.T) {
-	srv := NewServer(ServerConfig{V2ReadsEnabled: true})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
-	w := httptest.NewRecorder()
-	srv.Capabilities(w, req)
-
-	var resp struct {
-		V2Reads struct {
-			Enabled bool `json:"enabled"`
-		} `json:"v2_reads"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if !resp.V2Reads.Enabled {
-		t.Fatal("v2_reads.enabled = false, want true")
-	}
 }
 
 func TestCapabilities_OTLPGRPCBlock(t *testing.T) {
@@ -442,344 +182,12 @@ func TestCapabilities_IncidentsBlock(t *testing.T) {
 
 const successTrace = "bbbb0000cccc1111dddd2222eeee3333"
 
-func makeTestServerMixed() *Server {
-	st := graphstore.NewStore()
-	ts := tracestore.NewStore()
-	b := build.NewBuilder()
-
-	events := []event.WideEvent{
-		// Trace 1: 3-hop failure (gateway->checkout->payment fails)
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(502),
-			testutil.WithLatency(45),
-			testutil.WithTimestamp(time.Now().Add(-2*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithService("checkout"),
-			testutil.WithCallerService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(32),
-			testutil.WithTimestamp(time.Now().Add(-2*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(testTrace),
-			testutil.WithSpanID("3333333333333333"),
-			testutil.WithParentSpanID("2222222222222222"),
-			testutil.WithService("payment"),
-			testutil.WithCallerService("checkout"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "payment failed"),
-			testutil.WithLatency(12),
-			testutil.WithTimestamp(time.Now().Add(-2*time.Minute)),
-		),
-		// Trace 2: 3-hop success (all 200)
-		testutil.MakeEvent(
-			testutil.WithTraceID(successTrace),
-			testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(40),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(successTrace),
-			testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithParentSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithService("checkout"),
-			testutil.WithCallerService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(25),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(successTrace),
-			testutil.WithSpanID("cccccccccccccccc"),
-			testutil.WithParentSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithService("payment"),
-			testutil.WithCallerService("checkout"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(10),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-	}
-
-	for _, ev := range events {
-		result := b.BuildResult(ev)
-		st.Merge(result.Graph)
-		if result.Span != nil {
-			ts.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-		}
-	}
-
-	return &Server{store: st, traceStore: ts, builder: b}
-}
-
-func TestOverview_MixedSuccessAndFailure(t *testing.T) {
-	srv := makeTestServerMixed()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/overview?window=10m", nil)
-	w := httptest.NewRecorder()
-	srv.Overview(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp map[string]any
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	totalReqs := int(resp["total_requests"].(float64))
-	totalFails := int(resp["total_failures"].(float64))
-	errorRate := resp["error_rate"].(float64)
-
-	if totalReqs != 2 {
-		t.Errorf("total_requests = %d, want 2", totalReqs)
-	}
-	if totalFails != 1 {
-		t.Errorf("total_failures = %d, want 1", totalFails)
-	}
-	if errorRate != 50.0 {
-		t.Errorf("error_rate = %.1f, want 50.0", errorRate)
-	}
-}
-
-func TestOverview_TopErrors_UniquePerFailedRequest(t *testing.T) {
-	st := graphstore.NewStore()
-	b := build.NewBuilder()
-	traceID := "cccc0000dddd1111eeee2222ffff3333"
-
-	events := []event.WideEvent{
-		// First failure in request lifecycle.
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("PMT_502", "payment failed"),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		// Later propagated failure on gateway for the same request.
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("GW_DOWNSTREAM", "downstream checkout failed"),
-			testutil.WithTimestamp(time.Now().Add(-30*time.Second)),
-		),
-	}
-
-	for _, ev := range events {
-		st.Merge(b.Build(ev))
-	}
-
-	srv := &Server{store: st, builder: b}
-	req := httptest.NewRequest(http.MethodGet, "/v1/overview?window=10m", nil)
-	w := httptest.NewRecorder()
-	srv.Overview(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		TotalFailures int `json:"total_failures"`
-		TopErrors     []struct {
-			Code  string `json:"code"`
-			Count int    `json:"count"`
-		} `json:"top_errors"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if resp.TotalFailures != 1 {
-		t.Fatalf("total_failures = %d, want 1", resp.TotalFailures)
-	}
-	if len(resp.TopErrors) != 1 {
-		t.Fatalf("top_errors len = %d, want 1 (one primary code per failed request)", len(resp.TopErrors))
-	}
-	if resp.TopErrors[0].Code != "PMT_502" {
-		t.Fatalf("top_errors[0].code = %q, want PMT_502", resp.TopErrors[0].Code)
-	}
-	if resp.TopErrors[0].Count != 1 {
-		t.Fatalf("top_errors[0].count = %d, want 1", resp.TopErrors[0].Count)
-	}
-}
-
-func TestTraceStory_SuccessTrace(t *testing.T) {
-	srv := makeTestServerMixed()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/traces/story?trace_id="+successTrace, nil)
-	w := httptest.NewRecorder()
-	srv.TraceStory(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Story struct {
-			TraceID      string `json:"trace_id"`
-			Success      bool   `json:"success"`
-			HopCount     int    `json:"hop_count"`
-			FirstFailHop *any   `json:"first_fail_hop"`
-		} `json:"story"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	if resp.Story.TraceID != successTrace {
-		t.Errorf("trace_id = %q, want %q", resp.Story.TraceID, successTrace)
-	}
-	if !resp.Story.Success {
-		t.Error("expected success=true for all-200 trace")
-	}
-	if resp.Story.HopCount != 3 {
-		t.Errorf("hop_count = %d, want 3", resp.Story.HopCount)
-	}
-	if resp.Story.FirstFailHop != nil {
-		t.Error("expected first_fail_hop to be nil for success trace")
-	}
-}
-
-func TestReadEndpoints_NoStore(t *testing.T) {
-	srv := NewServer(ServerConfig{})
-
-	t.Run("TraceStory", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/traces/story?trace_id="+testTrace, nil)
-		w := httptest.NewRecorder()
-		srv.TraceStory(w, req)
-		if w.Code != http.StatusServiceUnavailable {
-			t.Errorf("expected 503, got %d", w.Code)
-		}
-	})
-
-	t.Run("RecentTraces", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/traces/recent", nil)
-		w := httptest.NewRecorder()
-		srv.RecentTraces(w, req)
-		if w.Code != http.StatusServiceUnavailable {
-			t.Errorf("expected 503, got %d", w.Code)
-		}
-	})
-
-	t.Run("Overview", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/overview", nil)
-		w := httptest.NewRecorder()
-		srv.Overview(w, req)
-		if w.Code != http.StatusServiceUnavailable {
-			t.Errorf("expected 503, got %d", w.Code)
-		}
-	})
-}
-
-func TestEventSearch_NoFilter(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore(), EventLogDir: t.TempDir()})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/events/search", nil)
-	w := httptest.NewRecorder()
-	srv.EventSearch(w, req)
-
-	if w.Code != http.StatusBadRequest {
-		t.Errorf("expected 400 for no filters, got %d", w.Code)
-	}
-}
-
-func TestEventSearch_NoEventLog(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/events/search?service=x", nil)
-	w := httptest.NewRecorder()
-	srv.EventSearch(w, req)
-
-	if w.Code != http.StatusServiceUnavailable {
-		t.Errorf("expected 503, got %d", w.Code)
-	}
-}
-
-func TestEventSearch_WithResults(t *testing.T) {
-	dir := t.TempDir()
-
-	// Write test events directly
-	w2, err := newTestEventLog(dir)
-	if err != nil {
-		t.Fatal(err)
-	}
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(testTrace),
-		testutil.WithService("checkout"),
-		testutil.WithStatusCode(200),
-	)
-	if err := w2.Write(&ev, true); err != nil {
-		t.Fatal(err)
-	}
-	w2.Close()
-
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore(), EventLogDir: dir})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/events/search?service=checkout&limit=5", nil)
-	rec := httptest.NewRecorder()
-	srv.EventSearch(rec, req)
-
-	if rec.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", rec.Code, rec.Body.String())
-	}
-
-	var resp struct {
-		Events []event.WideEvent `json:"events"`
-		Count  int               `json:"count"`
-	}
-	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if resp.Count != 1 {
-		t.Errorf("expected count=1, got %d", resp.Count)
-	}
-	if len(resp.Events) != 1 {
-		t.Errorf("expected 1 event, got %d", len(resp.Events))
-	}
-}
-
 func newTestEventLog(dir string) (*eventlog.Writer, error) {
 	return eventlog.New(dir)
 }
 
-func TestEventSearch_BadStartReturns400(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore(), EventLogDir: t.TempDir()})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/events/search?service=x&start=garbage", nil)
-	w := httptest.NewRecorder()
-	srv.EventSearch(w, req)
-
-	if w.Code != http.StatusBadRequest {
-		t.Errorf("expected 400 for bad start, got %d", w.Code)
-	}
-}
-
-func TestEventSearch_BadEndReturns400(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore(), EventLogDir: t.TempDir()})
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/events/search?service=x&end=not-a-date", nil)
-	w := httptest.NewRecorder()
-	srv.EventSearch(w, req)
-
-	if w.Code != http.StatusBadRequest {
-		t.Errorf("expected 400 for bad end, got %d", w.Code)
-	}
-}
-
 func TestLivez(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	req := httptest.NewRequest(http.MethodGet, "/livez", nil)
 	w := httptest.NewRecorder()
 	srv.Livez(w, req)
@@ -793,7 +201,7 @@ func TestLivez(t *testing.T) {
 }
 
 func TestReadyz_NotReady(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	req := httptest.NewRequest(http.MethodGet, "/readyz", nil)
 	w := httptest.NewRecorder()
 	srv.Readyz(w, req)
@@ -804,7 +212,7 @@ func TestReadyz_NotReady(t *testing.T) {
 }
 
 func TestReadyz_Ready(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	srv.SetReady()
 
 	req := httptest.NewRequest(http.MethodGet, "/readyz", nil)
@@ -820,7 +228,7 @@ func TestReadyz_Ready(t *testing.T) {
 }
 
 func TestHealth_JSON(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	srv.SetReady()
 
 	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
@@ -841,43 +249,19 @@ func TestHealth_JSON(t *testing.T) {
 	if resp["ready"] != true {
 		t.Errorf("ready = %v, want true", resp["ready"])
 	}
-	for _, key := range []string{"status", "uptime", "ready", "store", "event_log", "replay"} {
+	for _, key := range []string{"status", "uptime", "ready", "event_log", "replay"} {
 		if _, ok := resp[key]; !ok {
 			t.Errorf("missing key %q", key)
 		}
 	}
-	storeInfo := resp["store"].(map[string]any)
-	if storeInfo["configured"] != true {
-		t.Errorf("store.configured = %v, want true", storeInfo["configured"])
-	}
 	replayInfo := resp["replay"].(map[string]any)
 	if replayInfo["status"] != "none" {
-		t.Errorf("replay.status = %q, want 'none'", replayInfo["status"])
-	}
-}
-
-func TestHealth_Degraded(t *testing.T) {
-	srv := NewServer(ServerConfig{})
-
-	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
-	w := httptest.NewRecorder()
-	srv.Health(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d", w.Code)
-	}
-
-	var resp map[string]any
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if resp["status"] != "degraded" {
-		t.Errorf("status = %q, want 'degraded'", resp["status"])
+		t.Errorf("replay.status = %q, want 'none'", replayInfo["status"])
 	}
 }
 
 func TestHealth_ReplaySuccess(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	srv.SetReplayResult(nil)
 	srv.SetReady()
 
@@ -907,7 +291,7 @@ func TestHealth_ReplaySuccess(t *testing.T) {
 }
 
 func TestHealth_ReplayFailed(t *testing.T) {
-	srv := NewServer(ServerConfig{Store: graphstore.NewStore()})
+	srv := NewServer(ServerConfig{})
 	srv.SetReplayResult(errors.New("corrupt eventlog"))
 	srv.SetReady()
 
@@ -939,500 +323,10 @@ func TestHealth_ReplayFailed(t *testing.T) {
 	}
 }
 
-func TestOverview_ErrorRateFromPresamplingCounters(t *testing.T) {
-	srv := NewServer(ServerConfig{
-		Store:   graphstore.NewStore(),
-		Sampler: keepAllSampler(),
-	})
-
-	// Seed 4 events into the graph and pre-sampling counters: 3 success + 1 error.
-	makeEvent := func(traceID string, success bool, code int, errCode string) event.WideEvent {
-		ev := testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithService("svc"),
-			testutil.WithStatusCode(code),
-		)
-		if !success {
-			ev.Outcome.Success = false
-			ev.Error = &event.ErrorContext{Code: errCode, Message: "fail"}
-			ev.EventName = "svc.error"
-		}
-		return ev
-	}
-
-	for _, ev := range []event.WideEvent{
-		makeEvent("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa1", true, 200, ""),
-		makeEvent("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa2", true, 200, ""),
-		makeEvent("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa3", true, 200, ""),
-		makeEvent("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4", false, 500, "ERR_X"),
-	} {
-		result := srv.builder.BuildResult(ev)
-		srv.store.Merge(result.Graph)
-		srv.counters.Inc(!ev.Outcome.Success)
-		if result.Span != nil {
-			srv.traceStore.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-		}
-	}
-
-	// Overview should use pre-sampling counters: 1/4 = 25%.
-	req := httptest.NewRequest(http.MethodGet, "/v1/overview?window=10m", nil)
-	w := httptest.NewRecorder()
-	srv.Overview(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp map[string]any
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	errorRate := resp["error_rate"].(float64)
-	if errorRate != 25.0 {
-		t.Errorf("error_rate = %.1f, want 25.0 (from pre-sampling counters)", errorRate)
-	}
-}
-
 func keepAllSampler() *sampler.Sampler {
 	return sampler.New(sampler.Config{HappySampleRatePct: 100})
 }
 
-func TestOverviewTimeseries(t *testing.T) {
-	srv := makeTestServer()
-
-	t.Run("success", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/overview/timeseries?window=10m&step=5m", nil)
-		w := httptest.NewRecorder()
-		srv.OverviewTimeseries(w, req)
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		var resp struct {
-			Sampled bool `json:"sampled"`
-			Buckets []struct {
-				Start     string  `json:"start"`
-				End       string  `json:"end"`
-				Total     int     `json:"total"`
-				Failures  int     `json:"failures"`
-				ErrorRate float64 `json:"error_rate"`
-				Status2xx int     `json:"status_2xx"`
-				Status4xx int     `json:"status_4xx"`
-				Status5xx int     `json:"status_5xx"`
-				P50       int64   `json:"p50"`
-				P95       int64   `json:"p95"`
-				P99       int64   `json:"p99"`
-			} `json:"buckets"`
-		}
-		if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-			t.Fatalf("invalid json: %v", err)
-		}
-		if len(resp.Buckets) != 2 {
-			t.Fatalf("expected 2 buckets, got %d", len(resp.Buckets))
-		}
-		// Should have at least one request across the buckets.
-		totalReqs := 0
-		for _, b := range resp.Buckets {
-			totalReqs += b.Total
-		}
-		if totalReqs < 1 {
-			t.Errorf("expected at least 1 request across buckets, got %d", totalReqs)
-		}
-	})
-
-	t.Run("guardrail_window_too_large", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/overview/timeseries?window=48h", nil)
-		w := httptest.NewRecorder()
-		srv.OverviewTimeseries(w, req)
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected 400, got %d", w.Code)
-		}
-	})
-
-	t.Run("guardrail_step_too_small", func(t *testing.T) {
-		req := httptest.NewRequest(http.MethodGet, "/v1/overview/timeseries?step=5s", nil)
-		w := httptest.NewRecorder()
-		srv.OverviewTimeseries(w, req)
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected 400, got %d", w.Code)
-		}
-	})
-}
-
-func TestRoutes(t *testing.T) {
-	st := graphstore.NewStore()
-	b := build.NewBuilder()
-
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithTraceID("aaaa0000bbbb1111cccc2222dddd0001"),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(40),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID("aaaa0000bbbb1111cccc2222dddd0002"),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(60),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID("aaaa0000bbbb1111cccc2222dddd0003"),
-			testutil.WithSpanID("3333333333333333"),
-			testutil.WithService("checkout"),
-			testutil.WithEventName("checkout.request"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("CHK_502", "checkout failed"),
-			testutil.WithLatency(100),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-	}
-
-	for _, ev := range events {
-		st.Merge(b.Build(ev))
-	}
-
-	srv := &Server{store: st, builder: b}
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/routes?window=10m", nil)
-	w := httptest.NewRecorder()
-	srv.Routes(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Sampled bool `json:"sampled"`
-		Routes  []struct {
-			Service      string  `json:"service"`
-			Route        string  `json:"route"`
-			Invocations  int     `json:"invocations"`
-			Errors       int     `json:"errors"`
-			ErrorRate    float64 `json:"error_rate"`
-			Status2xx    int     `json:"status_2xx"`
-			Status4xx    int     `json:"status_4xx"`
-			Status5xx    int     `json:"status_5xx"`
-			P75LatencyMs int64   `json:"p75_latency_ms"`
-		} `json:"routes"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	if len(resp.Routes) < 2 {
-		t.Fatalf("expected at least 2 routes, got %d", len(resp.Routes))
-	}
-
-	// First route should be api-gateway (2 invocations) sorted by invocations desc.
-	if resp.Routes[0].Service != "api-gateway" {
-		t.Errorf("first route service = %q, want api-gateway", resp.Routes[0].Service)
-	}
-	if resp.Routes[0].Invocations != 2 {
-		t.Errorf("api-gateway invocations = %d, want 2", resp.Routes[0].Invocations)
-	}
-	if resp.Routes[0].Status2xx != 2 {
-		t.Errorf("api-gateway status_2xx = %d, want 2", resp.Routes[0].Status2xx)
-	}
-
-	// Second route: checkout with 1 error.
-	found := false
-	for _, r := range resp.Routes {
-		if r.Service == "checkout" {
-			found = true
-			if r.Invocations != 1 {
-				t.Errorf("checkout invocations = %d, want 1", r.Invocations)
-			}
-			if r.Errors != 1 {
-				t.Errorf("checkout errors = %d, want 1", r.Errors)
-			}
-			if r.ErrorRate != 100.0 {
-				t.Errorf("checkout error_rate = %.1f, want 100.0", r.ErrorRate)
-			}
-			if r.Status5xx != 1 {
-				t.Errorf("checkout status_5xx = %d, want 1", r.Status5xx)
-			}
-		}
-	}
-	if !found {
-		t.Error("checkout route not found")
-	}
-}
-
-func TestRoutes_FailuresOnly(t *testing.T) {
-	st := graphstore.NewStore()
-	b := build.NewBuilder()
-
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithTraceID("aaaa0000bbbb1111cccc2222dddd1010"),
-			testutil.WithSpanID("1010101010101010"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(40),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID("aaaa0000bbbb1111cccc2222dddd1011"),
-			testutil.WithSpanID("1111111111111011"),
-			testutil.WithService("checkout"),
-			testutil.WithEventName("checkout.request"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("CHK_502", "checkout failed"),
-			testutil.WithLatency(80),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		),
-	}
-	for _, ev := range events {
-		st.Merge(b.Build(ev))
-	}
-
-	srv := &Server{store: st, builder: b}
-	req := httptest.NewRequest(http.MethodGet, "/v1/routes?window=10m&failures_only=true", nil)
-	w := httptest.NewRecorder()
-	srv.Routes(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Routes []struct {
-			Service string `json:"service"`
-			Route   string `json:"route"`
-		} `json:"routes"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if len(resp.Routes) != 1 {
-		t.Fatalf("expected exactly 1 failed route, got %d", len(resp.Routes))
-	}
-	if resp.Routes[0].Service != "checkout" {
-		t.Fatalf("service = %q, want checkout", resp.Routes[0].Service)
-	}
-}
-
-func TestRoutes_RootServiceAttribution(t *testing.T) {
-	traceID := "aaaa0000bbbb1111cccc2222dddd9999"
-
-	t.Run("root_arrives_later", func(t *testing.T) {
-		st := graphstore.NewStore()
-		b := build.NewBuilder()
-
-		// Child span arrives first — service=payment, not the root.
-		child := testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithService("payment"),
-			testutil.WithEventName("payment.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(10),
-			testutil.WithTimestamp(time.Now().Add(-2*time.Minute)),
-		)
-		st.Merge(b.Build(child))
-
-		// Root span arrives later — service=api-gateway.
-		root := testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(50),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		)
-		st.Merge(b.Build(root))
-
-		srv := &Server{store: st, builder: b}
-		req := httptest.NewRequest(http.MethodGet, "/v1/routes?window=10m", nil)
-		w := httptest.NewRecorder()
-		srv.Routes(w, req)
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		var resp struct {
-			Routes []struct {
-				Service string `json:"service"`
-				Route   string `json:"route"`
-			} `json:"routes"`
-		}
-		if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-			t.Fatalf("invalid json: %v", err)
-		}
-		if len(resp.Routes) != 1 {
-			t.Fatalf("expected 1 route, got %d", len(resp.Routes))
-		}
-		if resp.Routes[0].Service != "api-gateway" {
-			t.Errorf("service = %q, want api-gateway (root_service)", resp.Routes[0].Service)
-		}
-	})
-
-	t.Run("root_absent_fallback", func(t *testing.T) {
-		st := graphstore.NewStore()
-		b := build.NewBuilder()
-
-		// Only a child span — no root span arrives.
-		child := testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithService("payment"),
-			testutil.WithEventName("payment.request"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(10),
-			testutil.WithTimestamp(time.Now().Add(-1*time.Minute)),
-		)
-		st.Merge(b.Build(child))
-
-		srv := &Server{store: st, builder: b}
-		req := httptest.NewRequest(http.MethodGet, "/v1/routes?window=10m", nil)
-		w := httptest.NewRecorder()
-		srv.Routes(w, req)
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		var resp struct {
-			Routes []struct {
-				Service string `json:"service"`
-				Route   string `json:"route"`
-			} `json:"routes"`
-		}
-		if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-			t.Fatalf("invalid json: %v", err)
-		}
-		if len(resp.Routes) != 1 {
-			t.Fatalf("expected 1 route, got %d", len(resp.Routes))
-		}
-		// Fallback: derived from event_name prefix.
-		if resp.Routes[0].Service != "payment" {
-			t.Errorf("service = %q, want payment (event_name fallback)", resp.Routes[0].Service)
-		}
-	})
-}
-
-func TestRoutes_GroupByMethodAndRoute(t *testing.T) {
-	st := graphstore.NewStore()
-	b := build.NewBuilder()
-
-	now := time.Now().Add(-1 * time.Minute)
-	events := []event.WideEvent{
-		// Same service + same route template, different methods -> separate groups.
-		testutil.MakeEvent(
-			testutil.WithTraceID("11110000bbbb1111cccc2222dddd0001"),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithHTTPMethod("GET"),
-			testutil.WithRouteTemplate("/users/{id}"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(20),
-			testutil.WithTimestamp(now),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID("11110000bbbb1111cccc2222dddd0002"),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithHTTPMethod("POST"),
-			testutil.WithRouteTemplate("/users/{id}"),
-			testutil.WithStatusCode(201),
-			testutil.WithLatency(35),
-			testutil.WithTimestamp(now),
-		),
-		// Same service + same method, different route template -> separate groups.
-		testutil.MakeEvent(
-			testutil.WithTraceID("11110000bbbb1111cccc2222dddd0003"),
-			testutil.WithSpanID("3333333333333333"),
-			testutil.WithService("api-gateway"),
-			testutil.WithEventName("api-gateway.request"),
-			testutil.WithHTTPMethod("GET"),
-			testutil.WithRouteTemplate("/orders/{id}"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(18),
-			testutil.WithTimestamp(now),
-		),
-		// Legacy event: no method/template -> UNKNOWN + event_name fallback.
-		testutil.MakeEvent(
-			testutil.WithTraceID("11110000bbbb1111cccc2222dddd0004"),
-			testutil.WithSpanID("4444444444444444"),
-			testutil.WithService("checkout"),
-			testutil.WithEventName("checkout.request"),
-			testutil.WithStatusCode(502),
-			testutil.WithError("CHK_502", "checkout failed"),
-			testutil.WithLatency(90),
-			testutil.WithTimestamp(now),
-		),
-	}
-
-	for _, ev := range events {
-		st.Merge(b.Build(ev))
-	}
-
-	srv := &Server{store: st, builder: b}
-	req := httptest.NewRequest(http.MethodGet, "/v1/routes?window=10m&limit=10", nil)
-	w := httptest.NewRecorder()
-	srv.Routes(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Routes []struct {
-			Service       string `json:"service"`
-			Method        string `json:"method"`
-			RouteTemplate string `json:"route_template"`
-			Route         string `json:"route"`
-			Invocations   int    `json:"invocations"`
-		} `json:"routes"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-	if len(resp.Routes) < 4 {
-		t.Fatalf("expected at least 4 grouped routes, got %d", len(resp.Routes))
-	}
-
-	seen := map[string]bool{}
-	for _, r := range resp.Routes {
-		key := r.Service + "|" + r.Method + "|" + r.RouteTemplate
-		seen[key] = true
-		if r.Route != r.RouteTemplate {
-			t.Errorf("route alias mismatch: route=%q route_template=%q", r.Route, r.RouteTemplate)
-		}
-	}
-
-	if !seen["api-gateway|GET|/users/{id}"] {
-		t.Error("missing group api-gateway|GET|/users/{id}")
-	}
-	if !seen["api-gateway|POST|/users/{id}"] {
-		t.Error("missing group api-gateway|POST|/users/{id}")
-	}
-	if !seen["api-gateway|GET|/orders/{id}"] {
-		t.Error("missing group api-gateway|GET|/orders/{id}")
-	}
-	if !seen["checkout|UNKNOWN|checkout.error"] {
-		t.Error("missing legacy fallback group checkout|UNKNOWN|checkout.error")
-	}
-}
-
 func gatherMap(families []*dto.MetricFamily) map[string]*dto.MetricFamily {
 	m := make(map[string]*dto.MetricFamily, len(families))
 	for _, f := range families {
@@ -1461,212 +355,8 @@ func histogramCount(mf *dto.MetricFamily) uint64 {
 	return 0
 }
 
-func TestGraphTopology(t *testing.T) {
-	// makeTestServer creates: api-gateway -> checkout -> payment (PMT_502)
-	// Span nodes have caller_service attrs set for checkout and payment.
-	srv := makeTestServer()
-
-	req := httptest.NewRequest(http.MethodGet, "/v1/graph/topology?window=1h", nil)
-	w := httptest.NewRecorder()
-	srv.GraphTopology(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var resp struct {
-		Nodes []struct {
-			Data struct {
-				ID          string  `json:"id"`
-				Label       string  `json:"label"`
-				Type        string  `json:"type"`
-				Invocations int     `json:"invocations"`
-				Errors      int     `json:"errors"`
-				ErrorRate   float64 `json:"error_rate"`
-			} `json:"data"`
-		} `json:"nodes"`
-		Edges []struct {
-			Data struct {
-				Source string `json:"source"`
-				Target string `json:"target"`
-				Label  string `json:"label"`
-				Count  int    `json:"count"`
-			} `json:"data"`
-		} `json:"edges"`
-	}
-	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-		t.Fatalf("invalid json: %v", err)
-	}
-
-	// Should have service nodes (at least api-gateway, checkout, payment)
-	if len(resp.Nodes) < 3 {
-		t.Errorf("expected at least 3 nodes, got %d", len(resp.Nodes))
-	}
-	for _, n := range resp.Nodes {
-		if n.Data.Type != "service" {
-			t.Errorf("node %q has type %q, want service", n.Data.ID, n.Data.Type)
-		}
-		if n.Data.Label == "" {
-			t.Errorf("node %q has empty label", n.Data.ID)
-		}
-	}
-
-	// Should have edges from span caller_service -> service
-	if len(resp.Edges) < 2 {
-		t.Errorf("expected at least 2 edges, got %d", len(resp.Edges))
-	}
-	for _, e := range resp.Edges {
-		if e.Data.Source == "" || e.Data.Target == "" {
-			t.Errorf("edge has empty source or target")
-		}
-		if e.Data.Count < 1 {
-			t.Errorf("edge %s->%s has count %d, want >= 1", e.Data.Source, e.Data.Target, e.Data.Count)
-		}
-	}
-
-	// Error attribution: payment (status 502, success=false) should carry the
-	// error, NOT the root service api-gateway.
-	nodeByID := map[string]struct {
-		Invocations int
-		Errors      int
-		ErrorRate   float64
-	}{}
-	for _, n := range resp.Nodes {
-		nodeByID[n.Data.ID] = struct {
-			Invocations int
-			Errors      int
-			ErrorRate   float64
-		}{n.Data.Invocations, n.Data.Errors, n.Data.ErrorRate}
-	}
-	if pmt, ok := nodeByID["payment"]; !ok {
-		t.Error("payment node missing")
-	} else if pmt.Errors == 0 {
-		t.Errorf("payment errors = 0, want > 0 (failure should be attributed to originating service)")
-	}
-	// api-gateway's own span succeeded (200) — it must not inherit downstream errors.
-	gw, ok := nodeByID["api-gateway"]
-	if !ok {
-		t.Fatal("api-gateway node missing")
-	}
-	if gw.Errors != 0 {
-		t.Errorf("api-gateway errors = %d, want 0 (downstream failures must not inflate root service)", gw.Errors)
-	}
-}
-
-func TestGraphTopology_MethodNotAllowed(t *testing.T) {
-	srv := makeTestServer()
-	req := httptest.NewRequest(http.MethodPost, "/v1/graph/topology", nil)
-	w := httptest.NewRecorder()
-	srv.GraphTopology(w, req)
-	if w.Code != http.StatusMethodNotAllowed {
-		t.Fatalf("expected 405, got %d", w.Code)
-	}
-}
-
-func TestGraphTopology_NoStore(t *testing.T) {
-	srv := &Server{}
-	req := httptest.NewRequest(http.MethodGet, "/v1/graph/topology?window=5m", nil)
-	w := httptest.NewRecorder()
-	srv.GraphTopology(w, req)
-	if w.Code != http.StatusServiceUnavailable {
-		t.Fatalf("expected 503, got %d", w.Code)
-	}
-}
-
-func TestGraphTopology_WindowClamped(t *testing.T) {
-	srv := makeTestServer()
-	// Request a 48h window — should be clamped to 24h and still work.
-	req := httptest.NewRequest(http.MethodGet, "/v1/graph/topology?window=48h", nil)
-	w := httptest.NewRecorder()
-	srv.GraphTopology(w, req)
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-}
-
-func TestCapabilities_GraphFlagAndArchitecture(t *testing.T) {
+func TestCapabilities_Architecture(t *testing.T) {
 	t.Setenv("GRAPH_HOT_WINDOW", "90m")
-	t.Setenv("GRAPH_RETENTION", "24h")
-
-	tests := []struct {
-		name    string
-		graphUI bool
-		want    bool
-	}{
-		{"disabled", false, false},
-		{"enabled", true, true},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			srv := NewServer(ServerConfig{GraphUI: tt.graphUI})
-
-			req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
-			w := httptest.NewRecorder()
-			srv.Capabilities(w, req)
-
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected 200, got %d", w.Code)
-			}
-
-			var resp map[string]any
-			if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-				t.Fatalf("invalid json: %v", err)
-			}
-			if got := resp["graph"]; got != tt.want {
-				t.Fatalf("graph = %v, want %v", got, tt.want)
-			}
-
-			arch, ok := resp["architecture"].(map[string]any)
-			if !ok {
-				t.Fatalf("missing architecture capability block: %#v", resp["architecture"])
-			}
-			if flattened, ok := arch["flattened"].(bool); !ok || !flattened {
-				t.Fatalf("architecture.flattened = %v, want true", arch["flattened"])
-			}
-			traceStore, ok := arch["trace_store"].(map[string]any)
-			if !ok {
-				t.Fatalf("missing architecture.trace_store block: %#v", arch["trace_store"])
-			}
-			if enabled, ok := traceStore["enabled"].(bool); !ok || !enabled {
-				t.Fatalf("architecture.trace_store.enabled = %v, want true", traceStore["enabled"])
-			}
-			graph, ok := arch["graph"].(map[string]any)
-			if !ok {
-				t.Fatalf("missing architecture.graph block: %#v", arch["graph"])
-			}
-			nodes, ok := graph["nodes"].([]any)
-			if !ok {
-				t.Fatalf("architecture.graph.nodes has unexpected type %T", graph["nodes"])
-			}
-			if len(nodes) != 3 {
-				t.Fatalf("architecture.graph.nodes len = %d, want 3", len(nodes))
-			}
-			if nodes[0] != "request" || nodes[1] != "service" || nodes[2] != "error" {
-				t.Fatalf("architecture.graph.nodes = %#v, want [request service error]", nodes)
-			}
-			hotWindow, ok := arch["hot_window"].(map[string]any)
-			if !ok {
-				t.Fatalf("missing architecture.hot_window block: %#v", arch["hot_window"])
-			}
-			if enabled, ok := hotWindow["enabled"].(bool); !ok || !enabled {
-				t.Fatalf("architecture.hot_window.enabled = %v, want true", hotWindow["enabled"])
-			}
-			if source, ok := hotWindow["source"].(string); !ok || source != "GRAPH_HOT_WINDOW" {
-				t.Fatalf("architecture.hot_window.source = %v, want GRAPH_HOT_WINDOW", hotWindow["source"])
-			}
-			if duration, ok := hotWindow["duration"].(string); !ok || duration != "1h30m0s" {
-				t.Fatalf("architecture.hot_window.duration = %v, want 1h30m0s", hotWindow["duration"])
-			}
-			if secs, ok := hotWindow["duration_secs"].(float64); !ok || int64(secs) != 5400 {
-				t.Fatalf("architecture.hot_window.duration_secs = %v, want 5400", hotWindow["duration_secs"])
-			}
-		})
-	}
-}
-
-func TestCapabilities_HotWindowFallbackToRetention(t *testing.T) {
-	t.Setenv("GRAPH_HOT_WINDOW", "")
-	t.Setenv("GRAPH_RETENTION", "2h")
 
 	srv := NewServer(ServerConfig{})
 
@@ -1682,29 +372,39 @@ func TestCapabilities_HotWindowFallbackToRetention(t *testing.T) {
 	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
 		t.Fatalf("invalid json: %v", err)
 	}
+	if _, present := resp["graph"]; present {
+		t.Fatalf("capabilities.graph field should be removed, got %#v", resp["graph"])
+	}
+
 	arch, ok := resp["architecture"].(map[string]any)
 	if !ok {
 		t.Fatalf("missing architecture capability block: %#v", resp["architecture"])
 	}
+	if flattened, ok := arch["flattened"].(bool); !ok || !flattened {
+		t.Fatalf("architecture.flattened = %v, want true", arch["flattened"])
+	}
 	hotWindow, ok := arch["hot_window"].(map[string]any)
 	if !ok {
 		t.Fatalf("missing architecture.hot_window block: %#v", arch["hot_window"])
 	}
-	if source, ok := hotWindow["source"].(string); !ok || source != "GRAPH_RETENTION" {
-		t.Fatalf("architecture.hot_window.source = %v, want GRAPH_RETENTION", hotWindow["source"])
+	if enabled, ok := hotWindow["enabled"].(bool); !ok || !enabled {
+		t.Fatalf("architecture.hot_window.enabled = %v, want true", hotWindow["enabled"])
 	}
-	if duration, ok := hotWindow["duration"].(string); !ok || duration != "2h0m0s" {
-		t.Fatalf("architecture.hot_window.duration = %v, want 2h0m0s", hotWindow["duration"])
+	if source, ok := hotWindow["source"].(string); !ok || source != "GRAPH_HOT_WINDOW" {
+		t.Fatalf("architecture.hot_window.source = %v, want GRAPH_HOT_WINDOW", hotWindow["source"])
 	}
-	if secs, ok := hotWindow["duration_secs"].(float64); !ok || int64(secs) != 7200 {
-		t.Fatalf("architecture.hot_window.duration_secs = %v, want 7200", hotWindow["duration_secs"])
+	if duration, ok := hotWindow["duration"].(string); !ok || duration != "1h30m0s" {
+		t.Fatalf("architecture.hot_window.duration = %v, want 1h30m0s", hotWindow["duration"])
+	}
+	if secs, ok := hotWindow["duration_secs"].(float64); !ok || int64(secs) != 5400 {
+		t.Fatalf("architecture.hot_window.duration_secs = %v, want 5400", hotWindow["duration_secs"])
 	}
 }
 
 // --- Agentic API fix tests ---
 
 func TestAsk_InvalidJSON_EnvelopeError(t *testing.T) {
-	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20}
+	srv := &Server{maxBodyBytes: 1 << 20}
 	r := httptest.NewRequest("POST", "/v1/ask?envelope=v2", strings.NewReader("{bad"))
 	r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
 	w := httptest.NewRecorder()
@@ -1727,9 +427,8 @@ func TestAsk_InvalidJSON_EnvelopeError(t *testing.T) {
 
 func TestToolCall_InvalidJSON_EnvelopeError(t *testing.T) {
 	reg := tools.NewRegistry()
-	tools.RegisterGraphTools(reg)
-	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg}
-	r := httptest.NewRequest("POST", "/v1/tools/graph_stats?envelope=v2", strings.NewReader("{bad"))
+	srv := &Server{maxBodyBytes: 1 << 20, askRegistry: reg}
+	r := httptest.NewRequest("POST", "/v1/tools/explain_request?envelope=v2", strings.NewReader("{bad"))
 	r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
 	w := httptest.NewRecorder()
 	srv.ToolCall(w, r)
@@ -1760,7 +459,7 @@ func TestPlanExecute_TriageTemplateExecutesAsPlan(t *testing.T) {
 				"snapshot":{"type":"boolean"}
 			}
 		}`),
-		Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			var got struct {
 				IncidentID string `json:"incident_id"`
 				Window     string `json:"window"`
@@ -1780,7 +479,7 @@ func TestPlanExecute_TriageTemplateExecutesAsPlan(t *testing.T) {
 		t.Fatalf("register: %v", err)
 	}
 	ps := NewPlanStore()
-	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg, planStore: ps}
+	srv := &Server{maxBodyBytes: 1 << 20, askRegistry: reg, planStore: ps}
 	body := `{"template":"triage","params":{"incident_id":"inc_abc","window":"15m","snapshot":true}}`
 	r := httptest.NewRequest(http.MethodPost, "/v1/plans/execute", strings.NewReader(body))
 	r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
@@ -1832,13 +531,13 @@ func TestPlanExecute_TemplateValidationErrors(t *testing.T) {
 		Name:        "triage_incident",
 		Description: "test triage",
 		InputSchema: json.RawMessage(`{"type":"object","required":["incident_id"],"properties":{"incident_id":{"type":"string"}}}`),
-		Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			return map[string]string{"ok": "true"}, nil
 		},
 	}); err != nil {
 		t.Fatalf("register: %v", err)
 	}
-	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg}
+	srv := &Server{maxBodyBytes: 1 << 20, askRegistry: reg}
 	cases := map[string]string{
 		"unknown template":        `{"template":"bogus","params":{"incident_id":"inc_abc"}}`,
 		"missing incident id":     `{"template":"triage","params":{"snapshot":true}}`,
@@ -1867,7 +566,7 @@ func TestPlanExecute_TemplateValidationErrors(t *testing.T) {
 func TestAsk_DedupSafetyNet_PreservesActualStatus(t *testing.T) {
 	dc := NewDedupCache()
 	srv := &Server{
-		store:        graphstore.NewStore(),
+
 		maxBodyBytes: 1 << 20,
 		dedupCache:   dc,
 	}
@@ -1900,7 +599,7 @@ func TestAsk_MissingProviderMessageIsProviderAgnostic(t *testing.T) {
 	t.Setenv("GEMINI_API_KEY", "")
 	t.Setenv("GOOGLE_API_KEY", "")
 	srv := &Server{
-		store:        graphstore.NewStore(),
+
 		maxBodyBytes: 1 << 20,
 		dedupCache:   NewDedupCache(),
 	}
@@ -1929,9 +628,17 @@ func TestAsk_MissingProviderMessageIsProviderAgnostic(t *testing.T) {
 func TestToolCall_DedupSafetyNet_Exists(t *testing.T) {
 	dc := NewDedupCache()
 	reg := tools.NewRegistry()
-	tools.RegisterGraphTools(reg)
+	if err := reg.Register(tools.Tool{
+		Name:        "explain_request",
+		Description: "stub for dedup test",
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
+			return nil, fmt.Errorf("trace not found")
+		},
+	}); err != nil {
+		t.Fatal(err)
+	}
 	srv := &Server{
-		store:        graphstore.NewStore(),
+
 		maxBodyBytes: 1 << 20,
 		dedupCache:   dc,
 		askRegistry:  reg,
@@ -1958,7 +665,7 @@ func TestToolCall_DedupSafetyNet_Exists(t *testing.T) {
 
 func TestAsk_WaiterTimeout_Logs(t *testing.T) {
 	dc := NewDedupCache()
-	srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, dedupCache: dc}
+	srv := &Server{maxBodyBytes: 1 << 20, dedupCache: dc}
 
 	body := `{"prompt":"test"}`
 	// Acquire inflight slot manually to force a waiter
@@ -2037,7 +744,7 @@ func TestTools_MethodNotAllowed_EnvelopeError(t *testing.T) {
 func TestAsk_Metrics_CountedOnValidationFailure(t *testing.T) {
 	reg := prometheus.NewRegistry()
 	m := metrics.New(reg)
-	srv := &Server{metrics: m, store: graphstore.NewStore(), maxBodyBytes: 1 << 20}
+	srv := &Server{metrics: m, maxBodyBytes: 1 << 20}
 
 	// Send invalid JSON — should still count in AskRequestsTotal
 	req := httptest.NewRequest("POST", "/v1/ask", strings.NewReader("not json"))
@@ -2072,7 +779,7 @@ func TestAsk_Metrics_CountedOnValidationFailure(t *testing.T) {
 
 func TestAsk_Idempotency_NotEnforcedForValidationErrors(t *testing.T) {
 	srv := &Server{
-		store:        graphstore.NewStore(),
+
 		maxBodyBytes: 1 << 20,
 		dedupCache:   NewDedupCache(),
 	}
@@ -2096,64 +803,6 @@ func TestAsk_Idempotency_NotEnforcedForValidationErrors(t *testing.T) {
 	}
 }
 
-func TestTopology_ReturnsNodesAndEdges(t *testing.T) {
-	srv := makeTestServer()
-	req := httptest.NewRequest("GET", "/v1/topology?window=1h", nil)
-	w := httptest.NewRecorder()
-	srv.Topology(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-	var resp APIResponse
-	json.NewDecoder(w.Body).Decode(&resp)
-	data, ok := resp.Data.(map[string]any)
-	if !ok {
-		t.Fatalf("data should be object, got %T", resp.Data)
-	}
-	if _, ok := data["nodes"]; !ok {
-		t.Fatal("missing nodes field")
-	}
-	if _, ok := data["edges"]; !ok {
-		t.Fatal("missing edges field")
-	}
-}
-
-func TestBlastRadiusEndpoint_RequiresErrorCode(t *testing.T) {
-	srv := makeTestServer()
-	req := httptest.NewRequest("GET", "/v1/blast_radius", nil)
-	w := httptest.NewRecorder()
-	srv.BlastRadius(w, req)
-
-	if w.Code != http.StatusBadRequest {
-		t.Fatalf("expected 400, got %d", w.Code)
-	}
-}
-
-func TestBlastRadiusEndpoint_ReturnsResult(t *testing.T) {
-	srv := makeTestServer()
-	req := httptest.NewRequest("GET", "/v1/blast_radius?error_code=DB_TIMEOUT&window=1h", nil)
-	w := httptest.NewRecorder()
-	srv.BlastRadius(w, req)
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-}
-
-func TestOverview_IncludesLatestFailedTraceID(t *testing.T) {
-	srv := makeTestServer()
-	req := httptest.NewRequest("GET", "/v1/overview?window=1h", nil)
-	w := httptest.NewRecorder()
-	srv.Overview(w, req)
-
-	var resp map[string]any
-	json.NewDecoder(w.Body).Decode(&resp)
-	if _, ok := resp["latest_failed_trace_id"]; !ok {
-		t.Fatal("overview response missing latest_failed_trace_id field")
-	}
-}
-
 type stubAskProvider struct{}
 
 func (stubAskProvider) Generate(ctx context.Context, prompt string, tools []llm.ToolDefinition, history []llm.Turn) (llm.Result, error) {
diff --git a/internal/ingest/pipeline.go b/internal/ingest/pipeline.go
index d2dc98c..77eb3bd 100644
--- a/internal/ingest/pipeline.go
+++ b/internal/ingest/pipeline.go
@@ -9,21 +9,11 @@ import (
 
 	"github.com/sssmaran/WaylogCLI/internal/coldstore"
 	"github.com/sssmaran/WaylogCLI/internal/eventlog"
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
 	"github.com/sssmaran/WaylogCLI/internal/metrics"
 	"github.com/sssmaran/WaylogCLI/internal/sampler"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
 	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
-// Notifier is a narrow interface for post-batch notification.
-// Matches SSEHub.MarkDirty. nil-safe — Pipeline operates normally if nil.
-type Notifier interface {
-	MarkDirty(topics ...string)
-}
-
 // Validator is the per-event validation function the Pipeline applies before
 // any durable work. The default path uses event.WideEvent.Validate; specialized
 // callers may provide a narrower validator.
@@ -33,9 +23,6 @@ type Validator func(ev *event.WideEvent) error
 // Most fields are optional — the pipeline degrades gracefully when a
 // dependency is nil (e.g., no EventLog means no WAL write).
 type PipelineConfig struct {
-	Store      *store.Store
-	TraceStore *tracestore.Store
-	Builder    *build.Builder
 	Sampler    *sampler.Sampler
 	EventLog   *eventlog.Writer
 	ColdWriter *coldstore.BatchWriter
@@ -43,19 +30,13 @@ type PipelineConfig struct {
 	Counters   *unsampledCounters
 	Accepted   *atomic.Uint64
 	Metrics    *metrics.Metrics
-	Notifier   Notifier
 	Validator  Validator
 }
 
-// Pipeline is the schema-1.x ingest core for old graph-derived APIs. Order of
-// operations per event:
+// Pipeline is the durable-write ingest core. Order of operations per event:
 //
-//	validate → WAL → counters → cold store → deployment upsert → sample →
-//	build → merge graph + tracestore → notify (once per batch)
+//	validate → WAL → counters → cold store → deployment upsert → sample
 type Pipeline struct {
-	store      *store.Store
-	traceStore *tracestore.Store
-	builder    *build.Builder
 	sampler    *sampler.Sampler
 	eventLog   *eventlog.Writer
 	coldWriter *coldstore.BatchWriter
@@ -63,7 +44,6 @@ type Pipeline struct {
 	counters   *unsampledCounters
 	accepted   *atomic.Uint64
 	metrics    *metrics.Metrics
-	notifier   Notifier
 	validator  Validator
 }
 
@@ -85,9 +65,6 @@ type EventError struct {
 // NewPipeline creates a Pipeline from the given configuration.
 func NewPipeline(cfg PipelineConfig) *Pipeline {
 	return &Pipeline{
-		store:      cfg.Store,
-		traceStore: cfg.TraceStore,
-		builder:    cfg.Builder,
 		sampler:    cfg.Sampler,
 		eventLog:   cfg.EventLog,
 		coldWriter: cfg.ColdWriter,
@@ -95,7 +72,6 @@ func NewPipeline(cfg PipelineConfig) *Pipeline {
 		counters:   cfg.Counters,
 		accepted:   cfg.Accepted,
 		metrics:    cfg.Metrics,
-		notifier:   cfg.Notifier,
 		validator:  cfg.Validator,
 	}
 }
@@ -207,25 +183,6 @@ func (p *Pipeline) IngestBatch(ctx context.Context, events []*event.WideEvent) (
 			continue
 		}
 
-		// Build graph + span and merge into derived views.
-		if p.builder != nil {
-			mergeStart := time.Now()
-			br := p.builder.BuildResult(*ev)
-			if p.store != nil && br.Graph != nil {
-				p.store.Merge(br.Graph)
-			}
-			if p.traceStore != nil && br.Span != nil {
-				traceStart := time.Now()
-				p.traceStore.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), br.Span)
-				if p.metrics != nil {
-					p.metrics.TraceUpsertDuration.Observe(time.Since(traceStart).Seconds())
-				}
-			}
-			if p.metrics != nil {
-				p.metrics.MergeLatency.Observe(time.Since(mergeStart).Seconds())
-			}
-		}
-
 		if p.accepted != nil {
 			p.accepted.Add(1)
 		}
@@ -233,10 +190,6 @@ func (p *Pipeline) IngestBatch(ctx context.Context, events []*event.WideEvent) (
 		result.SampledInGraph++
 	}
 
-	if p.notifier != nil && result.SampledInGraph > 0 {
-		p.notifier.MarkDirty(TopicOverview, TopicRoutes, TopicTimeseries)
-	}
-
 	return result, nil
 }
 
diff --git a/internal/ingest/pipeline_test.go b/internal/ingest/pipeline_test.go
deleted file mode 100644
index 7b9398f..0000000
--- a/internal/ingest/pipeline_test.go
+++ /dev/null
@@ -1,249 +0,0 @@
-package ingest
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/prometheus/client_golang/prometheus"
-	"github.com/sssmaran/WaylogCLI/internal/eventlog"
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/metrics"
-	"github.com/sssmaran/WaylogCLI/internal/sampler"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-func validSDKEvent() *event.WideEvent {
-	return &event.WideEvent{
-		SchemaVersion: event.SchemaVersion,
-		EventName:     "svc.request",
-		Timestamp:     time.Now(),
-		User:          event.UserContext{ID: "u1"},
-		Request:       event.RequestContext{TraceID: "aaaabbbbccccddddeeeeffffaaaabbbb", SpanID: "aaaabbbbccccdddd"},
-		System:        event.SystemContext{Service: "svc", Env: "prod"},
-		Outcome:       event.OutcomeContext{Success: true, StatusCode: 200},
-		Metrics:       event.MetricsContext{LatencyMs: 50},
-	}
-}
-
-func validOTLPEvent() *event.WideEvent {
-	ev := validSDKEvent()
-	ev.User.ID = "" // OTLP has no user
-	return ev
-}
-
-type countingNotifier struct{ calls int }
-
-func (n *countingNotifier) MarkDirty(topics ...string) { n.calls++ }
-
-func sdkPipeline(s *store.Store) *Pipeline {
-	return NewPipeline(PipelineConfig{
-		Store:     s,
-		Builder:   build.NewBuilder(),
-		Sampler:   sampler.New(sampler.Config{HappySampleRatePct: 100}),
-		Validator: func(ev *event.WideEvent) error { return ev.Validate() },
-	})
-}
-
-func TestValidateAndIngestBatch_AllValid(t *testing.T) {
-	s := store.NewStore()
-	p := sdkPipeline(s)
-	evs := []*event.WideEvent{validSDKEvent(), validSDKEvent()}
-	res, err := p.ValidateAndIngestBatch(context.Background(), evs)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if res.Accepted != 2 {
-		t.Errorf("expected 2 accepted, got %d", res.Accepted)
-	}
-	if res.Rejected != 0 {
-		t.Errorf("expected 0 rejected, got %d", res.Rejected)
-	}
-}
-
-func TestValidateAndIngestBatch_MixedValidInvalid(t *testing.T) {
-	s := store.NewStore()
-	p := sdkPipeline(s)
-	invalid := &event.WideEvent{} // empty, fails validation
-	evs := []*event.WideEvent{validSDKEvent(), invalid}
-	res, err := p.ValidateAndIngestBatch(context.Background(), evs)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if res.Accepted != 1 {
-		t.Errorf("expected 1 accepted, got %d", res.Accepted)
-	}
-	if res.Rejected != 1 {
-		t.Errorf("expected 1 rejected, got %d", res.Rejected)
-	}
-	if len(res.Errors) != 1 {
-		t.Fatalf("expected 1 error entry, got %d", len(res.Errors))
-	}
-	if res.Errors[0].Index != 1 {
-		t.Errorf("expected error at index 1, got %d", res.Errors[0].Index)
-	}
-}
-
-func TestValidateAndIngestBatch_OTLPValidator_EmptyUser(t *testing.T) {
-	s := store.NewStore()
-	p := NewPipeline(PipelineConfig{
-		Store:     s,
-		Builder:   build.NewBuilder(),
-		Sampler:   sampler.New(sampler.Config{HappySampleRatePct: 100}),
-		Validator: OTLPValidator,
-	})
-	evs := []*event.WideEvent{validOTLPEvent()}
-	res, err := p.ValidateAndIngestBatch(context.Background(), evs)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if res.Accepted != 1 {
-		t.Errorf("expected 1 accepted, got %d", res.Accepted)
-	}
-	if res.Rejected != 0 {
-		t.Errorf("expected 0 rejected, got %d", res.Rejected)
-	}
-}
-
-func TestValidateAndIngestBatch_OTLPValidator_OtherErrors(t *testing.T) {
-	s := store.NewStore()
-	p := NewPipeline(PipelineConfig{
-		Store:     s,
-		Builder:   build.NewBuilder(),
-		Sampler:   sampler.New(sampler.Config{HappySampleRatePct: 100}),
-		Validator: OTLPValidator,
-	})
-	ev := validOTLPEvent()
-	ev.System.Service = "" // missing service in addition to empty user
-	evs := []*event.WideEvent{ev}
-	res, err := p.ValidateAndIngestBatch(context.Background(), evs)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if res.Rejected != 1 {
-		t.Errorf("expected 1 rejected, got %d", res.Rejected)
-	}
-}
-
-func TestIngestBatch_NotifierCalledOnce(t *testing.T) {
-	s := store.NewStore()
-	n := &countingNotifier{}
-	p := NewPipeline(PipelineConfig{
-		Store:     s,
-		Builder:   build.NewBuilder(),
-		Sampler:   sampler.New(sampler.Config{HappySampleRatePct: 100}),
-		Validator: func(ev *event.WideEvent) error { return ev.Validate() },
-		Notifier:  n,
-	})
-	evs := []*event.WideEvent{validSDKEvent(), validSDKEvent(), validSDKEvent()}
-	if _, err := p.ValidateAndIngestBatch(context.Background(), evs); err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if n.calls != 1 {
-		t.Errorf("expected notifier called once, got %d", n.calls)
-	}
-}
-
-func TestIngestBatch_SamplingAccounting(t *testing.T) {
-	s := store.NewStore()
-	p := NewPipeline(PipelineConfig{
-		Store: s,
-		// No builder means graph merge is skipped, but sampler still runs.
-		// HappySampleRatePct=1 + a low-probability trace id won't reliably
-		// drop. Instead use a real builder and a deterministic drop-all config
-		// by setting the slow_ms threshold absurdly high and rate pct to 1
-		// then picking a trace id whose hash bucket != 0.
-		Builder:   build.NewBuilder(),
-		Sampler:   sampler.New(sampler.Config{HappySampleRatePct: 1, SlowMs: 10000, Salt: "deterministic"}),
-		Validator: func(ev *event.WideEvent) error { return ev.Validate() },
-	})
-	// Drive enough events with distinct trace ids so at least one lands
-	// outside bucket 0 and gets sampled out.
-	sampledOut := 0
-	for i := 0; i < 20; i++ {
-		ev := validSDKEvent()
-		ev.Request.TraceID = traceIDForIndex(i)
-		res, err := p.ValidateAndIngestBatch(context.Background(), []*event.WideEvent{ev})
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if res.Accepted != 1 {
-			t.Errorf("expected 1 accepted, got %d", res.Accepted)
-		}
-		sampledOut += res.SampledOut
-	}
-	if sampledOut == 0 {
-		t.Error("expected at least one sampled-out event across 20 distinct trace ids")
-	}
-}
-
-func TestIngestBatch_AcceptedMetricCountsDurableSampledOutEvents(t *testing.T) {
-	reg := prometheus.NewRegistry()
-	m := metrics.New(reg)
-	el, err := eventlog.NewWithConfig(t.TempDir(), eventlog.WriterConfig{})
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer el.Close()
-
-	p := NewPipeline(PipelineConfig{
-		Store:    store.NewStore(),
-		Builder:  build.NewBuilder(),
-		Sampler:  sampler.New(sampler.Config{HappySampleRatePct: 1, SlowMs: 10000, Salt: "deterministic"}),
-		EventLog: el,
-		Metrics:  m,
-		Validator: func(ev *event.WideEvent) error {
-			return ev.Validate()
-		},
-	})
-
-	accepted, sampledOut := 0, 0
-	for i := 0; i < 20; i++ {
-		ev := validSDKEvent()
-		ev.Request.TraceID = traceIDForIndex(i)
-		res, err := p.ValidateAndIngestBatch(context.Background(), []*event.WideEvent{ev})
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		accepted += res.Accepted
-		sampledOut += res.SampledOut
-	}
-	if sampledOut == 0 {
-		t.Fatal("test did not exercise sampled-out durable events")
-	}
-	if got := counterMetric(t, reg, "waylog_events_accepted_total"); got != float64(accepted) {
-		t.Fatalf("events_accepted=%v want %d", got, accepted)
-	}
-}
-
-func counterMetric(t *testing.T, reg *prometheus.Registry, name string) float64 {
-	t.Helper()
-	families, err := reg.Gather()
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, mf := range families {
-		if mf.GetName() != name {
-			continue
-		}
-		var total float64
-		for _, metric := range mf.GetMetric() {
-			if counter := metric.GetCounter(); counter != nil {
-				total += counter.GetValue()
-			}
-		}
-		return total
-	}
-	return 0
-}
-
-// traceIDForIndex generates a distinct 32-hex trace id for each index.
-func traceIDForIndex(i int) string {
-	hex := "0123456789abcdef"
-	out := make([]byte, 32)
-	for j := 0; j < 32; j++ {
-		out[j] = hex[(i+j)%16]
-	}
-	return string(out)
-}
diff --git a/internal/ingest/plan_test.go b/internal/ingest/plan_test.go
index b1effe9..167f1dc 100644
--- a/internal/ingest/plan_test.go
+++ b/internal/ingest/plan_test.go
@@ -8,11 +8,12 @@ import (
 	"github.com/sssmaran/WaylogCLI/internal/tools"
 )
 
-// setupTestRegistry builds a registry with three mock tools for testing.
-//
-//	graph_insights  — requires window (string); outputs total_failures, schema_version
-//	graph_failures  — optional limit (integer); outputs failures array with trace_id, error_code
-//	explain_request — requires trace_id (string); outputs verdict
+// setupTestRegistry builds a registry of three mock tools that exercise
+// distinct plan-executor shapes: window-windowed, paginated-list, single-id.
+// The first two have mock_-prefixed names because their schemas don't
+// match any real surviving tool. The third reuses the real explain_request
+// name (its input schema does mirror the real v2 contract) so ref-chain
+// tests look natural.
 func setupTestRegistry(t *testing.T) *tools.Registry {
 	t.Helper()
 	reg := tools.NewRegistry()
@@ -25,7 +26,7 @@ func setupTestRegistry(t *testing.T) *tools.Registry {
 	}
 
 	mustRegister(tools.Tool{
-		Name:        "graph_insights",
+		Name:        "mock_window_tool",
 		Description: "Graph insights over a time window",
 		InputSchema: json.RawMessage(`{
 			"type": "object",
@@ -42,13 +43,13 @@ func setupTestRegistry(t *testing.T) *tools.Registry {
 				"schema_version": {"type": "string"}
 			}
 		}`),
-		Handler: func(_ context.Context, _ tools.Store, _ json.RawMessage) (any, error) {
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
 			return map[string]any{"total_failures": 5, "schema_version": "1.0"}, nil
 		},
 	})
 
 	mustRegister(tools.Tool{
-		Name:        "graph_failures",
+		Name:        "mock_failures_tool",
 		Description: "List recent failures",
 		InputSchema: json.RawMessage(`{
 			"type": "object",
@@ -72,7 +73,7 @@ func setupTestRegistry(t *testing.T) *tools.Registry {
 				}
 			}
 		}`),
-		Handler: func(_ context.Context, _ tools.Store, _ json.RawMessage) (any, error) {
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
 			return map[string]any{"failures": []any{}}, nil
 		},
 	})
@@ -94,7 +95,7 @@ func setupTestRegistry(t *testing.T) *tools.Registry {
 				"verdict": {"type": "string"}
 			}
 		}`),
-		Handler: func(_ context.Context, _ tools.Store, _ json.RawMessage) (any, error) {
+		Handler: func(_ context.Context, _ json.RawMessage) (any, error) {
 			return map[string]any{"verdict": "ok"}, nil
 		},
 	})
@@ -115,7 +116,7 @@ func TestValidatePlan_Structural(t *testing.T) {
 	t.Run("too many steps", func(t *testing.T) {
 		steps := make([]PlanStep, 11)
 		for i := range steps {
-			steps[i] = PlanStep{ID: string(rune('a' + i)), Tool: "graph_insights", Params: json.RawMessage(`{"window":"10m"}`)}
+			steps[i] = PlanStep{ID: string(rune('a' + i)), Tool: "mock_window_tool", Params: json.RawMessage(`{"window":"10m"}`)}
 		}
 		errs := ValidatePlan(steps, reg)
 		if len(errs) == 0 {
@@ -125,8 +126,8 @@ func TestValidatePlan_Structural(t *testing.T) {
 
 	t.Run("duplicate IDs", func(t *testing.T) {
 		steps := []PlanStep{
-			{ID: "a", Tool: "graph_insights", Params: json.RawMessage(`{"window":"10m"}`)},
-			{ID: "a", Tool: "graph_failures", Params: json.RawMessage(`{}`)},
+			{ID: "a", Tool: "mock_window_tool", Params: json.RawMessage(`{"window":"10m"}`)},
+			{ID: "a", Tool: "mock_failures_tool", Params: json.RawMessage(`{}`)},
 		}
 		errs := ValidatePlan(steps, reg)
 		if len(errs) == 0 {
@@ -146,7 +147,7 @@ func TestValidatePlan_Structural(t *testing.T) {
 
 	t.Run("empty ID", func(t *testing.T) {
 		steps := []PlanStep{
-			{ID: "", Tool: "graph_insights", Params: json.RawMessage(`{"window":"10m"}`)},
+			{ID: "", Tool: "mock_window_tool", Params: json.RawMessage(`{"window":"10m"}`)},
 		}
 		errs := ValidatePlan(steps, reg)
 		if len(errs) == 0 {
@@ -167,7 +168,7 @@ func TestValidatePlan_ForwardRef(t *testing.T) {
 		},
 		{
 			ID:     "b",
-			Tool:   "graph_failures",
+			Tool:   "mock_failures_tool",
 			Params: json.RawMessage(`{}`),
 		},
 	}
@@ -199,11 +200,11 @@ func TestValidatePlan_SelfRef(t *testing.T) {
 func TestValidatePlan_InvalidRefPath(t *testing.T) {
 	reg := setupTestRegistry(t)
 
-	// Step "b" refs step "a".nonexistent_field which doesn't exist in graph_insights output.
+	// Step "b" refs step "a".nonexistent_field which doesn't exist in mock_window_tool output.
 	steps := []PlanStep{
 		{
 			ID:     "a",
-			Tool:   "graph_insights",
+			Tool:   "mock_window_tool",
 			Params: json.RawMessage(`{"window": "10m"}`),
 		},
 		{
@@ -222,12 +223,12 @@ func TestValidatePlan_InvalidRefPath(t *testing.T) {
 func TestValidatePlan_NonRefParamValidation(t *testing.T) {
 	reg := setupTestRegistry(t)
 
-	// graph_insights has additionalProperties: false.
+	// mock_window_tool has additionalProperties: false.
 	// "unknown_param" is not in its properties — should be rejected.
 	steps := []PlanStep{
 		{
 			ID:     "a",
-			Tool:   "graph_insights",
+			Tool:   "mock_window_tool",
 			Params: json.RawMessage(`{"window": "10m", "unknown_param": "bad"}`),
 		},
 	}
@@ -248,7 +249,7 @@ func TestValidatePlan_RefFieldSkippedForInputValidation(t *testing.T) {
 	steps := []PlanStep{
 		{
 			ID:     "a",
-			Tool:   "graph_failures",
+			Tool:   "mock_failures_tool",
 			Params: json.RawMessage(`{}`),
 		},
 		{
@@ -270,12 +271,12 @@ func TestValidatePlan_Valid(t *testing.T) {
 	steps := []PlanStep{
 		{
 			ID:     "insights",
-			Tool:   "graph_insights",
+			Tool:   "mock_window_tool",
 			Params: json.RawMessage(`{"window": "10m"}`),
 		},
 		{
 			ID:     "failures",
-			Tool:   "graph_failures",
+			Tool:   "mock_failures_tool",
 			Params: json.RawMessage(`{"limit": 5}`),
 		},
 		{
@@ -323,7 +324,7 @@ func TestExpandPlanRequest_RejectsInvalidTemplateInput(t *testing.T) {
 	params := json.RawMessage(`{"incident_id":"inc_abc"}`)
 	cases := map[string]PlanExecuteRequest{
 		"both steps and template": {
-			Steps:    []PlanStep{{ID: "x", Tool: "graph_stats"}},
+			Steps:    []PlanStep{{ID: "x", Tool: "mock_stats_tool"}},
 			Template: "triage",
 			Params:   ¶ms,
 		},
diff --git a/internal/ingest/sse.go b/internal/ingest/sse.go
deleted file mode 100644
index a9fd79e..0000000
--- a/internal/ingest/sse.go
+++ /dev/null
@@ -1,141 +0,0 @@
-package ingest
-
-import (
-	"errors"
-	"sync"
-)
-
-// SSE topic names.
-const (
-	TopicOverview    = "overview"
-	TopicTimeseries  = "timeseries"
-	TopicRoutes      = "routes"
-	TopicDeployments = "deployments"
-)
-
-// ErrMaxClients is returned by Subscribe when the hub is at capacity.
-var ErrMaxClients = errors.New("sse: max clients reached")
-
-// subscriber holds a notification channel and per-topic latest values.
-type subscriber struct {
-	ch     chan struct{}
-	mu     sync.Mutex
-	latest map[string][]byte // topic → most recent snapshot
-}
-
-// SSEHub is a pure pub/sub fan-out hub with per-subscriber coalescing.
-// It has no HTTP awareness. Every published value is a complete state
-// snapshot for its topic.
-type SSEHub struct {
-	mu        sync.RWMutex
-	subs      map[uint64]*subscriber
-	nextID    uint64
-	maxClient int
-
-	dirtyMu sync.Mutex
-	dirty   map[string]struct{}
-}
-
-// NewSSEHub creates a hub that allows at most maxClients concurrent subscribers.
-func NewSSEHub(maxClients int) *SSEHub {
-	return &SSEHub{
-		subs:      make(map[uint64]*subscriber),
-		maxClient: maxClients,
-		dirty:     make(map[string]struct{}),
-	}
-}
-
-// Subscribe registers a new subscriber. It returns the subscriber ID,
-// a notification channel (capacity 1), and an error if the hub is full.
-func (h *SSEHub) Subscribe() (uint64, <-chan struct{}, error) {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-
-	if len(h.subs) >= h.maxClient {
-		return 0, nil, ErrMaxClients
-	}
-
-	h.nextID++
-	id := h.nextID
-	ch := make(chan struct{}, 1)
-	h.subs[id] = &subscriber{
-		ch:     ch,
-		latest: make(map[string][]byte),
-	}
-	return id, ch, nil
-}
-
-// Unsubscribe removes a subscriber. Safe to call with an unknown ID.
-func (h *SSEHub) Unsubscribe(id uint64) {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-	delete(h.subs, id)
-}
-
-// Publish fans out data to all subscribers for the given topic.
-// Each subscriber's latest value for the topic is overwritten (coalesced),
-// and its notification channel is poked non-blocking.
-func (h *SSEHub) Publish(topic string, data []byte) {
-	h.mu.RLock()
-	defer h.mu.RUnlock()
-
-	for _, sub := range h.subs {
-		sub.mu.Lock()
-		sub.latest[topic] = data
-		sub.mu.Unlock()
-		// Non-blocking poke.
-		select {
-		case sub.ch <- struct{}{}:
-		default:
-		}
-	}
-}
-
-// Latest returns and clears all pending topic values for a subscriber.
-// Returns nil if the subscriber ID is unknown.
-func (h *SSEHub) Latest(id uint64) map[string][]byte {
-	h.mu.RLock()
-	defer h.mu.RUnlock()
-
-	sub, ok := h.subs[id]
-	if !ok {
-		return nil
-	}
-
-	sub.mu.Lock()
-	defer sub.mu.Unlock()
-
-	if len(sub.latest) == 0 {
-		return nil
-	}
-
-	out := sub.latest
-	sub.latest = make(map[string][]byte)
-	return out
-}
-
-// MarkDirty marks topics for recomputation by a ticker.
-func (h *SSEHub) MarkDirty(topics ...string) {
-	h.dirtyMu.Lock()
-	defer h.dirtyMu.Unlock()
-	for _, t := range topics {
-		h.dirty[t] = struct{}{}
-	}
-}
-
-// DrainDirty returns and clears the set of dirty topics.
-func (h *SSEHub) DrainDirty() []string {
-	h.dirtyMu.Lock()
-	defer h.dirtyMu.Unlock()
-
-	if len(h.dirty) == 0 {
-		return nil
-	}
-
-	out := make([]string, 0, len(h.dirty))
-	for t := range h.dirty {
-		out = append(out, t)
-	}
-	h.dirty = make(map[string]struct{})
-	return out
-}
diff --git a/internal/ingest/sse_handler.go b/internal/ingest/sse_handler.go
deleted file mode 100644
index bcca167..0000000
--- a/internal/ingest/sse_handler.go
+++ /dev/null
@@ -1,149 +0,0 @@
-package ingest
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"net/http"
-	"sync/atomic"
-	"time"
-)
-
-// writeSSEHeaders sets the standard headers for an SSE response.
-func writeSSEHeaders(w http.ResponseWriter) {
-	w.Header().Set("Content-Type", "text/event-stream")
-	w.Header().Set("Cache-Control", "no-cache")
-	w.Header().Set("Connection", "keep-alive")
-	w.Header().Set("X-Accel-Buffering", "no")
-}
-
-// SSEStream handles GET /v1/stream/dashboard.
-func (s *Server) SSEStream(w http.ResponseWriter, r *http.Request) {
-	flusher, ok := w.(http.Flusher)
-	if !ok {
-		http.Error(w, "streaming not supported", http.StatusInternalServerError)
-		return
-	}
-
-	id, ch, err := s.sseHub.Subscribe()
-	if err != nil {
-		http.Error(w, "too many connections", http.StatusServiceUnavailable)
-		return
-	}
-	defer s.sseHub.Unsubscribe(id)
-
-	writeSSEHeaders(w)
-
-	var eventID atomic.Uint64
-	writeEvent := func(topic string, data []byte) {
-		eid := eventID.Add(1)
-		fmt.Fprintf(w, "id: %d\nevent: %s\ndata: %s\n\n", eid, topic, data)
-		flusher.Flush()
-	}
-
-	// Initial snapshot in stable order
-	s.sendInitialSnapshot(writeEvent)
-
-	heartbeatInterval := s.sseHeartbeatInterval
-	if heartbeatInterval == 0 {
-		heartbeatInterval = 15 * time.Second
-	}
-	heartbeat := time.NewTicker(heartbeatInterval)
-	defer heartbeat.Stop()
-
-	ctx := r.Context()
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ch:
-			events := s.sseHub.Latest(id)
-			for topic, data := range events {
-				writeEvent(topic, data)
-			}
-		case <-heartbeat.C:
-			fmt.Fprint(w, ": heartbeat\n\n")
-			flusher.Flush()
-		}
-	}
-}
-
-func (s *Server) sendInitialSnapshot(writeEvent func(string, []byte)) {
-	topics := []string{TopicOverview, TopicTimeseries, TopicDeployments, TopicRoutes}
-	for _, topic := range topics {
-		data := s.ComputeSSETopic(topic)
-		if data != nil {
-			writeEvent(topic, data)
-		}
-	}
-}
-
-// ComputeSSETopic generates JSON bytes for a given SSE topic by computing
-// the same data as the corresponding read endpoints.
-func (s *Server) ComputeSSETopic(topic string) []byte {
-	switch topic {
-	case TopicOverview:
-		return s.computeOverviewJSON()
-	case TopicTimeseries:
-		return s.computeTimeseriesJSON()
-	case TopicRoutes:
-		return s.computeRoutesJSON()
-	case TopicDeployments:
-		return s.computeDeploymentsJSON()
-	default:
-		return nil
-	}
-}
-
-func (s *Server) computeOverviewJSON() []byte {
-	if s.store == nil {
-		return nil
-	}
-	payload := s.overviewPayload(time.Hour, 20)
-	data, err := json.Marshal(payload)
-	if err != nil {
-		return nil
-	}
-	return data
-}
-
-func (s *Server) computeTimeseriesJSON() []byte {
-	if s.store == nil {
-		return nil
-	}
-	payload := s.timeseriesPayload(time.Hour, 5*time.Minute)
-	data, err := json.Marshal(payload)
-	if err != nil {
-		return nil
-	}
-	return data
-}
-
-func (s *Server) computeRoutesJSON() []byte {
-	if s.store == nil {
-		return nil
-	}
-	payload := s.routesPayload(time.Hour, 20, false)
-	data, err := json.Marshal(payload)
-	if err != nil {
-		return nil
-	}
-	return data
-}
-
-func (s *Server) computeDeploymentsJSON() []byte {
-	if s.coldStore == nil {
-		return []byte(`{"deployments":[]}`)
-	}
-	now := time.Now().UTC()
-	start := now.Add(-time.Hour)
-	out, err := s.deploymentsPayload(context.Background(), start, now, "")
-	if err != nil {
-		return []byte(`{"deployments":[]}`)
-	}
-	data, err := json.Marshal(map[string]any{"deployments": out})
-	if err != nil {
-		return []byte(`{"deployments":[]}`)
-	}
-	return data
-}
diff --git a/internal/ingest/sse_test.go b/internal/ingest/sse_test.go
deleted file mode 100644
index 4d44d50..0000000
--- a/internal/ingest/sse_test.go
+++ /dev/null
@@ -1,148 +0,0 @@
-package ingest
-
-import (
-	"testing"
-)
-
-func TestSSEHub_SubscribeUnsubscribe(t *testing.T) {
-	hub := NewSSEHub(10)
-
-	id, ch, err := hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe: %v", err)
-	}
-	if ch == nil {
-		t.Fatal("expected non-nil channel")
-	}
-
-	hub.Unsubscribe(id)
-
-	// Publish after unsubscribe should not panic.
-	hub.Publish("overview", []byte(`{"ok":true}`))
-}
-
-func TestSSEHub_FanOut(t *testing.T) {
-	hub := NewSSEHub(10)
-
-	id1, ch1, err := hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe 1: %v", err)
-	}
-	id2, ch2, err := hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe 2: %v", err)
-	}
-
-	data := []byte(`{"count":42}`)
-	hub.Publish("overview", data)
-
-	// Both channels should be notified.
-	select {
-	case <-ch1:
-	default:
-		t.Fatal("subscriber 1 not notified")
-	}
-	select {
-	case <-ch2:
-	default:
-		t.Fatal("subscriber 2 not notified")
-	}
-
-	// Both should see the data via Latest.
-	lat1 := hub.Latest(id1)
-	if got := string(lat1["overview"]); got != string(data) {
-		t.Fatalf("subscriber 1: got %q, want %q", got, string(data))
-	}
-	lat2 := hub.Latest(id2)
-	if got := string(lat2["overview"]); got != string(data) {
-		t.Fatalf("subscriber 2: got %q, want %q", got, string(data))
-	}
-}
-
-func TestSSEHub_PerSubscriberCoalescing(t *testing.T) {
-	hub := NewSSEHub(10)
-
-	id, ch, err := hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe: %v", err)
-	}
-
-	hub.Publish("overview", []byte(`{"v":1}`))
-	hub.Publish("overview", []byte(`{"v":2}`))
-
-	// Drain the notification channel (may have 1 item since cap=1).
-	select {
-	case <-ch:
-	default:
-	}
-
-	lat := hub.Latest(id)
-	if got := string(lat["overview"]); got != `{"v":2}` {
-		t.Fatalf("coalescing: got %q, want %q", got, `{"v":2}`)
-	}
-
-	// After Latest, pending should be empty.
-	lat2 := hub.Latest(id)
-	if len(lat2) != 0 {
-		t.Fatalf("expected empty after drain, got %d topics", len(lat2))
-	}
-}
-
-func TestSSEHub_MaxClients(t *testing.T) {
-	hub := NewSSEHub(2)
-
-	id1, _, err := hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe 1: %v", err)
-	}
-	_, _, err = hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe 2: %v", err)
-	}
-
-	// Third should fail.
-	_, _, err = hub.Subscribe()
-	if err != ErrMaxClients {
-		t.Fatalf("expected ErrMaxClients, got %v", err)
-	}
-
-	// Unsubscribe frees a slot.
-	hub.Unsubscribe(id1)
-
-	_, _, err = hub.Subscribe()
-	if err != nil {
-		t.Fatalf("Subscribe after unsubscribe: %v", err)
-	}
-}
-
-func TestSSEHub_PublishNoSubscribers(t *testing.T) {
-	hub := NewSSEHub(10)
-	// Must not panic.
-	hub.Publish("overview", []byte(`{}`))
-}
-
-func TestSSEHub_DirtyTopics(t *testing.T) {
-	hub := NewSSEHub(10)
-
-	hub.MarkDirty("overview", "routes")
-	hub.MarkDirty("overview") // duplicate
-
-	dirty := hub.DrainDirty()
-	if len(dirty) != 2 {
-		t.Fatalf("expected 2 dirty topics, got %d: %v", len(dirty), dirty)
-	}
-
-	got := make(map[string]bool)
-	for _, d := range dirty {
-		got[d] = true
-	}
-	if !got["overview"] || !got["routes"] {
-		t.Fatalf("unexpected dirty set: %v", dirty)
-	}
-
-	// After drain, should be empty.
-	dirty2 := hub.DrainDirty()
-	if len(dirty2) != 0 {
-		t.Fatalf("expected empty after drain, got %v", dirty2)
-	}
-}
diff --git a/internal/ingest/triage_surface_agreement_test.go b/internal/ingest/triage_surface_agreement_test.go
new file mode 100644
index 0000000..b1a678f
--- /dev/null
+++ b/internal/ingest/triage_surface_agreement_test.go
@@ -0,0 +1,111 @@
+package ingest_test
+
+// Data-correctness invariant (g): for one incident built by one triage.Engine,
+// every surface that emits a triage report must agree on the canonical
+// report_hash. This exercises the REAL surfaces — the REST handler, the
+// triage_incident tool, and the render_triage_report tool — and compares the
+// hash each produces, catching surface-specific drift (an envelope that drops a
+// hashed field, a re-marshal, or a stale embedded hash). It reuses the stub
+// triage dependencies defined in triage_route_test.go.
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/reports"
+	"github.com/sssmaran/WaylogCLI/internal/tools"
+	"github.com/sssmaran/WaylogCLI/internal/triage"
+	"github.com/sssmaran/WaylogCLI/internal/triagehttp"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestTriageSurfacesAgreeOnReportHash(t *testing.T) {
+	eng, err := triage.NewEngine(triage.Deps{
+		Incidents:  stubTriageIncidents{},
+		Blast:      stubTriageBlast{},
+		Story:      stubTriageStory{},
+		Signals:    stubTriageSignals{},
+		NextChecks: stubTriageNextChecks{},
+		Now:        func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	})
+	if err != nil {
+		t.Fatalf("NewEngine: %v", err)
+	}
+	ctx := context.Background()
+
+	// --- REST surface: GET /v1/triage/{id} returns the Report as JSON ---
+	mux := http.NewServeMux()
+	mux.Handle("/v1/triage/", http.HandlerFunc(triagehttp.NewHandler(eng).Triage))
+	srv := httptest.NewServer(mux)
+	t.Cleanup(srv.Close)
+
+	resp, err := http.Get(srv.URL + "/v1/triage/inc_abc")
+	if err != nil {
+		t.Fatalf("GET: %v", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("REST status = %d, want 200", resp.StatusCode)
+	}
+	var restReport pkgtriage.Report
+	if err := json.NewDecoder(resp.Body).Decode(&restReport); err != nil {
+		t.Fatalf("decode REST report: %v", err)
+	}
+	if restReport.ReportHash == "" {
+		t.Fatalf("REST report missing report_hash")
+	}
+	// The REST surface must not desync the embedded hash from the canonical hash.
+	recomputed, err := restReport.CanonicalHash()
+	if err != nil {
+		t.Fatalf("recompute canonical hash: %v", err)
+	}
+	if recomputed != restReport.ReportHash {
+		t.Fatalf("REST embedded hash desynced from canonical: embedded=%q canonical=%q",
+			restReport.ReportHash, recomputed)
+	}
+
+	// --- Tool surfaces: triage_incident (raw Report) + render_triage_report (markdown) ---
+	reg := tools.NewRegistry()
+	if err := tools.RegisterTriageTool(reg, eng); err != nil {
+		t.Fatalf("register triage_incident: %v", err)
+	}
+	if err := tools.RegisterTriageReportTool(reg, eng); err != nil {
+		t.Fatalf("register render_triage_report: %v", err)
+	}
+
+	out, err := reg.Call(ctx, "triage_incident", json.RawMessage(`{"incident_id":"inc_abc","window":"15m"}`))
+	if err != nil {
+		t.Fatalf("triage_incident call: %v", err)
+	}
+	toolReport, ok := out.(*pkgtriage.Report)
+	if !ok {
+		t.Fatalf("triage_incident returned %T, want *pkgtriage.Report", out)
+	}
+
+	rendered, err := reg.Call(ctx, "render_triage_report", json.RawMessage(`{"incident_id":"inc_abc","format":"markdown"}`))
+	if err != nil {
+		t.Fatalf("render_triage_report call: %v", err)
+	}
+	r, ok := rendered.(reports.Rendered)
+	if !ok {
+		t.Fatalf("render_triage_report returned %T, want reports.Rendered", rendered)
+	}
+	md, ok := r.Body.(string)
+	if !ok {
+		t.Fatalf("markdown body is %T, want string", r.Body)
+	}
+
+	// --- Agreement across all three surfaces ---
+	if toolReport.ReportHash != restReport.ReportHash {
+		t.Fatalf("triage_incident vs REST hash drift: tool=%q rest=%q",
+			toolReport.ReportHash, restReport.ReportHash)
+	}
+	if !strings.Contains(md, restReport.ReportHash) {
+		t.Fatalf("render_triage_report markdown must embed report_hash %q", restReport.ReportHash)
+	}
+}
diff --git a/internal/ingest/v2/handler.go b/internal/ingest/v2/handler.go
index 1155424..4a04c92 100644
--- a/internal/ingest/v2/handler.go
+++ b/internal/ingest/v2/handler.go
@@ -123,6 +123,9 @@ func (h *Handler) handle(w http.ResponseWriter, r *http.Request, durable bool) {
 	}
 	env, err := h.IngestRaw(r.Context(), parsed.events, durable)
 	if err != nil {
+		// Backpressure: tell well-behaved clients when to retry so a WAL
+		// outage doesn't turn into an immediate retry herd.
+		w.Header().Set("Retry-After", "1")
 		switch {
 		case errors.Is(err, errDurabilityUnavailable):
 			http.Error(w, "durability unavailable", http.StatusServiceUnavailable)
diff --git a/internal/ingest/v2/handler_test.go b/internal/ingest/v2/handler_test.go
index 733868c..908399a 100644
--- a/internal/ingest/v2/handler_test.go
+++ b/internal/ingest/v2/handler_test.go
@@ -413,6 +413,9 @@ func TestEventsWALFailureReturnsPlain503(t *testing.T) {
 	if !strings.Contains(rec.Body.String(), "durability unavailable") {
 		t.Fatalf("body=%q", rec.Body.String())
 	}
+	if rec.Header().Get("Retry-After") != "1" {
+		t.Fatalf("503 must signal backpressure with Retry-After: 1, got %q", rec.Header().Get("Retry-After"))
+	}
 	fm := gatherMap(t, reg)
 	if got := counterWithLabel(fm["waylog_events_rejected_total"], "reason", ReasonDurabilityUnavailable); got != 1 {
 		t.Fatalf("durability_unavailable=%v want 1", got)
diff --git a/internal/llm/gemini.go b/internal/llm/gemini.go
index 24dd2fc..1f75be6 100644
--- a/internal/llm/gemini.go
+++ b/internal/llm/gemini.go
@@ -471,34 +471,22 @@ func filterToolsForPrompt(tools []ToolDefinition, prompt string) []ToolDefinitio
 		}
 	}
 
+	// Order is most-specific-first: "report" beats "triage" (so
+	// "render the triage report" → render_triage_report), and
+	// "triage|incident" beats "trace|explain" (so "explain incident X"
+	// → triage_incident, not explain_request).
 	switch {
-	case strings.Contains(p, "trace"):
-		add("trace_summary")
-		add("trace_graph")
-		add("explain_request")
-	case strings.Contains(p, "service path") || strings.Contains(p, "path"):
-		add("trace_summary")
-		add("failure_chain")
-	case strings.Contains(p, "root cause") || strings.Contains(p, "why did") || strings.Contains(p, "why is"):
-		add("explain_request")
-		add("failure_chain")
-	case strings.Contains(p, "explain") || strings.Contains(p, "info"):
-		add("explain_request")
-	case strings.Contains(p, "impact") || strings.Contains(p, "affected") || strings.Contains(p, "blast") || strings.Contains(p, "radius"):
+	case strings.Contains(p, "report"):
+		add("render_triage_report")
+	case strings.Contains(p, "triage") || strings.Contains(p, "incident"):
+		add("triage_incident")
+	case strings.Contains(p, "impact") || strings.Contains(p, "affected") ||
+		strings.Contains(p, "blast") || strings.Contains(p, "radius"):
 		add("blast_radius")
-	case strings.Contains(p, "pattern"):
-		add("failure_patterns")
-	case strings.Contains(p, "diff") || strings.Contains(p, "compare"):
-		add("compare_windows")
-	case strings.Contains(p, "query"):
-		add("graph_query")
-	case strings.Contains(p, "insight") || strings.Contains(p, "top") || strings.Contains(p, "stats") ||
-		strings.Contains(p, "overview") || strings.Contains(p, "summary") || strings.Contains(p, "health") ||
-		strings.Contains(p, "what happened"):
-		add("graph_insights")
-	case strings.Contains(p, "failure") || strings.Contains(p, "error"):
-		add("graph_failures")
-		add("graph_insights")
+	case strings.Contains(p, "trace") || strings.Contains(p, "explain") ||
+		strings.Contains(p, "info") || strings.Contains(p, "root cause") ||
+		strings.Contains(p, "why did") || strings.Contains(p, "why is"):
+		add("explain_request")
 	}
 
 	return out
@@ -523,13 +511,8 @@ func fillToolArgsFromPrompt(tool string, raw json.RawMessage, prompt string) (js
 	}
 
 	switch tool {
-	case "explain_request", "failure_chain":
-		setIfMissing("request_id", extractRequestID(prompt))
-	case "trace_graph":
-		setIfMissing("trace_id", extractTraceID(prompt))
-	case "trace_summary":
+	case "explain_request":
 		setIfMissing("trace_id", extractTraceID(prompt))
-		setIfMissing("request_id", extractRequestID(prompt))
 	}
 
 	if len(args) == 0 {
@@ -542,13 +525,6 @@ func fillToolArgsFromPrompt(tool string, raw json.RawMessage, prompt string) (js
 	return out, true
 }
 
-func extractRequestID(prompt string) string {
-	if id := extractHexIDAfterKeyword(prompt, "request"); id != "" {
-		return id
-	}
-	return extractFirstHexID(prompt)
-}
-
 func extractTraceID(prompt string) string {
 	if id := extractUUIDAfterKeyword(prompt, "trace"); id != "" {
 		return id
diff --git a/internal/llm/gemini_test.go b/internal/llm/gemini_test.go
index 6d3e0eb..d768ea7 100644
--- a/internal/llm/gemini_test.go
+++ b/internal/llm/gemini_test.go
@@ -9,20 +9,13 @@ import (
 	"testing"
 )
 
-// allTools returns the full tool set matching what RegisterGraphTools creates.
+// allTools returns the v1.0 surviving tool ledger.
 func allTools() []ToolDefinition {
 	names := []string{
-		"graph_stats",
 		"explain_request",
-		"trace_graph",
-		"trace_summary",
-		"graph_failures",
-		"failure_patterns",
 		"blast_radius",
-		"failure_chain",
-		"graph_query",
-		"compare_windows",
-		"graph_insights",
+		"triage_incident",
+		"render_triage_report",
 	}
 	tools := make([]ToolDefinition, len(names))
 	for i, n := range names {
@@ -45,135 +38,31 @@ func TestFilterToolsForPrompt(t *testing.T) {
 		prompt   string
 		expected []string
 	}{
-		// --- Keyword matches ---
 		{
-			name:     "trace keyword selects trace tools",
+			name:     "trace keyword routes to explain_request",
 			prompt:   "show me the trace for abc123",
-			expected: []string{"trace_summary", "trace_graph", "explain_request"},
+			expected: []string{"explain_request"},
 		},
 		{
-			name:     "explain keyword selects explain_request",
+			name:     "explain keyword routes to explain_request",
 			prompt:   "explain why checkout failed",
 			expected: []string{"explain_request"},
 		},
 		{
-			name:     "info keyword selects explain_request",
-			prompt:   "info about request abc",
+			name:     "root cause routes to explain_request",
+			prompt:   "what is the root cause of the checkout failure",
 			expected: []string{"explain_request"},
 		},
 		{
-			name:     "blast keyword selects blast_radius",
-			prompt:   "what is the blast radius of PMT_502",
-			expected: []string{"blast_radius"},
-		},
-		{
-			name:     "pattern keyword selects failure_patterns",
-			prompt:   "show me failure pattern in the last hour",
-			expected: []string{"failure_patterns"},
-		},
-		{
-			name:     "diff keyword selects compare_windows",
-			prompt:   "diff errors between now and 1h ago",
-			expected: []string{"compare_windows"},
-		},
-		{
-			name:     "compare keyword selects compare_windows",
-			prompt:   "compare errors in last 10m vs 1h ago",
-			expected: []string{"compare_windows"},
-		},
-		{
-			name:     "query keyword selects graph_query",
-			prompt:   "query for error_code=PMT_502 in last 10m",
-			expected: []string{"graph_query"},
-		},
-		{
-			name:     "insight keyword selects graph_insights",
-			prompt:   "show insights for the last hour",
-			expected: []string{"graph_insights"},
-		},
-		{
-			name:     "top keyword selects graph_insights",
-			prompt:   "top errors in the last 10 minutes",
-			expected: []string{"graph_insights"},
-		},
-		{
-			name:     "stats keyword selects graph_insights",
-			prompt:   "show me stats",
-			expected: []string{"graph_insights"},
-		},
-		{
-			name:     "failure keyword selects failures + insights",
-			prompt:   "list all failures",
-			expected: []string{"graph_failures", "graph_insights"},
-		},
-		{
-			name:     "error keyword selects failures + insights",
-			prompt:   "what errors happened recently",
-			expected: []string{"graph_failures", "graph_insights"},
-		},
-		{
-			name:     "service path selects trace_summary + failure_chain",
-			prompt:   "show the service path for this request",
-			expected: []string{"trace_summary", "failure_chain"},
-		},
-		{
-			name:     "path keyword selects trace_summary + failure_chain",
-			prompt:   "what is the path of the request",
-			expected: []string{"trace_summary", "failure_chain"},
-		},
-
-		// --- Case insensitivity ---
-		{
-			name:     "case insensitive TRACE",
-			prompt:   "Show TRACE for abc123",
-			expected: []string{"trace_summary", "trace_graph", "explain_request"},
+			name:     "why did routes to explain_request",
+			prompt:   "why did checkout return 502",
+			expected: []string{"explain_request"},
 		},
 		{
-			name:     "case insensitive Blast Radius",
-			prompt:   "Blast Radius for PMT_502",
+			name:     "blast keyword routes to blast_radius",
+			prompt:   "what is the blast radius of PMT_502",
 			expected: []string{"blast_radius"},
 		},
-
-		// --- No match → empty (fallback to full list happens in caller) ---
-		{
-			name:     "why did checkout break routes to explain_request",
-			prompt:   "why did checkout break",
-			expected: []string{"explain_request", "failure_chain"},
-		},
-		{
-			name:     "empty prompt returns empty",
-			prompt:   "",
-			expected: nil,
-		},
-		{
-			name:     "unrelated prompt returns empty",
-			prompt:   "hello how are you",
-			expected: nil,
-		},
-
-		// --- Priority: first matching case wins (switch statement) ---
-		{
-			name:     "trace wins over error when both present",
-			prompt:   "trace the error in payment service",
-			expected: []string{"trace_summary", "trace_graph", "explain_request"},
-		},
-		{
-			name:     "trace wins over explain when both present",
-			prompt:   "explain this trace abc123",
-			expected: []string{"trace_summary", "trace_graph", "explain_request"},
-		},
-		{
-			name:     "path wins over error",
-			prompt:   "show the path of the error",
-			expected: []string{"trace_summary", "failure_chain"},
-		},
-		{
-			name:     "explain wins over failure",
-			prompt:   "explain the failure",
-			expected: []string{"explain_request"},
-		},
-
-		// --- Previously known gaps, now fixed ---
 		{
 			name:     "impact keyword routes to blast_radius",
 			prompt:   "what is the impact of PMT_502",
@@ -181,61 +70,37 @@ func TestFilterToolsForPrompt(t *testing.T) {
 		},
 		{
 			name:     "affected keyword routes to blast_radius",
-			prompt:   "which users are affected by the payment outage",
+			prompt:   "which users are affected",
 			expected: []string{"blast_radius"},
 		},
 		{
-			name:     "what happened routes to graph_insights",
-			prompt:   "what happened in the last 10 minutes",
-			expected: []string{"graph_insights"},
+			name:     "triage keyword routes to triage_incident",
+			prompt:   "triage incident inc_42",
+			expected: []string{"triage_incident"},
 		},
 		{
-			name:     "root cause routes to explain_request",
-			prompt:   "what is the root cause of the checkout failure",
-			expected: []string{"explain_request", "failure_chain"},
-		},
-		{
-			name:     "overview routes to graph_insights",
-			prompt:   "give me an overview of the system health",
-			expected: []string{"graph_insights"},
+			name:     "incident keyword routes to triage_incident",
+			prompt:   "show me the latest incident",
+			expected: []string{"triage_incident"},
 		},
-
-		// --- New synonym coverage ---
 		{
-			name:     "why did routes to explain_request",
-			prompt:   "why did checkout return 502",
-			expected: []string{"explain_request", "failure_chain"},
+			name:     "report keyword routes to render_triage_report",
+			prompt:   "render the triage report",
+			expected: []string{"render_triage_report"},
 		},
 		{
-			name:     "why is routes to explain_request",
-			prompt:   "why is the payment service failing",
-			expected: []string{"explain_request", "failure_chain"},
-		},
-		{
-			name:     "summary routes to graph_insights",
-			prompt:   "give me a summary of errors",
-			expected: []string{"graph_insights"},
-		},
-		{
-			name:     "health routes to graph_insights",
-			prompt:   "how is system health right now",
-			expected: []string{"graph_insights"},
-		},
-		{
-			name:     "radius keyword routes to blast_radius",
-			prompt:   "show the error radius for DB_TIMEOUT",
+			name:     "case insensitive Blast Radius",
+			prompt:   "Blast Radius for PMT_502",
 			expected: []string{"blast_radius"},
 		},
-
-		// --- Remaining gaps (no keyword match) ---
 		{
-			name:     "GAP: vague question returns empty",
-			prompt:   "is anything wrong with my services",
+			name:     "empty prompt returns empty",
+			prompt:   "",
 			expected: nil,
 		},
 		{
-			name:     "GAP: latency question returns empty",
-			prompt:   "which endpoints are slow right now",
+			name:     "unrelated prompt returns empty",
+			prompt:   "hello how are you",
 			expected: nil,
 		},
 	}
@@ -272,44 +137,26 @@ func TestFillToolArgsFromPrompt(t *testing.T) {
 		wantFilled bool
 	}{
 		{
-			name:       "extract trace_id for trace_graph",
-			tool:       "trace_graph",
-			rawArgs:    `{}`,
-			prompt:     "show trace abcdef1234567890abcdef1234567890",
-			wantKey:    "trace_id",
-			wantVal:    "abcdef1234567890abcdef1234567890",
-			wantFilled: true,
-		},
-		{
-			name:       "extract trace_id for trace_summary",
-			tool:       "trace_summary",
+			name:       "extract trace_id for explain_request",
+			tool:       "explain_request",
 			rawArgs:    `{}`,
-			prompt:     "trace summary for abcdef1234567890abcdef1234567890",
+			prompt:     "explain request abcdef1234567890abcdef1234567890",
 			wantKey:    "trace_id",
 			wantVal:    "abcdef1234567890abcdef1234567890",
 			wantFilled: true,
 		},
 		{
-			name:       "extract request_id for explain_request",
+			name:       "UUID trace_id extracted",
 			tool:       "explain_request",
 			rawArgs:    `{}`,
-			prompt:     "explain request abcdef1234567890abcdef1234567890aabbccdd",
-			wantKey:    "request_id",
-			wantVal:    "abcdef1234567890abcdef1234567890aabbccdd",
-			wantFilled: true,
-		},
-		{
-			name:       "extract request_id for failure_chain",
-			tool:       "failure_chain",
-			rawArgs:    `{}`,
-			prompt:     "failure chain for request abcdef1234567890abcdef1234567890aabbccdd",
-			wantKey:    "request_id",
-			wantVal:    "abcdef1234567890abcdef1234567890aabbccdd",
+			prompt:     "trace 550e8400-e29b-41d4-a716-446655440000",
+			wantKey:    "trace_id",
+			wantVal:    "550e8400-e29b-41d4-a716-446655440000",
 			wantFilled: true,
 		},
 		{
 			name:       "does not overwrite existing arg",
-			tool:       "trace_graph",
+			tool:       "explain_request",
 			rawArgs:    `{"trace_id":"existing_id_abcdef1234567890"}`,
 			prompt:     "show trace 0000000000000000aaaaaaaaaaaaaaaa",
 			wantKey:    "trace_id",
@@ -318,27 +165,18 @@ func TestFillToolArgsFromPrompt(t *testing.T) {
 		},
 		{
 			name:       "no hex ID in prompt returns unchanged",
-			tool:       "trace_graph",
+			tool:       "explain_request",
 			rawArgs:    `{}`,
 			prompt:     "show me the trace",
 			wantFilled: false,
 		},
 		{
 			name:       "unrelated tool returns unchanged",
-			tool:       "graph_stats",
+			tool:       "blast_radius",
 			rawArgs:    `{}`,
-			prompt:     "show stats for abcdef1234567890abcdef1234567890",
+			prompt:     "blast radius for abcdef1234567890abcdef1234567890",
 			wantFilled: false,
 		},
-		{
-			name:       "UUID trace_id extracted",
-			tool:       "trace_graph",
-			rawArgs:    `{}`,
-			prompt:     "trace 550e8400-e29b-41d4-a716-446655440000",
-			wantKey:    "trace_id",
-			wantVal:    "550e8400-e29b-41d4-a716-446655440000",
-			wantFilled: true,
-		},
 	}
 
 	for _, tt := range tests {
@@ -390,27 +228,6 @@ func TestExtractTraceID(t *testing.T) {
 	}
 }
 
-func TestExtractRequestID(t *testing.T) {
-	tests := []struct {
-		name   string
-		prompt string
-		want   string
-	}{
-		{"hex after request keyword", "request abcdef1234567890abcdef1234567890aabbccdd", "abcdef1234567890abcdef1234567890aabbccdd"},
-		{"standalone hex", "explain abcdef1234567890abcdef1234567890aabbccdd", "abcdef1234567890abcdef1234567890aabbccdd"},
-		{"no ID", "explain the failure", ""},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := extractRequestID(tt.prompt)
-			if got != tt.want {
-				t.Fatalf("extractRequestID(%q) = %q, want %q", tt.prompt, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestIsHex(t *testing.T) {
 	tests := []struct {
 		input string
diff --git a/internal/mcp/stdio/server.go b/internal/mcp/stdio/server.go
index d83ff91..62e16a5 100644
--- a/internal/mcp/stdio/server.go
+++ b/internal/mcp/stdio/server.go
@@ -64,13 +64,10 @@ type toolsCallResult struct {
 	Content []toolContent `json:"content"`
 }
 
-func Serve(ctx context.Context, in io.Reader, out io.Writer, reg *tools.Registry, store tools.Store, info ServerInfo) error {
+func Serve(ctx context.Context, in io.Reader, out io.Writer, reg *tools.Registry, info ServerInfo) error {
 	if reg == nil {
 		return fmt.Errorf("registry required")
 	}
-	if store == nil {
-		return fmt.Errorf("store required")
-	}
 
 	enc := json.NewEncoder(out)
 	enc.SetEscapeHTML(false)
@@ -103,7 +100,7 @@ func Serve(ctx context.Context, in io.Reader, out io.Writer, reg *tools.Registry
 		}
 
 		if isNotification(req.ID) {
-			handleNotification(ctx, req, reg, store, info)
+			handleNotification(ctx, req, reg, info)
 			continue
 		}
 
@@ -145,7 +142,7 @@ func Serve(ctx context.Context, in io.Reader, out io.Writer, reg *tools.Registry
 			if len(params.Arguments) == 0 {
 				params.Arguments = json.RawMessage("{}")
 			}
-			result, err := reg.Call(ctx, store, params.Name, params.Arguments)
+			result, err := reg.Call(ctx, params.Name, params.Arguments)
 			if err != nil {
 				writeError(enc, req.ID, -32000, "tool error", err.Error())
 				continue
@@ -164,10 +161,9 @@ func Serve(ctx context.Context, in io.Reader, out io.Writer, reg *tools.Registry
 	}
 }
 
-func handleNotification(ctx context.Context, req rpcRequest, reg *tools.Registry, store tools.Store, info ServerInfo) {
+func handleNotification(ctx context.Context, req rpcRequest, reg *tools.Registry, info ServerInfo) {
 	_ = ctx
 	_ = reg
-	_ = store
 	_ = info
 }
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 21d7680..d28b823 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -15,10 +15,10 @@ type Metrics struct {
 
 	IngestLatency          prometheus.Histogram
 	IngestBatchSize        prometheus.Histogram
-	MergeLatency           prometheus.Histogram
 	EventsAccepted         prometheus.Counter
 	EventsDuplicate        prometheus.Counter
 	EventsRejected         *prometheus.CounterVec
+	RateLimited            *prometheus.CounterVec
 	EventlogFails          prometheus.Counter
 	EventDedupCacheSize    prometheus.Gauge
 	EventDedupReplayLoaded prometheus.Counter
@@ -41,16 +41,6 @@ type Metrics struct {
 	ReplayFailuresTotal prometheus.Counter
 	Ready               prometheus.Gauge
 	InFlightRequests    prometheus.Gauge
-	SnapshotLastSuccess prometheus.Gauge
-	SnapshotLastError   prometheus.Gauge
-	GraphNodes          prometheus.Gauge
-	GraphEdges          prometheus.Gauge
-	GraphPrunedTotal    prometheus.Counter
-	TraceUpsertDuration prometheus.Histogram
-	TraceStoreRecords   prometheus.Gauge
-	TraceStoreSpans     prometheus.Gauge
-	TraceStoreCohorts   prometheus.Gauge
-	TraceStorePruned    prometheus.Counter
 
 	AskRequestsTotal     *prometheus.CounterVec
 	AskDuration          prometheus.Histogram
@@ -67,9 +57,10 @@ type Metrics struct {
 	DeployUpsertsTotal prometheus.Counter
 	DeployUpsertErrors prometheus.Counter
 
-	SignalsAccepted       prometheus.Counter
-	SignalsRejected       *prometheus.CounterVec
-	SignalRetentionPruned prometheus.Counter
+	SignalsAccepted         prometheus.Counter
+	SignalsRejected         *prometheus.CounterVec
+	SignalRetentionPruned   prometheus.Counter
+	IncidentRetentionPruned prometheus.Counter
 
 	IncidentOpened          prometheus.Counter
 	IncidentUpdated         prometheus.Counter
@@ -83,11 +74,6 @@ type Metrics struct {
 	IncidentRebuildFailures prometheus.Counter
 	IncidentRebuildReplayed prometheus.Counter
 
-	CausalRunsTotal   prometheus.Counter
-	CausalRunDuration prometheus.Histogram
-	CausalRunFailures prometheus.Counter
-	CausalClaimsTotal *prometheus.CounterVec // labels: type, tier
-
 	// OTLP ingestion metrics
 	OTLPRequestsTotal     *prometheus.CounterVec // labels: status
 	OTLPSpansReceived     prometheus.Counter
@@ -114,11 +100,6 @@ func New(reg *prometheus.Registry) *Metrics {
 		Help:    "Number of events parsed from each ingest request.",
 		Buckets: []float64{1, 2, 4, 8, 16, 32, 64, 128, 256},
 	})
-	m.MergeLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name:    "waylog_merge_latency_seconds",
-		Help:    "Build + Merge time.",
-		Buckets: defaultBuckets,
-	})
 	m.EventsAccepted = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_events_accepted_total",
 		Help: "Events accepted after each ingest path's durability contract is satisfied.",
@@ -134,6 +115,10 @@ func New(reg *prometheus.Registry) *Metrics {
 	for _, reason := range []string{"validation", "sampling"} {
 		m.EventsRejected.WithLabelValues(reason).Add(0)
 	}
+	m.RateLimited = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "waylog_rate_limited_total",
+		Help: "Requests rejected with 429 by the per-key rate limiter.",
+	}, []string{"scope"})
 	m.EventlogFails = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_eventlog_write_failures_total",
 		Help: "Failed eventlog writes.",
@@ -234,48 +219,6 @@ func New(reg *prometheus.Registry) *Metrics {
 		Name: "waylog_inflight_requests",
 		Help: "Concurrent Events handler calls.",
 	})
-	m.SnapshotLastSuccess = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_snapshot_last_success_timestamp",
-		Help: "Unix epoch of last successful save.",
-	})
-	m.SnapshotLastError = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_snapshot_last_error_timestamp",
-		Help: "Unix epoch of last failed save.",
-	})
-	m.GraphNodes = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_graph_nodes",
-		Help: "Current node count.",
-	})
-	m.GraphEdges = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_graph_edges",
-		Help: "Current edge count.",
-	})
-	m.GraphPrunedTotal = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "waylog_graph_pruned_total",
-		Help: "Number of retention prune cycles executed.",
-	})
-	m.TraceUpsertDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name:    "waylog_trace_upsert_duration_seconds",
-		Help:    "Trace store upsert time.",
-		Buckets: defaultBuckets,
-	})
-	m.TraceStoreRecords = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_trace_store_records",
-		Help: "Current trace record count.",
-	})
-	m.TraceStoreSpans = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_trace_store_spans",
-		Help: "Current total span count in trace store.",
-	})
-	m.TraceStoreCohorts = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "waylog_trace_store_cohorts",
-		Help: "Current trace-store time cohort count.",
-	})
-	m.TraceStorePruned = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "waylog_trace_store_pruned_total",
-		Help: "Total trace records pruned from the trace store.",
-	})
-
 	m.AskRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Name: "waylog_ask_requests_total",
 		Help: "Ask endpoint requests.",
@@ -351,6 +294,10 @@ func New(reg *prometheus.Registry) *Metrics {
 		Name: "waylog_signal_retention_pruned_total",
 		Help: "Production-context signals pruned by retention.",
 	})
+	m.IncidentRetentionPruned = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "waylog_incident_retention_pruned_total",
+		Help: "Resolved incidents pruned by retention.",
+	})
 
 	m.IncidentOpened = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "waylog_incidents_opened_total",
@@ -404,24 +351,6 @@ func New(reg *prometheus.Registry) *Metrics {
 		Help: "Schema-2.0 events replayed for startup hot-window incident rebuild.",
 	})
 
-	m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "waylog_causal_runs_total",
-		Help: "Total causal inference runs.",
-	})
-	m.CausalRunDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
-		Name:    "waylog_causal_run_duration_seconds",
-		Help:    "Duration of causal inference runs.",
-		Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5},
-	})
-	m.CausalRunFailures = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "waylog_causal_run_failures_total",
-		Help: "Total failed causal inference runs.",
-	})
-	m.CausalClaimsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
-		Name: "waylog_causal_claims_total",
-		Help: "Total causal claims produced.",
-	}, []string{"type", "tier"})
-
 	m.OTLPRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Name: "waylog_otlp_requests_total",
 		Help: "Total OTLP trace ingestion requests.",
@@ -462,8 +391,8 @@ func New(reg *prometheus.Registry) *Metrics {
 	})
 
 	reg.MustRegister(
-		m.IngestLatency, m.IngestBatchSize, m.MergeLatency,
-		m.EventsAccepted, m.EventsDuplicate, m.EventsRejected, m.EventlogFails,
+		m.IngestLatency, m.IngestBatchSize,
+		m.EventsAccepted, m.EventsDuplicate, m.EventsRejected, m.RateLimited, m.EventlogFails,
 		m.EventDedupCacheSize, m.EventDedupReplayLoaded,
 		m.V2EventsProjected, m.V2IndexSize, m.V2IndexPruned, m.V2ReplayProjected,
 		m.V2ReplaySkipped,
@@ -472,19 +401,15 @@ func New(reg *prometheus.Registry) *Metrics {
 		m.V2ReadLatency, m.V2ReadEmpty, m.V2ReadNotFound,
 		m.ReplayLagSeconds, m.ReplayInProgress, m.ReplayFailuresTotal, m.Ready,
 		m.InFlightRequests,
-		m.SnapshotLastSuccess, m.SnapshotLastError,
-		m.GraphNodes, m.GraphEdges, m.GraphPrunedTotal,
-		m.TraceUpsertDuration, m.TraceStoreRecords, m.TraceStoreSpans, m.TraceStoreCohorts, m.TraceStorePruned,
 		m.AskRequestsTotal, m.AskDuration,
 		m.AskToolCallsTotal, m.AskToolDuration,
 		m.ToolDirectCallsTotal, m.DedupReplayTotal, m.DedupCacheSize,
 		m.ColdEventsWritten, m.ColdEventsDropped, m.ColdBatchLatency,
 		m.DeployUpsertsTotal, m.DeployUpsertErrors,
-		m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
+		m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned, m.IncidentRetentionPruned,
 		m.IncidentOpened, m.IncidentUpdated, m.IncidentRecovered, m.IncidentResolved,
 		m.IncidentTickLatency, m.IncidentActive, m.IncidentClassifications,
 		m.IncidentRebuildDuration, m.IncidentRebuildRows, m.IncidentRebuildFailures, m.IncidentRebuildReplayed,
-		m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
 		m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
 		m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,
 		m.OTLPInfraFailures, m.OTLPRequestDuration, m.OTLPRequestSizeBytes,
diff --git a/internal/otel/deploys.go b/internal/otel/deploys.go
new file mode 100644
index 0000000..cdd935e
--- /dev/null
+++ b/internal/otel/deploys.go
@@ -0,0 +1,79 @@
+// OTLP deploy auto-registration: spans carry service.version, and a version
+// change for a (service, env) pair is the strongest deploy evidence an
+// OTel-only install can produce. Registering it as a deployment makes the
+// incident classifier's deploy correlation work without the deploy webhook.
+package otel
+
+import (
+	"context"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/coldstore"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+// DeploymentUpserter is the slice of the cold store the tracker needs.
+type DeploymentUpserter interface {
+	UpsertDeployment(ctx context.Context, d coldstore.Deployment) error
+}
+
+// DeployTracker registers a deployment when a (service, env) tuple changes
+// version within the process lifetime. The first version seen per tuple is
+// tracked but never registered: after a restart, steady-state traffic must
+// not fabricate a deploy anchored at boot time and poison deploy correlation.
+type DeployTracker struct {
+	store DeploymentUpserter
+
+	mu          sync.Mutex
+	lastVersion map[string]string // service + "\x00" + env → version
+}
+
+func NewDeployTracker(store DeploymentUpserter) *DeployTracker {
+	return &DeployTracker{store: store, lastVersion: map[string]string{}}
+}
+
+// Observe scans successfully ingested events and upserts a deployment for
+// every version change. Upsert failures are logged, not propagated: deploy
+// registration must never fail span ingestion.
+func (t *DeployTracker) Observe(ctx context.Context, events []*eventv2.Event) {
+	if t == nil {
+		return
+	}
+	type change struct{ service, env, version string }
+	var changes []change
+
+	t.mu.Lock()
+	for _, ev := range events {
+		if ev == nil || ev.Service == "" || ev.Version == "" {
+			continue
+		}
+		key := ev.Service + "\x00" + ev.Env
+		last, seen := t.lastVersion[key]
+		if last == ev.Version {
+			continue
+		}
+		t.lastVersion[key] = ev.Version
+		if seen {
+			changes = append(changes, change{ev.Service, ev.Env, ev.Version})
+		}
+	}
+	t.mu.Unlock()
+
+	now := time.Now().UTC()
+	for _, c := range changes {
+		dep := coldstore.Deployment{
+			ID:        "otlp:" + c.service + ":" + c.env + ":" + c.version,
+			Service:   c.service,
+			Version:   c.version,
+			Env:       c.env,
+			FirstSeen: now,
+			LastSeen:  now,
+			Metadata:  map[string]string{"source": "otlp"},
+		}
+		if err := t.store.UpsertDeployment(ctx, dep); err != nil {
+			slog.Warn("otlp: deploy auto-registration failed", "service", c.service, "version", c.version, "err", err)
+		}
+	}
+}
diff --git a/internal/otel/deploys_test.go b/internal/otel/deploys_test.go
new file mode 100644
index 0000000..4a04a11
--- /dev/null
+++ b/internal/otel/deploys_test.go
@@ -0,0 +1,196 @@
+package otel
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/coldstore"
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+	eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+	coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+	commonpb "go.opentelemetry.io/proto/otlp/common/v1"
+	resourcepb "go.opentelemetry.io/proto/otlp/resource/v1"
+	tracepb "go.opentelemetry.io/proto/otlp/trace/v1"
+)
+
+type fakeDeployStore struct {
+	upserts []coldstore.Deployment
+}
+
+func (f *fakeDeployStore) UpsertDeployment(_ context.Context, d coldstore.Deployment) error {
+	f.upserts = append(f.upserts, d)
+	return nil
+}
+
+// requestWithVersion builds a one-span OTLP request for test-svc/prod carrying
+// the given service.version. seq makes trace/span IDs unique across calls so
+// ingest dedup never drops the event.
+func requestWithVersion(version string, seq byte) *coltracepb.ExportTraceServiceRequest {
+	traceID := make([]byte, 16)
+	spanID := make([]byte, 8)
+	traceID[15] = seq
+	traceID[0] = 0x0f
+	spanID[7] = seq
+	spanID[0] = 0x0f
+	return &coltracepb.ExportTraceServiceRequest{
+		ResourceSpans: []*tracepb.ResourceSpans{{
+			Resource: &resourcepb.Resource{Attributes: []*commonpb.KeyValue{
+				strAttr("service.name", "test-svc"),
+				strAttr("service.version", version),
+				strAttr("deployment.environment", "prod"),
+			}},
+			ScopeSpans: []*tracepb.ScopeSpans{{
+				Spans: []*tracepb.Span{{
+					TraceId:           traceID,
+					SpanId:            spanID,
+					Name:              "test-op",
+					StartTimeUnixNano: 1000000000,
+					EndTimeUnixNano:   1050000000,
+					Attributes: []*commonpb.KeyValue{
+						strAttr("http.request.method", "GET"),
+						strAttr("http.route", "/test"),
+						intAttr("http.response.status_code", 200),
+					},
+					Status: &tracepb.Status{Code: tracepb.Status_STATUS_CODE_OK},
+				}},
+			}},
+		}},
+	}
+}
+
+func TestExportRegistersDeploymentOnVersionChange(t *testing.T) {
+	store := &fakeDeployStore{}
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, NewDeployTracker(store))
+	ctx := context.Background()
+
+	// First version seen for a (service, env) is tracked, not registered:
+	// steady-state traffic after a restart must not fabricate a deploy
+	// anchored at boot time.
+	if _, err := h.Export(ctx, requestWithVersion("v1", 1)); err != nil {
+		t.Fatalf("export v1: %v", err)
+	}
+	if len(store.upserts) != 0 {
+		t.Fatalf("first-seen version must not register a deployment: %+v", store.upserts)
+	}
+
+	// A version change registers exactly one deployment.
+	if _, err := h.Export(ctx, requestWithVersion("v2", 2)); err != nil {
+		t.Fatalf("export v2: %v", err)
+	}
+	if len(store.upserts) != 1 {
+		t.Fatalf("version change must register one deployment, got %d", len(store.upserts))
+	}
+	dep := store.upserts[0]
+	if dep.Service != "test-svc" || dep.Env != "prod" || dep.Version != "v2" {
+		t.Fatalf("deployment fields wrong: %+v", dep)
+	}
+	if dep.ID != "otlp:test-svc:prod:v2" {
+		t.Fatalf("deployment ID must be deterministic, got %q", dep.ID)
+	}
+	if dep.FirstSeen.IsZero() || dep.LastSeen.IsZero() {
+		t.Fatalf("first/last seen must be set: %+v", dep)
+	}
+
+	// Repeats of the same version are a no-op.
+	if _, err := h.Export(ctx, requestWithVersion("v2", 3)); err != nil {
+		t.Fatalf("export v2 again: %v", err)
+	}
+	if len(store.upserts) != 1 {
+		t.Fatalf("unchanged version must not re-register, got %d upserts", len(store.upserts))
+	}
+
+	// Rolling back (or mixed replicas during rollout) re-registers the other
+	// version; the store's MIN(first_seen) keeps the original anchor.
+	if _, err := h.Export(ctx, requestWithVersion("v1", 4)); err != nil {
+		t.Fatalf("export v1 rollback: %v", err)
+	}
+	if len(store.upserts) != 2 || store.upserts[1].Version != "v1" {
+		t.Fatalf("rollback must register v1: %+v", store.upserts)
+	}
+}
+
+// End-to-end scenario: an OTel-only install (no SDK, no deploy webhook) sees a
+// service.version change on spans; the auto-registered deployment must be
+// queryable from the real cold store and make the incident classifier pick
+// cause=deploy for a subsequent error burst.
+func TestOTLPVersionChangeEnablesDeployClassification(t *testing.T) {
+	db, err := coldstore.Open(":memory:")
+	if err != nil {
+		t.Fatalf("coldstore.Open: %v", err)
+	}
+	defer db.Close()
+	store := db.(*coldstore.SQLiteStore)
+
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, NewDeployTracker(store))
+	ctx := context.Background()
+	if _, err := h.Export(ctx, requestWithVersion("v1", 21)); err != nil {
+		t.Fatalf("export v1: %v", err)
+	}
+	if _, err := h.Export(ctx, requestWithVersion("v2", 22)); err != nil {
+		t.Fatalf("export v2: %v", err)
+	}
+
+	now := time.Now().UTC()
+	rows, err := store.DeploymentsInWindow(ctx, now.Add(-time.Minute), now.Add(time.Minute), "test-svc")
+	if err != nil {
+		t.Fatalf("DeploymentsInWindow: %v", err)
+	}
+	if len(rows) != 1 || rows[0].Version != "v2" {
+		t.Fatalf("want one v2 deployment, got %+v", rows)
+	}
+
+	// Same conversion the engine's coldDeployAdapter performs.
+	dep := incidents.Deployment{
+		ID: rows[0].ID, Service: rows[0].Service, Version: rows[0].Version,
+		Env: rows[0].Env, FirstSeen: rows[0].FirstSeen,
+	}
+	errEvent := &eventv2.Event{
+		SchemaVersion: eventv2.SchemaVersion2,
+		EventID:       "e-burst", TraceID: "trace-burst", SpanID: "span-burst",
+		TsStart: now, TsEnd: now, Kind: "http",
+		Service: "test-svc", Env: "prod", Version: "v2",
+		Status: eventv2.StatusError,
+		Anchor: &eventv2.Anchor{Step: "op", ErrorCode: "HTTP_500"},
+		Steps: []eventv2.Step{{
+			Name: "op", Status: eventv2.StepStatusError,
+			Error: &eventv2.StepError{Code: "HTTP_500", Reason: "boom"},
+		}},
+	}
+	got := incidents.Classify(incidents.ClassificationInput{
+		Incident: incidents.Incident{
+			Service: "test-svc", Env: "prod", StartedAt: now,
+			ErrorFamily: apiv2.ErrorFamily{Service: "test-svc", Step: "op", ErrorCode: "HTTP_500"},
+		},
+		Events:      []*eventv2.Event{errEvent},
+		Deployments: []incidents.Deployment{dep},
+		Now:         now,
+	})
+	if got.Cause != incidents.CauseDeploy || got.Confidence != incidents.ConfidenceHigh {
+		t.Fatalf("OTel-only deploy correlation failed: %+v", got)
+	}
+}
+
+func TestExportWithoutTrackerOrVersionIsSafe(t *testing.T) {
+	ctx := context.Background()
+
+	// nil tracker: no panic.
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
+	if _, err := h.Export(ctx, requestWithVersion("v1", 9)); err != nil {
+		t.Fatalf("export with nil tracker: %v", err)
+	}
+
+	// events without service.version never register.
+	store := &fakeDeployStore{}
+	h2 := NewHandler(testV2Ingest(t), nil, 1<<20, NewDeployTracker(store))
+	if _, err := h2.Export(ctx, validOTLPRequest()); err != nil {
+		t.Fatalf("export without version: %v", err)
+	}
+	if _, err := h2.Export(ctx, validOTLPRequest()); err != nil {
+		t.Fatalf("export without version again: %v", err)
+	}
+	if len(store.upserts) != 0 {
+		t.Fatalf("versionless events must not register deployments: %+v", store.upserts)
+	}
+}
diff --git a/internal/otel/grpc.go b/internal/otel/grpc.go
index 886be3b..fab3912 100644
--- a/internal/otel/grpc.go
+++ b/internal/otel/grpc.go
@@ -25,9 +25,9 @@ type TraceServiceServer struct {
 	metrics *metrics.Metrics
 }
 
-func NewTraceServiceServer(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *TraceServiceServer {
+func NewTraceServiceServer(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64, deploys *DeployTracker) *TraceServiceServer {
 	return &TraceServiceServer{
-		handler: NewHandler(v2Ingest, m, maxBodyBytes),
+		handler: NewHandler(v2Ingest, m, maxBodyBytes, deploys),
 		metrics: m,
 	}
 }
diff --git a/internal/otel/grpc_test.go b/internal/otel/grpc_test.go
index 7e9beb1..521197e 100644
--- a/internal/otel/grpc_test.go
+++ b/internal/otel/grpc_test.go
@@ -20,7 +20,7 @@ func newBufconnClient(t *testing.T, keys []string) (coltracepb.TraceServiceClien
 	t.Helper()
 	lis := bufconn.Listen(bufSize)
 	srv := grpc.NewServer(grpc.UnaryInterceptor(AuthUnaryInterceptor(keys)))
-	coltracepb.RegisterTraceServiceServer(srv, NewTraceServiceServer(testV2Ingest(t), nil, 1<<20))
+	coltracepb.RegisterTraceServiceServer(srv, NewTraceServiceServer(testV2Ingest(t), nil, 1<<20, nil))
 	go func() {
 		_ = srv.Serve(lis)
 	}()
diff --git a/internal/otel/handler.go b/internal/otel/handler.go
index 1bcac11..da13ce3 100644
--- a/internal/otel/handler.go
+++ b/internal/otel/handler.go
@@ -33,6 +33,7 @@ type Handler struct {
 	v2Ingest     *ingestv2.Handler
 	metrics      *metrics.Metrics
 	maxBodyBytes int64
+	deploys      *DeployTracker
 }
 
 // ExportError is returned when decoded OTLP spans cannot be processed after
@@ -58,12 +59,14 @@ func (e *ExportError) Unwrap() error {
 	return e.Cause
 }
 
-// NewHandler constructs an OTLP traces handler.
-func NewHandler(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *Handler {
+// NewHandler constructs an OTLP traces handler. deploys may be nil to
+// disable deploy auto-registration (no cold store configured).
+func NewHandler(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64, deploys *DeployTracker) *Handler {
 	return &Handler{
 		v2Ingest:     v2Ingest,
 		metrics:      m,
 		maxBodyBytes: maxBodyBytes,
+		deploys:      deploys,
 	}
 }
 
@@ -213,6 +216,7 @@ func (h *Handler) Export(ctx context.Context, req *coltracepb.ExportTraceService
 		if h.metrics != nil && len(env.Rejected) > 0 {
 			h.metrics.OTLPValidationRejects.Add(float64(len(env.Rejected)))
 		}
+		h.deploys.Observe(ctx, convResult.Events)
 	}
 
 	resp := &coltracepb.ExportTraceServiceResponse{}
diff --git a/internal/otel/handler_test.go b/internal/otel/handler_test.go
index dab8afc..9dd2d19 100644
--- a/internal/otel/handler_test.go
+++ b/internal/otel/handler_test.go
@@ -78,7 +78,7 @@ func postOTLP(handler http.Handler, body []byte, contentType, contentEncoding st
 }
 
 func TestHandler_HappyPath(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	rr := postOTLP(h, body, "application/x-protobuf", "")
 	if rr.Code != 200 {
@@ -94,7 +94,7 @@ func TestHandler_HappyPath(t *testing.T) {
 }
 
 func TestHandler_GzipCompressed(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	var buf bytes.Buffer
 	gw := gzip.NewWriter(&buf)
@@ -107,7 +107,7 @@ func TestHandler_GzipCompressed(t *testing.T) {
 }
 
 func TestHandler_ContentTypeWithParams(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	rr := postOTLP(h, body, "application/x-protobuf; charset=utf-8", "")
 	if rr.Code != 200 {
@@ -116,7 +116,7 @@ func TestHandler_ContentTypeWithParams(t *testing.T) {
 }
 
 func TestHandler_WrongContentType(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	rr := postOTLP(h, []byte("{}"), "application/json", "")
 	if rr.Code != http.StatusUnsupportedMediaType {
 		t.Errorf("status = %d, want 415", rr.Code)
@@ -124,7 +124,7 @@ func TestHandler_WrongContentType(t *testing.T) {
 }
 
 func TestHandler_UnsupportedContentEncoding(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	rr := postOTLP(h, body, "application/x-protobuf", "deflate")
 	if rr.Code != http.StatusUnsupportedMediaType {
@@ -133,7 +133,7 @@ func TestHandler_UnsupportedContentEncoding(t *testing.T) {
 }
 
 func TestHandler_WrongMethod(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	req := httptest.NewRequest(http.MethodGet, "/v1/otlp/v1/traces", nil)
 	rr := httptest.NewRecorder()
 	h.ServeHTTP(rr, req)
@@ -143,7 +143,7 @@ func TestHandler_WrongMethod(t *testing.T) {
 }
 
 func TestHandler_MalformedProtobuf(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	rr := postOTLP(h, []byte("not protobuf"), "application/x-protobuf", "")
 	if rr.Code != http.StatusBadRequest {
 		t.Errorf("status = %d, want 400", rr.Code)
@@ -151,7 +151,7 @@ func TestHandler_MalformedProtobuf(t *testing.T) {
 }
 
 func TestHandler_BodyTooLarge(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 10)
+	h := NewHandler(testV2Ingest(t), nil, 10, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	rr := postOTLP(h, body, "application/x-protobuf", "")
 	if rr.Code != http.StatusRequestEntityTooLarge {
@@ -160,7 +160,7 @@ func TestHandler_BodyTooLarge(t *testing.T) {
 }
 
 func TestHandler_EmptyRequest(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	body, _ := proto.Marshal(&coltracepb.ExportTraceServiceRequest{})
 	rr := postOTLP(h, body, "application/x-protobuf", "")
 	if rr.Code != 200 {
@@ -169,7 +169,7 @@ func TestHandler_EmptyRequest(t *testing.T) {
 }
 
 func TestHandler_MissingV2IngestReturns503ForConvertedSpans(t *testing.T) {
-	h := NewHandler(nil, nil, 1<<20)
+	h := NewHandler(nil, nil, 1<<20, nil)
 	body, _ := proto.Marshal(validOTLPRequest())
 	rr := postOTLP(h, body, "application/x-protobuf", "")
 	if rr.Code != http.StatusServiceUnavailable {
@@ -178,7 +178,7 @@ func TestHandler_MissingV2IngestReturns503ForConvertedSpans(t *testing.T) {
 }
 
 func TestHandler_FutureTimestampDropped(t *testing.T) {
-	h := NewHandler(testV2Ingest(t), nil, 1<<20)
+	h := NewHandler(testV2Ingest(t), nil, 1<<20, nil)
 	req := validOTLPRequest()
 	// Stamp the span 10 minutes in the future — should be dropped with
 	// partial_success rather than skewing recent traces / overview.
diff --git a/internal/persist/jsonSnapshot.go b/internal/persist/jsonSnapshot.go
deleted file mode 100644
index 3ad437e..0000000
--- a/internal/persist/jsonSnapshot.go
+++ /dev/null
@@ -1,160 +0,0 @@
-package persist
-
-import (
-	"crypto/sha256"
-	"encoding/hex"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-// SnapshotVersion is bumped when the on-disk snapshot schema changes.
-// Version 2 corresponds to the flattened graph architecture.
-const SnapshotVersion = "2"
-
-var ErrSnapshotMissing = errors.New("snapshot missing")
-var ErrSnapshotVersionMismatch = errors.New("snapshot version mismatch")
-
-type Snapshot struct {
-	Version  string      `json:"version"`
-	SavedAt  time.Time   `json:"saved_at"`
-	Checksum string      `json:"checksum"`
-	Graph    *core.Graph `json:"graph"`
-
-	NodeCount int `json:"node_count"`
-	EdgeCount int `json:"edge_count"`
-}
-
-func Save(path string, g *core.Graph) error {
-	if path == "" {
-		return errors.New("snapshot path is empty")
-	}
-
-	dir := filepath.Dir(path)
-	if err := os.MkdirAll(dir, 0o755); err != nil {
-		return fmt.Errorf("mkdir snapshot dir: %w", err)
-	}
-
-	tmp := Snapshot{
-		Version:   SnapshotVersion,
-		SavedAt:   time.Now().UTC(),
-		Graph:     g,
-		NodeCount: len(g.Nodes),
-		EdgeCount: len(g.Edges),
-	}
-
-	raw, err := json.Marshal(tmp.Graph)
-	if err != nil {
-		return err
-	}
-
-	sum := sha256.Sum256(raw)
-	tmp.Checksum = hex.EncodeToString(sum[:])
-
-	out, err := json.MarshalIndent(tmp, "", "  ")
-	if err != nil {
-		return err
-	}
-
-	// Atomic write: write to temp file in same directory, then rename
-	tmpPath := path + ".tmp"
-
-	// 1. Write to temp file
-	if err := os.WriteFile(tmpPath, out, 0644); err != nil {
-		return fmt.Errorf("write temp snapshot: %w", err)
-	}
-
-	// 2. Fsync temp file to ensure data is on disk
-	f, err := os.Open(tmpPath)
-	if err == nil {
-		_ = f.Sync()
-		_ = f.Close()
-	}
-
-	// 3. Backup existing file BEFORE rename (so .bak = previous good state)
-	if _, err := os.Stat(path); err == nil {
-		_ = copyFile(path, path+".bak")
-	}
-
-	// 4. Atomically replace primary file
-	if err := os.Rename(tmpPath, path); err != nil {
-		return fmt.Errorf("rename temp snapshot: %w", err)
-	}
-
-	return nil
-}
-
-func Load(path string) (*Snapshot, error) {
-	snap, _, err := LoadWithSource(path)
-	return snap, err
-}
-
-func LoadWithSource(path string) (*Snapshot, string, error) {
-	if snap, err := loadSnapshot(path); err == nil {
-		return snap, "primary", nil
-	} else if bakSnap, err2 := loadSnapshot(path + ".bak"); err2 == nil {
-		return bakSnap, "backup", nil
-	} else {
-		if isMissing(err) && isMissing(err2) {
-			return nil, "", ErrSnapshotMissing
-		}
-		return nil, "", fmt.Errorf("load snapshot failed: %w; backup failed: %w", err, err2)
-	}
-}
-
-func loadSnapshot(path string) (*Snapshot, error) {
-	b, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-
-	var snap Snapshot
-	if err := json.Unmarshal(b, &snap); err != nil {
-		return nil, err
-	}
-
-	if snap.Version != SnapshotVersion {
-		return nil, fmt.Errorf("%w: got %q want %q", ErrSnapshotVersionMismatch, snap.Version, SnapshotVersion)
-	}
-
-	raw, err := json.Marshal(snap.Graph)
-	if err != nil {
-		return nil, err
-	}
-
-	sum := sha256.Sum256(raw)
-	if hex.EncodeToString(sum[:]) != snap.Checksum {
-		return nil, errors.New("snapshot checksum mismatch")
-	}
-
-	return &snap, nil
-}
-
-func copyFile(src, dst string) error {
-	in, err := os.Open(src)
-	if err != nil {
-		return err
-	}
-	defer in.Close()
-
-	out, err := os.Create(dst)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	if _, err := io.Copy(out, in); err != nil {
-		return err
-	}
-	return out.Sync()
-}
-
-func isMissing(err error) bool {
-	return err != nil && errors.Is(err, os.ErrNotExist)
-}
diff --git a/internal/persist/jsonSnapshot_test.go b/internal/persist/jsonSnapshot_test.go
deleted file mode 100644
index 081448a..0000000
--- a/internal/persist/jsonSnapshot_test.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package persist
-
-import (
-	"crypto/sha256"
-	"encoding/hex"
-	"encoding/json"
-	"errors"
-	"os"
-	"path/filepath"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-func TestLoadWithSourceRejectsVersionMismatch(t *testing.T) {
-	dir := t.TempDir()
-	path := filepath.Join(dir, "graph_snapshot.json")
-	writeSnapshotFile(t, path, "1")
-
-	snap, source, err := LoadWithSource(path)
-	if err == nil {
-		t.Fatalf("LoadWithSource returned snapshot=%v source=%q, want version mismatch error", snap, source)
-	}
-	if !errors.Is(err, ErrSnapshotVersionMismatch) {
-		t.Fatalf("errors.Is(err, ErrSnapshotVersionMismatch) = false, err=%v", err)
-	}
-	if errors.Is(err, ErrSnapshotMissing) {
-		t.Fatalf("version mismatch should not be reported as missing, err=%v", err)
-	}
-}
-
-func TestLoadWithSourceMissing(t *testing.T) {
-	dir := t.TempDir()
-	path := filepath.Join(dir, "graph_snapshot.json")
-
-	snap, source, err := LoadWithSource(path)
-	if !errors.Is(err, ErrSnapshotMissing) {
-		t.Fatalf("errors.Is(err, ErrSnapshotMissing) = false, err=%v", err)
-	}
-	if snap != nil || source != "" {
-		t.Fatalf("LoadWithSource returned snapshot=%v source=%q, want nil/empty on missing", snap, source)
-	}
-}
-
-func writeSnapshotFile(t *testing.T, path, version string) {
-	t.Helper()
-
-	g := core.New()
-	raw, err := json.Marshal(g)
-	if err != nil {
-		t.Fatalf("marshal graph: %v", err)
-	}
-
-	sum := sha256.Sum256(raw)
-	snap := Snapshot{
-		Version:   version,
-		SavedAt:   time.Unix(1700000000, 0).UTC(),
-		Checksum:  hex.EncodeToString(sum[:]),
-		Graph:     g,
-		NodeCount: len(g.Nodes),
-		EdgeCount: len(g.Edges),
-	}
-
-	out, err := json.MarshalIndent(snap, "", "  ")
-	if err != nil {
-		t.Fatalf("marshal snapshot: %v", err)
-	}
-	if err := os.WriteFile(path, out, 0o644); err != nil {
-		t.Fatalf("write snapshot: %v", err)
-	}
-}
diff --git a/internal/query/atoms.go b/internal/query/atoms.go
deleted file mode 100644
index 084e490..0000000
--- a/internal/query/atoms.go
+++ /dev/null
@@ -1,49 +0,0 @@
-package query
-
-import (
-	"strconv"
-	"strings"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-type EqualsPredicate struct {
-	Field string
-	Value string
-}
-
-func (p EqualsPredicate) Eval(f store.RequestFacts) bool {
-	switch p.Field {
-	case "service":
-		for _, s := range f.Services {
-			if s == p.Value {
-				return true
-			}
-		}
-	case "error", "error_code":
-		for _, e := range f.Errors {
-			if e == p.Value {
-				return true
-			}
-		}
-	case "tier", "user_tier":
-		return f.UserTier == p.Value
-	case "user_id":
-		return f.UserID == p.Value
-	case "user_region":
-		return f.UserRegion == p.Value
-	case "user_vip":
-		expected, err := strconv.ParseBool(strings.TrimSpace(p.Value))
-		if err != nil {
-			return false
-		}
-		return f.UserVIP == expected
-	case "flag", "feature_flag":
-		return f.HasFeatureFlag(p.Value)
-	case "version":
-		return f.Version == p.Value
-	case "status":
-		return f.Status == p.Value
-	}
-	return false
-}
diff --git a/internal/query/atoms_test.go b/internal/query/atoms_test.go
deleted file mode 100644
index 10b030f..0000000
--- a/internal/query/atoms_test.go
+++ /dev/null
@@ -1,63 +0,0 @@
-package query
-
-import (
-	"testing"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-func TestEqualsPredicateEval_FlattenedFacts(t *testing.T) {
-	f := store.RequestFacts{
-		Services:     []string{"checkout"},
-		Errors:       []string{"PMT_502"},
-		UserID:       "user-123",
-		UserTier:     "premium",
-		UserVIP:      true,
-		UserRegion:   "us-west-2",
-		FeatureFlags: []string{"flag-a", "flag-b"},
-		Status:       "failed",
-		Version:      "v2",
-	}
-
-	cases := []struct {
-		name  string
-		field string
-		value string
-		want  bool
-	}{
-		{"service", "service", "checkout", true},
-		{"error", "error_code", "PMT_502", true},
-		{"feature_flag", "feature_flag", "flag-a", true},
-		{"flag alias", "flag", "flag-b", true},
-		{"tier alias", "tier", "premium", true},
-		{"user_tier", "user_tier", "premium", true},
-		{"user_id", "user_id", "user-123", true},
-		{"user_region", "user_region", "us-west-2", true},
-		{"user_vip true", "user_vip", "true", true},
-		{"user_vip false", "user_vip", "false", false},
-		{"version", "version", "v2", true},
-		{"status", "status", "failed", true},
-		{"missing", "feature_flag", "missing", false},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			got := EqualsPredicate{Field: tc.field, Value: tc.value}.Eval(f)
-			if got != tc.want {
-				t.Fatalf("Eval(%s=%s) = %v, want %v", tc.field, tc.value, got, tc.want)
-			}
-		})
-	}
-}
-
-func TestEqualsPredicateEval_TierAlias(t *testing.T) {
-	f := store.RequestFacts{
-		UserTier: "standard",
-	}
-	if !(EqualsPredicate{Field: "tier", Value: "standard"}.Eval(f)) {
-		t.Fatal("tier should match UserTier")
-	}
-	if !(EqualsPredicate{Field: "user_tier", Value: "standard"}.Eval(f)) {
-		t.Fatal("user_tier should match UserTier")
-	}
-}
diff --git a/internal/query/logical.go b/internal/query/logical.go
deleted file mode 100644
index 27a4fda..0000000
--- a/internal/query/logical.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package query
-
-import "github.com/sssmaran/WaylogCLI/internal/graph/store"
-
-type AndPredicate struct {
-	Left, Right Predicate
-}
-
-func (p AndPredicate) Eval(f store.RequestFacts) bool {
-	return p.Left.Eval(f) && p.Right.Eval(f)
-}
-
-type OrPredicate struct {
-	Left, Right Predicate
-}
-
-func (p OrPredicate) Eval(f store.RequestFacts) bool {
-	return p.Left.Eval(f) || p.Right.Eval(f)
-}
diff --git a/internal/query/parser.go b/internal/query/parser.go
deleted file mode 100644
index d731431..0000000
--- a/internal/query/parser.go
+++ /dev/null
@@ -1,82 +0,0 @@
-package query
-
-import (
-	"fmt"
-	"strings"
-)
-
-func Parse(expr string) (Predicate, error) {
-	tokens := tokenize(expr)
-	if len(tokens) == 0 {
-		return nil, fmt.Errorf("empty query")
-	}
-	p, rest, err := parseOr(tokens)
-	if err != nil {
-		return nil, err
-	}
-	if len(rest) != 0 {
-		return nil, fmt.Errorf("unexpected tokens: %v", rest)
-	}
-	return p, nil
-}
-
-func tokenize(s string) []string {
-	s = strings.ReplaceAll(s, "(", " ( ")
-	s = strings.ReplaceAll(s, ")", " ) ")
-	return strings.Fields(s)
-}
-
-// helper funcs
-func parseOr(tokens []string) (Predicate, []string, error) {
-	left, rest, err := parseAnd(tokens)
-	if err != nil {
-		return nil, nil, err
-	}
-	for len(rest) > 0 && strings.ToUpper(rest[0]) == "OR" {
-		var right Predicate
-		right, rest, err = parseAnd(rest[1:])
-		if err != nil {
-			return nil, nil, err
-		}
-		left = OrPredicate{Left: left, Right: right}
-	}
-	return left, rest, nil
-}
-
-func parseAnd(tokens []string) (Predicate, []string, error) {
-	left, rest, err := parseAtom(tokens)
-	if err != nil {
-		return nil, nil, err
-	}
-	for len(rest) > 0 && strings.ToUpper(rest[0]) == "AND" {
-		var right Predicate
-		right, rest, err = parseAtom(rest[1:])
-		if err != nil {
-			return nil, nil, err
-		}
-		left = AndPredicate{Left: left, Right: right}
-	}
-	return left, rest, nil
-}
-
-func parseAtom(tokens []string) (Predicate, []string, error) {
-	if tokens[0] == "(" {
-		p, rest, err := parseOr(tokens[1:])
-		if err != nil {
-			return nil, nil, err
-		}
-		if len(rest) == 0 || rest[0] != ")" {
-			return nil, nil, fmt.Errorf("missing )")
-		}
-		return p, rest[1:], nil
-	}
-
-	parts := strings.Split(tokens[0], "=")
-	if len(parts) != 2 {
-		return nil, nil, fmt.Errorf("invalid expression: %s", tokens[0])
-	}
-	return EqualsPredicate{
-		Field: parts[0],
-		Value: parts[1],
-	}, tokens[1:], nil
-}
diff --git a/internal/query/predicate.go b/internal/query/predicate.go
deleted file mode 100644
index 52b708f..0000000
--- a/internal/query/predicate.go
+++ /dev/null
@@ -1,7 +0,0 @@
-package query
-
-import "github.com/sssmaran/WaylogCLI/internal/graph/store"
-
-type Predicate interface {
-	Eval(f store.RequestFacts) bool
-}
diff --git a/internal/ratelimit/ratelimit.go b/internal/ratelimit/ratelimit.go
new file mode 100644
index 0000000..d65666c
--- /dev/null
+++ b/internal/ratelimit/ratelimit.go
@@ -0,0 +1,109 @@
+// Package ratelimit provides a per-key token-bucket limiter for the HTTP
+// surface. Requests are keyed by the presented credential (Bearer or
+// X-API-Key) so one leaked or misbehaving key cannot starve others; requests
+// without a credential are keyed by client IP. Throttling on the *presented*
+// credential — valid or not — also slows down key brute-forcing.
+package ratelimit
+
+import (
+	"container/list"
+	"log/slog"
+	"net"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/metrics"
+	"golang.org/x/time/rate"
+)
+
+// maxKeys bounds the bucket map. When it is full, the least-recently-used key
+// is evicted to make room — so a flood of attacker-generated keys drops the
+// cold attacker entries rather than wiping the rate state of hot legitimate
+// keys.
+const maxKeys = 10000
+
+type bucket struct {
+	key string
+	lim *rate.Limiter
+}
+
+// Limiter is a per-key token bucket at rps requests/second with burst = rps,
+// backed by a bounded LRU. A nil Limiter or rps <= 0 disables limiting (Allow
+// always true).
+type Limiter struct {
+	rps     int
+	mu      sync.Mutex
+	lru     *list.List               // *bucket, most-recently-used at front
+	buckets map[string]*list.Element // key -> its element in lru
+}
+
+func New(rps int) *Limiter {
+	if rps <= 0 {
+		return nil
+	}
+	return &Limiter{rps: rps, lru: list.New(), buckets: map[string]*list.Element{}}
+}
+
+func (l *Limiter) Allow(key string, now time.Time) bool {
+	if l == nil {
+		return true
+	}
+	l.mu.Lock()
+	var lim *rate.Limiter
+	if el, ok := l.buckets[key]; ok {
+		l.lru.MoveToFront(el)
+		lim = el.Value.(*bucket).lim
+	} else {
+		if l.lru.Len() >= maxKeys {
+			if oldest := l.lru.Back(); oldest != nil {
+				l.lru.Remove(oldest)
+				delete(l.buckets, oldest.Value.(*bucket).key)
+			}
+		}
+		lim = rate.NewLimiter(rate.Limit(l.rps), l.rps)
+		l.buckets[key] = l.lru.PushFront(&bucket{key: key, lim: lim})
+	}
+	l.mu.Unlock()
+	// lim stays valid even if another goroutine evicts it before AllowN runs.
+	return lim.AllowN(now, 1)
+}
+
+// Middleware throttles requests through l. On rejection it responds
+// 429 + Retry-After: 1 (plain text, matching the auth middleware style)
+// and increments the rate-limited counter for the scope.
+func Middleware(l *Limiter, scope string, m *metrics.Metrics) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		if l == nil {
+			return next
+		}
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if !l.Allow(keyFromRequest(r), time.Now()) {
+				if m != nil {
+					m.RateLimited.WithLabelValues(scope).Inc()
+				}
+				slog.Debug("rate limited", "scope", scope, "path", r.URL.Path)
+				w.Header().Set("Retry-After", "1")
+				http.Error(w, "rate limit exceeded", http.StatusTooManyRequests)
+				return
+			}
+			next.ServeHTTP(w, r)
+		})
+	}
+}
+
+func keyFromRequest(r *http.Request) string {
+	if auth := r.Header.Get("Authorization"); auth != "" {
+		if idx := strings.IndexByte(auth, ' '); idx > 0 && strings.EqualFold(auth[:idx], "bearer") {
+			return auth[idx+1:]
+		}
+	}
+	if k := r.Header.Get("X-API-Key"); k != "" {
+		return k
+	}
+	if host, _, err := net.SplitHostPort(r.RemoteAddr); err == nil {
+		return host
+	}
+	return r.RemoteAddr
+}
diff --git a/internal/ratelimit/ratelimit_test.go b/internal/ratelimit/ratelimit_test.go
new file mode 100644
index 0000000..ebe07eb
--- /dev/null
+++ b/internal/ratelimit/ratelimit_test.go
@@ -0,0 +1,155 @@
+package ratelimit
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestAllowPerKeyIsolation(t *testing.T) {
+	l := New(2) // 2 rps, burst 2
+	now := time.Now()
+	if !l.Allow("a", now) || !l.Allow("a", now) {
+		t.Fatal("requests within burst must be admitted")
+	}
+	if l.Allow("a", now) {
+		t.Fatal("request over budget must be denied")
+	}
+	if !l.Allow("b", now) {
+		t.Fatal("an exhausted key must not affect other keys")
+	}
+}
+
+func TestAllowRefill(t *testing.T) {
+	l := New(1)
+	now := time.Now()
+	if !l.Allow("k", now) {
+		t.Fatal("first request must pass")
+	}
+	if l.Allow("k", now) {
+		t.Fatal("second immediate request must be denied at 1 rps")
+	}
+	if !l.Allow("k", now.Add(1100*time.Millisecond)) {
+		t.Fatal("token must refill after ~1s")
+	}
+}
+
+func TestDisabledLimiterAlwaysAllows(t *testing.T) {
+	for _, l := range []*Limiter{New(0), New(-1), nil} {
+		for i := 0; i < 50; i++ {
+			if !l.Allow("k", time.Now()) {
+				t.Fatal("disabled limiter must always allow")
+			}
+		}
+	}
+}
+
+func TestBucketCountIsBounded(t *testing.T) {
+	l := New(1)
+	now := time.Now()
+	for i := 0; i < maxKeys+10; i++ {
+		l.Allow(fmt.Sprintf("k%d", i), now)
+	}
+	l.mu.Lock()
+	n := len(l.buckets)
+	l.mu.Unlock()
+	if n > maxKeys {
+		t.Fatalf("bucket map must stay bounded: %d > %d", n, maxKeys)
+	}
+}
+
+func TestEvictionKeepsRecentlyUsedKeyThrottled(t *testing.T) {
+	l := New(1) // 1 rps, burst 1
+	now := time.Now()
+
+	// Exhaust a legitimate hot key.
+	if !l.Allow("real", now) {
+		t.Fatal("first request for hot key should pass")
+	}
+	if l.Allow("real", now) {
+		t.Fatal("hot key should be throttled after consuming its single token")
+	}
+
+	// An attacker churns far more than maxKeys distinct fake credentials while
+	// the legitimate key keeps receiving traffic. LRU eviction must drop the
+	// cold attacker keys, never the hot one — so the hot key stays throttled.
+	for i := 0; i < maxKeys*2; i++ {
+		l.Allow(fmt.Sprintf("fake-%d", i), now)
+		if i%10 == 0 {
+			if l.Allow("real", now) {
+				t.Fatalf("hot key's bucket was reset by eviction churn at i=%d", i)
+			}
+		}
+	}
+}
+
+func TestMiddlewareReturns429WithRetryAfter(t *testing.T) {
+	l := New(1)
+	var hits int
+	h := Middleware(l, "write", nil)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		hits++
+		w.WriteHeader(http.StatusAccepted)
+	}))
+
+	send := func(key string) *httptest.ResponseRecorder {
+		req := httptest.NewRequest(http.MethodPost, "/v1/events", nil)
+		if key != "" {
+			req.Header.Set("Authorization", "Bearer "+key)
+		}
+		rr := httptest.NewRecorder()
+		h.ServeHTTP(rr, req)
+		return rr
+	}
+
+	if rr := send("key1"); rr.Code != http.StatusAccepted {
+		t.Fatalf("first request: %d", rr.Code)
+	}
+	rr := send("key1")
+	if rr.Code != http.StatusTooManyRequests {
+		t.Fatalf("second request must be throttled, got %d", rr.Code)
+	}
+	if rr.Header().Get("Retry-After") != "1" {
+		t.Fatalf("429 must carry Retry-After: 1, got %q", rr.Header().Get("Retry-After"))
+	}
+	if rr := send("key2"); rr.Code != http.StatusAccepted {
+		t.Fatalf("other key must not be throttled: %d", rr.Code)
+	}
+	if hits != 2 {
+		t.Fatalf("handler hits = %d, want 2", hits)
+	}
+}
+
+func TestMiddlewareNilLimiterPassesThrough(t *testing.T) {
+	h := Middleware(nil, "read", nil)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	for i := 0; i < 20; i++ {
+		rr := httptest.NewRecorder()
+		h.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, "/", nil))
+		if rr.Code != http.StatusOK {
+			t.Fatalf("nil limiter must pass through, got %d", rr.Code)
+		}
+	}
+}
+
+func TestKeyFromRequestFallsBackToClientIP(t *testing.T) {
+	bearer := httptest.NewRequest(http.MethodGet, "/", nil)
+	bearer.Header.Set("Authorization", "Bearer tok123")
+	if got := keyFromRequest(bearer); got != "tok123" {
+		t.Fatalf("bearer key = %q", got)
+	}
+
+	apiKey := httptest.NewRequest(http.MethodGet, "/", nil)
+	apiKey.Header.Set("X-API-Key", "xk1")
+	if got := keyFromRequest(apiKey); got != "xk1" {
+		t.Fatalf("x-api-key = %q", got)
+	}
+
+	anon := httptest.NewRequest(http.MethodGet, "/", nil)
+	anon.RemoteAddr = "10.1.2.3:5544"
+	if got := keyFromRequest(anon); got != "10.1.2.3" {
+		t.Fatalf("ip fallback = %q", got)
+	}
+}
diff --git a/internal/reports/reports.go b/internal/reports/reports.go
index 243ee11..f275d87 100644
--- a/internal/reports/reports.go
+++ b/internal/reports/reports.go
@@ -45,8 +45,9 @@ func Markdown(rep *pkgtriage.Report) string {
 	fmt.Fprintln(&b)
 	fmt.Fprintf(&b, "## Summary\n\n")
 	fmt.Fprintf(&b, "- Incident: `%s` (report `%s`)\n", nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Evidence fingerprint: `%s` (stable across ticks until evidence changes)\n", nz(rep.EvidenceFingerprint))
 	fmt.Fprintf(&b, "- Confidence: `%s` (incident `%s`, report `%s`)\n", nz(string(rep.Confidence)), nz(rep.IncidentRef.ID), nz(rep.ReportHash))
-	fmt.Fprintf(&b, "- Evidence status: alert=%s trace=%s signal=%s (report `%s`)\n", availability(len(rep.Alerts) > 0), availability(len(rep.SampleTraces) > 0), availability(len(rep.Signals) > 0), nz(rep.ReportHash))
+	fmt.Fprintf(&b, "- Evidence status: alert=%s trace=%s signal=%s runtime=%s (report `%s`)\n", availability(len(rep.Alerts) > 0), availability(len(rep.SampleTraces) > 0), availability(len(rep.Signals) > 0), availability(len(rep.Runtime) > 0), nz(rep.ReportHash))
 	fmt.Fprintf(&b, "- Window: `%s` (incident `%s`)\n\n", nz(rep.IncidentRef.Window), nz(rep.IncidentRef.ID))
 
 	fmt.Fprintf(&b, "## Impact\n\n")
@@ -78,6 +79,15 @@ func Markdown(rep *pkgtriage.Report) string {
 		}
 	}
 
+	fmt.Fprintf(&b, "\n## Runtime Evidence\n\n")
+	if len(rep.Runtime) == 0 {
+		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+	} else {
+		for _, r := range rep.Runtime {
+			fmt.Fprintf(&b, "- `%s` %s on `%s`: %s (source `%s`, signal `%s`, report `%s`)\n", nz(r.Severity), nz(r.Subtype), nz(r.Service), nz(r.Reason), nz(r.Source), nz(r.SignalID), nz(rep.ReportHash))
+		}
+	}
+
 	fmt.Fprintf(&b, "\n## Signals\n\n")
 	if len(rep.Signals) == 0 {
 		fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
@@ -105,6 +115,7 @@ func Slack(rep *pkgtriage.Report) map[string]any {
 		{"type": "mrkdwn", "text": "*Impact*\n" + impactSummary(rep)},
 		{"type": "mrkdwn", "text": "*Trace*\n" + firstTrace(rep)},
 		{"type": "mrkdwn", "text": "*Report hash*\n`" + nz(rep.ReportHash) + "`"},
+		{"type": "mrkdwn", "text": "*Evidence fingerprint*\n`" + nz(rep.EvidenceFingerprint) + "`"},
 	}
 	alertText := "not available"
 	if len(rep.Alerts) > 0 {
@@ -116,6 +127,7 @@ func Slack(rep *pkgtriage.Report) map[string]any {
 			{"type": "header", "text": map[string]string{"type": "plain_text", "text": "Waylog operator report"}},
 			{"type": "section", "fields": fields},
 			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Alert evidence*\n" + alertText}},
+			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Runtime evidence*\n" + runtimeSummary(rep)}},
 			{"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Next check*\n" + firstCheck(rep)}},
 		},
 	}
@@ -127,8 +139,8 @@ func PagerDuty(rep *pkgtriage.Report) string {
 		a := rep.Alerts[0]
 		alert = fmt.Sprintf("%s alert %s via signal %s provider=%s", nz(a.Source), nz(a.AlertID), nz(a.SignalID), nz(a.ProviderURL))
 	}
-	return fmt.Sprintf("Waylog operator report: incident=%s confidence=%s impact=%s trace=%s report_hash=%s alert=%s next_check=%s",
-		nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), impactSummary(rep), firstTrace(rep), nz(rep.ReportHash), alert, firstCheck(rep))
+	return fmt.Sprintf("Waylog operator report: incident=%s confidence=%s impact=%s trace=%s report_hash=%s evidence_fingerprint=%s alert=%s runtime=%s next_check=%s",
+		nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), impactSummary(rep), firstTrace(rep), nz(rep.ReportHash), nz(rep.EvidenceFingerprint), alert, runtimeSummary(rep), firstCheck(rep))
 }
 
 func EncodeBody(r Rendered) ([]byte, error) {
@@ -155,6 +167,19 @@ func firstTrace(rep *pkgtriage.Report) string {
 	return "`" + nz(rep.SampleTraces[0].TraceID) + "` (incident `" + nz(rep.IncidentRef.ID) + "`, report `" + nz(rep.ReportHash) + "`)"
 }
 
+// runtimeSummary renders matched runtime evidence as a compact list: infra and
+// app rows count toward the same incident. Used by Slack and PagerDuty.
+func runtimeSummary(rep *pkgtriage.Report) string {
+	if len(rep.Runtime) == 0 {
+		return "not available (report `" + nz(rep.ReportHash) + "`)"
+	}
+	parts := make([]string, 0, len(rep.Runtime))
+	for _, r := range rep.Runtime {
+		parts = append(parts, fmt.Sprintf("`%s` %s on %s (%s)", nz(r.Severity), nz(r.Subtype), nz(r.Service), nz(r.Source)))
+	}
+	return strings.Join(parts, "; ") + " (report `" + nz(rep.ReportHash) + "`)"
+}
+
 func impactSummary(rep *pkgtriage.Report) string {
 	return fmt.Sprintf("%d requests, %d users, %d services (incident `%s`, report `%s`)",
 		rep.BlastSnapshot.Requests, rep.BlastSnapshot.Users, rep.BlastSnapshot.Services, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
diff --git a/internal/reports/reports_test.go b/internal/reports/reports_test.go
index 40fe239..fc8be13 100644
--- a/internal/reports/reports_test.go
+++ b/internal/reports/reports_test.go
@@ -12,10 +12,12 @@ func TestMarkdownReportCitesEvidence(t *testing.T) {
 	out := Markdown(testReport())
 	for _, want := range []string{
 		"# Waylog Operator Report",
-		"Evidence status: alert=present trace=present signal=present (report `sha256:test`)",
+		"Evidence status: alert=present trace=present signal=present runtime=present (report `sha256:test`)",
 		"Requests: 12 (incident `inc_abc`, report `sha256:test`)",
 		"trace `trace_1`: checkout payment failure (incident `inc_abc`, report `sha256:test`)",
 		"`critical` from `grafana`: PMT_502 spike; provider=https://grafana/alert/1 (signal `sig_alert`, alert `alert_1`, report `sha256:test`)",
+		"## Runtime Evidence",
+		"`critical` oom_killed on `checkout`: OOMKilled (source `k8s-demo`, signal `sig_oom`, report `sha256:test`)",
 		"Check payment health (check `check_0`, report `sha256:test`)",
 	} {
 		if !strings.Contains(out, want) {
@@ -36,7 +38,7 @@ func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
 	if !json.Valid(raw) {
 		t.Fatalf("invalid json: %s", raw)
 	}
-	for _, want := range []string{"Waylog operator report", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test"} {
+	for _, want := range []string{"Waylog operator report", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test", "Runtime evidence", "oom_killed on checkout"} {
 		if !strings.Contains(string(raw), want) {
 			t.Fatalf("slack payload missing %q:\n%s", want, raw)
 		}
@@ -45,7 +47,7 @@ func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
 
 func TestPagerDutyReportCitesEvidence(t *testing.T) {
 	out := PagerDuty(testReport())
-	for _, want := range []string{"Waylog operator report", "inc_abc", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test"} {
+	for _, want := range []string{"Waylog operator report", "inc_abc", "12 requests, 2 users, 3 services", "trace_1", "sig_alert", "alert_1", "sha256:test", "runtime=", "oom_killed on checkout"} {
 		if !strings.Contains(out, want) {
 			t.Fatalf("pagerduty missing %q:\n%s", want, out)
 		}
@@ -67,6 +69,7 @@ func testReport() *pkgtriage.Report {
 		SampleTraces: []pkgtriage.TraceSample{{TraceID: "trace_1", Summary: "checkout payment failure"}},
 		Signals:      []pkgtriage.SignalRef{{ID: "sig_alert", Type: "alert", EvidenceIDs: []string{"sig_alert"}}},
 		Alerts:       []pkgtriage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", ProviderURL: "https://grafana/alert/1", EvidenceIDs: []string{"sig_alert"}}},
+		Runtime:      []pkgtriage.RuntimeRef{{SignalID: "sig_oom", Subtype: "oom_killed", Service: "checkout", Source: "k8s-demo", Severity: "critical", Reason: "OOMKilled", OccurredAt: "2026-05-10T11:59:00Z"}},
 		NextChecks:   []pkgtriage.NextCheck{{ID: "check_0", Prompt: "Check payment health"}},
 		Confidence:   pkgtriage.ConfidenceHigh,
 		GeneratedAt:  "2026-05-10T12:00:00Z",
diff --git a/internal/testutil/testutil.go b/internal/testutil/testutil.go
index 6d504d8..14a5b2e 100644
--- a/internal/testutil/testutil.go
+++ b/internal/testutil/testutil.go
@@ -3,7 +3,6 @@ package testutil
 import (
 	"time"
 
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
 	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
@@ -188,24 +187,3 @@ func WithParentRequestID(id string) EventOption {
 		ev.ParentRequestID = id
 	}
 }
-
-// MakeGraph creates a graph with the given nodes and edges.
-func MakeGraph(nodes []core.Node, edges []core.Edge) *core.Graph {
-	g := core.New()
-	for _, n := range nodes {
-		g.AddNode(n)
-	}
-	for _, e := range edges {
-		g.AddEdge(e)
-	}
-	return g
-}
-
-// MakeNode creates a node with the given ID and type.
-func MakeNode(id string, nodeType core.NodeType, attr map[string]any) core.Node {
-	return core.Node{
-		ID:   id,
-		Type: nodeType,
-		Attr: attr,
-	}
-}
diff --git a/internal/tools/failures.go b/internal/tools/failures.go
deleted file mode 100644
index 7c9a288..0000000
--- a/internal/tools/failures.go
+++ /dev/null
@@ -1,553 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"sort"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/config"
-	"github.com/sssmaran/WaylogCLI/internal/graph/analysis"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-type failuresInput struct {
-	Tier   string `json:"tier"`
-	Limit  int    `json:"limit,omitempty"`
-	Offset int    `json:"offset,omitempty"`
-}
-
-type failureEntry struct {
-	RequestID   string `json:"request_id"`
-	TraceID     string `json:"trace_id,omitempty"`
-	LatencyMs   any    `json:"latency_ms,omitempty"`
-	Tier        string `json:"tier,omitempty"`
-	ErrorCode   string `json:"error_code,omitempty"`
-	ErrorPath   string `json:"error_path,omitempty"`
-	ErrorReason string `json:"error_reason,omitempty"`
-	RetryOf     int    `json:"retry_of,omitempty"`
-}
-
-type failuresOutput struct {
-	SchemaVersion string         `json:"schema_version"`
-	Failures      []failureEntry `json:"failures"`
-	TotalCount    int            `json:"total_count"`
-	HasMore       bool           `json:"has_more"`
-}
-
-type failureRecord struct {
-	entry  failureEntry
-	seenAt time.Time
-}
-
-func handleFailures(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input failuresInput
-	if len(params) > 0 {
-		if err := json.Unmarshal(params, &input); err != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-		}
-	}
-
-	g := store.Snapshot()
-	now := time.Now()
-	var out []failureRecord
-
-	store.ForEachRequestFact(time.Time{}, now, func(f graphstore.RequestFacts) {
-		userTier := f.UserTier
-		if input.Tier != "" && userTier != input.Tier {
-			return
-		}
-
-		errorCodes := uniqueStrings(f.Errors)
-		if len(errorCodes) == 0 {
-			errorCodes = requestErrorCodesFromGraph(g, f.RequestID)
-		}
-		if len(errorCodes) == 0 {
-			return
-		}
-
-		traceID := f.TraceID
-		if traceID == "" {
-			traceID = traceIDForRequest(g, f.RequestID)
-		}
-		latency := any(f.LatencyMs)
-		errPath, errReason, retryOf := requestErrorContext(g, f.RequestID)
-		for _, errorCode := range errorCodes {
-			out = append(out, failureRecord{
-				entry: failureEntry{
-					RequestID:   f.RequestID,
-					TraceID:     traceID,
-					LatencyMs:   latency,
-					Tier:        userTier,
-					ErrorCode:   errorCode,
-					ErrorPath:   errPath,
-					ErrorReason: errReason,
-					RetryOf:     retryOf,
-				},
-				seenAt: f.SeenAt,
-			})
-		}
-	})
-
-	sort.SliceStable(out, func(i, j int) bool {
-		if !out[i].seenAt.Equal(out[j].seenAt) {
-			return out[i].seenAt.After(out[j].seenAt)
-		}
-		if out[i].entry.RequestID != out[j].entry.RequestID {
-			return out[i].entry.RequestID < out[j].entry.RequestID
-		}
-		return out[i].entry.ErrorCode < out[j].entry.ErrorCode
-	})
-
-	failures := make([]failureEntry, 0, len(out))
-	for _, record := range out {
-		failures = append(failures, record.entry)
-	}
-
-	page, totalCount, hasMore := applyPagination(failures, input.Limit, input.Offset)
-	return failuresOutput{SchemaVersion: "1.0", Failures: page, TotalCount: totalCount, HasMore: hasMore}, nil
-}
-
-type patternsInput struct {
-	Window string `json:"window,omitempty"`
-	Limit  int    `json:"limit,omitempty"`
-	Offset int    `json:"offset,omitempty"`
-}
-
-type patternsOutput struct {
-	SchemaVersion string                    `json:"schema_version"`
-	Patterns      []analysis.FailurePattern `json:"patterns"`
-	TotalCount    int                       `json:"total_count"`
-	HasMore       bool                      `json:"has_more"`
-}
-
-func handleFailurePatterns(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input patternsInput
-	if len(params) > 0 {
-		if err := json.Unmarshal(params, &input); err != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-		}
-	}
-
-	if input.Window != "" {
-		d, err := time.ParseDuration(input.Window)
-		if err != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid window: %v", err)}
-		}
-		end := time.Now()
-		start := end.Add(-d)
-		g := store.Snapshot()
-		rollup := analysis.RollupWindow(g, store, traceStoreFrom(store), start, end)
-		patterns := analysis.FailurePatternsFromRollup(rollup)
-		page, totalCount, hasMore := applyPagination(patterns, input.Limit, input.Offset)
-		return patternsOutput{SchemaVersion: "1.0", Patterns: page, TotalCount: totalCount, HasMore: hasMore}, nil
-	}
-
-	g := store.Snapshot()
-	patterns := analysis.DetectFailurePatterns(g)
-	page, totalCount, hasMore := applyPagination(patterns, input.Limit, input.Offset)
-	return patternsOutput{SchemaVersion: "1.0", Patterns: page, TotalCount: totalCount, HasMore: hasMore}, nil
-}
-
-type blastInput struct {
-	ErrorCode       string `json:"error_code"`
-	IncludeServices bool   `json:"include_services,omitempty"`
-	TopUsers        int    `json:"top_users,omitempty"`
-	ByTier          bool   `json:"by_tier,omitempty"`
-	Limit           int    `json:"limit,omitempty"`
-	Offset          int    `json:"offset,omitempty"`
-}
-
-type blastService struct {
-	Service string `json:"service"`
-	Count   int    `json:"count"`
-}
-
-type blastTier struct {
-	Tier  string `json:"tier"`
-	Count int    `json:"count"`
-}
-
-type blastUser struct {
-	UserID string `json:"user_id"`
-	Count  int    `json:"count"`
-}
-
-type blastOutput struct {
-	SchemaVersion    string         `json:"schema_version"`
-	ErrorCode        string         `json:"error_code"`
-	AffectedRequests int            `json:"affected_requests"`
-	AffectedUsers    int            `json:"affected_users"`
-	VIPUsers         int            `json:"vip_users"`
-	SeverityScore    float64        `json:"severity_score"`
-	Services         []blastService `json:"services,omitempty"`
-	Tiers            []blastTier    `json:"tiers,omitempty"`
-	TopUsers         []blastUser    `json:"top_users,omitempty"`
-	FeatureFlags     []string       `json:"feature_flags,omitempty"`
-	TotalCount       int            `json:"total_count"`
-	HasMore          bool           `json:"has_more"`
-}
-
-func handleBlastRadius(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input blastInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.ErrorCode == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "error_code required"}
-	}
-
-	g := store.Snapshot()
-	ids, ready := store.ErrorIndex(input.ErrorCode)
-	requestIDs := map[string]struct{}{}
-	if ready {
-		for _, id := range ids {
-			requestIDs[id] = struct{}{}
-		}
-	}
-
-	weightRequest := config.GetenvFloat("BLAST_WEIGHT_REQUEST", 1.0)
-	weightVIP := config.GetenvFloat("BLAST_WEIGHT_VIP", 10.0)
-	weightPremium := config.GetenvFloat("BLAST_WEIGHT_PREMIUM", 3.0)
-	weightService := config.GetenvFloat("BLAST_WEIGHT_SERVICE", 5.0)
-
-	requests := map[string]bool{}
-	users := map[string]int{}
-	services := map[string]int{}
-	tiers := map[string]int{}
-	flags := map[string]bool{}
-	vipUsers := map[string]bool{}
-	premiumUsers := map[string]bool{}
-	now := time.Now()
-	store.ForEachRequestFact(time.Time{}, now, func(f graphstore.RequestFacts) {
-		if ready {
-			if _, ok := requestIDs[f.RequestID]; !ok {
-				return
-			}
-		} else if !f.HasError(input.ErrorCode) && !requestHasErrorCode(g, f.RequestID, input.ErrorCode) {
-			return
-		}
-
-		requests[f.RequestID] = true
-		collectBlastNeighbors(g, f, users, services, tiers, flags, vipUsers, premiumUsers)
-	})
-
-	out := blastOutput{
-		SchemaVersion:    "1.0",
-		ErrorCode:        input.ErrorCode,
-		AffectedRequests: len(requests),
-		AffectedUsers:    len(users),
-		VIPUsers:         len(vipUsers),
-		FeatureFlags:     sortedKeys(flags),
-	}
-	out.SeverityScore = float64(out.AffectedRequests)*weightRequest +
-		float64(out.VIPUsers)*weightVIP +
-		float64(len(premiumUsers))*weightPremium +
-		float64(len(services))*weightService
-
-	if input.IncludeServices {
-		allServices := mapCountToSortedServices(services)
-		out.TotalCount = len(allServices)
-		var svcHasMore bool
-		out.Services, _, svcHasMore = applyPagination(allServices, input.Limit, input.Offset)
-		out.HasMore = svcHasMore
-	} else {
-		out.TotalCount = out.AffectedRequests
-	}
-	if input.ByTier {
-		out.Tiers = mapCountToSortedTiers(tiers)
-	}
-	if input.TopUsers > 0 {
-		out.TopUsers = mapCountToTopUsers(users, input.TopUsers)
-	}
-
-	return out, nil
-}
-
-func collectBlastNeighbors(g *core.Graph, fact graphstore.RequestFacts, users map[string]int, services map[string]int, tiers map[string]int, flags map[string]bool, vipUsers map[string]bool, premiumUsers map[string]bool) {
-	serviceNames := uniqueStrings(fact.Services)
-	if len(serviceNames) == 0 {
-		serviceNames = requestServicesFromGraph(g, fact.RequestID)
-	}
-	for _, name := range serviceNames {
-		services[name]++
-	}
-
-	flagNames := uniqueStrings(fact.FeatureFlags)
-	if len(flagNames) == 0 {
-		flagNames = requestFeatureFlagsFromGraph(g, fact.RequestID)
-	}
-	for _, name := range flagNames {
-		flags[name] = true
-	}
-
-	userID := fact.UserID
-	userTier := fact.UserTier
-	userRegion := fact.UserRegion
-	userVIP := fact.UserVIP
-	if userID == "" || userTier == "" || userRegion == "" {
-		if fallbackID, fallbackTier, fallbackRegion, fallbackVIP, ok := requestUserInfoFromGraph(g, fact.RequestID); ok {
-			if userID == "" {
-				userID = fallbackID
-			}
-			if userTier == "" {
-				userTier = fallbackTier
-			}
-			if userRegion == "" {
-				userRegion = fallbackRegion
-			}
-			if !userVIP {
-				userVIP = fallbackVIP
-			}
-		}
-	}
-
-	if userID != "" {
-		users[userID]++
-	}
-	if userTier != "" {
-		tiers[userTier]++
-		if userTier == "premium" && userID != "" {
-			premiumUsers[userID] = true
-		}
-	}
-	if userVIP && userID != "" {
-		vipUsers[userID] = true
-	}
-}
-
-type chainInput struct {
-	RequestID string `json:"request_id"`
-}
-
-type chainOutput struct {
-	SchemaVersion string   `json:"schema_version"`
-	RequestID     string   `json:"request_id"`
-	Services      []string `json:"services"`
-}
-
-func handleFailureChain(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input chainInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.RequestID == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "request_id required"}
-	}
-
-	g := store.Snapshot()
-	var serviceID string
-	for _, e := range g.OutEdges[input.RequestID] {
-		if e.Type == core.EdgeHandledBy {
-			serviceID = e.To
-			break
-		}
-	}
-	if serviceID == "" {
-		return chainOutput{SchemaVersion: "1.0", RequestID: input.RequestID, Services: []string{}}, nil
-	}
-
-	visited := map[string]bool{}
-	var svcs []string
-	curr := serviceID
-
-	for {
-		if visited[curr] {
-			break
-		}
-		visited[curr] = true
-
-		svc, ok := g.Nodes[curr]
-		if !ok {
-			break
-		}
-		svcs = append(svcs, serviceNameForNode(svc))
-
-		next := ""
-		for _, e := range g.OutEdges[curr] {
-			if e.Type == core.EdgeCalls {
-				next = e.To
-				break
-			}
-		}
-		if next == "" {
-			break
-		}
-		curr = next
-	}
-
-	return chainOutput{SchemaVersion: "1.0", RequestID: input.RequestID, Services: svcs}, nil
-}
-
-func uniqueStrings(values []string) []string {
-	seen := map[string]struct{}{}
-	out := make([]string, 0, len(values))
-	for _, v := range values {
-		if v == "" {
-			continue
-		}
-		if _, ok := seen[v]; ok {
-			continue
-		}
-		seen[v] = struct{}{}
-		out = append(out, v)
-	}
-	return out
-}
-
-func traceIDForRequest(g *core.Graph, reqID string) string {
-	if g == nil {
-		return ""
-	}
-	req, ok := g.Nodes[reqID]
-	if !ok || req.Attr == nil {
-		return ""
-	}
-	traceID, _ := req.Attr["trace_id"].(string)
-	return traceID
-}
-
-func requestErrorContext(g *core.Graph, reqID string) (path, reason string, retryOf int) {
-	if g == nil {
-		return
-	}
-	req, ok := g.Nodes[reqID]
-	if !ok || req.Attr == nil {
-		return
-	}
-	if p, ok := req.Attr["error_path"].(string); ok {
-		path = p
-	}
-	if r, ok := req.Attr["error_reason"].(string); ok {
-		reason = r
-	}
-	switch v := req.Attr["retry_of"].(type) {
-	case int:
-		retryOf = v
-	case float64:
-		retryOf = int(v)
-	}
-	return
-}
-
-func requestErrorCodesFromGraph(g *core.Graph, reqID string) []string {
-	if g == nil {
-		return nil
-	}
-	seen := map[string]struct{}{}
-	var out []string
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeFailedWith {
-			continue
-		}
-		errNode, ok := g.Nodes[e.To]
-		if !ok || errNode.Attr == nil {
-			continue
-		}
-		code, _ := errNode.Attr["code"].(string)
-		if code == "" {
-			continue
-		}
-		if _, ok := seen[code]; ok {
-			continue
-		}
-		seen[code] = struct{}{}
-		out = append(out, code)
-	}
-	return out
-}
-
-func requestHasErrorCode(g *core.Graph, reqID, code string) bool {
-	for _, current := range requestErrorCodesFromGraph(g, reqID) {
-		if current == code {
-			return true
-		}
-	}
-	return false
-}
-
-func requestServicesFromGraph(g *core.Graph, reqID string) []string {
-	if g == nil {
-		return nil
-	}
-	seen := map[string]struct{}{}
-	var out []string
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeHandledBy {
-			continue
-		}
-		svc, ok := g.Nodes[e.To]
-		if !ok {
-			continue
-		}
-		name := serviceNameForNode(svc)
-		if name == "" {
-			continue
-		}
-		if _, ok := seen[name]; ok {
-			continue
-		}
-		seen[name] = struct{}{}
-		out = append(out, name)
-	}
-	return out
-}
-
-func requestFeatureFlagsFromGraph(g *core.Graph, reqID string) []string {
-	if g == nil {
-		return nil
-	}
-	seen := map[string]struct{}{}
-	var out []string
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeUsedFlag {
-			continue
-		}
-		flagNode, ok := g.Nodes[e.To]
-		if !ok || flagNode.Attr == nil {
-			continue
-		}
-		name, _ := flagNode.Attr["name"].(string)
-		if name == "" {
-			name = e.To
-		}
-		if _, ok := seen[name]; ok {
-			continue
-		}
-		seen[name] = struct{}{}
-		out = append(out, name)
-	}
-	return out
-}
-
-func requestUserInfoFromGraph(g *core.Graph, reqID string) (userID, tier, region string, vip bool, ok bool) {
-	if g == nil {
-		return "", "", "", false, false
-	}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeRequestBy {
-			continue
-		}
-		userNode, found := g.Nodes[e.To]
-		if !found {
-			return "", "", "", false, false
-		}
-		if userNode.Attr != nil {
-			userID, _ = userNode.Attr["id"].(string)
-			if userID == "" {
-				userID = e.To
-			}
-			tier, _ = userNode.Attr["tier"].(string)
-			region, _ = userNode.Attr["region"].(string)
-			vip, _ = userNode.Attr["vip"].(bool)
-		}
-		return userID, tier, region, vip, true
-	}
-	return "", "", "", false, false
-}
diff --git a/internal/tools/failures_dedup_test.go b/internal/tools/failures_dedup_test.go
deleted file mode 100644
index 3280f30..0000000
--- a/internal/tools/failures_dedup_test.go
+++ /dev/null
@@ -1,85 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"testing"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-)
-
-func makeStoreWithSingleSpanFailure() *graphstore.Store {
-	s := graphstore.NewStore()
-	b := build.NewBuilder()
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-		testutil.WithSpanID("1111111111111111"),
-		testutil.WithService("checkout"),
-		testutil.WithStatusCode(502),
-		testutil.WithError("PMT_502", "payment failed"),
-	)
-	s.Merge(b.Build(ev))
-	return s
-}
-
-func TestHandleInsightsDeduplicatesRequestAndSpanFailures(t *testing.T) {
-	store := makeStoreWithSingleSpanFailure()
-
-	outAny, err := handleInsights(context.Background(), store, json.RawMessage(`{}`))
-	if err != nil {
-		t.Fatalf("handleInsights returned error: %v", err)
-	}
-	out, ok := outAny.(insightsOutput)
-	if !ok {
-		t.Fatalf("unexpected output type %T", outAny)
-	}
-
-	if out.TotalFailures != 1 {
-		t.Fatalf("total_failures = %d, want 1", out.TotalFailures)
-	}
-	if len(out.TopErrors) != 1 {
-		t.Fatalf("top_errors len = %d, want 1", len(out.TopErrors))
-	}
-	if out.TopErrors[0].ErrorCode != "PMT_502" || out.TopErrors[0].Count != 1 {
-		t.Fatalf("top error = %+v, want PMT_502 count=1", out.TopErrors[0])
-	}
-	if len(out.TopServices) != 1 {
-		t.Fatalf("top_services len = %d, want 1", len(out.TopServices))
-	}
-	if out.TopServices[0].Service != "checkout" || out.TopServices[0].Count != 1 {
-		t.Fatalf("top service = %+v, want checkout count=1", out.TopServices[0])
-	}
-}
-
-func TestHandleBlastRadiusDeduplicatesRequestAndSpanFailures(t *testing.T) {
-	store := makeStoreWithSingleSpanFailure()
-
-	params := json.RawMessage(`{"error_code":"PMT_502","include_services":true,"by_tier":true,"top_users":5}`)
-	outAny, err := handleBlastRadius(context.Background(), store, params)
-	if err != nil {
-		t.Fatalf("handleBlastRadius returned error: %v", err)
-	}
-	out, ok := outAny.(blastOutput)
-	if !ok {
-		t.Fatalf("unexpected output type %T", outAny)
-	}
-
-	if out.AffectedRequests != 1 {
-		t.Fatalf("affected_requests = %d, want 1", out.AffectedRequests)
-	}
-	if out.AffectedUsers != 1 {
-		t.Fatalf("affected_users = %d, want 1", out.AffectedUsers)
-	}
-	if len(out.Services) != 1 || out.Services[0].Count != 1 {
-		t.Fatalf("services = %+v, want one entry with count=1", out.Services)
-	}
-	if len(out.Tiers) != 1 || out.Tiers[0].Count != 1 {
-		t.Fatalf("tiers = %+v, want one entry with count=1", out.Tiers)
-	}
-	if len(out.TopUsers) != 1 || out.TopUsers[0].Count != 1 {
-		t.Fatalf("top_users = %+v, want one entry with count=1", out.TopUsers)
-	}
-}
diff --git a/internal/tools/failures_test.go b/internal/tools/failures_test.go
deleted file mode 100644
index 368aaf8..0000000
--- a/internal/tools/failures_test.go
+++ /dev/null
@@ -1,243 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-type factStore struct {
-	facts    []store.RequestFacts
-	errors   map[string][]string
-	snapshot *core.Graph
-}
-
-func (s factStore) Snapshot() *core.Graph {
-	if s.snapshot != nil {
-		return s.snapshot
-	}
-	return core.New()
-}
-
-func (s factStore) SummarizeWindow(start, end time.Time) store.WindowSummary {
-	return store.WindowSummary{}
-}
-
-func (s factStore) ForEachRequestFact(start, end time.Time, fn func(store.RequestFacts)) {
-	for _, f := range s.facts {
-		if !f.SeenAt.IsZero() {
-			if f.SeenAt.Before(start) || f.SeenAt.After(end) {
-				continue
-			}
-		}
-		fn(f)
-	}
-}
-
-func (s factStore) ErrorIndex(errorCode string) ([]string, bool) {
-	ids, ok := s.errors[errorCode]
-	if !ok {
-		return nil, false
-	}
-	return append([]string(nil), ids...), true
-}
-
-func (s factStore) TraceStore() *tracestore.Store { return nil }
-
-func TestBlastRadius_SeverityScore(t *testing.T) {
-	s := store.NewStore()
-	b := build.NewBuilder()
-
-	// 3 requests failing with same error, one VIP user, one premium user
-	ev1 := testutil.MakeEvent(
-		testutil.WithTraceID("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-		testutil.WithUser("user-vip", "premium", "us"),
-		testutil.WithVIP(true),
-		testutil.WithError("BLAST_ERR", "boom"),
-		testutil.WithStatusCode(500),
-		testutil.WithService("svc-a"),
-	)
-	ev2 := testutil.MakeEvent(
-		testutil.WithTraceID("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
-		testutil.WithUser("user-premium", "premium", "us"),
-		testutil.WithError("BLAST_ERR", "boom"),
-		testutil.WithStatusCode(500),
-		testutil.WithService("svc-a"),
-	)
-	ev3 := testutil.MakeEvent(
-		testutil.WithTraceID("cccccccccccccccccccccccccccccccc"),
-		testutil.WithUser("user-standard", "standard", "us"),
-		testutil.WithError("BLAST_ERR", "boom"),
-		testutil.WithStatusCode(500),
-		testutil.WithService("svc-b"),
-	)
-
-	s.Merge(b.Build(ev1))
-	s.Merge(b.Build(ev2))
-	s.Merge(b.Build(ev3))
-
-	params, _ := json.Marshal(blastInput{ErrorCode: "BLAST_ERR", IncludeServices: true})
-	result, err := handleBlastRadius(context.Background(), s, params)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	out := result.(blastOutput)
-
-	if out.AffectedRequests != 3 {
-		t.Errorf("AffectedRequests = %d, want 3", out.AffectedRequests)
-	}
-	if out.VIPUsers != 1 {
-		t.Errorf("VIPUsers = %d, want 1", out.VIPUsers)
-	}
-
-	// Default weights: request=1, vip=10, premium=3, service=5
-	// Score = 3*1 + 1*10 + 2*3 + 2*5 = 3 + 10 + 6 + 10 = 29
-	if out.SeverityScore != 29 {
-		t.Errorf("SeverityScore = %f, want 29", out.SeverityScore)
-	}
-}
-
-func TestBlastRadius_VIPAndPremiumUniqueUsers(t *testing.T) {
-	s := store.NewStore()
-	b := build.NewBuilder()
-
-	// Same VIP user fails twice with same error → should count as 1 VIP user
-	for i, tid := range []string{
-		"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
-		"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
-	} {
-		_ = i
-		ev := testutil.MakeEvent(
-			testutil.WithTraceID(tid),
-			testutil.WithUser("user-vip", "premium", "us"),
-			testutil.WithVIP(true),
-			testutil.WithError("DUP_ERR", "boom"),
-			testutil.WithStatusCode(500),
-			testutil.WithService("svc-a"),
-		)
-		s.Merge(b.Build(ev))
-	}
-
-	params, _ := json.Marshal(blastInput{ErrorCode: "DUP_ERR"})
-	result, err := handleBlastRadius(context.Background(), s, params)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	out := result.(blastOutput)
-
-	if out.VIPUsers != 1 {
-		t.Errorf("VIPUsers = %d, want 1 (unique)", out.VIPUsers)
-	}
-	if out.AffectedUsers != 1 {
-		t.Errorf("AffectedUsers = %d, want 1 (unique)", out.AffectedUsers)
-	}
-}
-
-func TestHandleBlastRadius_UsesFlattenedFacts(t *testing.T) {
-	now := time.Now()
-	store := factStore{
-		facts: []store.RequestFacts{
-			{
-				RequestID:    "req-1",
-				SeenAt:       now,
-				TraceID:      "trace-1",
-				Services:     []string{"checkout"},
-				Errors:       []string{"BLAST_ERR"},
-				UserID:       "user-vip",
-				UserTier:     "premium",
-				UserVIP:      true,
-				FeatureFlags: []string{"flag-a", "flag-b"},
-			},
-			{
-				RequestID:    "req-2",
-				SeenAt:       now.Add(-time.Second),
-				TraceID:      "trace-2",
-				Services:     []string{"payment"},
-				Errors:       []string{"BLAST_ERR"},
-				UserID:       "user-standard",
-				UserTier:     "standard",
-				FeatureFlags: []string{"flag-b"},
-			},
-		},
-		errors: map[string][]string{
-			"BLAST_ERR": []string{"req-1", "req-2"},
-		},
-	}
-
-	params, _ := json.Marshal(blastInput{ErrorCode: "BLAST_ERR", IncludeServices: true, ByTier: true, TopUsers: 5})
-	result, err := handleBlastRadius(context.Background(), store, params)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	out := result.(blastOutput)
-	if out.AffectedRequests != 2 {
-		t.Fatalf("AffectedRequests = %d, want 2", out.AffectedRequests)
-	}
-	if out.AffectedUsers != 2 {
-		t.Fatalf("AffectedUsers = %d, want 2", out.AffectedUsers)
-	}
-	if out.VIPUsers != 1 {
-		t.Fatalf("VIPUsers = %d, want 1", out.VIPUsers)
-	}
-	if len(out.FeatureFlags) != 2 || out.FeatureFlags[0] != "flag-a" || out.FeatureFlags[1] != "flag-b" {
-		t.Fatalf("FeatureFlags = %+v, want [flag-a flag-b]", out.FeatureFlags)
-	}
-	if len(out.Services) != 2 {
-		t.Fatalf("Services = %+v, want 2 entries", out.Services)
-	}
-	if len(out.Tiers) != 2 {
-		t.Fatalf("Tiers = %+v, want 2 entries", out.Tiers)
-	}
-	if len(out.TopUsers) != 2 {
-		t.Fatalf("TopUsers = %+v, want 2 entries", out.TopUsers)
-	}
-}
-
-func TestHandleFailures_UsesRequestFactsTraceAndTier(t *testing.T) {
-	now := time.Now()
-	store := factStore{
-		facts: []store.RequestFacts{
-			{
-				RequestID: "req-1",
-				TraceID:   "trace-1",
-				SeenAt:    now,
-				LatencyMs: 91,
-				UserTier:  "premium",
-				Errors:    []string{"ERR_A"},
-			},
-		},
-	}
-
-	params, _ := json.Marshal(failuresInput{Tier: "premium"})
-	result, err := handleFailures(context.Background(), store, params)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	out := result.(failuresOutput)
-	if out.TotalCount != 1 {
-		t.Fatalf("TotalCount = %d, want 1", out.TotalCount)
-	}
-	if len(out.Failures) != 1 {
-		t.Fatalf("Failures len = %d, want 1", len(out.Failures))
-	}
-	if out.Failures[0].TraceID != "trace-1" {
-		t.Fatalf("TraceID = %q, want trace-1", out.Failures[0].TraceID)
-	}
-	if out.Failures[0].Tier != "premium" {
-		t.Fatalf("Tier = %q, want premium", out.Failures[0].Tier)
-	}
-	if out.Failures[0].ErrorCode != "ERR_A" {
-		t.Fatalf("ErrorCode = %q, want ERR_A", out.Failures[0].ErrorCode)
-	}
-}
diff --git a/internal/tools/graph_tools.go b/internal/tools/graph_tools.go
deleted file mode 100644
index c810488..0000000
--- a/internal/tools/graph_tools.go
+++ /dev/null
@@ -1,144 +0,0 @@
-package tools
-
-import (
-	"encoding/json"
-)
-
-const (
-	toolGraphStatsName   = "graph_stats"
-	toolExplainReqName   = "explain_request"
-	toolTraceGraphName   = "trace_graph"
-	toolTraceSummaryName = "trace_summary"
-	toolFailuresName     = "graph_failures"
-	toolPatternsName     = "failure_patterns"
-	toolBlastName        = "blast_radius"
-	toolChainName        = "failure_chain"
-	toolQueryName        = "graph_query"
-	toolDiffName         = "compare_windows"
-	toolInsightsName     = "graph_insights"
-)
-
-func RegisterGraphTools(reg *Registry) error {
-	if err := reg.Register(Tool{
-		Name:         toolGraphStatsName,
-		Description:  "Return entity and relationship counts for the current graph snapshot.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(graphStatsInputSchema),
-		OutputSchema: json.RawMessage(graphStatsOutputSchema),
-		Handler:      handleGraphStats,
-		Examples:     []string{"show graph stats"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolExplainReqName,
-		Description:  "Explain why a request failed using deterministic graph evidence.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(explainRequestInputSchema),
-		OutputSchema: json.RawMessage(explainRequestOutputSchema),
-		Handler:      handleExplainRequest,
-		Examples:     []string{"explain request ", "why did checkout fail"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolTraceGraphName,
-		Description:  "Return the span tree for a trace ID from the graph snapshot.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(traceGraphInputSchema),
-		OutputSchema: json.RawMessage(traceGraphOutputSchema),
-		Handler:      handleTraceGraph,
-		Examples:     []string{"show trace "},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolTraceSummaryName,
-		Description:  "Summarize a trace with request type, latency, and service path.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(traceSummaryInputSchema),
-		OutputSchema: json.RawMessage(traceSummaryOutputSchema),
-		Handler:      handleTraceSummary,
-		Examples:     []string{"trace summary for "},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolFailuresName,
-		Description:  "List failed requests with optional tier filtering.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(failuresInputSchema),
-		OutputSchema: json.RawMessage(failuresOutputSchema),
-		Handler:      handleFailures,
-		Examples:     []string{"list all failures"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolPatternsName,
-		Description:  "Detect recurring failure patterns in the graph or a time window.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(patternsInputSchema),
-		OutputSchema: json.RawMessage(patternsOutputSchema),
-		Handler:      handleFailurePatterns,
-		Examples:     []string{"show failure patterns in the last hour"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolBlastName,
-		Description:  "Compute the blast radius for a specific error code.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(blastInputSchema),
-		OutputSchema: json.RawMessage(blastOutputSchema),
-		Handler:      handleBlastRadius,
-		Examples:     []string{"what is the blast radius of PMT_502", "which users are affected"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolChainName,
-		Description:  "Return the downstream service chain for a request.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(chainInputSchema),
-		OutputSchema: json.RawMessage(chainOutputSchema),
-		Handler:      handleFailureChain,
-		Examples:     []string{"failure chain for request "},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolQueryName,
-		Description:  "Evaluate a query expression over a time window.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(queryInputSchema),
-		OutputSchema: json.RawMessage(queryOutputSchema),
-		Handler:      handleGraphQuery,
-		Examples:     []string{"graph_query expr='error_code=PMT_502' window='10m'"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolDiffName,
-		Description:  "Compare error counts between two time windows.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(diffInputSchema),
-		OutputSchema: json.RawMessage(diffOutputSchema),
-		Handler:      handleCompareWindows,
-		Examples:     []string{"compare errors in last 10m vs 1h ago"},
-	}); err != nil {
-		return err
-	}
-	if err := reg.Register(Tool{
-		Name:         toolInsightsName,
-		Description:  "Summarize failures with top errors and services.",
-		Version:      "1.0",
-		InputSchema:  json.RawMessage(insightsInputSchema),
-		OutputSchema: json.RawMessage(insightsOutputSchema),
-		Handler:      handleInsights,
-		Examples:     []string{"show top errors", "what happened in the last 10 minutes"},
-	}); err != nil {
-		return err
-	}
-	return nil
-}
diff --git a/internal/tools/graph_tools_v2.go b/internal/tools/graph_tools_v2.go
new file mode 100644
index 0000000..d1aa623
--- /dev/null
+++ b/internal/tools/graph_tools_v2.go
@@ -0,0 +1,106 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+const (
+	toolExplainReqName = "explain_request"
+	toolBlastName      = "blast_radius"
+)
+
+// RegisterExplainRequestTool registers the explain_request tool backed by
+// incidents.Reader. Output shape is apiv2.StoryResponse — see
+// docs/superpowers/specs/2026-05-18-graph-to-incident-evidence-design.md.
+func RegisterExplainRequestTool(reg *Registry, reader incidents.Reader) error {
+	return reg.Register(Tool{
+		Name:        toolExplainReqName,
+		Description: "Return the trace story (per-step path, anchor, downstream) for a given trace_id.",
+		Version:     "explain.v2",
+		InputSchema: json.RawMessage(explainRequestV2InputSchema),
+		Handler: func(_ context.Context, params json.RawMessage) (any, error) {
+			var p struct {
+				TraceID string `json:"trace_id"`
+			}
+			if err := json.Unmarshal(params, &p); err != nil {
+				return nil, fmt.Errorf("explain_request: bad params: %w", err)
+			}
+			if p.TraceID == "" {
+				return nil, fmt.Errorf("explain_request: trace_id required")
+			}
+			story, ok := reader.TraceStoryByTraceID(p.TraceID)
+			if !ok {
+				return nil, fmt.Errorf("explain_request: trace not found: %s", p.TraceID)
+			}
+			return story, nil
+		},
+		Examples: []string{"explain request "},
+	})
+}
+
+// RegisterBlastRadiusTool registers the blast_radius tool backed by
+// incidents.Reader. Output shape is apiv2.BlastRadiusResponse.
+func RegisterBlastRadiusTool(reg *Registry, reader incidents.Reader) error {
+	return reg.Register(Tool{
+		Name:        toolBlastName,
+		Description: "Aggregate impact (affected requests, users, services, top services, sample traces) for an error family in a window.",
+		Version:     "blast.v2",
+		InputSchema: json.RawMessage(blastRadiusV2InputSchema),
+		Handler: func(_ context.Context, params json.RawMessage) (any, error) {
+			var p struct {
+				Service   string `json:"service"`
+				Step      string `json:"step"`
+				ErrorCode string `json:"error_code"`
+				Window    string `json:"window"`
+			}
+			if err := json.Unmarshal(params, &p); err != nil {
+				return nil, fmt.Errorf("blast_radius: bad params: %w", err)
+			}
+			if p.Service == "" || p.Step == "" || p.ErrorCode == "" {
+				return nil, fmt.Errorf("blast_radius: service, step, error_code all required")
+			}
+			windowStr := p.Window
+			if windowStr == "" {
+				windowStr = "15m"
+			}
+			window, err := time.ParseDuration(windowStr)
+			if err != nil {
+				return nil, fmt.Errorf("blast_radius: bad window: %w", err)
+			}
+			now := time.Now()
+			res := reader.BlastRadius(
+				incidents.SearchFilter{Since: now.Add(-window), Until: now},
+				apiv2.BlastKey{Service: p.Service, Step: p.Step, ErrorCode: p.ErrorCode},
+			)
+			return res, nil
+		},
+		Examples: []string{"blast radius for payment-service/charge/DB_TIMEOUT in 15m"},
+	})
+}
+
+const explainRequestV2InputSchema = `{
+  "type": "object",
+  "required": ["trace_id"],
+  "properties": {
+    "trace_id": { "type": "string" }
+  },
+  "additionalProperties": false
+}`
+
+const blastRadiusV2InputSchema = `{
+  "type": "object",
+  "required": ["service", "step", "error_code"],
+  "properties": {
+    "service":    { "type": "string" },
+    "step":       { "type": "string" },
+    "error_code": { "type": "string" },
+    "window":     { "type": "string", "description": "Go duration (default 15m)" }
+  },
+  "additionalProperties": false
+}`
diff --git a/internal/tools/helpers.go b/internal/tools/helpers.go
deleted file mode 100644
index a09a593..0000000
--- a/internal/tools/helpers.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package tools
-
-import (
-	"fmt"
-	"sort"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-// serviceChainForRequest returns the chain of services for a request.
-func serviceChainForRequest(g *core.Graph, reqID string) []string {
-	serviceID := ""
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type == core.EdgeHandledBy {
-			serviceID = e.To
-			break
-		}
-	}
-	if serviceID == "" {
-		return nil
-	}
-	visited := map[string]bool{}
-	var services []string
-	curr := serviceID
-	for {
-		if visited[curr] {
-			break
-		}
-		visited[curr] = true
-		svc, ok := g.Nodes[curr]
-		if !ok {
-			break
-		}
-		services = append(services, serviceNameForNode(svc))
-		next := ""
-		for _, e := range g.OutEdges[curr] {
-			if e.Type == core.EdgeCalls {
-				next = e.To
-				break
-			}
-		}
-		if next == "" {
-			break
-		}
-		curr = next
-	}
-	return services
-}
-
-// errorCodeForID returns the error code attribute for an error node ID.
-func errorCodeForID(g *core.Graph, id string) string {
-	n, ok := g.Nodes[id]
-	if !ok || n.Attr == nil {
-		return id
-	}
-	if code, ok := n.Attr["code"].(string); ok && code != "" {
-		return code
-	}
-	return id
-}
-
-// serviceNameForID returns the service name for a service node ID.
-func serviceNameForID(g *core.Graph, id string) string {
-	n, ok := g.Nodes[id]
-	if !ok {
-		return id
-	}
-	return serviceNameForNode(n)
-}
-
-// serviceNameForNode extracts the service name from a node's attributes.
-func serviceNameForNode(n core.Node) string {
-	if n.Attr == nil {
-		return n.ID
-	}
-	if name, ok := n.Attr["service"]; ok && name != nil {
-		return fmt.Sprintf("%v", name)
-	}
-	if name, ok := n.Attr["name"]; ok && name != nil {
-		return fmt.Sprintf("%v", name)
-	}
-	return n.ID
-}
-
-// sortedKeys returns sorted keys from a map.
-func sortedKeys(m map[string]bool) []string {
-	out := make([]string, 0, len(m))
-	for k := range m {
-		out = append(out, k)
-	}
-	sort.Strings(out)
-	return out
-}
-
-func mapCountToSortedServices(m map[string]int) []blastService {
-	type pair struct {
-		name  string
-		count int
-	}
-	var pairs []pair
-	for name, count := range m {
-		pairs = append(pairs, pair{name: name, count: count})
-	}
-	sort.Slice(pairs, func(i, j int) bool {
-		return pairs[i].count > pairs[j].count
-	})
-	out := make([]blastService, 0, len(pairs))
-	for _, p := range pairs {
-		out = append(out, blastService{Service: p.name, Count: p.count})
-	}
-	return out
-}
-
-func mapCountToSortedTiers(m map[string]int) []blastTier {
-	type pair struct {
-		name  string
-		count int
-	}
-	var pairs []pair
-	for name, count := range m {
-		pairs = append(pairs, pair{name: name, count: count})
-	}
-	sort.Slice(pairs, func(i, j int) bool {
-		return pairs[i].count > pairs[j].count
-	})
-	out := make([]blastTier, 0, len(pairs))
-	for _, p := range pairs {
-		out = append(out, blastTier{Tier: p.name, Count: p.count})
-	}
-	return out
-}
-
-func mapCountToTopUsers(m map[string]int, n int) []blastUser {
-	type pair struct {
-		id    string
-		count int
-	}
-	var pairs []pair
-	for id, count := range m {
-		pairs = append(pairs, pair{id: id, count: count})
-	}
-	sort.Slice(pairs, func(i, j int) bool {
-		return pairs[i].count > pairs[j].count
-	})
-	if n > len(pairs) {
-		n = len(pairs)
-	}
-	out := make([]blastUser, 0, n)
-	for i := 0; i < n; i++ {
-		out = append(out, blastUser{UserID: pairs[i].id, Count: pairs[i].count})
-	}
-	return out
-}
-
-func mapCountToTopErrors(m map[string]int, n int) []insightError {
-	type pair struct {
-		code  string
-		count int
-	}
-	var pairs []pair
-	for code, count := range m {
-		pairs = append(pairs, pair{code: code, count: count})
-	}
-	sort.Slice(pairs, func(i, j int) bool {
-		return pairs[i].count > pairs[j].count
-	})
-	if n > len(pairs) {
-		n = len(pairs)
-	}
-	out := make([]insightError, 0, n)
-	for i := 0; i < n; i++ {
-		out = append(out, insightError{ErrorCode: pairs[i].code, Count: pairs[i].count})
-	}
-	return out
-}
-
-func mapCountToTopServices(m map[string]int, n int) []insightService {
-	type pair struct {
-		name  string
-		count int
-	}
-	var pairs []pair
-	for name, count := range m {
-		pairs = append(pairs, pair{name: name, count: count})
-	}
-	sort.Slice(pairs, func(i, j int) bool {
-		return pairs[i].count > pairs[j].count
-	})
-	if n > len(pairs) {
-		n = len(pairs)
-	}
-	out := make([]insightService, 0, n)
-	for i := 0; i < n; i++ {
-		out = append(out, insightService{Service: pairs[i].name, Count: pairs[i].count})
-	}
-	return out
-}
diff --git a/internal/tools/insights.go b/internal/tools/insights.go
deleted file mode 100644
index 98abb5b..0000000
--- a/internal/tools/insights.go
+++ /dev/null
@@ -1,128 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/analysis"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-type explainRequestInput struct {
-	RequestID string `json:"request_id"`
-	TraceID   string `json:"trace_id"`
-}
-
-type explainRequestOutput struct {
-	SchemaVersion string                 `json:"schema_version"`
-	RequestID     string                 `json:"request_id"`
-	LatencyMs     any                    `json:"latency_ms,omitempty"`
-	Flow          any                    `json:"flow,omitempty"`
-	UserID        string                 `json:"user_id,omitempty"`
-	UserTier      any                    `json:"user_tier,omitempty"`
-	FeatureFlags  []string               `json:"feature_flags,omitempty"`
-	SpanID        string                 `json:"span_id,omitempty"`
-	SpanService   any                    `json:"span_service,omitempty"`
-	SpanDepth     string                 `json:"span_depth,omitempty"`
-	Service       any                    `json:"service,omitempty"`
-	ErrorCode     any                    `json:"error_code,omitempty"`
-	ErrorMsg      any                    `json:"error_msg,omitempty"`
-	SpanChain     []analysis.SpanSummary `json:"span_chain,omitempty"`
-}
-
-func handleExplainRequest(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input explainRequestInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.RequestID == "" && input.TraceID == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "request_id or trace_id required"}
-	}
-	requestID := input.RequestID
-	if requestID == "" {
-		requestID = core.ID("request", input.TraceID)
-	}
-	g := store.Snapshot()
-	ex, err := analysis.ExplainRequestWithTrace(g, traceStoreFrom(store), requestID)
-	if err != nil {
-		return nil, &ToolError{Code: CodeNotFound, Message: err.Error()}
-	}
-	return explainRequestOutput{
-		SchemaVersion: "1.0",
-		RequestID:     ex.RequestID,
-		LatencyMs:     ex.LatencyMs,
-		Flow:          ex.Flow,
-		UserID:        ex.UserID,
-		UserTier:      ex.UserTier,
-		FeatureFlags:  ex.FeatureFlags,
-		SpanID:        ex.SpanID,
-		SpanService:   ex.SpanService,
-		SpanDepth:     ex.SpanDepth,
-		Service:       ex.Service,
-		ErrorCode:     ex.ErrorCode,
-		ErrorMsg:      ex.ErrorMsg,
-		SpanChain:     ex.SpanChain,
-	}, nil
-}
-
-type insightsInput struct {
-	Window      string `json:"window,omitempty"`
-	TopErrors   int    `json:"top_errors,omitempty"`
-	TopServices int    `json:"top_services,omitempty"`
-}
-
-type insightError struct {
-	ErrorCode string `json:"error_code"`
-	Count     int    `json:"count"`
-}
-
-type insightService struct {
-	Service string `json:"service"`
-	Count   int    `json:"count"`
-}
-
-type insightsOutput struct {
-	SchemaVersion string           `json:"schema_version"`
-	TotalFailures int              `json:"total_failures"`
-	TopErrors     []insightError   `json:"top_errors,omitempty"`
-	TopServices   []insightService `json:"top_services,omitempty"`
-}
-
-func handleInsights(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input insightsInput
-	if len(params) > 0 {
-		if err := json.Unmarshal(params, &input); err != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-		}
-	}
-
-	if input.TopErrors == 0 {
-		input.TopErrors = 5
-	}
-	if input.TopServices == 0 {
-		input.TopServices = 5
-	}
-
-	g := store.Snapshot()
-	end := time.Now()
-	var start time.Time
-	if input.Window != "" {
-		d, err := time.ParseDuration(input.Window)
-		if err != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid window: %v", err)}
-		}
-		start = end.Add(-d)
-	}
-
-	sum := analysis.RollupWindow(g, store, traceStoreFrom(store), start, end)
-	return insightsOutput{
-		SchemaVersion: "1.0",
-		TotalFailures: sum.TotalFailures,
-		TopErrors:     mapCountToTopErrors(sum.PrimaryErrorCount, input.TopErrors),
-		TopServices:   mapCountToTopServices(sum.ServiceFailureCount, input.TopServices),
-	}, nil
-}
diff --git a/internal/tools/pagination.go b/internal/tools/pagination.go
deleted file mode 100644
index 76086e5..0000000
--- a/internal/tools/pagination.go
+++ /dev/null
@@ -1,28 +0,0 @@
-package tools
-
-// applyPagination slices a sorted list by offset/limit and returns
-// pagination metadata. Default limit: 100, max limit: 1000.
-func applyPagination[T any](items []T, limit, offset int) (page []T, totalCount int, hasMore bool) {
-	totalCount = len(items)
-
-	if limit <= 0 {
-		limit = 100
-	}
-	if limit > 1000 {
-		limit = 1000
-	}
-
-	if offset < 0 {
-		offset = 0
-	}
-	if offset >= totalCount {
-		return []T{}, totalCount, false
-	}
-
-	end := offset + limit
-	if end > totalCount {
-		end = totalCount
-	}
-
-	return items[offset:end], totalCount, end < totalCount
-}
diff --git a/internal/tools/pagination_test.go b/internal/tools/pagination_test.go
deleted file mode 100644
index 615fba1..0000000
--- a/internal/tools/pagination_test.go
+++ /dev/null
@@ -1,87 +0,0 @@
-package tools
-
-import "testing"
-
-func TestApplyPagination(t *testing.T) {
-	items := []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
-
-	t.Run("first page", func(t *testing.T) {
-		page, total, hasMore := applyPagination(items, 3, 0)
-		if total != 10 {
-			t.Errorf("total = %d, want 10", total)
-		}
-		if !hasMore {
-			t.Error("expected has_more=true")
-		}
-		if len(page) != 3 {
-			t.Errorf("len = %d, want 3", len(page))
-		}
-	})
-
-	t.Run("middle page", func(t *testing.T) {
-		page, _, hasMore := applyPagination(items, 3, 3)
-		if !hasMore {
-			t.Error("expected has_more=true")
-		}
-		if len(page) != 3 {
-			t.Errorf("len = %d, want 3", len(page))
-		}
-		if page[0] != 4 {
-			t.Errorf("first = %d, want 4", page[0])
-		}
-	})
-
-	t.Run("last page", func(t *testing.T) {
-		page, _, hasMore := applyPagination(items, 3, 9)
-		if hasMore {
-			t.Error("expected has_more=false")
-		}
-		if len(page) != 1 {
-			t.Errorf("len = %d, want 1", len(page))
-		}
-	})
-
-	t.Run("offset beyond length", func(t *testing.T) {
-		page, total, hasMore := applyPagination(items, 3, 20)
-		if hasMore {
-			t.Error("expected has_more=false")
-		}
-		if len(page) != 0 {
-			t.Errorf("len = %d, want 0", len(page))
-		}
-		if total != 10 {
-			t.Errorf("total = %d, want 10", total)
-		}
-	})
-
-	t.Run("zero limit uses default", func(t *testing.T) {
-		page, _, _ := applyPagination(items, 0, 0)
-		if len(page) != 10 {
-			t.Errorf("len = %d, want 10 (default 100, clamped to len)", len(page))
-		}
-	})
-
-	t.Run("negative offset treated as zero", func(t *testing.T) {
-		page, total, hasMore := applyPagination(items, 3, -1)
-		if total != 10 {
-			t.Errorf("total = %d, want 10", total)
-		}
-		if !hasMore {
-			t.Error("expected has_more=true")
-		}
-		if len(page) != 3 {
-			t.Errorf("len = %d, want 3", len(page))
-		}
-		if page[0] != 1 {
-			t.Errorf("first = %d, want 1", page[0])
-		}
-	})
-
-	t.Run("limit exceeds max", func(t *testing.T) {
-		big := make([]int, 2000)
-		page, _, _ := applyPagination(big, 1500, 0)
-		if len(page) != 1000 {
-			t.Errorf("len = %d, want 1000 (max)", len(page))
-		}
-	})
-}
diff --git a/internal/tools/query.go b/internal/tools/query.go
deleted file mode 100644
index 8e72117..0000000
--- a/internal/tools/query.go
+++ /dev/null
@@ -1,178 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/analysis"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/query"
-)
-
-// mapRollupDiffEntries converts DiffEntry rows that already carry canonical
-// error codes (as produced by DiffRollups) into the tool-layer response shape.
-// No id→code translation is needed.
-func mapRollupDiffEntries(entries []analysis.DiffEntry) []diffEntry {
-	out := make([]diffEntry, 0, len(entries))
-	for _, e := range entries {
-		out = append(out, diffEntry{
-			ErrorCode: e.ErrorCode,
-			Before:    e.Before,
-			After:     e.After,
-			Delta:     e.Delta,
-		})
-	}
-	return out
-}
-
-type queryInput struct {
-	Expr   string `json:"expr"`
-	Window string `json:"window"`
-}
-
-type queryOutput struct {
-	SchemaVersion   string `json:"schema_version"`
-	MatchedRequests int    `json:"matched_requests"`
-}
-
-func handleGraphQuery(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input queryInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.Expr == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "expr required"}
-	}
-	if input.Window == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "window required"}
-	}
-
-	d, err := time.ParseDuration(input.Window)
-	if err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid window: %v", err)}
-	}
-	pred, err := query.Parse(input.Expr)
-	if err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("query parse error: %v", err)}
-	}
-
-	end := time.Now()
-	start := end.Add(-d)
-	matched := 0
-
-	store.ForEachRequestFact(start, end, func(f graphstore.RequestFacts) {
-		if pred.Eval(f) {
-			matched++
-		}
-	})
-
-	return queryOutput{SchemaVersion: "1.0", MatchedRequests: matched}, nil
-}
-
-type diffInput struct {
-	Current  string `json:"current"`
-	Baseline string `json:"baseline"`
-	Offset   string `json:"offset"`
-	Anchor   string `json:"anchor"`
-}
-
-type diffEntry struct {
-	ErrorCode string `json:"error_code"`
-	Before    int    `json:"before,omitempty"`
-	After     int    `json:"after,omitempty"`
-	Delta     int    `json:"delta"`
-}
-
-type diffOutput struct {
-	SchemaVersion string      `json:"schema_version"`
-	New           []diffEntry `json:"new,omitempty"`
-	Removed       []diffEntry `json:"removed,omitempty"`
-	Increased     []diffEntry `json:"increased,omitempty"`
-	Decreased     []diffEntry `json:"decreased,omitempty"`
-
-	TotalRequestsBefore int   `json:"total_requests_before"`
-	TotalRequestsAfter  int   `json:"total_requests_after"`
-	TotalFailuresBefore int   `json:"total_failures_before"`
-	TotalFailuresAfter  int   `json:"total_failures_after"`
-	LatencyP50Before    int64 `json:"latency_p50_before"`
-	LatencyP50After     int64 `json:"latency_p50_after"`
-	LatencyP95Before    int64 `json:"latency_p95_before"`
-	LatencyP95After     int64 `json:"latency_p95_after"`
-	LatencyP99Before    int64 `json:"latency_p99_before"`
-	LatencyP99After     int64 `json:"latency_p99_after"`
-}
-
-func handleCompareWindows(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input diffInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.Current == "" || input.Baseline == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "current and baseline required"}
-	}
-	if input.Anchor == "" && input.Offset == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "either offset or anchor required"}
-	}
-	if input.Anchor != "" && input.Offset != "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "offset and anchor are mutually exclusive"}
-	}
-
-	currDur, err := time.ParseDuration(input.Current)
-	if err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid current: %v", err)}
-	}
-	baseDur, err := time.ParseDuration(input.Baseline)
-	if err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid baseline: %v", err)}
-	}
-
-	var currStart, currEnd, baseStart, baseEnd time.Time
-	if input.Anchor != "" {
-		anchor, parseErr := time.Parse(time.RFC3339, input.Anchor)
-		if parseErr != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid anchor: %v", parseErr)}
-		}
-		currStart = anchor
-		currEnd = anchor.Add(currDur)
-		baseEnd = anchor
-		baseStart = anchor.Add(-baseDur)
-	} else {
-		offDur, parseErr := time.ParseDuration(input.Offset)
-		if parseErr != nil {
-			return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid offset: %v", parseErr)}
-		}
-		now := time.Now()
-		currEnd = now
-		currStart = currEnd.Add(-currDur)
-		baseEnd = currEnd.Add(-offDur)
-		baseStart = baseEnd.Add(-baseDur)
-	}
-
-	g := store.Snapshot()
-	ts := traceStoreFrom(store)
-	curr := analysis.RollupWindow(g, store, ts, currStart, currEnd)
-	base := analysis.RollupWindow(g, store, ts, baseStart, baseEnd)
-	diff := analysis.DiffRollups(base, curr)
-
-	return diffOutput{
-		SchemaVersion:       "1.0",
-		New:                 mapRollupDiffEntries(diff.New),
-		Removed:             mapRollupDiffEntries(diff.Removed),
-		Increased:           mapRollupDiffEntries(diff.Increased),
-		Decreased:           mapRollupDiffEntries(diff.Decreased),
-		TotalRequestsBefore: diff.TotalRequestsBefore,
-		TotalRequestsAfter:  diff.TotalRequestsAfter,
-		TotalFailuresBefore: diff.TotalFailuresBefore,
-		TotalFailuresAfter:  diff.TotalFailuresAfter,
-		LatencyP50Before:    diff.LatencyP50Before,
-		LatencyP50After:     diff.LatencyP50After,
-		LatencyP95Before:    diff.LatencyP95Before,
-		LatencyP95After:     diff.LatencyP95After,
-		LatencyP99Before:    diff.LatencyP99Before,
-		LatencyP99After:     diff.LatencyP99After,
-	}, nil
-}
diff --git a/internal/tools/registry.go b/internal/tools/registry.go
index ed92dbd..772f3f1 100644
--- a/internal/tools/registry.go
+++ b/internal/tools/registry.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 )
 
-type ToolHandler func(ctx context.Context, store Store, params json.RawMessage) (any, error)
+type ToolHandler func(ctx context.Context, params json.RawMessage) (any, error)
 
 type Tool struct {
 	Name         string
@@ -138,7 +138,7 @@ func (r *Registry) List() []Tool {
 	return out
 }
 
-func (r *Registry) Call(ctx context.Context, store Store, name string, params json.RawMessage) (result any, err error) {
+func (r *Registry) Call(ctx context.Context, name string, params json.RawMessage) (result any, err error) {
 	t, ok := r.tools[name]
 	if !ok {
 		return nil, &ToolError{Code: CodeNotFound, Message: fmt.Sprintf("unknown tool: %s", name), Retryable: false}
@@ -151,7 +151,7 @@ func (r *Registry) Call(ctx context.Context, store Store, name string, params js
 		}
 	}()
 
-	result, err = t.Handler(ctx, store, params)
+	result, err = t.Handler(ctx, params)
 	if err != nil {
 		if _, ok := AsToolError(err); ok {
 			return nil, err
diff --git a/internal/tools/registry_test.go b/internal/tools/registry_test.go
index 970191a..c5d935e 100644
--- a/internal/tools/registry_test.go
+++ b/internal/tools/registry_test.go
@@ -15,12 +15,12 @@ func TestCall_PanicRecovery(t *testing.T) {
 	reg := NewRegistry()
 	reg.Register(Tool{
 		Name: "panicker",
-		Handler: func(ctx context.Context, store Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			panic("kaboom")
 		},
 	})
 
-	_, err := reg.Call(context.Background(), nil, "panicker", nil)
+	_, err := reg.Call(context.Background(), "panicker", nil)
 	te, ok := AsToolError(err)
 	if !ok {
 		t.Fatalf("expected ToolError, got %T: %v", err, err)
@@ -37,12 +37,12 @@ func TestCall_RawErrorWrapping(t *testing.T) {
 	reg := NewRegistry()
 	reg.Register(Tool{
 		Name: "raw",
-		Handler: func(ctx context.Context, store Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			return nil, errors.New("plain error")
 		},
 	})
 
-	_, err := reg.Call(context.Background(), nil, "raw", nil)
+	_, err := reg.Call(context.Background(), "raw", nil)
 	te, ok := AsToolError(err)
 	if !ok {
 		t.Fatalf("expected ToolError, got %T: %v", err, err)
@@ -56,12 +56,12 @@ func TestCall_ToolErrorPassthrough(t *testing.T) {
 	reg := NewRegistry()
 	reg.Register(Tool{
 		Name: "typed",
-		Handler: func(ctx context.Context, store Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			return nil, &ToolError{Code: CodeInvalidParams, Message: "bad"}
 		},
 	})
 
-	_, err := reg.Call(context.Background(), nil, "typed", nil)
+	_, err := reg.Call(context.Background(), "typed", nil)
 	te, ok := AsToolError(err)
 	if !ok {
 		t.Fatalf("expected ToolError, got %T: %v", err, err)
@@ -73,7 +73,7 @@ func TestCall_ToolErrorPassthrough(t *testing.T) {
 
 func TestCall_UnknownTool(t *testing.T) {
 	reg := NewRegistry()
-	_, err := reg.Call(context.Background(), nil, "nope", nil)
+	_, err := reg.Call(context.Background(), "nope", nil)
 	te, ok := AsToolError(err)
 	if !ok {
 		t.Fatalf("expected ToolError, got %T: %v", err, err)
@@ -87,12 +87,12 @@ func TestCall_Success(t *testing.T) {
 	reg := NewRegistry()
 	reg.Register(Tool{
 		Name: "ok",
-		Handler: func(ctx context.Context, store Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			return map[string]int{"x": 1}, nil
 		},
 	})
 
-	result, err := reg.Call(context.Background(), nil, "ok", nil)
+	result, err := reg.Call(context.Background(), "ok", nil)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
diff --git a/internal/tools/report.go b/internal/tools/report.go
index 57206c7..b89193b 100644
--- a/internal/tools/report.go
+++ b/internal/tools/report.go
@@ -42,7 +42,7 @@ func RegisterTriageReportTool(reg *Registry, engine *triage.Engine) error {
 			`{"incident_id":"inc_01HX...","format":"markdown","snapshot":true}`,
 			`{"incident_id":"inc_01HX...","format":"slack"}`,
 		},
-		Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			var p struct {
 				IncidentID string `json:"incident_id"`
 				Format     string `json:"format"`
diff --git a/internal/tools/report_test.go b/internal/tools/report_test.go
index 29df20f..187e4f9 100644
--- a/internal/tools/report_test.go
+++ b/internal/tools/report_test.go
@@ -15,7 +15,7 @@ func TestRenderTriageReportToolReturnsRenderedReport(t *testing.T) {
 	if err := tools.RegisterTriageReportTool(reg, eng); err != nil {
 		t.Fatalf("register: %v", err)
 	}
-	out, err := reg.Call(context.Background(), nil, "render_triage_report", json.RawMessage(`{"incident_id":"inc_abc","format":"markdown"}`))
+	out, err := reg.Call(context.Background(), "render_triage_report", json.RawMessage(`{"incident_id":"inc_abc","format":"markdown"}`))
 	if err != nil {
 		t.Fatalf("call: %v", err)
 	}
diff --git a/internal/tools/schema_test.go b/internal/tools/schema_test.go
deleted file mode 100644
index 7a2f319..0000000
--- a/internal/tools/schema_test.go
+++ /dev/null
@@ -1,392 +0,0 @@
-package tools
-
-import (
-	"encoding/json"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-)
-
-func TestOutputSchemas_ValidJSON(t *testing.T) {
-	schemas := map[string]string{
-		"graph_stats":      graphStatsOutputSchema,
-		"explain_request":  explainRequestOutputSchema,
-		"trace_graph":      traceGraphOutputSchema,
-		"trace_summary":    traceSummaryOutputSchema,
-		"graph_failures":   failuresOutputSchema,
-		"failure_patterns": patternsOutputSchema,
-		"blast_radius":     blastOutputSchema,
-		"failure_chain":    chainOutputSchema,
-		"graph_query":      queryOutputSchema,
-		"compare_windows":  diffOutputSchema,
-		"graph_insights":   insightsOutputSchema,
-	}
-
-	for name, raw := range schemas {
-		t.Run(name, func(t *testing.T) {
-			var schema map[string]any
-			if err := json.Unmarshal([]byte(raw), &schema); err != nil {
-				t.Fatalf("invalid JSON: %v", err)
-			}
-
-			// Must have additionalProperties: false
-			if ap, ok := schema["additionalProperties"]; !ok || ap != false {
-				t.Error("missing or non-false additionalProperties")
-			}
-
-			// Must have schema_version in properties
-			props, ok := schema["properties"].(map[string]any)
-			if !ok {
-				t.Fatal("missing properties")
-			}
-			if _, ok := props["schema_version"]; !ok {
-				t.Error("missing schema_version property")
-			}
-
-			// schema_version must be required
-			required, ok := schema["required"].([]any)
-			if !ok {
-				t.Fatal("missing required array")
-			}
-			found := false
-			for _, r := range required {
-				if r == "schema_version" {
-					found = true
-					break
-				}
-			}
-			if !found {
-				t.Error("schema_version not in required list")
-			}
-		})
-	}
-}
-
-func TestAllToolsHaveVersion(t *testing.T) {
-	reg := NewRegistry()
-	if err := RegisterGraphTools(reg); err != nil {
-		t.Fatal(err)
-	}
-	for _, tool := range reg.List() {
-		if tool.Version == "" {
-			t.Errorf("tool %q has no version", tool.Name)
-		}
-	}
-}
-
-// TestHandlerOutputMatchesSchema calls each handler with fixture data and
-// validates that output keys, required fields, and types match the declared schema.
-func TestHandlerOutputMatchesSchema(t *testing.T) {
-	traceID := "0123456789abcdef0123456789abcdef"
-	now := time.Now()
-	reqID := core.ID("request", traceID)
-	spanID := core.ID("span", traceID, "0123456789abcdef")
-	userID := core.ID("user", "user-123")
-	svcID := core.ID("service", "test-service")
-	errID := core.ID("error", "TEST_ERR")
-
-	store := graphstore.NewStore()
-	g := core.New()
-
-	g.AddNode(core.Node{
-		ID: reqID, Type: core.NodeRequest,
-		FirstSeen: now, LastSeen: now,
-		Attr: map[string]any{
-			"trace_id":    traceID,
-			"success":     false,
-			"status_code": 500,
-			"latency_ms":  int64(42),
-			"event_name":  "test-service.error",
-			"flow":        "checkout",
-			"is_root":     true,
-		},
-	})
-	g.AddNode(core.Node{
-		ID: spanID, Type: core.NodeSpan,
-		FirstSeen: now, LastSeen: now,
-		Attr: map[string]any{
-			"trace_id": traceID,
-			"span_id":  "0123456789abcdef",
-			"service":  "test-service",
-		},
-	})
-	g.AddNode(core.Node{
-		ID: userID, Type: core.NodeUser,
-		FirstSeen: now, LastSeen: now,
-		Attr: map[string]any{"tier": "standard"},
-	})
-	g.AddNode(core.Node{
-		ID: svcID, Type: core.NodeService,
-		FirstSeen: now, LastSeen: now,
-		Attr: map[string]any{"name": "test-service"},
-	})
-	g.AddNode(core.Node{
-		ID: errID, Type: core.NodeError,
-		FirstSeen: now, LastSeen: now,
-		Attr: map[string]any{"code": "TEST_ERR", "message": "test error"},
-	})
-
-	// Edges
-	g.AddEdge(core.Edge{From: reqID, To: userID, Type: core.EdgeRequestBy})
-	g.AddEdge(core.Edge{From: reqID, To: svcID, Type: core.EdgeHandledBy})
-	g.AddEdge(core.Edge{From: reqID, To: errID, Type: core.EdgeFailedWith})
-	g.AddEdge(core.Edge{From: reqID, To: spanID, Type: core.EdgeRequestHasSpan})
-
-	store.Merge(g)
-
-	reg := NewRegistry()
-	if err := RegisterGraphTools(reg); err != nil {
-		t.Fatal(err)
-	}
-
-	// Tool name -> params to call with
-	cases := map[string]json.RawMessage{
-		"graph_stats":      json.RawMessage(`{}`),
-		"explain_request":  json.RawMessage(fmt.Sprintf(`{"trace_id":%q}`, traceID)),
-		"trace_graph":      json.RawMessage(fmt.Sprintf(`{"trace_id":%q}`, traceID)),
-		"trace_summary":    json.RawMessage(fmt.Sprintf(`{"trace_id":%q}`, traceID)),
-		"graph_failures":   json.RawMessage(`{}`),
-		"failure_patterns": json.RawMessage(`{}`),
-		"blast_radius":     json.RawMessage(`{"error_code":"TEST_ERR"}`),
-		"failure_chain":    json.RawMessage(fmt.Sprintf(`{"request_id":%q}`, reqID)),
-		"graph_query":      json.RawMessage(`{"expr":"error_code=TEST_ERR","window":"1h"}`),
-		"compare_windows":  json.RawMessage(`{"current":"1h","baseline":"1h","offset":"2h"}`),
-		"graph_insights":   json.RawMessage(`{}`),
-	}
-
-	for _, tool := range reg.List() {
-		params, ok := cases[tool.Name]
-		if !ok {
-			t.Errorf("no test case for tool %q", tool.Name)
-			continue
-		}
-		t.Run(tool.Name, func(t *testing.T) {
-			result, err := reg.Call(t.Context(), store, tool.Name, params)
-			if err != nil {
-				t.Fatalf("handler returned error: %v", err)
-			}
-
-			// Marshal result to JSON and back to map
-			b, err := json.Marshal(result)
-			if err != nil {
-				t.Fatalf("marshal result: %v", err)
-			}
-			var output map[string]any
-			if err := json.Unmarshal(b, &output); err != nil {
-				t.Fatalf("unmarshal result: %v", err)
-			}
-
-			// Parse schema
-			var schema map[string]any
-			if err := json.Unmarshal(tool.OutputSchema, &schema); err != nil {
-				t.Fatalf("parse output schema: %v", err)
-			}
-
-			// Validate
-			validateObject(t, "", output, schema, schema)
-		})
-	}
-}
-
-// resolveRef resolves a $ref like "#/$defs/span_node" against the root schema.
-func resolveRef(schema map[string]any, root map[string]any) map[string]any {
-	ref, ok := schema["$ref"].(string)
-	if !ok || root == nil {
-		return schema
-	}
-	// Only support "#/$defs/" format
-	const prefix = "#/$defs/"
-	if len(ref) <= len(prefix) || ref[:len(prefix)] != prefix {
-		return schema
-	}
-	name := ref[len(prefix):]
-	defs, _ := root["$defs"].(map[string]any)
-	if resolved, ok := defs[name].(map[string]any); ok {
-		return resolved
-	}
-	return schema
-}
-
-// validateObject checks that output matches the schema at the given path.
-func validateObject(t *testing.T, path string, output map[string]any, schema map[string]any, root map[string]any) {
-	t.Helper()
-	props, _ := schema["properties"].(map[string]any)
-
-	// Check additionalProperties: false
-	if ap, ok := schema["additionalProperties"]; ok && ap == false {
-		for key := range output {
-			if _, defined := props[key]; !defined {
-				t.Errorf("%s: unexpected key %q not in schema properties", path, key)
-			}
-		}
-	}
-
-	// Check required fields
-	if required, ok := schema["required"].([]any); ok {
-		for _, r := range required {
-			key, _ := r.(string)
-			if _, exists := output[key]; !exists {
-				t.Errorf("%s: required field %q missing from output", path, key)
-			}
-		}
-	}
-
-	// Check schema_version value
-	if path == "" {
-		if sv, ok := output["schema_version"]; ok {
-			if sv != "1.0" {
-				t.Errorf("schema_version = %v, want \"1.0\"", sv)
-			}
-		}
-	}
-
-	// Type-check each field present in output
-	for key, val := range output {
-		propSchema, ok := props[key]
-		if !ok {
-			continue // already reported above if additionalProperties:false
-		}
-		propMap, ok := propSchema.(map[string]any)
-		if !ok {
-			continue
-		}
-		fieldPath := key
-		if path != "" {
-			fieldPath = path + "." + key
-		}
-		validateType(t, fieldPath, val, resolveRef(propMap, root), root)
-	}
-}
-
-// validateType checks that val matches the declared schema type.
-func validateType(t *testing.T, path string, val any, schema map[string]any, root map[string]any) {
-	t.Helper()
-	schemaType := schema["type"]
-	if schemaType == nil {
-		return
-	}
-
-	switch st := schemaType.(type) {
-	case string:
-		checkSingleType(t, path, val, st, schema, root)
-	case []any:
-		// Nullable: e.g. ["string", "null"]
-		if val == nil {
-			// null is ok if "null" is in the type list
-			for _, typ := range st {
-				if typ == "null" {
-					return
-				}
-			}
-			t.Errorf("%s: got null, but type %v does not include null", path, st)
-			return
-		}
-		ok := false
-		for _, typ := range st {
-			s, _ := typ.(string)
-			if s == "null" {
-				continue
-			}
-			if typMatches(val, s) {
-				ok = true
-				break
-			}
-		}
-		if !ok {
-			t.Errorf("%s: value %T does not match any of %v", path, val, st)
-		}
-	}
-}
-
-func checkSingleType(t *testing.T, path string, val any, typ string, schema map[string]any, root map[string]any) {
-	t.Helper()
-	if val == nil {
-		t.Errorf("%s: got null for non-nullable type %q", path, typ)
-		return
-	}
-	if !typMatches(val, typ) {
-		t.Errorf("%s: expected type %q, got %T", path, typ, val)
-		return
-	}
-	// Recurse into arrays
-	if typ == "array" {
-		arr, ok := val.([]any)
-		if !ok {
-			return
-		}
-		items, _ := schema["items"].(map[string]any)
-		if items == nil {
-			return
-		}
-		items = resolveRef(items, root)
-		for i, elem := range arr {
-			elemPath := fmt.Sprintf("%s[%d]", path, i)
-			if itemType, _ := items["type"].(string); itemType == "object" {
-				m, ok := elem.(map[string]any)
-				if !ok {
-					t.Errorf("%s: expected object, got %T", elemPath, elem)
-					continue
-				}
-				validateObject(t, elemPath, m, items, root)
-			} else {
-				validateType(t, elemPath, elem, items, root)
-			}
-		}
-	}
-}
-
-func typMatches(val any, typ string) bool {
-	switch typ {
-	case "string":
-		_, ok := val.(string)
-		return ok
-	case "integer":
-		f, ok := val.(float64)
-		return ok && f == float64(int64(f))
-	case "number":
-		_, ok := val.(float64)
-		return ok
-	case "boolean":
-		_, ok := val.(bool)
-		return ok
-	case "array":
-		_, ok := val.([]any)
-		return ok
-	case "object":
-		_, ok := val.(map[string]any)
-		return ok
-	}
-	return false
-}
-
-func TestTraceGraphOutputSchema_RecursiveDefs(t *testing.T) {
-	var schema map[string]any
-	if err := json.Unmarshal([]byte(traceGraphOutputSchema), &schema); err != nil {
-		t.Fatalf("invalid JSON: %v", err)
-	}
-	defs, ok := schema["$defs"].(map[string]any)
-	if !ok {
-		t.Fatal("expected $defs in schema")
-	}
-	spanNode, ok := defs["span_node"].(map[string]any)
-	if !ok {
-		t.Fatal("expected span_node in $defs")
-	}
-	// Verify children self-references
-	props := spanNode["properties"].(map[string]any)
-	children := props["children"].(map[string]any)
-	items := children["items"].(map[string]any)
-	if ref, ok := items["$ref"]; !ok || ref != "#/$defs/span_node" {
-		t.Errorf("children.items.$ref = %v, want #/$defs/span_node", ref)
-	}
-	// Verify roots uses $ref
-	rootProps := schema["properties"].(map[string]any)
-	roots := rootProps["roots"].(map[string]any)
-	rootItems := roots["items"].(map[string]any)
-	if ref, ok := rootItems["$ref"]; !ok || ref != "#/$defs/span_node" {
-		t.Errorf("roots.items.$ref = %v, want #/$defs/span_node", ref)
-	}
-}
diff --git a/internal/tools/schemas.go b/internal/tools/schemas.go
deleted file mode 100644
index 2897a15..0000000
--- a/internal/tools/schemas.go
+++ /dev/null
@@ -1,375 +0,0 @@
-package tools
-
-const graphStatsInputSchema = `{
-  "type": "object",
-  "properties": {},
-  "additionalProperties": false
-}`
-
-const graphStatsOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "nodes": { "type": "integer" },
-    "edges": { "type": "integer" },
-    "requests": { "type": "integer" },
-    "users": { "type": "integer" },
-    "services": { "type": "integer" },
-    "feature_flags": { "type": "integer" },
-    "failures": { "type": "integer" }
-  },
-  "required": ["schema_version", "nodes", "edges", "requests", "users", "services", "feature_flags", "failures"],
-  "additionalProperties": false
-}`
-
-const explainRequestInputSchema = `{
-  "type": "object",
-  "properties": {
-    "request_id": { "type": "string" },
-    "trace_id": { "type": "string" }
-  },
-  "additionalProperties": false
-}`
-
-const explainRequestOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "request_id": { "type": "string" },
-    "latency_ms": { "type": ["number", "null"] },
-    "flow": { "type": ["string", "null"] },
-    "user_id": { "type": "string" },
-    "user_tier": { "type": ["string", "null"] },
-    "feature_flags": { "type": "array", "items": { "type": "string" } },
-    "span_id": { "type": "string" },
-    "span_service": { "type": ["string", "null"] },
-    "span_depth": { "type": "string" },
-    "service": { "type": ["string", "null"] },
-    "error_code": { "type": ["string", "null"] },
-    "error_msg": { "type": ["string", "null"] },
-    "span_chain": { "type": "array", "items": { "type": "object", "properties": { "span_id": { "type": "string" }, "service": { "type": "string" }, "error_code": { "type": "string" }, "latency_ms": { "type": ["number", "null"] }, "depth": { "type": "integer" } }, "required": ["span_id", "service", "depth"], "additionalProperties": false } }
-  },
-  "required": ["schema_version", "request_id"],
-  "additionalProperties": false
-}`
-
-const traceGraphInputSchema = `{
-  "type": "object",
-  "properties": {
-    "trace_id": { "type": "string" }
-  },
-  "required": ["trace_id"],
-  "additionalProperties": false
-}`
-
-const traceGraphOutputSchema = `{
-  "type": "object",
-  "$defs": {
-    "span_node": {
-      "type": "object",
-      "properties": {
-        "span_id": { "type": "string" },
-        "service": { "type": ["string", "null"] },
-        "children": { "type": "array", "items": { "$ref": "#/$defs/span_node" } }
-      },
-      "additionalProperties": false
-    }
-  },
-  "properties": {
-    "schema_version": { "type": "string" },
-    "trace_id": { "type": "string" },
-    "roots": { "type": "array", "items": { "$ref": "#/$defs/span_node" } }
-  },
-  "required": ["schema_version", "trace_id", "roots"],
-  "additionalProperties": false
-}`
-
-const traceSummaryInputSchema = `{
-  "type": "object",
-  "properties": {
-    "trace_id": { "type": "string" }
-  },
-  "required": ["trace_id"],
-  "additionalProperties": false
-}`
-
-const traceSummaryOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "trace_id": { "type": "string" },
-    "request_id": { "type": "string" },
-    "event_name": { "type": "string" },
-    "flow": { "type": "string" },
-    "latency_ms": { "type": ["number", "null"] },
-    "root_span_ids": { "type": "array", "items": { "type": "string" } },
-    "paths": {
-      "type": "array",
-      "items": { "type": "array", "items": { "type": "string" } }
-    }
-  },
-  "required": ["schema_version", "trace_id", "request_id"],
-  "additionalProperties": false
-}`
-
-const failuresInputSchema = `{
-  "type": "object",
-  "properties": {
-    "tier": { "type": "string" },
-    "limit": { "type": "integer", "description": "Max results (default 100, max 1000)" },
-    "offset": { "type": "integer", "description": "Skip N results (default 0)" }
-  },
-  "additionalProperties": false
-}`
-
-const failuresOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "failures": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "request_id": { "type": "string" },
-          "trace_id": { "type": "string" },
-          "latency_ms": { "type": ["number", "null"] },
-          "tier": { "type": "string" },
-          "error_code": { "type": "string" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "total_count": { "type": "integer" },
-    "has_more": { "type": "boolean" }
-  },
-  "required": ["schema_version", "failures", "total_count", "has_more"],
-  "additionalProperties": false
-}`
-
-const patternsInputSchema = `{
-  "type": "object",
-  "properties": {
-    "window": { "type": "string" },
-    "limit": { "type": "integer", "description": "Max results (default 100, max 1000)" },
-    "offset": { "type": "integer", "description": "Skip N results (default 0)" }
-  },
-  "additionalProperties": false
-}`
-
-const patternsOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "patterns": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "error_code": { "type": "string" },
-          "flow": { "type": "string" },
-          "user_tier": { "type": "string" },
-          "feature_flags": { "type": "array", "items": { "type": "string" } },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "total_count": { "type": "integer" },
-    "has_more": { "type": "boolean" }
-  },
-  "required": ["schema_version", "patterns", "total_count", "has_more"],
-  "additionalProperties": false
-}`
-
-const blastInputSchema = `{
-  "type": "object",
-  "properties": {
-    "error_code": { "type": "string" },
-    "include_services": { "type": "boolean" },
-    "top_users": { "type": "integer" },
-    "by_tier": { "type": "boolean" },
-    "limit": { "type": "integer", "description": "Max results (default 100, max 1000)" },
-    "offset": { "type": "integer", "description": "Skip N results (default 0)" }
-  },
-  "required": ["error_code"],
-  "additionalProperties": false
-}`
-
-const blastOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "error_code": { "type": "string" },
-    "affected_requests": { "type": "integer" },
-    "affected_users": { "type": "integer" },
-    "vip_users": { "type": "integer" },
-    "severity_score": { "type": "number" },
-    "services": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "service": { "type": "string" },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "tiers": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "tier": { "type": "string" },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "top_users": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "user_id": { "type": "string" },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "feature_flags": { "type": "array", "items": { "type": "string" } },
-    "total_count": { "type": "integer" },
-    "has_more": { "type": "boolean" }
-  },
-  "required": ["schema_version", "error_code", "affected_requests", "affected_users", "vip_users", "severity_score", "total_count", "has_more"],
-  "additionalProperties": false
-}`
-
-const chainInputSchema = `{
-  "type": "object",
-  "properties": {
-    "request_id": { "type": "string" }
-  },
-  "required": ["request_id"],
-  "additionalProperties": false
-}`
-
-const chainOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "request_id": { "type": "string" },
-    "services": { "type": "array", "items": { "type": "string" } }
-  },
-  "required": ["schema_version", "request_id", "services"],
-  "additionalProperties": false
-}`
-
-const queryInputSchema = `{
-  "type": "object",
-  "properties": {
-    "expr": { "type": "string" },
-    "window": { "type": "string" }
-  },
-  "required": ["expr", "window"],
-  "additionalProperties": false
-}`
-
-const queryOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "matched_requests": { "type": "integer" }
-  },
-  "required": ["schema_version", "matched_requests"],
-  "additionalProperties": false
-}`
-
-const diffInputSchema = `{
-  "type": "object",
-  "properties": {
-    "current": { "type": "string" },
-    "baseline": { "type": "string" },
-    "offset": { "type": "string" },
-    "anchor": { "type": "string", "description": "ISO 8601 timestamp to anchor windows around (mutually exclusive with offset)" }
-  },
-  "required": ["current", "baseline"],
-  "additionalProperties": false
-}`
-
-const diffEntryItemSchema = `{
-  "type": "object",
-  "properties": {
-    "error_code": { "type": "string" },
-    "before": { "type": "integer" },
-    "after": { "type": "integer" },
-    "delta": { "type": "integer" }
-  },
-  "additionalProperties": false
-}`
-
-const diffOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "new": { "type": "array", "items": ` + diffEntryItemSchema + ` },
-    "removed": { "type": "array", "items": ` + diffEntryItemSchema + ` },
-    "increased": { "type": "array", "items": ` + diffEntryItemSchema + ` },
-    "decreased": { "type": "array", "items": ` + diffEntryItemSchema + ` },
-    "total_requests_before": { "type": "integer" },
-    "total_requests_after": { "type": "integer" },
-    "total_failures_before": { "type": "integer" },
-    "total_failures_after": { "type": "integer" },
-    "latency_p50_before": { "type": "integer" },
-    "latency_p50_after": { "type": "integer" },
-    "latency_p95_before": { "type": "integer" },
-    "latency_p95_after": { "type": "integer" },
-    "latency_p99_before": { "type": "integer" },
-    "latency_p99_after": { "type": "integer" }
-  },
-  "required": ["schema_version"],
-  "additionalProperties": false
-}`
-
-const insightsInputSchema = `{
-  "type": "object",
-  "properties": {
-    "window": { "type": "string" },
-    "top_errors": { "type": "integer" },
-    "top_services": { "type": "integer" }
-  },
-  "additionalProperties": false
-}`
-
-const insightsOutputSchema = `{
-  "type": "object",
-  "properties": {
-    "schema_version": { "type": "string" },
-    "total_failures": { "type": "integer" },
-    "top_errors": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "error_code": { "type": "string" },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    },
-    "top_services": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "properties": {
-          "service": { "type": "string" },
-          "count": { "type": "integer" }
-        },
-        "additionalProperties": false
-      }
-    }
-  },
-  "required": ["schema_version", "total_failures"],
-  "additionalProperties": false
-}`
diff --git a/internal/tools/stats.go b/internal/tools/stats.go
deleted file mode 100644
index d39480c..0000000
--- a/internal/tools/stats.go
+++ /dev/null
@@ -1,46 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-)
-
-type graphStatsOutput struct {
-	SchemaVersion string `json:"schema_version"`
-	Nodes         int    `json:"nodes"`
-	Edges         int    `json:"edges"`
-	Requests      int    `json:"requests"`
-	Users         int    `json:"users"`
-	Services      int    `json:"services"`
-	FeatureFlags  int    `json:"feature_flags"`
-	Failures      int    `json:"failures"`
-}
-
-func handleGraphStats(ctx context.Context, store Store, _ json.RawMessage) (any, error) {
-	_ = ctx
-	g := store.Snapshot()
-	out := graphStatsOutput{
-		SchemaVersion: "1.0",
-		Nodes:         len(g.Nodes),
-		Edges:         len(g.Edges),
-	}
-
-	for _, n := range g.Nodes {
-		switch n.Type {
-		case core.NodeRequest:
-			out.Requests++
-		case core.NodeUser:
-			out.Users++
-		case core.NodeService:
-			out.Services++
-		case core.NodeFlag:
-			out.FeatureFlags++
-		case core.NodeError:
-			out.Failures++
-		}
-	}
-
-	return out, nil
-}
diff --git a/internal/tools/store.go b/internal/tools/store.go
deleted file mode 100644
index 9534ceb..0000000
--- a/internal/tools/store.go
+++ /dev/null
@@ -1,17 +0,0 @@
-package tools
-
-import (
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-type Store interface {
-	Snapshot() *core.Graph
-	SummarizeWindow(start, end time.Time) graphstore.WindowSummary
-	ForEachRequestFact(start, end time.Time, fn func(graphstore.RequestFacts))
-	ErrorIndex(errorCode string) ([]string, bool)
-	TraceStore() *tracestore.Store
-}
diff --git a/internal/tools/trace.go b/internal/tools/trace.go
deleted file mode 100644
index 8985d7d..0000000
--- a/internal/tools/trace.go
+++ /dev/null
@@ -1,302 +0,0 @@
-package tools
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-type traceGraphInput struct {
-	TraceID string `json:"trace_id"`
-}
-
-type traceSpan struct {
-	SpanID   string      `json:"span_id,omitempty"`
-	Service  any         `json:"service,omitempty"`
-	Children []traceSpan `json:"children,omitempty"`
-}
-
-type traceGraphOutput struct {
-	SchemaVersion string      `json:"schema_version"`
-	TraceID       string      `json:"trace_id"`
-	Roots         []traceSpan `json:"roots"`
-}
-
-func handleTraceGraph(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input traceGraphInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.TraceID == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "trace_id required"}
-	}
-
-	g := store.Snapshot()
-	reqID := core.ID("request", input.TraceID)
-	var roots []traceSpan
-
-	if ts := traceStoreFrom(store); ts != nil {
-		if rec, ok := ts.Get(input.TraceID); ok {
-			roots = traceTreeToSpans(tracestore.BuildTree(rec.Spans))
-		}
-	}
-	if len(roots) == 0 {
-		for _, spanID := range rootSpanIDsFromGraph(g, reqID) {
-			roots = append(roots, buildTraceSpanFromGraph(g, spanID, map[string]bool{}))
-		}
-	}
-
-	return traceGraphOutput{
-		SchemaVersion: "1.0",
-		TraceID:       input.TraceID,
-		Roots:         roots,
-	}, nil
-}
-
-func buildTraceSpanFromGraph(g *core.Graph, spanID string, visited map[string]bool) traceSpan {
-	if visited[spanID] {
-		return traceSpan{}
-	}
-	visited[spanID] = true
-
-	n, ok := g.Nodes[spanID]
-	if !ok {
-		return traceSpan{}
-	}
-
-	out := traceSpan{
-		Service: n.Attr["service"],
-	}
-	if span, ok := n.Attr["span_id"]; ok && span != nil {
-		out.SpanID = fmt.Sprintf("%v", span)
-	}
-
-	for _, e := range g.InEdges[spanID] {
-		if e.Type == core.EdgeSpanChildOf {
-			out.Children = append(out.Children, buildTraceSpanFromGraph(g, e.From, visited))
-		}
-	}
-
-	return out
-}
-
-type traceSummaryInput struct {
-	TraceID string `json:"trace_id"`
-}
-
-type traceSummaryOutput struct {
-	SchemaVersion   string     `json:"schema_version"`
-	TraceID         string     `json:"trace_id"`
-	RequestID       string     `json:"request_id"`
-	EventName       string     `json:"event_name,omitempty"`
-	Flow            string     `json:"flow,omitempty"`
-	LatencyMs       any        `json:"latency_ms,omitempty"`
-	ErrorCode       string     `json:"error_code,omitempty"`
-	ErrorPath       string     `json:"error_path,omitempty"`
-	ErrorReason     string     `json:"error_reason,omitempty"`
-	RetryOf         int        `json:"retry_of,omitempty"`
-	RetryPreviousID string     `json:"retry_previous_attempt_id,omitempty"`
-	ParentRequestID string     `json:"parent_request_id,omitempty"`
-	RootSpanIDs     []string   `json:"root_span_ids,omitempty"`
-	Paths           [][]string `json:"paths,omitempty"`
-}
-
-func handleTraceSummary(ctx context.Context, store Store, params json.RawMessage) (any, error) {
-	_ = ctx
-	var input traceSummaryInput
-	if err := json.Unmarshal(params, &input); err != nil {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: fmt.Sprintf("invalid params: %v", err)}
-	}
-	if input.TraceID == "" {
-		return nil, &ToolError{Code: CodeInvalidParams, Message: "trace_id required"}
-	}
-
-	g := store.Snapshot()
-	reqID := core.ID("request", input.TraceID)
-
-	out := traceSummaryOutput{
-		SchemaVersion: "1.0",
-		TraceID:       input.TraceID,
-		RequestID:     reqID,
-	}
-
-	if req, ok := g.Nodes[reqID]; ok {
-		if req.Attr != nil {
-			if name, ok := req.Attr["event_name"].(string); ok {
-				out.EventName = name
-			}
-			if flow, ok := req.Attr["flow"].(string); ok {
-				out.Flow = flow
-			}
-			out.LatencyMs = req.Attr["latency_ms"]
-			if code, ok := req.Attr["error_code"].(string); ok {
-				out.ErrorCode = code
-			}
-			if path, ok := req.Attr["error_path"].(string); ok {
-				out.ErrorPath = path
-			}
-			if reason, ok := req.Attr["error_reason"].(string); ok {
-				out.ErrorReason = reason
-			}
-			if parent, ok := req.Attr["parent_request_id"].(string); ok {
-				out.ParentRequestID = parent
-			}
-			if prev, ok := req.Attr["retry_previous_attempt_id"].(string); ok {
-				out.RetryPreviousID = prev
-			}
-			switch v := req.Attr["retry_of"].(type) {
-			case int:
-				out.RetryOf = v
-			case float64:
-				out.RetryOf = int(v)
-			}
-		}
-	}
-
-	if ts := traceStoreFrom(store); ts != nil {
-		if rec, ok := ts.Get(input.TraceID); ok {
-			out.RequestID = rec.RequestID
-			roots := tracestore.BuildTree(rec.Spans)
-			out.RootSpanIDs = traceRootIDs(roots)
-			out.Paths = traceTreePaths(roots)
-		}
-	}
-	if len(out.RootSpanIDs) == 0 {
-		rootSpans := rootSpanIDsFromGraph(g, reqID)
-		out.RootSpanIDs = rootSpans
-		out.Paths = spanPathsForRootsFromGraph(g, rootSpans)
-	}
-
-	if len(out.Paths) == 0 {
-		if chain := serviceChainForRequest(g, reqID); len(chain) > 0 {
-			out.Paths = [][]string{chain}
-		}
-	}
-
-	return out, nil
-}
-
-func traceTreeToSpans(nodes []*tracestore.TreeNode) []traceSpan {
-	if len(nodes) == 0 {
-		return nil
-	}
-	out := make([]traceSpan, 0, len(nodes))
-	for _, node := range nodes {
-		out = append(out, traceSpan{
-			SpanID:   node.Span.SpanID,
-			Service:  node.Span.Service,
-			Children: traceTreeToSpans(node.Children),
-		})
-	}
-	return out
-}
-
-func traceRootIDs(nodes []*tracestore.TreeNode) []string {
-	if len(nodes) == 0 {
-		return nil
-	}
-	ids := make([]string, 0, len(nodes))
-	for _, node := range nodes {
-		if node.Span.SpanID != "" {
-			ids = append(ids, node.Span.SpanID)
-		}
-	}
-	return ids
-}
-
-func traceTreePaths(nodes []*tracestore.TreeNode) [][]string {
-	var paths [][]string
-	var walk func(node *tracestore.TreeNode, prefix []string)
-	walk = func(node *tracestore.TreeNode, prefix []string) {
-		if node == nil {
-			return
-		}
-		service := node.Span.Service
-		if service == "" {
-			service = node.Span.SpanID
-		}
-		next := append(prefix, service)
-		if len(node.Children) == 0 {
-			paths = append(paths, next)
-			return
-		}
-		for _, child := range node.Children {
-			walk(child, next)
-		}
-	}
-	for _, root := range nodes {
-		walk(root, nil)
-	}
-	return paths
-}
-
-func rootSpanIDsFromGraph(g *core.Graph, reqID string) []string {
-	hasParent := map[string]bool{}
-	for _, e := range g.Edges {
-		if e.Type == core.EdgeSpanChildOf {
-			hasParent[e.From] = true
-		}
-	}
-	var roots []string
-	seen := map[string]bool{}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeRequestHasSpan {
-			continue
-		}
-		if seen[e.To] {
-			continue
-		}
-		seen[e.To] = true
-		if !hasParent[e.To] {
-			roots = append(roots, e.To)
-		}
-	}
-	return roots
-}
-
-func spanPathsForRootsFromGraph(g *core.Graph, roots []string) [][]string {
-	if len(roots) == 0 {
-		return nil
-	}
-	children := map[string][]string{}
-	for _, e := range g.Edges {
-		if e.Type == core.EdgeSpanChildOf {
-			children[e.To] = append(children[e.To], e.From)
-		}
-	}
-
-	var paths [][]string
-	for _, root := range roots {
-		dfsSpanPathsFromGraph(g, root, children, nil, &paths)
-	}
-	return paths
-}
-
-func dfsSpanPathsFromGraph(g *core.Graph, spanID string, children map[string][]string, prefix []string, out *[][]string) {
-	n, ok := g.Nodes[spanID]
-	if !ok {
-		return
-	}
-	service := ""
-	if n.Attr != nil {
-		if s, ok := n.Attr["service"].(string); ok {
-			service = s
-		}
-	}
-	if service == "" {
-		service = spanID
-	}
-	path := append(prefix, service)
-	kids := children[spanID]
-	if len(kids) == 0 {
-		*out = append(*out, path)
-		return
-	}
-	for _, child := range kids {
-		dfsSpanPathsFromGraph(g, child, children, path, out)
-	}
-}
diff --git a/internal/tools/tracestore.go b/internal/tools/tracestore.go
deleted file mode 100644
index 4c3493c..0000000
--- a/internal/tools/tracestore.go
+++ /dev/null
@@ -1,10 +0,0 @@
-package tools
-
-import "github.com/sssmaran/WaylogCLI/internal/tracestore"
-
-func traceStoreFrom(store Store) *tracestore.Store {
-	if store == nil {
-		return nil
-	}
-	return store.TraceStore()
-}
diff --git a/internal/tools/triage.go b/internal/tools/triage.go
index 7ad969d..78116ad 100644
--- a/internal/tools/triage.go
+++ b/internal/tools/triage.go
@@ -35,7 +35,7 @@ func RegisterTriageTool(reg *Registry, engine *triage.Engine) error {
 			`{"incident_id":"inc_01HX...","window":"15m"}`,
 			`{"incident_id":"inc_01HX...","snapshot":true}`,
 		},
-		Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+		Handler: func(ctx context.Context, params json.RawMessage) (any, error) {
 			var p struct {
 				IncidentID string `json:"incident_id"`
 				Window     string `json:"window"`
diff --git a/internal/tools/triage_test.go b/internal/tools/triage_test.go
index 560e802..5aec2fb 100644
--- a/internal/tools/triage_test.go
+++ b/internal/tools/triage_test.go
@@ -29,7 +29,7 @@ func TestTriageToolHandlerReturnsReport(t *testing.T) {
 		t.Fatalf("register: %v", err)
 	}
 	params := json.RawMessage(`{"incident_id":"inc_abc","window":"15m","snapshot":false}`)
-	out, err := reg.Call(context.Background(), nil /* graph store unused by triage */, "triage_incident", params)
+	out, err := reg.Call(context.Background(), "triage_incident", params)
 	if err != nil {
 		t.Fatalf("call: %v", err)
 	}
diff --git a/internal/tracestore/store.go b/internal/tracestore/store.go
deleted file mode 100644
index b038cdb..0000000
--- a/internal/tracestore/store.go
+++ /dev/null
@@ -1,279 +0,0 @@
-package tracestore
-
-import (
-	"slices"
-	"sync"
-	"time"
-)
-
-type SpanRecord struct {
-	SpanID            string
-	ParentSpanID      string
-	Service           string
-	EventName         string
-	StatusCode        int
-	Success           bool
-	LatencyMs         int64
-	ErrorCode         string
-	ErrorMessage      string
-	ErrorPath         string
-	ErrorReason       string
-	CallerService     string
-	DownstreamService string
-	Timestamp         time.Time
-	HTTPMethod        string
-	RouteTemplate     string
-	RetryOf           int
-	RetryPreviousID   string
-	Metadata          map[string]any
-}
-
-type TraceRecord struct {
-	TraceID   string
-	RequestID string
-	Spans     []SpanRecord
-	UpdatedAt time.Time
-}
-
-type Store struct {
-	mu              sync.RWMutex
-	traces          map[string]*TraceRecord
-	traceLastBucket map[string]time.Time
-	cohorts         []*cohort
-}
-
-type cohort struct {
-	bucket   time.Time
-	traceIDs map[string]struct{}
-}
-
-func NewStore() *Store {
-	return &Store{
-		traces:          map[string]*TraceRecord{},
-		traceLastBucket: map[string]time.Time{},
-	}
-}
-
-func (s *Store) Get(traceID string) (*TraceRecord, bool) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	record, ok := s.traces[traceID]
-	if !ok {
-		return nil, false
-	}
-	return cloneTraceRecord(record), true
-}
-
-func (s *Store) Upsert(traceID, requestID string, span *SpanRecord) {
-	if traceID == "" || span == nil || span.SpanID == "" {
-		return
-	}
-
-	now := time.Now().UTC()
-
-	// Use the span's event timestamp for cohort bucketing so that replayed
-	// and late-arriving events land in the correct time bucket instead of
-	// inflating the current cohort.
-	ts := span.Timestamp
-	if ts.IsZero() {
-		ts = now
-	}
-	bucket := ts.Truncate(time.Minute)
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	record := s.traces[traceID]
-	if record == nil {
-		record = &TraceRecord{TraceID: traceID}
-		s.traces[traceID] = record
-	}
-	if record.RequestID == "" && requestID != "" {
-		record.RequestID = requestID
-	}
-
-	merged := false
-	for i := range record.Spans {
-		if record.Spans[i].SpanID != span.SpanID {
-			continue
-		}
-		mergeSpanRecord(&record.Spans[i], *span)
-		merged = true
-		break
-	}
-	if !merged {
-		record.Spans = append(record.Spans, *span)
-	}
-
-	record.UpdatedAt = now
-	s.moveTraceToBucketLocked(traceID, bucket)
-}
-
-func (s *Store) ForEachSpan(start, end time.Time, fn func(traceID string, span SpanRecord)) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-
-	for traceID, record := range s.traces {
-		for _, span := range record.Spans {
-			ts := span.Timestamp
-			if ts.IsZero() {
-				ts = record.UpdatedAt
-			}
-			if ts.IsZero() || ts.Before(start) || ts.After(end) {
-				continue
-			}
-			fn(traceID, span)
-		}
-	}
-}
-
-func (s *Store) Count() int {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	return len(s.traces)
-}
-
-func (s *Store) SpanCount() int {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	n := 0
-	for _, rec := range s.traces {
-		n += len(rec.Spans)
-	}
-	return n
-}
-
-func (s *Store) CohortCount() int {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	return len(s.cohorts)
-}
-
-func (s *Store) PruneOlderThan(cutoff time.Time) (deletedTraces int, deletedCohorts int) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	idx := 0
-	for idx < len(s.cohorts) {
-		c := s.cohorts[idx]
-		if !c.bucket.Before(cutoff) {
-			break
-		}
-		for traceID := range c.traceIDs {
-			delete(s.traces, traceID)
-			delete(s.traceLastBucket, traceID)
-			deletedTraces++
-		}
-		idx++
-	}
-	if idx > 0 {
-		s.cohorts = slices.Delete(s.cohorts, 0, idx)
-		deletedCohorts = idx
-	}
-	return deletedTraces, deletedCohorts
-}
-
-func cloneTraceRecord(record *TraceRecord) *TraceRecord {
-	if record == nil {
-		return nil
-	}
-	out := &TraceRecord{
-		TraceID:   record.TraceID,
-		RequestID: record.RequestID,
-		UpdatedAt: record.UpdatedAt,
-	}
-	if len(record.Spans) > 0 {
-		out.Spans = append([]SpanRecord(nil), record.Spans...)
-	}
-	return out
-}
-
-func mergeSpanRecord(dst *SpanRecord, src SpanRecord) {
-	if dst.ParentSpanID == "" && src.ParentSpanID != "" {
-		dst.ParentSpanID = src.ParentSpanID
-	}
-	if dst.Service == "" && src.Service != "" {
-		dst.Service = src.Service
-	}
-	if dst.EventName == "" && src.EventName != "" {
-		dst.EventName = src.EventName
-	}
-	if dst.StatusCode == 0 && src.StatusCode != 0 {
-		dst.StatusCode = src.StatusCode
-	}
-	if !dst.Success && src.Success {
-		dst.Success = src.Success
-	}
-	if dst.LatencyMs == 0 && src.LatencyMs != 0 {
-		dst.LatencyMs = src.LatencyMs
-	}
-	if dst.ErrorCode == "" && src.ErrorCode != "" {
-		dst.ErrorCode = src.ErrorCode
-	}
-	if dst.ErrorMessage == "" && src.ErrorMessage != "" {
-		dst.ErrorMessage = src.ErrorMessage
-	}
-	if dst.ErrorPath == "" && src.ErrorPath != "" {
-		dst.ErrorPath = src.ErrorPath
-	}
-	if dst.ErrorReason == "" && src.ErrorReason != "" {
-		dst.ErrorReason = src.ErrorReason
-	}
-	if dst.CallerService == "" && src.CallerService != "" {
-		dst.CallerService = src.CallerService
-	}
-	if dst.DownstreamService == "" && src.DownstreamService != "" {
-		dst.DownstreamService = src.DownstreamService
-	}
-	if dst.Timestamp.IsZero() && !src.Timestamp.IsZero() {
-		dst.Timestamp = src.Timestamp
-	}
-	if dst.HTTPMethod == "" && src.HTTPMethod != "" {
-		dst.HTTPMethod = src.HTTPMethod
-	}
-	if dst.RouteTemplate == "" && src.RouteTemplate != "" {
-		dst.RouteTemplate = src.RouteTemplate
-	}
-}
-
-func (s *Store) moveTraceToBucketLocked(traceID string, bucket time.Time) {
-	if old, ok := s.traceLastBucket[traceID]; ok && old.Equal(bucket) {
-		return
-	} else if ok {
-		s.removeTraceFromBucketLocked(traceID, old)
-	}
-
-	s.traceLastBucket[traceID] = bucket
-	cohort := s.cohortForBucketLocked(bucket)
-	cohort.traceIDs[traceID] = struct{}{}
-}
-
-func (s *Store) removeTraceFromBucketLocked(traceID string, bucket time.Time) {
-	for i := range s.cohorts {
-		if !s.cohorts[i].bucket.Equal(bucket) {
-			continue
-		}
-		delete(s.cohorts[i].traceIDs, traceID)
-		if len(s.cohorts[i].traceIDs) == 0 {
-			s.cohorts = slices.Delete(s.cohorts, i, i+1)
-		}
-		return
-	}
-}
-
-func (s *Store) cohortForBucketLocked(bucket time.Time) *cohort {
-	for i := range s.cohorts {
-		if s.cohorts[i].bucket.Equal(bucket) {
-			return s.cohorts[i]
-		}
-		if s.cohorts[i].bucket.After(bucket) {
-			c := &cohort{bucket: bucket, traceIDs: map[string]struct{}{}}
-			s.cohorts = slices.Insert(s.cohorts, i, c)
-			return c
-		}
-	}
-	c := &cohort{bucket: bucket, traceIDs: map[string]struct{}{}}
-	s.cohorts = append(s.cohorts, c)
-	return c
-}
diff --git a/internal/tracestore/store_test.go b/internal/tracestore/store_test.go
deleted file mode 100644
index 7e9dc1d..0000000
--- a/internal/tracestore/store_test.go
+++ /dev/null
@@ -1,152 +0,0 @@
-package tracestore
-
-import (
-	"testing"
-	"time"
-)
-
-func TestUpsert_NewTrace(t *testing.T) {
-	s := NewStore()
-	s.Upsert("trace-1", "req-1", &SpanRecord{
-		SpanID:  "span-a",
-		Service: "api-gateway",
-		Success: true,
-	})
-	rec, ok := s.Get("trace-1")
-	if !ok {
-		t.Fatal("expected trace to exist")
-	}
-	if rec.TraceID != "trace-1" {
-		t.Errorf("got TraceID=%q, want trace-1", rec.TraceID)
-	}
-	if rec.RequestID != "req-1" {
-		t.Errorf("got RequestID=%q, want req-1", rec.RequestID)
-	}
-	if len(rec.Spans) != 1 {
-		t.Fatalf("got %d spans, want 1", len(rec.Spans))
-	}
-	if rec.Spans[0].SpanID != "span-a" {
-		t.Errorf("got SpanID=%q, want span-a", rec.Spans[0].SpanID)
-	}
-}
-
-func TestUpsert_AppendNewSpan(t *testing.T) {
-	s := NewStore()
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-a", Service: "gw"})
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-b", Service: "checkout"})
-	rec, _ := s.Get("trace-1")
-	if len(rec.Spans) != 2 {
-		t.Fatalf("got %d spans, want 2", len(rec.Spans))
-	}
-}
-
-func TestUpsert_EnrichStub(t *testing.T) {
-	s := NewStore()
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-a"})
-	s.Upsert("trace-1", "req-1", &SpanRecord{
-		SpanID:    "span-a",
-		Service:   "checkout",
-		LatencyMs: 42,
-		Success:   true,
-	})
-	rec, _ := s.Get("trace-1")
-	if len(rec.Spans) != 1 {
-		t.Fatalf("got %d spans, want 1", len(rec.Spans))
-	}
-	if rec.Spans[0].Service != "checkout" {
-		t.Errorf("stub not enriched: Service=%q", rec.Spans[0].Service)
-	}
-	if rec.Spans[0].LatencyMs != 42 {
-		t.Errorf("stub not enriched: LatencyMs=%d", rec.Spans[0].LatencyMs)
-	}
-}
-
-func TestUpsert_FirstNonZeroWins(t *testing.T) {
-	s := NewStore()
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-a", Service: "first"})
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-a", Service: "second"})
-	rec, _ := s.Get("trace-1")
-	if rec.Spans[0].Service != "first" {
-		t.Errorf("first non-zero should win, got Service=%q", rec.Spans[0].Service)
-	}
-}
-
-func TestGet_UnknownTrace(t *testing.T) {
-	s := NewStore()
-	_, ok := s.Get("nonexistent")
-	if ok {
-		t.Error("expected ok=false for unknown trace")
-	}
-}
-
-func TestUpsert_UpdatesUpdatedAt(t *testing.T) {
-	s := NewStore()
-	s.Upsert("trace-1", "req-1", &SpanRecord{SpanID: "span-a"})
-	rec, _ := s.Get("trace-1")
-	if rec.UpdatedAt.IsZero() {
-		t.Error("UpdatedAt should be set after upsert")
-	}
-}
-
-func TestPruneOlderThan(t *testing.T) {
-	s := NewStore()
-
-	old := time.Now().Add(-3 * time.Hour)
-	s.mu.Lock()
-	rec := &TraceRecord{TraceID: "old-trace", RequestID: "req-old", UpdatedAt: old}
-	rec.Spans = []SpanRecord{{SpanID: "s1", Service: "svc", Timestamp: old}}
-	s.traces["old-trace"] = rec
-	bucket := old.Truncate(time.Minute)
-	s.traceLastBucket["old-trace"] = bucket
-	s.cohorts = append([]*cohort{{bucket: bucket, traceIDs: map[string]struct{}{"old-trace": {}}}}, s.cohorts...)
-	s.mu.Unlock()
-
-	s.Upsert("new-trace", "req-new", &SpanRecord{SpanID: "s2", Service: "svc"})
-
-	cutoff := time.Now().Add(-1 * time.Hour)
-	s.PruneOlderThan(cutoff)
-
-	_, ok := s.Get("old-trace")
-	if ok {
-		t.Error("old trace should be pruned")
-	}
-	_, ok = s.Get("new-trace")
-	if !ok {
-		t.Error("new trace should still exist")
-	}
-}
-
-func TestForEachSpan(t *testing.T) {
-	s := NewStore()
-	s.Upsert("t1", "r1", &SpanRecord{SpanID: "a", Service: "gw"})
-	s.Upsert("t1", "r1", &SpanRecord{SpanID: "b", Service: "checkout"})
-	s.Upsert("t2", "r2", &SpanRecord{SpanID: "c", Service: "payment"})
-
-	var count int
-	start := time.Now().Add(-1 * time.Minute)
-	end := time.Now().Add(1 * time.Minute)
-	s.ForEachSpan(start, end, func(traceID string, span SpanRecord) {
-		if traceID == "" {
-			t.Fatal("traceID should be populated")
-		}
-		count++
-	})
-	if count != 3 {
-		t.Errorf("got %d spans, want 3", count)
-	}
-}
-
-func TestForEachSpan_TimeFiltered(t *testing.T) {
-	s := NewStore()
-	s.Upsert("t1", "r1", &SpanRecord{SpanID: "a", Service: "gw"})
-
-	var count int
-	start := time.Now().Add(-2 * time.Hour)
-	end := time.Now().Add(-1 * time.Hour)
-	s.ForEachSpan(start, end, func(traceID string, span SpanRecord) {
-		count++
-	})
-	if count != 0 {
-		t.Errorf("got %d spans, want 0 (outside window)", count)
-	}
-}
diff --git a/internal/tracestore/tree.go b/internal/tracestore/tree.go
deleted file mode 100644
index 9ee1aa6..0000000
--- a/internal/tracestore/tree.go
+++ /dev/null
@@ -1,108 +0,0 @@
-package tracestore
-
-import "sort"
-
-type TreeNode struct {
-	Span     SpanRecord
-	Children []*TreeNode
-}
-
-// BuildTree reconstructs a span tree from a flat span list.
-// Orphans and unresolved cycles are promoted to roots so callers always get a
-// stable forest instead of losing evidence.
-func BuildTree(spans []SpanRecord) []*TreeNode {
-	if len(spans) == 0 {
-		return nil
-	}
-
-	byID := make(map[string]SpanRecord, len(spans))
-	order := make([]string, 0, len(spans))
-	for _, span := range spans {
-		if span.SpanID == "" {
-			continue
-		}
-		if existing, ok := byID[span.SpanID]; ok {
-			mergeSpanRecord(&existing, span)
-			byID[span.SpanID] = existing
-			continue
-		}
-		byID[span.SpanID] = span
-		order = append(order, span.SpanID)
-	}
-	children := make(map[string][]string, len(spans))
-	for _, span := range byID {
-		if span.ParentSpanID != "" {
-			children[span.ParentSpanID] = append(children[span.ParentSpanID], span.SpanID)
-		}
-	}
-
-	sort.Strings(order)
-	sortChildren := func(ids []string) {
-		sort.Slice(ids, func(i, j int) bool {
-			left := byID[ids[i]]
-			right := byID[ids[j]]
-			if left.Timestamp.Equal(right.Timestamp) {
-				return left.SpanID < right.SpanID
-			}
-			if left.Timestamp.IsZero() {
-				return false
-			}
-			if right.Timestamp.IsZero() {
-				return true
-			}
-			return left.Timestamp.Before(right.Timestamp)
-		})
-	}
-	for parentID := range children {
-		sortChildren(children[parentID])
-	}
-
-	roots := make([]string, 0, len(order))
-	for _, id := range order {
-		span := byID[id]
-		if span.ParentSpanID == "" {
-			roots = append(roots, id)
-			continue
-		}
-		if _, ok := byID[span.ParentSpanID]; !ok {
-			roots = append(roots, id)
-			continue
-		}
-	}
-	if len(roots) == 0 {
-		roots = append(roots, order...)
-	}
-
-	stack := make(map[string]bool, len(spans))
-	var build func(string) *TreeNode
-	build = func(id string) *TreeNode {
-		span, ok := byID[id]
-		if !ok {
-			return nil
-		}
-		if stack[id] {
-			return &TreeNode{Span: span}
-		}
-		stack[id] = true
-
-		node := &TreeNode{Span: span}
-		for _, childID := range children[id] {
-			if stack[childID] {
-				continue
-			}
-			if child := build(childID); child != nil {
-				node.Children = append(node.Children, child)
-			}
-		}
-		delete(stack, id)
-		return node
-	}
-
-	out := make([]*TreeNode, 0, len(roots))
-	for _, id := range roots {
-		if node := build(id); node != nil {
-			out = append(out, node)
-		}
-	}
-	return out
-}
diff --git a/internal/tracestore/tree_test.go b/internal/tracestore/tree_test.go
deleted file mode 100644
index 4f03cba..0000000
--- a/internal/tracestore/tree_test.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package tracestore
-
-import "testing"
-
-func TestBuildTree_SingleRoot(t *testing.T) {
-	spans := []SpanRecord{
-		{SpanID: "root", ParentSpanID: "", Service: "gw"},
-		{SpanID: "child1", ParentSpanID: "root", Service: "checkout"},
-		{SpanID: "child2", ParentSpanID: "root", Service: "payment"},
-	}
-	roots := BuildTree(spans)
-	if len(roots) != 1 {
-		t.Fatalf("got %d roots, want 1", len(roots))
-	}
-	if roots[0].Span.SpanID != "root" {
-		t.Errorf("root SpanID=%q", roots[0].Span.SpanID)
-	}
-	if len(roots[0].Children) != 2 {
-		t.Errorf("root has %d children, want 2", len(roots[0].Children))
-	}
-}
-
-func TestBuildTree_OrphanBecomesRoot(t *testing.T) {
-	spans := []SpanRecord{
-		{SpanID: "child", ParentSpanID: "missing-parent", Service: "svc"},
-	}
-	roots := BuildTree(spans)
-	if len(roots) != 1 {
-		t.Fatalf("orphan should become root, got %d roots", len(roots))
-	}
-	if roots[0].Span.SpanID != "child" {
-		t.Errorf("got SpanID=%q", roots[0].Span.SpanID)
-	}
-}
-
-func TestBuildTree_EmptySpans(t *testing.T) {
-	roots := BuildTree(nil)
-	if len(roots) != 0 {
-		t.Errorf("expected 0 roots for nil spans, got %d", len(roots))
-	}
-}
-
-func TestBuildTree_CycleDetection(t *testing.T) {
-	spans := []SpanRecord{
-		{SpanID: "a", ParentSpanID: "b", Service: "svc1"},
-		{SpanID: "b", ParentSpanID: "a", Service: "svc2"},
-	}
-	roots := BuildTree(spans)
-	if len(roots) == 0 {
-		t.Error("expected at least one root from cyclic spans")
-	}
-}
-
-func TestBuildTree_DeepChain(t *testing.T) {
-	spans := []SpanRecord{
-		{SpanID: "r", ParentSpanID: "", Service: "gw"},
-		{SpanID: "c1", ParentSpanID: "r", Service: "auth"},
-		{SpanID: "c2", ParentSpanID: "c1", Service: "db"},
-	}
-	roots := BuildTree(spans)
-	if len(roots) != 1 {
-		t.Fatalf("got %d roots, want 1", len(roots))
-	}
-	if len(roots[0].Children) != 1 {
-		t.Fatalf("root children=%d, want 1", len(roots[0].Children))
-	}
-	if len(roots[0].Children[0].Children) != 1 {
-		t.Fatalf("depth-2 children=%d, want 1", len(roots[0].Children[0].Children))
-	}
-	if roots[0].Children[0].Children[0].Span.Service != "db" {
-		t.Error("deepest node should be db")
-	}
-}
diff --git a/internal/tracestory/story.go b/internal/tracestory/story.go
deleted file mode 100644
index c4f46d3..0000000
--- a/internal/tracestory/story.go
+++ /dev/null
@@ -1,432 +0,0 @@
-package tracestory
-
-import (
-	"fmt"
-	"sort"
-	"strconv"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-)
-
-// Hop represents a single service hop in a trace.
-type Hop struct {
-	SpanID     string    `json:"span_id"`
-	Service    string    `json:"service"`
-	StatusCode int       `json:"status_code"`
-	LatencyMs  int64     `json:"latency_ms"`
-	Success    bool      `json:"success"`
-	ErrorCode  string    `json:"error_code,omitempty"`
-	IsRoot     bool      `json:"is_root"`
-	Timestamp  time.Time `json:"timestamp,omitempty"`
-}
-
-// Story is the full trace narrative: an ordered chain of hops.
-// Tree is populated only when the caller requests the tree format; the flat
-// Chain always reflects root-first DFS order for back-compat with clients.
-type Story struct {
-	TraceID      string                 `json:"trace_id"`
-	Chain        []Hop                  `json:"chain"`
-	Success      bool                   `json:"success"`
-	FirstFailHop *Hop                   `json:"first_fail_hop,omitempty"`
-	HopCount     int                    `json:"hop_count"`
-	Tree         []*tracestore.TreeNode `json:"tree,omitempty"`
-}
-
-// Context provides user and request metadata for the trace.
-type Context struct {
-	RequestID    string   `json:"request_id,omitempty"`
-	RequestEvent string   `json:"request_event,omitempty"`
-	ErrorCodes   []string `json:"error_codes,omitempty"`
-	UserID       string   `json:"user_id,omitempty"`
-	UserTier     string   `json:"user_tier,omitempty"`
-	UserRegion   string   `json:"user_region,omitempty"`
-	Flow         string   `json:"flow,omitempty"`
-	Flags        []string `json:"flags,omitempty"`
-}
-
-// Build constructs a Story and Context from a graph for the given traceID.
-// Prefer BuildWithTraceStore when a trace store is available.
-func Build(g *core.Graph, traceID string) (Story, Context, error) {
-	return BuildWithTraceStore(g, nil, traceID)
-}
-
-// BuildWithTraceStore constructs a Story and Context from a graph plus an
-// optional trace store. The flat Chain is populated; Tree is not.
-func BuildWithTraceStore(g *core.Graph, traceStore *tracestore.Store, traceID string) (Story, Context, error) {
-	return BuildWithFormat(g, traceStore, traceID, "")
-}
-
-// BuildWithFormat behaves like BuildWithTraceStore and additionally populates
-// Story.Tree when format == "tree". Any other format value yields the default
-// flat-Chain response so existing callers stay unchanged.
-func BuildWithFormat(g *core.Graph, traceStore *tracestore.Store, traceID, format string) (Story, Context, error) {
-	if g == nil {
-		return Story{}, Context{}, fmt.Errorf("graph is nil")
-	}
-
-	reqID := core.ID("request", traceID)
-	reqNode, ok := g.Nodes[reqID]
-	if !ok {
-		return Story{}, Context{}, fmt.Errorf("trace %s not found", traceID)
-	}
-
-	var (
-		story Story
-		ctx   Context
-		tree  []*tracestore.TreeNode
-		err   error
-	)
-	if traceStore != nil {
-		if rec, ok := traceStore.Get(traceID); ok {
-			story, ctx, tree, err = buildFromTraceRecord(g, traceID, reqID, reqNode, rec)
-		} else {
-			story, ctx, err = buildFromGraph(g, reqID, reqNode)
-		}
-	} else {
-		story, ctx, err = buildFromGraph(g, reqID, reqNode)
-	}
-	if err != nil {
-		return Story{}, Context{}, err
-	}
-	if format == "tree" {
-		story.Tree = tree
-	}
-	return story, ctx, nil
-}
-
-func buildFromTraceRecord(g *core.Graph, traceID, reqID string, reqNode core.Node, rec *tracestore.TraceRecord) (Story, Context, []*tracestore.TreeNode, error) {
-	roots := tracestore.BuildTree(rec.Spans)
-
-	var chain []Hop
-	var walk func(*tracestore.TreeNode)
-	walk = func(node *tracestore.TreeNode) {
-		if node == nil {
-			return
-		}
-		chain = append(chain, hopFromRecord(node.Span))
-		for _, child := range node.Children {
-			walk(child)
-		}
-	}
-	for _, root := range roots {
-		walk(root)
-	}
-
-	story := Story{
-		TraceID:  traceID,
-		Chain:    chain,
-		Success:  true,
-		HopCount: len(chain),
-	}
-	for i := range chain {
-		if !chain[i].Success {
-			story.Success = false
-			if story.FirstFailHop == nil {
-				hop := chain[i] // copy
-				story.FirstFailHop = &hop
-			}
-		}
-	}
-
-	ctx := buildContext(g, reqID, reqNode)
-
-	return story, ctx, roots, nil
-}
-
-func buildFromGraph(g *core.Graph, reqID string, reqNode core.Node) (Story, Context, error) {
-	roots := rootSpanIDs(g, reqID)
-	children := map[string][]string{}
-	for _, e := range g.Edges {
-		if e.Type == core.EdgeSpanChildOf {
-			children[e.To] = append(children[e.To], e.From)
-		}
-	}
-	sortSpanIDsByTime(g, roots)
-	for parentID := range children {
-		sortSpanIDsByTime(g, children[parentID])
-	}
-
-	var chain []Hop
-	visited := map[string]bool{}
-	for _, root := range roots {
-		dfsHops(g, root, children, visited, &chain)
-	}
-
-	story := Story{
-		TraceID:  stringAttr(reqNode.Attr["trace_id"]),
-		Chain:    chain,
-		Success:  true,
-		HopCount: len(chain),
-	}
-	for i := range chain {
-		if !chain[i].Success {
-			story.Success = false
-			if story.FirstFailHop == nil {
-				hop := chain[i]
-				story.FirstFailHop = &hop
-			}
-		}
-	}
-
-	ctx := buildContext(g, reqID, reqNode)
-	return story, ctx, nil
-}
-
-// rootSpanIDs finds spans connected to the request that are NOT children of other spans.
-func rootSpanIDs(g *core.Graph, reqID string) []string {
-	// Collect all span IDs that are children (have a span_child_of edge FROM them)
-	hasParent := map[string]bool{}
-	for _, e := range g.Edges {
-		if e.Type == core.EdgeSpanChildOf {
-			hasParent[e.From] = true
-		}
-	}
-
-	// Find spans connected to this request that have no parent
-	var roots []string
-	seen := map[string]bool{}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type != core.EdgeRequestHasSpan {
-			continue
-		}
-		if seen[e.To] {
-			continue
-		}
-		seen[e.To] = true
-		if !hasParent[e.To] {
-			roots = append(roots, e.To)
-		}
-	}
-	return roots
-}
-
-// dfsHops traverses the span tree depth-first, building hops in root-first order.
-func dfsHops(g *core.Graph, spanID string, children map[string][]string, visited map[string]bool, chain *[]Hop) {
-	if visited[spanID] {
-		return
-	}
-	visited[spanID] = true
-
-	n, ok := g.Nodes[spanID]
-	if !ok {
-		return
-	}
-
-	*chain = append(*chain, hopFromNode(n))
-
-	for _, childID := range children[spanID] {
-		dfsHops(g, childID, children, visited, chain)
-	}
-}
-
-// hopFromNode extracts a Hop from a span node's enriched attributes.
-func hopFromNode(n core.Node) Hop {
-	h := Hop{}
-	if n.Attr == nil {
-		return h
-	}
-
-	h.SpanID = stringAttr(n.Attr["span_id"])
-	h.Service = stringAttr(n.Attr["service"])
-	h.StatusCode, _ = intAttr(n.Attr["status_code"])
-	h.LatencyMs, _ = int64Attr(n.Attr["latency_ms"])
-	h.Success, _ = boolAttr(n.Attr["success"])
-	h.ErrorCode = stringAttr(n.Attr["error_code"])
-	h.IsRoot = stringAttr(n.Attr["parent_span_id"]) == ""
-	h.Timestamp, _ = timeAttr(n.Attr["timestamp"])
-	return h
-}
-
-func hopFromRecord(span tracestore.SpanRecord) Hop {
-	return Hop{
-		SpanID:     span.SpanID,
-		Service:    span.Service,
-		StatusCode: span.StatusCode,
-		LatencyMs:  span.LatencyMs,
-		Success:    span.Success,
-		ErrorCode:  span.ErrorCode,
-		IsRoot:     span.ParentSpanID == "",
-		Timestamp:  span.Timestamp,
-	}
-}
-
-// buildContext extracts user and request metadata for a trace.
-func buildContext(g *core.Graph, reqID string, reqNode core.Node) Context {
-	ctx := Context{RequestID: reqID}
-
-	// Flow from request node
-	if reqNode.Attr != nil {
-		ctx.RequestEvent = stringAttr(reqNode.Attr["event_name"])
-		ctx.Flow = stringAttr(reqNode.Attr["flow"])
-		ctx.ErrorCodes = append(ctx.ErrorCodes, stringSliceAttr(reqNode.Attr["error_codes"])...)
-		if len(ctx.ErrorCodes) == 0 {
-			ctx.ErrorCodes = append(ctx.ErrorCodes, stringSliceAttr(reqNode.Attr["errors"])...)
-		}
-		if code := stringAttr(reqNode.Attr["error_code"]); code != "" && len(ctx.ErrorCodes) == 0 {
-			ctx.ErrorCodes = append(ctx.ErrorCodes, code)
-		}
-		if flags := stringSliceAttr(reqNode.Attr["feature_flags"]); len(flags) > 0 {
-			ctx.Flags = append(ctx.Flags, flags...)
-		}
-	}
-
-	// Find user node via request_by edge
-	if reqNode.Attr != nil {
-		if v := stringAttr(reqNode.Attr["user_id"]); v != "" {
-			ctx.UserID = v
-		}
-		if v := stringAttr(reqNode.Attr["user_tier"]); v != "" {
-			ctx.UserTier = v
-		}
-		if v := stringAttr(reqNode.Attr["user_region"]); v != "" {
-			ctx.UserRegion = v
-		}
-	}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type == core.EdgeRequestBy {
-			if userNode, ok := g.Nodes[e.To]; ok && userNode.Attr != nil {
-				ctx.UserTier = stringAttr(userNode.Attr["tier"])
-				ctx.UserRegion = stringAttr(userNode.Attr["region"])
-				ctx.UserID = e.To
-				if uid := stringAttr(userNode.Attr["id"]); uid != "" {
-					ctx.UserID = uid
-				}
-			}
-			break
-		}
-	}
-
-	// Find feature flags via used_flag edges
-	if len(ctx.Flags) == 0 {
-		ctx.Flags = append(ctx.Flags, stringSliceAttr(reqNode.Attr["feature_flags"])...)
-	}
-	for _, e := range g.OutEdges[reqID] {
-		if e.Type == core.EdgeUsedFlag {
-			if flagNode, ok := g.Nodes[e.To]; ok && flagNode.Attr != nil {
-				if name := stringAttr(flagNode.Attr["name"]); name != "" {
-					ctx.Flags = append(ctx.Flags, name)
-				}
-			}
-		}
-	}
-
-	return ctx
-}
-
-func sortSpanIDsByTime(g *core.Graph, spanIDs []string) {
-	sort.Slice(spanIDs, func(i, j int) bool {
-		leftTime := spanSortTime(g, spanIDs[i])
-		rightTime := spanSortTime(g, spanIDs[j])
-		if !leftTime.Equal(rightTime) {
-			if leftTime.IsZero() {
-				return false
-			}
-			if rightTime.IsZero() {
-				return true
-			}
-			return leftTime.Before(rightTime)
-		}
-		return spanIDs[i] < spanIDs[j]
-	})
-}
-
-func spanSortTime(g *core.Graph, spanID string) time.Time {
-	n, ok := g.Nodes[spanID]
-	if !ok {
-		return time.Time{}
-	}
-	if ts, ok := timeAttr(n.Attr["timestamp"]); ok && !ts.IsZero() {
-		return ts
-	}
-	if !n.FirstSeen.IsZero() {
-		return n.FirstSeen
-	}
-	return n.LastSeen
-}
-
-func stringSliceAttr(v any) []string {
-	switch values := v.(type) {
-	case []string:
-		return append([]string(nil), values...)
-	case []any:
-		out := make([]string, 0, len(values))
-		for _, item := range values {
-			if s, ok := item.(string); ok && s != "" {
-				out = append(out, s)
-			}
-		}
-		return out
-	default:
-		return nil
-	}
-}
-
-func stringAttr(v any) string {
-	if s, ok := v.(string); ok {
-		return s
-	}
-	if v == nil {
-		return ""
-	}
-	return fmt.Sprintf("%v", v)
-}
-
-func int64Attr(v any) (int64, bool) {
-	switch t := v.(type) {
-	case int:
-		return int64(t), true
-	case int64:
-		return t, true
-	case float64:
-		return int64(t), true
-	case string:
-		i, err := strconv.ParseInt(t, 10, 64)
-		if err == nil {
-			return i, true
-		}
-	}
-	return 0, false
-}
-
-func intAttr(v any) (int, bool) {
-	n, ok := int64Attr(v)
-	return int(n), ok
-}
-
-func boolAttr(v any) (bool, bool) {
-	switch t := v.(type) {
-	case bool:
-		return t, true
-	case string:
-		b, err := strconv.ParseBool(t)
-		if err == nil {
-			return b, true
-		}
-	}
-	if n, ok := int64Attr(v); ok {
-		return n != 0, true
-	}
-	return false, false
-}
-
-func timeAttr(v any) (time.Time, bool) {
-	switch t := v.(type) {
-	case time.Time:
-		return t, true
-	case string:
-		ts, err := time.Parse(time.RFC3339Nano, t)
-		if err == nil {
-			return ts, true
-		}
-		ts, err = time.Parse(time.RFC3339, t)
-		if err == nil {
-			return ts, true
-		}
-	}
-	if n, ok := int64Attr(v); ok {
-		return time.Unix(n, 0).UTC(), true
-	}
-	return time.Time{}, false
-}
diff --git a/internal/tracestory/story_test.go b/internal/tracestory/story_test.go
deleted file mode 100644
index 2efba6e..0000000
--- a/internal/tracestory/story_test.go
+++ /dev/null
@@ -1,231 +0,0 @@
-package tracestory
-
-import (
-	"slices"
-	"testing"
-	"time"
-
-	"github.com/sssmaran/WaylogCLI/internal/graph/build"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	"github.com/sssmaran/WaylogCLI/internal/graph/store"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
-)
-
-func TestBuildWithTraceStore_SuccessChain(t *testing.T) {
-	traceID := "11111111111111111111111111111111"
-	graphStore, traceStore := buildThreeHopTrace(t, traceID, false)
-
-	story, ctx, err := BuildWithTraceStore(graphStore.Snapshot(), traceStore, traceID)
-	if err != nil {
-		t.Fatalf("BuildWithTraceStore() error: %v", err)
-	}
-	if story.HopCount != 3 {
-		t.Fatalf("HopCount = %d, want 3", story.HopCount)
-	}
-	if !story.Success || story.FirstFailHop != nil {
-		t.Fatalf("unexpected story success state: %+v", story)
-	}
-	want := []string{"api-gateway", "checkout", "payment"}
-	for i := range want {
-		if story.Chain[i].Service != want[i] {
-			t.Fatalf("Chain[%d].Service = %q, want %q", i, story.Chain[i].Service, want[i])
-		}
-	}
-	if ctx.UserID != "user-42" || ctx.UserTier != "premium" || ctx.UserRegion != "us-west-2" {
-		t.Fatalf("unexpected context: %+v", ctx)
-	}
-	if !slices.Contains(ctx.Flags, "dark-mode") {
-		t.Fatalf("expected dark-mode in context flags, got %v", ctx.Flags)
-	}
-}
-
-func TestBuildWithTraceStore_FailureChain(t *testing.T) {
-	traceID := "22222222222222222222222222222222"
-	graphStore, traceStore := buildThreeHopTrace(t, traceID, true)
-
-	story, ctx, err := BuildWithTraceStore(graphStore.Snapshot(), traceStore, traceID)
-	if err != nil {
-		t.Fatalf("BuildWithTraceStore() error: %v", err)
-	}
-	if story.Success {
-		t.Fatal("expected failed trace")
-	}
-	if story.FirstFailHop == nil {
-		t.Fatal("expected first failing hop")
-	}
-	if story.FirstFailHop.Service != "payment" {
-		t.Fatalf("FirstFailHop.Service = %q, want payment", story.FirstFailHop.Service)
-	}
-	if story.FirstFailHop.ErrorCode != "PMT_502" {
-		t.Fatalf("FirstFailHop.ErrorCode = %q, want PMT_502", story.FirstFailHop.ErrorCode)
-	}
-	if !slices.Contains(ctx.ErrorCodes, "PMT_502") {
-		t.Fatalf("expected PMT_502 in context error codes, got %v", ctx.ErrorCodes)
-	}
-}
-
-func TestBuildWithTraceStore_SingleHop(t *testing.T) {
-	traceID := "33333333333333333333333333333333"
-	graphStore := store.NewStore()
-	traceStore := tracestore.NewStore()
-
-	builder := build.NewBuilder()
-
-	ev := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-		testutil.WithParentSpanID(""),
-		testutil.WithService("api-gateway"),
-		testutil.WithStatusCode(200),
-		testutil.WithLatency(10),
-	)
-	upsertEvent(t, graphStore, traceStore, builder, ev)
-
-	story, _, err := BuildWithTraceStore(graphStore.Snapshot(), traceStore, traceID)
-	if err != nil {
-		t.Fatalf("BuildWithTraceStore() error: %v", err)
-	}
-	if story.HopCount != 1 || story.Chain[0].Service != "api-gateway" {
-		t.Fatalf("unexpected story: %+v", story)
-	}
-}
-
-func TestBuildWithTraceStore_OrdersSiblingHopsByTimestamp(t *testing.T) {
-	traceID := "66666666666666666666666666666666"
-	graphStore := store.NewStore()
-	traceStore := tracestore.NewStore()
-
-	builder := build.NewBuilder()
-	base := time.Now().UTC()
-
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithParentSpanID(""),
-			testutil.WithService("api-gateway"),
-			testutil.WithTimestamp(base.Add(1*time.Millisecond)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithParentSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithService("checkout"),
-			testutil.WithTimestamp(base.Add(2*time.Millisecond)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("dddddddddddddddd"),
-			testutil.WithParentSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithService("payment"),
-			testutil.WithTimestamp(base.Add(4*time.Millisecond)),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("cccccccccccccccc"),
-			testutil.WithParentSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithService("db"),
-			testutil.WithTimestamp(base.Add(3*time.Millisecond)),
-		),
-	}
-
-	for _, ev := range events {
-		upsertEvent(t, graphStore, traceStore, builder, ev)
-	}
-
-	story, _, err := BuildWithTraceStore(graphStore.Snapshot(), traceStore, traceID)
-	if err != nil {
-		t.Fatalf("BuildWithTraceStore() error: %v", err)
-	}
-	want := []string{"api-gateway", "checkout", "db", "payment"}
-	if len(story.Chain) != len(want) {
-		t.Fatalf("chain length = %d, want %d", len(story.Chain), len(want))
-	}
-	for i := range want {
-		if story.Chain[i].Service != want[i] {
-			t.Fatalf("Chain[%d].Service = %q, want %q", i, story.Chain[i].Service, want[i])
-		}
-	}
-}
-
-func TestBuild_UnknownTrace(t *testing.T) {
-	graphStore := store.NewStore()
-	_, _, err := Build(graphStore.Snapshot(), "00000000000000000000000000000000")
-	if err == nil {
-		t.Fatal("expected error for unknown trace")
-	}
-}
-
-func buildThreeHopTrace(t *testing.T, traceID string, paymentFail bool) (*store.Store, *tracestore.Store) {
-	t.Helper()
-
-	graphStore := store.NewStore()
-	traceStore := tracestore.NewStore()
-
-	builder := build.NewBuilder()
-
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithParentSpanID(""),
-			testutil.WithService("api-gateway"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(45),
-			testutil.WithUser("user-42", "premium", "us-west-2"),
-			testutil.WithFlow("checkout"),
-			testutil.WithFeatureFlags("dark-mode"),
-		),
-		testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithParentSpanID("aaaaaaaaaaaaaaaa"),
-			testutil.WithService("checkout"),
-			testutil.WithStatusCode(200),
-			testutil.WithLatency(32),
-			testutil.WithUser("user-42", "premium", "us-west-2"),
-			testutil.WithCallerService("api-gateway"),
-		),
-	}
-
-	payment := testutil.MakeEvent(
-		testutil.WithTraceID(traceID),
-		testutil.WithSpanID("cccccccccccccccc"),
-		testutil.WithParentSpanID("bbbbbbbbbbbbbbbb"),
-		testutil.WithService("payment"),
-		testutil.WithStatusCode(200),
-		testutil.WithLatency(12),
-		testutil.WithUser("user-42", "premium", "us-west-2"),
-		testutil.WithCallerService("checkout"),
-	)
-	if paymentFail {
-		payment = testutil.MakeEvent(
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("cccccccccccccccc"),
-			testutil.WithParentSpanID("bbbbbbbbbbbbbbbb"),
-			testutil.WithService("payment"),
-			testutil.WithStatusCode(502),
-			testutil.WithLatency(12),
-			testutil.WithUser("user-42", "premium", "us-west-2"),
-			testutil.WithCallerService("checkout"),
-			testutil.WithError("PMT_502", "payment failed"),
-		)
-	}
-	events = append(events, payment)
-
-	for _, ev := range events {
-		upsertEvent(t, graphStore, traceStore, builder, ev)
-	}
-	return graphStore, traceStore
-}
-
-func upsertEvent(t *testing.T, graphStore *store.Store, traceStore *tracestore.Store, builder *build.Builder, ev event.WideEvent) {
-	t.Helper()
-	result := builder.BuildResult(ev)
-	graphStore.Merge(result.Graph)
-	if result.Span != nil {
-		traceStore.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-	}
-}
diff --git a/internal/triage/adapter.go b/internal/triage/adapter.go
index 1723880..bf5cd2f 100644
--- a/internal/triage/adapter.go
+++ b/internal/triage/adapter.go
@@ -15,9 +15,9 @@ import (
 )
 
 // Upstream collaborator interfaces. Defined narrowly so adapters are testable
-// without instantiating real engines/stores. Production wiring (Task 11)
-// satisfies these with *incidents.Engine (Get / BlastRadius+Errors), the
-// signal store, and a closure over (*core.Graph, *tracestore.Store).
+// without instantiating real engines/stores. Production wiring satisfies
+// these with *incidents.Engine (Get / BlastRadius+Errors), the signal store,
+// and a closure over the v2 reader's TraceStoryByTraceID.
 
 // IncidentReader returns a single incident by ID. *incidents.Engine satisfies
 // this via its Get method.
@@ -64,16 +64,20 @@ func (a incidentLookupAdapter) GetIncident(ctx context.Context, id string) (Inci
 		return IncidentSummary{}, err
 	}
 	return IncidentSummary{
-		ID:         inc.IncidentID,
-		Window:     defaultWindowLabel,
-		Env:        inc.Env,
-		StartedAt:  inc.StartedAt,
-		UpdatedAt:  inc.UpdatedAt,
-		Service:    inc.ErrorFamily.Service,
-		Step:       inc.ErrorFamily.Step,
-		ErrorCode:  inc.ErrorFamily.ErrorCode,
-		Confidence: mapConfidence(inc.Confidence),
-		NextChecks: append([]string(nil), inc.NextChecks...),
+		ID:          inc.IncidentID,
+		Window:      defaultWindowLabel,
+		Env:         inc.Env,
+		StartedAt:   inc.StartedAt,
+		UpdatedAt:   inc.UpdatedAt,
+		Service:     inc.ErrorFamily.Service,
+		Step:        inc.ErrorFamily.Step,
+		ErrorCode:   inc.ErrorFamily.ErrorCode,
+		Confidence:  mapConfidence(inc.Confidence),
+		NextChecks:  append([]string(nil), inc.NextChecks...),
+		Propagation: inc.Propagation,
+		Blast:       inc.Blast,
+		Alerts:      inc.Alerts,
+		Runtime:     inc.Runtime,
 	}, nil
 }
 
@@ -100,20 +104,31 @@ func NewBlastQueryAdapter(r BlastReader) BlastQuery {
 }
 
 func (a blastQueryAdapter) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
-	end := opts.Now
-	if end.IsZero() {
-		end = inc.UpdatedAt
-	}
-	window := opts.Window
-	if window <= 0 {
-		window = defaultWindow
-	}
-	filter := incidents.SearchFilter{
-		Service:   inc.Service,
-		ErrorCode: inc.ErrorCode,
-		Since:     end.Add(-window),
-		Until:     end,
+	if inc.Blast != nil && inc.Blast.Latest != nil {
+		bl := inc.Blast.Latest
+		users := 0
+		if bl.AffectedUsers != nil {
+			users = *bl.AffectedUsers
+		}
+		families, err := a.topErrorFamilies(inc, opts)
+		if err != nil {
+			return BlastSnapshotResult{}, err
+		}
+		return BlastSnapshotResult{
+			Requests:         bl.AffectedRequests,
+			Users:            users,
+			Services:         bl.AffectedServices,
+			TopErrorFamilies: families,
+		}, nil
 	}
+	return a.blastSnapshotFromReader(ctx, inc, opts)
+}
+
+// blastSnapshotFromReader is the pre-v1.0 computation path. Called when the
+// incident has no Blast.Latest snapshot (legacy stored incidents, or a tick
+// where capture failed and Latest is missing entirely).
+func (a blastQueryAdapter) blastSnapshotFromReader(_ context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+	filter := blastFilter(inc, opts)
 	br := a.r.BlastRadius(filter, apiv2.BlastKey{
 		Service:   inc.Service,
 		Step:      inc.Step,
@@ -123,6 +138,23 @@ func (a blastQueryAdapter) BlastSnapshot(ctx context.Context, inc IncidentSummar
 	if br.AffectedUsers != nil {
 		users = *br.AffectedUsers
 	}
+	families, err := a.topErrorFamiliesWithFilter(filter)
+	if err != nil {
+		return BlastSnapshotResult{}, err
+	}
+	return BlastSnapshotResult{
+		Requests:         br.AffectedRequests,
+		Users:            users,
+		Services:         br.AffectedServices,
+		TopErrorFamilies: families,
+	}, nil
+}
+
+func (a blastQueryAdapter) topErrorFamilies(inc IncidentSummary, opts BuildOptions) ([]pkgtriage.ErrorFamily, error) {
+	return a.topErrorFamiliesWithFilter(blastFilter(inc, opts))
+}
+
+func (a blastQueryAdapter) topErrorFamiliesWithFilter(filter incidents.SearchFilter) ([]pkgtriage.ErrorFamily, error) {
 	rows := a.r.Errors(filter, 5).Rows
 	families := make([]pkgtriage.ErrorFamily, 0, len(rows))
 	for _, row := range rows {
@@ -133,12 +165,24 @@ func (a blastQueryAdapter) BlastSnapshot(ctx context.Context, inc IncidentSummar
 			Count:     row.Count,
 		})
 	}
-	return BlastSnapshotResult{
-		Requests:         br.AffectedRequests,
-		Users:            users,
-		Services:         br.AffectedServices,
-		TopErrorFamilies: families,
-	}, nil
+	return families, nil
+}
+
+func blastFilter(inc IncidentSummary, opts BuildOptions) incidents.SearchFilter {
+	end := opts.Now
+	if end.IsZero() {
+		end = inc.UpdatedAt
+	}
+	window := opts.Window
+	if window <= 0 {
+		window = defaultWindow
+	}
+	return incidents.SearchFilter{
+		Service:   inc.Service,
+		ErrorCode: inc.ErrorCode,
+		Since:     end.Add(-window),
+		Until:     end,
+	}
 }
 
 type storyBuilderAdapter struct {
@@ -148,14 +192,63 @@ type storyBuilderAdapter struct {
 
 // NewStoryBuilderAdapter wraps an upstream incident reader (to discover the
 // first-failure trace ID) and a story-build function (production: closure
-// over tracestory.BuildWithTraceStore). The trace selected is the first
+// over the v2 reader's TraceStoryByTraceID). The trace selected is the first
 // SampleTraces entry on the underlying incident; if none exists, returns an
 // empty result rather than erroring (M1).
 func NewStoryBuilderAdapter(r IncidentReader, build StoryBuildFunc) StoryBuilder {
 	return storyBuilderAdapter{r: r, build: build}
 }
 
-func (a storyBuilderAdapter) FirstFailureStory(ctx context.Context, inc IncidentSummary, _ BuildOptions) (FirstFailureResult, error) {
+func (a storyBuilderAdapter) FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error) {
+	if inc.Propagation != nil && inc.Propagation.Latest != nil {
+		return a.firstFailureFromSnapshot(inc)
+	}
+	return a.firstFailureFromReader(ctx, inc)
+}
+
+// firstFailureFromSnapshot projects from Incident.Propagation.Latest + Blast.Latest
+// (spec: Report.SampleTraces ← Incident.Blast.Latest.SampledTraces; FirstFailure is
+// a compact JSON object with origin_service / origin_step / first_failing_step /
+// error_code / sample_trace_id).
+func (a storyBuilderAdapter) firstFailureFromSnapshot(inc IncidentSummary) (FirstFailureResult, error) {
+	p := inc.Propagation.Latest
+	firstFailing, errCode := firstErrorStep(p)
+	payload := struct {
+		OriginService    string `json:"origin_service"`
+		OriginStep       string `json:"origin_step"`
+		FirstFailingStep string `json:"first_failing_step,omitempty"`
+		ErrorCode        string `json:"error_code,omitempty"`
+		SampleTraceID    string `json:"sample_trace_id,omitempty"`
+	}{
+		OriginService:    p.OriginService,
+		OriginStep:       p.OriginStep,
+		FirstFailingStep: firstFailing,
+		ErrorCode:        errCode,
+		SampleTraceID:    p.SampleTraceID,
+	}
+	raw, err := json.Marshal(&payload)
+	if err != nil {
+		return FirstFailureResult{}, fmt.Errorf("triage: project first failure: %w", err)
+	}
+	var samples []pkgtriage.TraceSample
+	if inc.Blast != nil && inc.Blast.Latest != nil {
+		for _, traceID := range inc.Blast.Latest.SampledTraces {
+			summary := ""
+			if traceID == p.SampleTraceID {
+				summary = storySummaryFromPath(p)
+			}
+			samples = append(samples, pkgtriage.TraceSample{TraceID: traceID, Summary: summary})
+		}
+	}
+	if len(samples) == 0 && p.SampleTraceID != "" {
+		samples = []pkgtriage.TraceSample{{TraceID: p.SampleTraceID, Summary: storySummaryFromPath(p)}}
+	}
+	return FirstFailureResult{Payload: raw, SampleTraces: samples}, nil
+}
+
+// firstFailureFromReader is the pre-v1.0 computation path. Called when the
+// incident has no Propagation.Latest snapshot.
+func (a storyBuilderAdapter) firstFailureFromReader(ctx context.Context, inc IncidentSummary) (FirstFailureResult, error) {
 	upstream, err := a.r.Get(ctx, inc.ID)
 	if err != nil {
 		if errors.Is(err, incidents.ErrNotFound) {
@@ -182,6 +275,27 @@ func (a storyBuilderAdapter) FirstFailureStory(ctx context.Context, inc Incident
 	}, nil
 }
 
+// firstErrorStep walks p.Path and returns the first step with status="error"
+// (step name + error code). Returns "","" if none.
+func firstErrorStep(p *incidents.PropagationEvidence) (step, code string) {
+	if p == nil {
+		return "", ""
+	}
+	for _, s := range p.Path {
+		if s.Status == "error" {
+			return s.Step, s.ErrorCode
+		}
+	}
+	return "", ""
+}
+
+func storySummaryFromPath(p *incidents.PropagationEvidence) string {
+	if p == nil || len(p.Path) == 0 {
+		return ""
+	}
+	return fmt.Sprintf("%s/%s → %s", p.OriginService, p.OriginStep, p.Path[len(p.Path)-1].Step)
+}
+
 func storySummary(s apiv2.StoryResponse, inc IncidentSummary) string {
 	svc := s.Service
 	step := ""
@@ -321,9 +435,9 @@ type nextChecksAdapter struct{}
 
 // NewNextChecksAdapter returns a passthrough that converts the incident's
 // own NextChecks list (already populated by the incidents engine via
-// internal/incidents.NextChecks(cause, confidence)) into the typed
-// NextCheckSpec entries the report consumes. Stable IDs (check_)
-// keep the report deterministic across runs.
+// internal/incidents.NextChecks) into the typed NextCheckSpec entries the
+// report consumes. Stable IDs (check_) keep the report deterministic
+// across runs.
 func NewNextChecksAdapter() NextChecksProvider {
 	return nextChecksAdapter{}
 }
diff --git a/internal/triage/determinism_test.go b/internal/triage/determinism_test.go
new file mode 100644
index 0000000..5f3ab36
--- /dev/null
+++ b/internal/triage/determinism_test.go
@@ -0,0 +1,230 @@
+package triage
+
+import (
+	"context"
+	"reflect"
+	"testing"
+	"time"
+
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// buildFixtureReport builds a report from the in-memory rich* fixtures.
+// No server, no live incident — the spec's fixture/in-memory path.
+func buildFixtureReport(t *testing.T) *pkgtriage.Report {
+	t.Helper()
+	deps := Deps{
+		Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+		Signals: richSignals{}, NextChecks: richNextChecks{},
+		Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+	}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("15m", false, deps.Now())
+	r, err := eng.Build(context.Background(), "inc_abc", opts)
+	if err != nil {
+		t.Fatalf("build: %v", err)
+	}
+	return r
+}
+
+// Invariant (b1): provenance fields must not enter the canonical hash.
+func TestHashExcludesProvenanceFields(t *testing.T) {
+	r := buildFixtureReport(t)
+	base, err := r.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash: %v", err)
+	}
+	r.GeneratedAt = "2099-01-01T00:00:00Z"
+	r.PlanRunID = "plan_zzz"
+	r.ReportHash = "sha256:deadbeef"
+	got, err := r.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash: %v", err)
+	}
+	if got != base {
+		t.Fatalf("provenance fields must not affect hash: base=%q got=%q", base, got)
+	}
+}
+
+// Invariant (b2): the evidence projections that actually reach the report
+// (runtime and alert) must drop CapturedAt, so a fresh capture can't churn
+// report_hash. Blast and propagation snapshot CapturedAt never reach the report
+// by construction (the report carries no propagation, and its blast comes from
+// the BlastQuery, not the incident's blast snapshot), so there is nothing to
+// assert for those two here.
+func TestRuntimeProjectionDropsCapturedAt(t *testing.T) {
+	mk := func(capturedAt time.Time) *incidents.RuntimeSnapshot {
+		ev := incidents.RuntimeEvidence{
+			Subtype:    "oom_killed",
+			Service:    "checkout",
+			Source:     "k8s",
+			Severity:   "critical",
+			Reason:     "OOMKilled",
+			SignalID:   "sig_1",
+			OccurredAt: time.Date(2026, 5, 6, 11, 0, 0, 0, time.UTC),
+			CapturedAt: capturedAt,
+		}
+		return &incidents.RuntimeSnapshot{Matches: []incidents.RuntimeEvidence{ev}, Latest: &ev}
+	}
+	a := runtimeFromSnapshot(mk(time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)))
+	b := runtimeFromSnapshot(mk(time.Date(2026, 5, 6, 18, 30, 0, 0, time.UTC)))
+	if !reflect.DeepEqual(a, b) {
+		t.Fatalf("CapturedAt must not affect runtime projection: %+v vs %+v", a, b)
+	}
+}
+
+// Invariant (b2, alert): the alert projection must also drop CapturedAt.
+func TestAlertProjectionDropsCapturedAt(t *testing.T) {
+	mk := func(capturedAt time.Time) *incidents.AlertSnapshot {
+		return &incidents.AlertSnapshot{Latest: &incidents.AlertEvidence{
+			CapturedAt:    capturedAt,
+			CaptureStatus: incidents.CaptureOK,
+			Matches: []incidents.MatchedAlert{{
+				SignalID: "sig_1", Source: "alertmanager", Severity: "critical", Reason: "service down",
+			}},
+		}}
+	}
+	a, aok := alertsFromSnapshot(mk(time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)))
+	b, bok := alertsFromSnapshot(mk(time.Date(2026, 5, 6, 18, 30, 0, 0, time.UTC)))
+	if !aok || !bok {
+		t.Fatalf("alertsFromSnapshot should report fromSnapshot=true (a=%v b=%v)", aok, bok)
+	}
+	if !reflect.DeepEqual(a, b) {
+		t.Fatalf("CapturedAt must not affect alert projection: %+v vs %+v", a, b)
+	}
+}
+
+// Invariant (c): material fields must change the canonical hash.
+func TestHashIncludesMaterialFields(t *testing.T) {
+	base, err := buildFixtureReport(t).CanonicalHash()
+	if err != nil {
+		t.Fatalf("base hash: %v", err)
+	}
+	cases := map[string]func(*pkgtriage.Report){
+		"blast_requests": func(x *pkgtriage.Report) { x.BlastSnapshot.Requests++ },
+		"top_error_family": func(x *pkgtriage.Report) {
+			x.BlastSnapshot.TopErrorFamilies = append(x.BlastSnapshot.TopErrorFamilies,
+				pkgtriage.ErrorFamily{Service: "svc", Step: "step", ErrorCode: "X", Count: 1})
+		},
+		"next_check": func(x *pkgtriage.Report) {
+			x.NextChecks = append(x.NextChecks, pkgtriage.NextCheck{ID: "n_new", Prompt: "new"})
+		},
+	}
+	for name, mutate := range cases {
+		t.Run(name, func(t *testing.T) {
+			r := buildFixtureReport(t)
+			mutate(r)
+			got, err := r.CanonicalHash()
+			if err != nil {
+				t.Fatalf("hash: %v", err)
+			}
+			if got == base {
+				t.Fatalf("mutating %s must change the hash but did not", name)
+			}
+		})
+	}
+}
+
+// tickBlast simulates the recent-index window sliding between engine ticks:
+// every call returns different counts.
+type tickBlast struct{ calls *int }
+
+func (b tickBlast) BlastSnapshot(_ context.Context, _ IncidentSummary, _ BuildOptions) (BlastSnapshotResult, error) {
+	*b.calls++
+	return BlastSnapshotResult{
+		Requests: 10 + *b.calls, Users: 5 + *b.calls, Services: 4,
+		TopErrorFamilies: []pkgtriage.ErrorFamily{
+			{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 10 + *b.calls},
+		},
+	}, nil
+}
+
+// extraSignals returns the rich signal set plus one newly attached signal.
+type extraSignals struct{}
+
+func (extraSignals) SignalsFor(_ context.Context, _ IncidentSummary, _ BuildOptions) ([]SignalEvidence, error) {
+	return []SignalEvidence{
+		{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}},
+		{ID: "sig_2", Type: "dependency", EvidenceIDs: []string{"e2"}},
+	}, nil
+}
+
+// Invariant (ADR 0002): evidence_fingerprint is stable across ticks while the
+// evidence set is unchanged, even though report_hash legitimately drifts with
+// the window; attaching evidence changes the fingerprint.
+func TestEvidenceFingerprintStableAcrossTicks(t *testing.T) {
+	calls := 0
+	now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+	deps := Deps{
+		Incidents: richIncidents{}, Blast: tickBlast{calls: &calls}, Story: richStory{},
+		Signals: richSignals{}, NextChecks: richNextChecks{},
+		Now: func() time.Time { return now },
+	}
+	eng, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	build := func() *pkgtriage.Report {
+		opts, _ := ParseBuildOptions("15m", false, deps.Now())
+		r, err := eng.Build(context.Background(), "inc_abc", opts)
+		if err != nil {
+			t.Fatalf("build: %v", err)
+		}
+		return r
+	}
+
+	first := build()
+	now = now.Add(30 * time.Second) // next tick: window slid, counts changed
+	second := build()
+
+	if first.ReportHash == second.ReportHash {
+		t.Fatal("fixture error: blast drift should have changed report_hash between ticks")
+	}
+	if first.EvidenceFingerprint == "" {
+		t.Fatal("engine must populate evidence_fingerprint")
+	}
+	if first.EvidenceFingerprint != second.EvidenceFingerprint {
+		t.Fatalf("fingerprint must survive tick drift: %s vs %s",
+			first.EvidenceFingerprint, second.EvidenceFingerprint)
+	}
+
+	// A newly attached signal changes the fingerprint.
+	deps.Signals = extraSignals{}
+	eng2, err := NewEngine(deps)
+	if err != nil {
+		t.Fatalf("new engine: %v", err)
+	}
+	opts, _ := ParseBuildOptions("15m", false, deps.Now())
+	third, err := eng2.Build(context.Background(), "inc_abc", opts)
+	if err != nil {
+		t.Fatalf("build: %v", err)
+	}
+	if third.EvidenceFingerprint == first.EvidenceFingerprint {
+		t.Fatal("attaching a signal must change the evidence fingerprint")
+	}
+}
+
+// Invariant (d): canonical hash is repeatable — identical across repeated calls
+// on the same report. Report has no map fields, so there is no map-key ordering
+// to canonicalize; this guards marshal determinism, not a deeper canonical-key
+// normalization.
+func TestCanonicalHashIsRepeatable(t *testing.T) {
+	r := buildFixtureReport(t)
+	first, err := r.CanonicalHash()
+	if err != nil {
+		t.Fatalf("hash: %v", err)
+	}
+	for i := 0; i < 100; i++ {
+		got, err := r.CanonicalHash()
+		if err != nil {
+			t.Fatalf("hash iter %d: %v", i, err)
+		}
+		if got != first {
+			t.Fatalf("canonical hash unstable at iter %d: %q vs %q", i, got, first)
+		}
+	}
+}
diff --git a/internal/triage/engine.go b/internal/triage/engine.go
index 3df3ad2..9479745 100644
--- a/internal/triage/engine.go
+++ b/internal/triage/engine.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
 	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
 )
 
@@ -15,16 +16,20 @@ var ErrUnknownIncident = errors.New("triage: unknown incident")
 // IncidentSummary is the minimal incident shape this package needs.
 // Adapter types in the wiring layer convert from internal/incidents.Incident.
 type IncidentSummary struct {
-	ID         string
-	Window     string
-	Env        string
-	StartedAt  time.Time
-	UpdatedAt  time.Time
-	Service    string
-	Step       string
-	ErrorCode  string
-	Confidence pkgtriage.Confidence
-	NextChecks []string
+	ID          string
+	Window      string
+	Env         string
+	StartedAt   time.Time
+	UpdatedAt   time.Time
+	Service     string
+	Step        string
+	ErrorCode   string
+	Confidence  pkgtriage.Confidence
+	NextChecks  []string
+	Propagation *incidents.PropagationSnapshot
+	Blast       *incidents.BlastSnapshot
+	Alerts      *incidents.AlertSnapshot
+	Runtime     *incidents.RuntimeSnapshot
 }
 
 type BlastSnapshotResult struct {
@@ -112,8 +117,8 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
 	if err != nil {
 		return nil, fmt.Errorf("triage: signals: %w", err)
 	}
-	var alerts []pkgtriage.AlertRef
-	if e.deps.Alerts != nil {
+	alerts, fromSnapshot := alertsFromSnapshot(inc.Alerts)
+	if !fromSnapshot && e.deps.Alerts != nil {
 		alerts, err = e.deps.Alerts.AlertsFor(ctx, inc, opts)
 		if err != nil {
 			return nil, fmt.Errorf("triage: alerts: %w", err)
@@ -135,6 +140,7 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
 		SampleTraces: story.SampleTraces,
 		Signals:      sigs,
 		Alerts:       alerts,
+		Runtime:      runtimeFromSnapshot(inc.Runtime),
 		NextChecks:   checks,
 		Confidence:   inc.Confidence,
 		GeneratedAt:  e.deps.Now().UTC().Format(time.RFC3339Nano),
@@ -145,8 +151,59 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
 		return nil, fmt.Errorf("triage: hash: %w", err)
 	}
 	r.ReportHash = hash
+	r.EvidenceFingerprint = r.CanonicalEvidenceFingerprint()
 	if err := r.Validate(); err != nil {
 		return nil, fmt.Errorf("triage: produced invalid report: %w", err)
 	}
 	return r, nil
 }
+
+func alertsFromSnapshot(s *incidents.AlertSnapshot) ([]pkgtriage.AlertRef, bool) {
+	if s == nil || s.Latest == nil {
+		return nil, false
+	}
+	out := make([]pkgtriage.AlertRef, 0, len(s.Latest.Matches))
+	for _, m := range s.Latest.Matches {
+		evidenceIDs := append([]string(nil), m.EvidenceIDs...)
+		if len(evidenceIDs) == 0 && m.SignalID != "" {
+			evidenceIDs = []string{m.SignalID}
+		}
+		out = append(out, pkgtriage.AlertRef{
+			SignalID:    m.SignalID,
+			AlertID:     m.AlertID,
+			Source:      m.Source,
+			Severity:    m.Severity,
+			Reason:      m.Reason,
+			ProviderURL: m.ProviderURL,
+			EvidenceIDs: evidenceIDs,
+		})
+	}
+	return out, true
+}
+
+// runtimeFromSnapshot projects all matched runtime evidence (infra AND app)
+// into report RuntimeRefs. Uses RuntimeSnapshot.Matches (not Opening/Latest) so
+// both infra and app rows survive into the report. OccurredAt is stable, so the
+// rows participate in report_hash; CapturedAt is deliberately excluded.
+func runtimeFromSnapshot(s *incidents.RuntimeSnapshot) []pkgtriage.RuntimeRef {
+	if s == nil || len(s.Matches) == 0 {
+		return nil
+	}
+	out := make([]pkgtriage.RuntimeRef, 0, len(s.Matches))
+	for _, m := range s.Matches {
+		occurred := ""
+		if !m.OccurredAt.IsZero() {
+			occurred = m.OccurredAt.UTC().Format(time.RFC3339Nano)
+		}
+		out = append(out, pkgtriage.RuntimeRef{
+			SignalID:   m.SignalID,
+			Subtype:    m.Subtype,
+			Service:    m.Service,
+			Source:     m.Source,
+			Severity:   m.Severity,
+			Reason:     m.Reason,
+			OccurredAt: occurred,
+		})
+	}
+	return out
+}
diff --git a/internal/triage/engine_test.go b/internal/triage/engine_test.go
index b316830..5030c19 100644
--- a/internal/triage/engine_test.go
+++ b/internal/triage/engine_test.go
@@ -7,6 +7,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/sssmaran/WaylogCLI/internal/incidents"
+	apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
 	pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
 )
 
@@ -315,3 +317,250 @@ func itoa(i int) string {
 	// Tests only exercise small indices.
 	return "n"
 }
+
+// --- snapshot-projection tests (Task 13) ---
+
+// fixedLookup returns a fixed IncidentSummary regardless of ID.
+type fixedLookup struct{ inc IncidentSummary }
+
+func (s fixedLookup) GetIncident(_ context.Context, _ string) (IncidentSummary, error) {
+	return s.inc, nil
+}
+
+// noOpBlastReader satisfies BlastReader with zero results. Both production
+// methods are exercised by blastQueryAdapter: when Blast.Latest is set, the
+// adapter still calls Errors() to compute TopErrorFamilies.
+type noOpBlastReader struct{}
+
+func (noOpBlastReader) BlastRadius(_ incidents.SearchFilter, _ apiv2.BlastKey) apiv2.BlastRadiusResponse {
+	return apiv2.BlastRadiusResponse{}
+}
+func (noOpBlastReader) Errors(_ incidents.SearchFilter, _ int) incidents.ErrorsResult {
+	return incidents.ErrorsResult{}
+}
+
+// noOpIncidentReader satisfies IncidentReader with ErrNotFound. The story
+// adapter's reader-driven path is gated behind Propagation.Latest == nil; when
+// the projection runs, this is never called.
+type noOpIncidentReader struct{}
+
+func (noOpIncidentReader) Get(_ context.Context, _ string) (incidents.Incident, error) {
+	return incidents.Incident{}, incidents.ErrNotFound
+}
+
+type stubAlertsResult struct{ out []pkgtriage.AlertRef }
+
+func (s stubAlertsResult) AlertsFor(_ context.Context, _ IncidentSummary, _ BuildOptions) ([]pkgtriage.AlertRef, error) {
+	return s.out, nil
+}
+
+func makeFixedSummary(t *testing.T, ts, firstSeen time.Time) IncidentSummary {
+	t.Helper()
+	users := 47
+	return IncidentSummary{
+		ID:         "inc_golden",
+		Window:     "15m",
+		Env:        "demo",
+		StartedAt:  ts,
+		UpdatedAt:  ts,
+		Service:    "payment-service",
+		Step:       "charge",
+		ErrorCode:  "DB_TIMEOUT",
+		Confidence: pkgtriage.ConfidenceMedium,
+		NextChecks: []string{"Verify payment-service health"},
+		Propagation: &incidents.PropagationSnapshot{
+			Latest: &incidents.PropagationEvidence{
+				OriginService: "payment-service",
+				OriginStep:    "charge",
+				Path: []incidents.PropagationStep{
+					{Service: "payment-service", Step: "validate", Status: "ok", StartMS: 0, DurationMS: 5},
+					{Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT", StartMS: 5, DurationMS: 50},
+				},
+				SampleTraceID: "7a3fb2",
+				FirstSeenAt:   &firstSeen,
+				CapturedAt:    ts,
+				CaptureStatus: incidents.CaptureOK,
+			},
+		},
+		Blast: &incidents.BlastSnapshot{
+			Latest: &incidents.BlastEvidence{
+				AffectedRequests: 184,
+				AffectedUsers:    &users,
+				AffectedServices: 3,
+				TopServices:      []string{"checkout", "api-gateway", "mobile-api"},
+				SampledTraces:    []string{"7a3fb2", "1c4d5e", "9f8a7b"},
+				CapturedAt:       ts,
+				CaptureStatus:    incidents.CaptureOK,
+			},
+		},
+	}
+}
+
+func newSnapshotProjectionEngine(t *testing.T, inc IncidentSummary, now time.Time) *Engine {
+	t.Helper()
+	eng, err := NewEngine(Deps{
+		Incidents:  fixedLookup{inc: inc},
+		Blast:      NewBlastQueryAdapter(noOpBlastReader{}),
+		Story:      NewStoryBuilderAdapter(noOpIncidentReader{}, func(_ string) (apiv2.StoryResponse, bool) { return apiv2.StoryResponse{}, false }),
+		Signals:    stubSignalsResult{},
+		Alerts:     stubAlertsResult{},
+		NextChecks: stubNextChecksResult{},
+		Now:        func() time.Time { return now },
+	})
+	if err != nil {
+		t.Fatalf("NewEngine: %v", err)
+	}
+	return eng
+}
+
+func TestEngine_Build_GoldenHash_FromIncidentSnapshots(t *testing.T) {
+	ts := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
+	firstSeen := ts.Add(-30 * time.Second)
+	inc := makeFixedSummary(t, ts, firstSeen)
+	eng := newSnapshotProjectionEngine(t, inc, ts)
+
+	rpt, err := eng.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+
+	const want = "sha256:857bb8f5682044d0cd80e8d26d79cf946a3ff9f355b85e122e78c77a0d7af572"
+	if rpt.ReportHash != want {
+		t.Fatalf("ReportHash = %s\nwant       = %s\n\n(If this is the first run, copy the actual hash above into the const.)", rpt.ReportHash, want)
+	}
+}
+
+func TestEngine_Build_GoldenHash_OpeningNotInHashSurface(t *testing.T) {
+	// Same Latest, different Opening — hash must not change. Opening is not
+	// projected into the Report; only Latest is.
+	ts := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
+	firstSeen := ts.Add(-30 * time.Second)
+	incA := makeFixedSummary(t, ts, firstSeen)
+	openUsers := 1
+	incB := makeFixedSummary(t, ts, firstSeen)
+	incB.Blast.Opening = &incidents.BlastEvidence{
+		AffectedRequests: 1,
+		AffectedUsers:    &openUsers,
+		AffectedServices: 1,
+		TopServices:      []string{"early"},
+		SampledTraces:    []string{"early_trace"},
+		CapturedAt:       ts.Add(-time.Minute),
+		CaptureStatus:    incidents.CaptureOK,
+	}
+
+	engA := newSnapshotProjectionEngine(t, incA, ts)
+	rptA, err := engA.Build(context.Background(), incA.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build A: %v", err)
+	}
+
+	engB := newSnapshotProjectionEngine(t, incB, ts)
+	rptB, err := engB.Build(context.Background(), incB.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build B: %v", err)
+	}
+	if rptA.ReportHash != rptB.ReportHash {
+		t.Fatalf("ReportHash differs but only Opening did:\nA: %s\nB: %s", rptA.ReportHash, rptB.ReportHash)
+	}
+}
+
+func TestEngine_Build_ProjectionIsByteStable(t *testing.T) {
+	ts := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
+	firstSeen := ts.Add(-30 * time.Second)
+	inc := makeFixedSummary(t, ts, firstSeen)
+	eng := newSnapshotProjectionEngine(t, inc, ts)
+
+	rpt1, err := eng.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build 1: %v", err)
+	}
+	rpt2, err := eng.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build 2: %v", err)
+	}
+	j1, _ := json.Marshal(rpt1)
+	j2, _ := json.Marshal(rpt2)
+	if string(j1) != string(j2) {
+		t.Fatalf("Report projection drifted between runs:\nfirst:  %s\nsecond: %s", j1, j2)
+	}
+	if rpt1.ReportHash != rpt2.ReportHash {
+		t.Fatalf("ReportHash drifted: %s vs %s", rpt1.ReportHash, rpt2.ReportHash)
+	}
+}
+
+func TestEngine_Build_UsesAlertSnapshotWhenPresent(t *testing.T) {
+	ts := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC)
+	firstSeen := ts.Add(-30 * time.Second)
+	inc := makeFixedSummary(t, ts, firstSeen)
+	inc.Alerts = &incidents.AlertSnapshot{
+		Latest: &incidents.AlertEvidence{
+			Matches: []incidents.MatchedAlert{{
+				SignalID:    "sig_snapshot",
+				AlertID:     "CheckoutPaymentFailure",
+				Source:      "alertmanager",
+				Severity:    "critical",
+				Reason:      "PMT_502 spike",
+				ProviderURL: "https://alerts.example/inc",
+				EvidenceIDs: []string{"sig_snapshot"},
+				MatchedAt:   ts,
+				Strategy:    "family",
+			}},
+			CapturedAt:    ts,
+			CaptureStatus: incidents.CaptureOK,
+		},
+	}
+	eng := newSnapshotProjectionEngine(t, inc, ts)
+
+	rpt, err := eng.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(rpt.Alerts) != 1 {
+		t.Fatalf("Alerts len = %d, want 1: %+v", len(rpt.Alerts), rpt.Alerts)
+	}
+	got := rpt.Alerts[0]
+	if got.SignalID != "sig_snapshot" || got.AlertID != "CheckoutPaymentFailure" || got.Source != "alertmanager" {
+		t.Fatalf("alert ref = %+v", got)
+	}
+}
+
+func TestEngine_Build_IncludesRuntimeFromSnapshot(t *testing.T) {
+	ts := time.Date(2026, 5, 24, 12, 0, 0, 0, time.UTC)
+	firstSeen := ts.Add(-30 * time.Second)
+	inc := makeFixedSummary(t, ts, firstSeen)
+	inc.Runtime = &incidents.RuntimeSnapshot{
+		Matches: []incidents.RuntimeEvidence{
+			{SignalID: "sig_oom", Subtype: "oom_killed", Service: "checkout", Source: "k8s-demo",
+				Severity: "critical", Reason: "OOMKilled", OccurredAt: ts.Add(-2 * time.Minute), CapturedAt: ts},
+			{SignalID: "sig_panic", Subtype: "panic", Service: "checkout", Source: "go-sdk",
+				Severity: "warning", Reason: "runtime panic", OccurredAt: ts.Add(-time.Minute), CapturedAt: ts},
+		},
+	}
+	eng := newSnapshotProjectionEngine(t, inc, ts)
+
+	rpt, err := eng.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build: %v", err)
+	}
+	if len(rpt.Runtime) != 2 {
+		t.Fatalf("Runtime len = %d, want 2: %+v", len(rpt.Runtime), rpt.Runtime)
+	}
+	if rpt.Runtime[0].Subtype != "oom_killed" || rpt.Runtime[1].Subtype != "panic" {
+		t.Fatalf("runtime subtypes wrong: %+v", rpt.Runtime)
+	}
+	// CapturedAt must not leak into the report (and thus not into the hash).
+	rawA, _ := json.Marshal(rpt)
+	if strings.Contains(string(rawA), "captured_at") {
+		t.Fatalf("report leaked captured_at: %s", rawA)
+	}
+	// Changing CapturedAt only must not change report_hash.
+	inc.Runtime.Matches[0].CapturedAt = ts.Add(time.Hour)
+	eng2 := newSnapshotProjectionEngine(t, inc, ts)
+	rpt2, err := eng2.Build(context.Background(), inc.ID, BuildOptions{Window: 15 * time.Minute})
+	if err != nil {
+		t.Fatalf("Build 2: %v", err)
+	}
+	if rpt.ReportHash != rpt2.ReportHash {
+		t.Fatalf("report_hash changed when only CapturedAt differed: %s vs %s", rpt.ReportHash, rpt2.ReportHash)
+	}
+}
diff --git a/internal/tui/api.go b/internal/tui/api.go
deleted file mode 100644
index ce0ee6b..0000000
--- a/internal/tui/api.go
+++ /dev/null
@@ -1,219 +0,0 @@
-package tui
-
-import (
-	"bufio"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"net/http"
-	"net/url"
-	"strings"
-	"time"
-
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-// APIClient talks to the ingest server's read APIs.
-type APIClient struct {
-	BaseURL    string
-	HTTPClient *http.Client
-}
-
-func NewAPIClient(baseURL string) *APIClient {
-	return &APIClient{
-		BaseURL:    baseURL,
-		HTTPClient: &http.Client{Timeout: 5 * time.Second},
-	}
-}
-
-// Response types matching the Phase 4 JSON shapes.
-
-type OverviewResponse struct {
-	Window        string       `json:"window"`
-	TotalRequests int          `json:"total_requests"`
-	TotalFailures int          `json:"total_failures"`
-	ErrorRate     float64      `json:"error_rate"`
-	Sampled       bool         `json:"sampled"`
-	TopErrors     []ErrorCount `json:"top_errors"`
-	RecentTraces  []TraceEntry `json:"recent_traces"`
-}
-
-type ErrorCount struct {
-	Code  string `json:"code"`
-	Count int    `json:"count"`
-}
-
-type TraceEntry struct {
-	TraceID    string    `json:"trace_id"`
-	Success    bool      `json:"success"`
-	StatusCode int       `json:"status_code"`
-	LatencyMs  int64     `json:"latency_ms"`
-	EventName  string    `json:"event_name,omitempty"`
-	Timestamp  time.Time `json:"timestamp"`
-}
-
-type StoryResponse struct {
-	Story   Story        `json:"story"`
-	Context TraceContext `json:"context"`
-}
-
-type Story struct {
-	TraceID      string `json:"trace_id"`
-	Chain        []Hop  `json:"chain"`
-	Success      bool   `json:"success"`
-	FirstFailHop *Hop   `json:"first_fail_hop,omitempty"`
-	HopCount     int    `json:"hop_count"`
-}
-
-type Hop struct {
-	SpanID     string    `json:"span_id"`
-	Service    string    `json:"service"`
-	StatusCode int       `json:"status_code"`
-	LatencyMs  int64     `json:"latency_ms"`
-	Success    bool      `json:"success"`
-	ErrorCode  string    `json:"error_code,omitempty"`
-	IsRoot     bool      `json:"is_root"`
-	Timestamp  time.Time `json:"timestamp,omitempty"`
-}
-
-type TraceContext struct {
-	RequestID    string   `json:"request_id,omitempty"`
-	RequestEvent string   `json:"request_event,omitempty"`
-	ErrorCodes   []string `json:"error_codes,omitempty"`
-	UserID       string   `json:"user_id,omitempty"`
-	UserTier     string   `json:"user_tier,omitempty"`
-	UserRegion   string   `json:"user_region,omitempty"`
-	Flow         string   `json:"flow,omitempty"`
-	Flags        []string `json:"flags,omitempty"`
-}
-
-// Message types for bubbletea.
-type overviewMsg OverviewResponse
-type storyMsg StoryResponse
-type errMsg struct{ err error }
-
-func (e errMsg) Error() string { return e.err.Error() }
-
-// FetchOverview fetches the overview endpoint (includes recent traces).
-func (c *APIClient) FetchOverview(window string, limit int) tea.Cmd {
-	return func() tea.Msg {
-		endpoint := fmt.Sprintf("%s/v1/overview?window=%s&limit=%d", c.BaseURL, url.QueryEscape(window), limit)
-		resp, err := c.HTTPClient.Get(endpoint)
-		if err != nil {
-			return errMsg{err}
-		}
-		defer resp.Body.Close()
-		if resp.StatusCode != http.StatusOK {
-			return errMsg{fmt.Errorf("overview request failed: %s", resp.Status)}
-		}
-		var result OverviewResponse
-		if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
-			return errMsg{err}
-		}
-		return overviewMsg(result)
-	}
-}
-
-// StartDashboardStream opens an SSE connection to /v1/stream/dashboard and
-// returns a channel of tea.Msg values. The underlying goroutine runs until ctx
-// is canceled or the server closes the stream.
-func (c *APIClient) StartDashboardStream(ctx context.Context) (<-chan tea.Msg, error) {
-	endpoint := c.BaseURL + "/v1/stream/dashboard"
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
-	if err != nil {
-		return nil, err
-	}
-	req.Header.Set("Accept", "text/event-stream")
-
-	streamClient := &http.Client{} // no timeout: long-lived stream
-	resp, err := streamClient.Do(req)
-	if err != nil {
-		return nil, err
-	}
-	if resp.StatusCode != http.StatusOK {
-		resp.Body.Close()
-		return nil, fmt.Errorf("stream request failed: %s", resp.Status)
-	}
-
-	ch := make(chan tea.Msg, 8)
-	go func() {
-		defer resp.Body.Close()
-		defer close(ch)
-
-		reader := bufio.NewReader(resp.Body)
-		var event string
-		var dataBuf strings.Builder
-		send := func(msg tea.Msg) bool {
-			select {
-			case ch <- msg:
-				return true
-			case <-ctx.Done():
-				return false
-			}
-		}
-
-		for {
-			line, err := reader.ReadString('\n')
-			if err != nil {
-				return
-			}
-			line = strings.TrimRight(line, "\r\n")
-			if line == "" {
-				if event == "overview" && dataBuf.Len() > 0 {
-					var ov OverviewResponse
-					if jerr := json.Unmarshal([]byte(dataBuf.String()), &ov); jerr == nil {
-						if !send(overviewMsg(ov)) {
-							return
-						}
-					}
-				}
-				event = ""
-				dataBuf.Reset()
-				continue
-			}
-			switch {
-			case strings.HasPrefix(line, "event: "):
-				event = line[len("event: "):]
-			case strings.HasPrefix(line, "data: "):
-				dataBuf.WriteString(line[len("data: "):])
-			}
-		}
-	}()
-	return ch, nil
-}
-
-// WaitForStream returns a tea.Cmd that blocks on the next message from ch.
-// When ch closes, it emits an errMsg so the caller can fall back to polling.
-func WaitForStream(ch <-chan tea.Msg) tea.Cmd {
-	return func() tea.Msg {
-		msg, ok := <-ch
-		if !ok {
-			return errMsg{errors.New("dashboard stream closed")}
-		}
-		return msg
-	}
-}
-
-// FetchStory fetches the trace story for a given trace ID.
-func (c *APIClient) FetchStory(traceID string) tea.Cmd {
-	return func() tea.Msg {
-		endpoint := fmt.Sprintf("%s/v1/traces/story?trace_id=%s", c.BaseURL, url.QueryEscape(traceID))
-		resp, err := c.HTTPClient.Get(endpoint)
-		if err != nil {
-			return errMsg{err}
-		}
-		defer resp.Body.Close()
-		if resp.StatusCode == http.StatusNotFound {
-			return errMsg{fmt.Errorf("trace %s not found", traceID)}
-		}
-		if resp.StatusCode != http.StatusOK {
-			return errMsg{fmt.Errorf("story request failed: %s", resp.Status)}
-		}
-		var result StoryResponse
-		if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
-			return errMsg{err}
-		}
-		return storyMsg(result)
-	}
-}
diff --git a/internal/tui/api_test.go b/internal/tui/api_test.go
deleted file mode 100644
index a6688ad..0000000
--- a/internal/tui/api_test.go
+++ /dev/null
@@ -1,81 +0,0 @@
-package tui
-
-import (
-	"context"
-	"fmt"
-	"net/http"
-	"net/http/httptest"
-	"testing"
-	"time"
-)
-
-func TestStartDashboardStream_ParsesOverviewEvents(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		flusher, ok := w.(http.Flusher)
-		if !ok {
-			t.Fatalf("test server cannot flush")
-		}
-		w.Header().Set("Content-Type", "text/event-stream")
-		w.WriteHeader(http.StatusOK)
-
-		// Two overview events followed by stream close.
-		fmt.Fprint(w, "id: 1\nevent: overview\ndata: {\"window\":\"5m\",\"total_requests\":10,\"total_failures\":1}\n\n")
-		flusher.Flush()
-		fmt.Fprint(w, "id: 2\nevent: overview\ndata: {\"window\":\"5m\",\"total_requests\":25,\"total_failures\":3}\n\n")
-		flusher.Flush()
-	}))
-	defer srv.Close()
-
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-	defer cancel()
-
-	client := NewAPIClient(srv.URL)
-	ch, err := client.StartDashboardStream(ctx)
-	if err != nil {
-		t.Fatalf("StartDashboardStream: %v", err)
-	}
-
-	first, ok := (<-ch).(overviewMsg)
-	if !ok {
-		t.Fatalf("first message was not overviewMsg")
-	}
-	if first.TotalRequests != 10 || first.TotalFailures != 1 {
-		t.Fatalf("first event = %+v, want 10/1", first)
-	}
-
-	second, ok := (<-ch).(overviewMsg)
-	if !ok {
-		t.Fatalf("second message was not overviewMsg")
-	}
-	if second.TotalRequests != 25 || second.TotalFailures != 3 {
-		t.Fatalf("second event = %+v, want 25/3", second)
-	}
-}
-
-func TestStartDashboardStream_ClosedStreamEmitsErrMsg(t *testing.T) {
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "text/event-stream")
-		w.WriteHeader(http.StatusOK)
-		// Close immediately.
-	}))
-	defer srv.Close()
-
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-	defer cancel()
-
-	client := NewAPIClient(srv.URL)
-	ch, err := client.StartDashboardStream(ctx)
-	if err != nil {
-		t.Fatalf("StartDashboardStream: %v", err)
-	}
-
-	// Drain any pre-close messages then expect a closed channel.
-	for range ch {
-	}
-
-	// WaitForStream must produce an errMsg when the channel is closed.
-	msg := WaitForStream(ch)()
-	if _, ok := msg.(errMsg); !ok {
-		t.Fatalf("WaitForStream returned %T, want errMsg", msg)
-	}
-}
diff --git a/internal/tui/dashboard.go b/internal/tui/dashboard.go
deleted file mode 100644
index af3fb89..0000000
--- a/internal/tui/dashboard.go
+++ /dev/null
@@ -1,254 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-
-	"github.com/charmbracelet/bubbles/key"
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-)
-
-// DashboardModel holds the dashboard view state.
-type DashboardModel struct {
-	overview    OverviewResponse
-	selectedIdx int
-	filterText  string
-	filtering   bool
-	err         error
-}
-
-func NewDashboardModel() DashboardModel {
-	return DashboardModel{}
-}
-
-// UpdateOverview updates all stats and traces from an overview response.
-func (d *DashboardModel) UpdateOverview(resp OverviewResponse) {
-	d.overview = resp
-	d.err = nil
-	if d.selectedIdx >= len(d.overview.RecentTraces) {
-		d.selectedIdx = 0
-	}
-}
-
-// SetError sets the error state displayed in the footer.
-func (d *DashboardModel) SetError(err error) { d.err = err }
-
-// CanQuit returns true if the dashboard is in a state where quit is allowed.
-func (d *DashboardModel) CanQuit() bool { return !d.filtering }
-
-// HandleKey processes key input for the dashboard.
-// onInspect is called when the user selects a trace (returns a fetch command).
-// onRefresh is called when the user requests a data refresh.
-func (d *DashboardModel) HandleKey(msg tea.KeyMsg, onInspect func(string) tea.Cmd, onRefresh func() tea.Cmd) (tea.Cmd, *activeView) {
-	if d.filtering {
-		return d.handleFilterKey(msg), nil
-	}
-
-	switch {
-	case key.Matches(msg, Keys.Down):
-		d.moveDown()
-	case key.Matches(msg, Keys.Up):
-		d.moveUp()
-	case key.Matches(msg, Keys.Enter):
-		if id := d.selectedTraceID(); id != "" {
-			return onInspect(id), nil
-		}
-	case key.Matches(msg, Keys.Filter):
-		d.filtering = true
-		d.filterText = ""
-	case key.Matches(msg, Keys.Refresh):
-		return onRefresh(), nil
-	case key.Matches(msg, Keys.Help):
-		v := viewHelp
-		return nil, &v
-	}
-	return nil, nil
-}
-
-func (d *DashboardModel) handleFilterKey(msg tea.KeyMsg) tea.Cmd {
-	switch {
-	case key.Matches(msg, Keys.Back):
-		d.filtering = false
-		d.filterText = ""
-	case key.Matches(msg, Keys.Enter):
-		d.filtering = false
-	default:
-		if msg.Type == tea.KeyBackspace && len(d.filterText) > 0 {
-			d.filterText = d.filterText[:len(d.filterText)-1]
-		} else if msg.Type == tea.KeyRunes {
-			d.filterText += string(msg.Runes)
-		}
-	}
-	return nil
-}
-
-func (d *DashboardModel) moveDown() {
-	if n := len(d.filteredTraces()); n > 0 && d.selectedIdx < n-1 {
-		d.selectedIdx++
-	}
-}
-
-func (d *DashboardModel) moveUp() {
-	if d.selectedIdx > 0 {
-		d.selectedIdx--
-	}
-}
-
-func (d *DashboardModel) selectedTraceID() string {
-	filtered := d.filteredTraces()
-	if d.selectedIdx < len(filtered) {
-		return filtered[d.selectedIdx].TraceID
-	}
-	return ""
-}
-
-func (d *DashboardModel) filteredTraces() []TraceEntry {
-	if d.filterText == "" {
-		return d.overview.RecentTraces
-	}
-	needle := strings.ToLower(d.filterText)
-	var result []TraceEntry
-	for _, t := range d.overview.RecentTraces {
-		if strings.Contains(strings.ToLower(t.TraceID), needle) ||
-			strings.Contains(strings.ToLower(t.EventName), needle) ||
-			strings.Contains(strconv.Itoa(t.StatusCode), needle) {
-			result = append(result, t)
-		}
-	}
-	return result
-}
-
-// View renders the dashboard.
-func (d *DashboardModel) View(width, height int) string {
-	var b strings.Builder
-
-	// Header
-	b.WriteString(renderBanner(width))
-	stats := fmt.Sprintf("Requests: %d  Failures: %d  Error Rate: %.1f%%",
-		d.overview.TotalRequests, d.overview.TotalFailures, d.overview.ErrorRate)
-	if d.overview.Sampled {
-		stats += "  (sampled)"
-	}
-	b.WriteString(liveIndicator + "  " + statusBarStyle.Render(stats))
-	b.WriteString("\n" + separator(width) + "\n")
-
-	// Two-column layout
-	leftWidth := width/2 - 2
-	rightWidth := width - leftWidth - 3
-	b.WriteString(renderColumns(
-		d.renderTraces(leftWidth, height-9),
-		d.renderTopErrors(rightWidth, height-9),
-		leftWidth,
-	))
-	b.WriteString(separator(width) + "\n")
-
-	// Footer
-	if d.err != nil {
-		b.WriteString(failStyle.Render(fmt.Sprintf("Error: %v", d.err)))
-	} else if d.filtering {
-		b.WriteString(labelStyle.Render("filter: ") + d.filterText + "█")
-	} else {
-		b.WriteString(helpBarStyle.Render("j/k: navigate  enter: inspect  /: filter  r: refresh  q: quit  ?: help"))
-	}
-
-	return b.String()
-}
-
-func (d *DashboardModel) renderTraces(width, maxRows int) string {
-	var b strings.Builder
-	const (
-		traceColWidth   = 8
-		statusColWidth  = 6
-		codeColWidth    = 4
-		latencyColWidth = 7
-		nameColWidth    = 20
-	)
-
-	col := func(w int, s string) string {
-		return lipgloss.NewStyle().Width(w).Render(s)
-	}
-
-	b.WriteString(labelStyle.Render("Recent Traces") + "\n")
-	header := "  " +
-		col(traceColWidth, "ID") + "  " +
-		col(statusColWidth, "STATUS") + "  " +
-		col(codeColWidth, "CODE") + "  " +
-		col(latencyColWidth, "LATENCY") + "  " +
-		col(nameColWidth, "NAME")
-	b.WriteString(statusBarStyle.Render(header) + "\n")
-
-	filtered := d.filteredTraces()
-	if len(filtered) == 0 {
-		b.WriteString(statusBarStyle.Render("  No traces"))
-		return b.String()
-	}
-
-	for i, t := range filtered {
-		if i >= maxRows-1 {
-			break
-		}
-
-		traceShort := t.TraceID
-		if len(traceShort) > 8 {
-			traceShort = traceShort[:8]
-		}
-
-		status := successStyle.Render("OK")
-		if !t.Success {
-			status = failStyle.Render("FAIL")
-		}
-
-		name := t.EventName
-		if len(name) > nameColWidth {
-			name = name[:nameColWidth]
-		}
-
-		line := "  " +
-			col(traceColWidth, traceShort) + "  " +
-			col(statusColWidth, status) + "  " +
-			col(codeColWidth, StatusColor(t.StatusCode).Render(fmt.Sprintf("%d", t.StatusCode))) + "  " +
-			col(latencyColWidth, fmt.Sprintf("%dms", t.LatencyMs)) + "  " +
-			col(nameColWidth, name)
-
-		if i == d.selectedIdx {
-			line = selectedRowStyle.Render("▸" + line[1:])
-		}
-
-		b.WriteString(line + "\n")
-	}
-
-	return b.String()
-}
-
-func (d *DashboardModel) renderTopErrors(width, maxRows int) string {
-	var b strings.Builder
-	b.WriteString(labelStyle.Render("Top Errors") + "\n")
-
-	if len(d.overview.TopErrors) == 0 {
-		b.WriteString(statusBarStyle.Render("No errors"))
-		return b.String()
-	}
-
-	maxCount := 1
-	for _, e := range d.overview.TopErrors {
-		if e.Count > maxCount {
-			maxCount = e.Count
-		}
-	}
-
-	for i, e := range d.overview.TopErrors {
-		if i >= maxRows {
-			break
-		}
-		barWidth := (e.Count * 10) / maxCount
-		if barWidth < 1 && e.Count > 0 {
-			barWidth = 1
-		}
-		b.WriteString(fmt.Sprintf("%-12s  %s  %d\n",
-			e.Code, errorBarStyle.Render(strings.Repeat("█", barWidth)), e.Count))
-	}
-
-	return b.String()
-}
diff --git a/internal/tui/help.go b/internal/tui/help.go
deleted file mode 100644
index d35106e..0000000
--- a/internal/tui/help.go
+++ /dev/null
@@ -1,32 +0,0 @@
-package tui
-
-import (
-	"strings"
-
-	"github.com/charmbracelet/lipgloss"
-)
-
-func HelpView(width, height int) string {
-	var lines []string
-	lines = append(lines, lipgloss.NewStyle().Bold(true).Foreground(colorCyan).Render("Keybindings"))
-	lines = append(lines, "")
-	lines = append(lines, "  j / ↓        Move down")
-	lines = append(lines, "  k / ↑        Move up")
-	lines = append(lines, "  enter        Inspect trace")
-	lines = append(lines, "  esc          Back / clear filter")
-	lines = append(lines, "  /            Filter traces")
-	lines = append(lines, "  r            Refresh")
-	lines = append(lines, "  ?            Toggle help")
-	lines = append(lines, "  q            Quit")
-	lines = append(lines, "")
-	lines = append(lines, lipgloss.NewStyle().Foreground(colorDim).Render("Press any key to close"))
-
-	content := strings.Join(lines, "\n")
-	boxed := lipgloss.NewStyle().
-		Border(lipgloss.RoundedBorder()).
-		BorderForeground(colorCyan).
-		Padding(1, 2).
-		Render(content)
-
-	return lipgloss.Place(width, height, lipgloss.Center, lipgloss.Center, boxed)
-}
diff --git a/internal/tui/keymap.go b/internal/tui/keymap.go
deleted file mode 100644
index 6974760..0000000
--- a/internal/tui/keymap.go
+++ /dev/null
@@ -1,50 +0,0 @@
-package tui
-
-import "github.com/charmbracelet/bubbles/key"
-
-// KeyMap defines the key bindings for the TUI.
-type KeyMap struct {
-	Up      key.Binding
-	Down    key.Binding
-	Enter   key.Binding
-	Back    key.Binding
-	Filter  key.Binding
-	Refresh key.Binding
-	Help    key.Binding
-	Quit    key.Binding
-}
-
-var Keys = KeyMap{
-	Up: key.NewBinding(
-		key.WithKeys("k", "up"),
-		key.WithHelp("k/↑", "up"),
-	),
-	Down: key.NewBinding(
-		key.WithKeys("j", "down"),
-		key.WithHelp("j/↓", "down"),
-	),
-	Enter: key.NewBinding(
-		key.WithKeys("enter"),
-		key.WithHelp("enter", "inspect"),
-	),
-	Back: key.NewBinding(
-		key.WithKeys("esc"),
-		key.WithHelp("esc", "back"),
-	),
-	Filter: key.NewBinding(
-		key.WithKeys("/"),
-		key.WithHelp("/", "filter"),
-	),
-	Refresh: key.NewBinding(
-		key.WithKeys("r"),
-		key.WithHelp("r", "refresh"),
-	),
-	Help: key.NewBinding(
-		key.WithKeys("?"),
-		key.WithHelp("?", "help"),
-	),
-	Quit: key.NewBinding(
-		key.WithKeys("q", "ctrl+c"),
-		key.WithHelp("q", "quit"),
-	),
-}
diff --git a/internal/tui/model.go b/internal/tui/model.go
deleted file mode 100644
index 6408a0c..0000000
--- a/internal/tui/model.go
+++ /dev/null
@@ -1,126 +0,0 @@
-package tui
-
-import (
-	"time"
-
-	"github.com/charmbracelet/bubbles/key"
-	tea "github.com/charmbracelet/bubbletea"
-)
-
-type activeView int
-
-const (
-	viewDashboard activeView = iota
-	viewStory
-	viewHelp
-)
-
-type tickMsg time.Time
-
-// Model is the root bubbletea model. It dispatches messages to sub-models
-// and handles view switching.
-type Model struct {
-	activeView    activeView
-	prevView      activeView
-	width, height int
-	dashboard     DashboardModel
-	story         StoryModel
-	api           *APIClient
-	pollInterval  time.Duration
-	stream        <-chan tea.Msg // non-nil in dev mode
-}
-
-func NewModel(client *APIClient, interval time.Duration) Model {
-	return Model{
-		activeView:   viewDashboard,
-		dashboard:    NewDashboardModel(),
-		story:        NewStoryModel(),
-		api:          client,
-		pollInterval: interval,
-	}
-}
-
-// WithStream enables dev mode: the model reads live overview events from ch
-// instead of polling /v1/overview on a fixed interval.
-func (m Model) WithStream(ch <-chan tea.Msg) Model {
-	m.stream = ch
-	return m
-}
-
-func (m Model) Init() tea.Cmd {
-	if m.stream != nil {
-		return tea.Batch(m.api.FetchOverview("5m", 20), WaitForStream(m.stream))
-	}
-	return tea.Batch(m.api.FetchOverview("5m", 20), m.tick())
-}
-
-func (m Model) tick() tea.Cmd {
-	return tea.Tick(m.pollInterval, func(t time.Time) tea.Msg { return tickMsg(t) })
-}
-
-func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	switch msg := msg.(type) {
-	case tea.WindowSizeMsg:
-		m.width, m.height = msg.Width, msg.Height
-	case tea.KeyMsg:
-		return m.handleKey(msg)
-	case tickMsg:
-		return m, tea.Batch(m.api.FetchOverview("5m", 20), m.tick())
-	case overviewMsg:
-		m.dashboard.UpdateOverview(OverviewResponse(msg))
-		if m.stream != nil {
-			return m, WaitForStream(m.stream)
-		}
-	case storyMsg:
-		m.story.UpdateStory(StoryResponse(msg))
-		m.activeView = viewStory
-	case errMsg:
-		m.dashboard.SetError(msg.err)
-		if m.stream != nil {
-			// Stream died — fall back to polling so the TUI keeps working.
-			m.stream = nil
-			return m, m.tick()
-		}
-	}
-	return m, nil
-}
-
-func (m Model) handleKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
-	// Quit from dashboard (when not filtering)
-	if key.Matches(msg, Keys.Quit) && m.activeView == viewDashboard && m.dashboard.CanQuit() {
-		return m, tea.Quit
-	}
-
-	switch m.activeView {
-	case viewDashboard:
-		cmd, nav := m.dashboard.HandleKey(msg, m.api.FetchStory, func() tea.Cmd {
-			return m.api.FetchOverview("5m", 20)
-		})
-		if nav != nil {
-			m.prevView = viewDashboard
-			m.activeView = *nav
-		}
-		return m, cmd
-
-	case viewStory:
-		if nav := m.story.HandleKey(msg); nav != nil {
-			m.prevView = viewStory
-			m.activeView = *nav
-		}
-
-	case viewHelp:
-		m.activeView = m.prevView // any key exits help
-	}
-	return m, nil
-}
-
-func (m Model) View() string {
-	switch m.activeView {
-	case viewStory:
-		return m.story.View(m.width, m.height)
-	case viewHelp:
-		return HelpView(m.width, m.height)
-	default:
-		return m.dashboard.View(m.width, m.height)
-	}
-}
diff --git a/internal/tui/story.go b/internal/tui/story.go
deleted file mode 100644
index 221f63e..0000000
--- a/internal/tui/story.go
+++ /dev/null
@@ -1,192 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/charmbracelet/bubbles/key"
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-)
-
-// StoryModel holds story view state.
-type StoryModel struct {
-	story   Story
-	context TraceContext
-	loaded  bool
-}
-
-func NewStoryModel() StoryModel {
-	return StoryModel{}
-}
-
-func (s *StoryModel) UpdateStory(resp StoryResponse) {
-	s.story = resp.Story
-	s.context = resp.Context
-	s.loaded = true
-}
-
-// HandleKey processes key input for the story view.
-// Returns a view to switch to, or nil to stay.
-func (s *StoryModel) HandleKey(msg tea.KeyMsg) *activeView {
-	switch {
-	case key.Matches(msg, Keys.Back), key.Matches(msg, Keys.Quit):
-		v := viewDashboard
-		return &v
-	case key.Matches(msg, Keys.Help):
-		v := viewHelp
-		return &v
-	}
-	return nil
-}
-
-func (s StoryModel) View(width, height int) string {
-	if !s.loaded {
-		return lipgloss.Place(width, height, lipgloss.Center, lipgloss.Center,
-			helpBarStyle.Render("Loading trace story..."))
-	}
-
-	var b strings.Builder
-
-	// Title
-	traceID := s.story.TraceID
-	if len(traceID) > 16 {
-		traceID = traceID[:12] + "..."
-	}
-	b.WriteString(titleStyle.Render("Trace "+traceID) + "\n" + separator(width) + "\n")
-
-	// Two-column layout via shared renderColumns
-	leftWidth := width/2 - 2
-	b.WriteString(renderColumns(s.renderHopChain(), s.renderContext(), leftWidth))
-
-	// Summary
-	b.WriteString(separator(width) + "\n")
-	status := successStyle.Render("SUCCESS")
-	if !s.story.Success {
-		label := "FAILED"
-		if s.story.FirstFailHop != nil && s.story.FirstFailHop.Service != "" {
-			label += fmt.Sprintf(" (first observed fail: %s)", s.story.FirstFailHop.Service)
-		}
-		status = failStyle.Render(label)
-	}
-	b.WriteString(fmt.Sprintf("Overall: %s  Hops: %d\n", status, s.story.HopCount))
-	if summary := s.rootCauseSummary(); summary != "" {
-		b.WriteString(summary + "\n")
-	}
-	b.WriteString(helpBarStyle.Render("esc: back  q: dashboard  ?: help"))
-
-	return b.String()
-}
-
-func (s StoryModel) renderHopChain() string {
-	var b strings.Builder
-	b.WriteString(labelStyle.Render("Hop Chain (timestamp-ordered)") + "\n\n")
-
-	for i, hop := range s.story.Chain {
-		icon := successStyle.Render("✓")
-		statusStr := StatusColor(hop.StatusCode).Render(fmt.Sprintf("%d", hop.StatusCode))
-		if !hop.Success {
-			icon = failStyle.Render("✗")
-			statusStr = failStyle.Render(fmt.Sprintf("%d", hop.StatusCode))
-		}
-
-		service := hop.Service
-		if s.story.FirstFailHop != nil && hop.SpanID == s.story.FirstFailHop.SpanID {
-			service = lipgloss.NewStyle().Foreground(colorRed).Bold(true).Render(service)
-		}
-
-		b.WriteString(fmt.Sprintf("[%d] %s %s\n     %s  %dms\n", i+1, icon, service, statusStr, hop.LatencyMs))
-		if hop.ErrorCode != "" {
-			b.WriteString(failStyle.Render("     └ "+hop.ErrorCode) + "\n")
-		}
-		if i < len(s.story.Chain)-1 {
-			b.WriteString("     │\n")
-		}
-	}
-
-	return b.String()
-}
-
-func (s StoryModel) renderContext() string {
-	var b strings.Builder
-	b.WriteString(labelStyle.Render("Context") + "\n\n")
-
-	ctx := s.context
-	if s.story.TraceID != "" {
-		b.WriteString("Trace ID: " + s.story.TraceID + "\n")
-	}
-	if ctx.RequestID != "" {
-		b.WriteString("Request Node: " + ctx.RequestID + "\n")
-	}
-	if ctx.RequestEvent != "" {
-		b.WriteString("Event: " + ctx.RequestEvent + "\n")
-	}
-	if len(ctx.ErrorCodes) > 0 {
-		b.WriteString("Error Codes: " + strings.Join(ctx.ErrorCodes, ", ") + "\n")
-	}
-	if ctx.UserID != "" {
-		line := "User: " + ctx.UserID
-		if ctx.UserTier != "" {
-			line += fmt.Sprintf(" (%s)", ctx.UserTier)
-		}
-		b.WriteString(line + "\n")
-	}
-	if ctx.UserRegion != "" {
-		b.WriteString("Region: " + ctx.UserRegion + "\n")
-	}
-	if ctx.Flow != "" {
-		b.WriteString("Flow: " + ctx.Flow + "\n")
-	}
-	if len(ctx.Flags) > 0 {
-		b.WriteString("Flags: " + strings.Join(ctx.Flags, ", ") + "\n")
-	}
-
-	b.WriteString("\n" + labelStyle.Render("Commands") + "\n")
-	cmdStyle := helpBarStyle
-	b.WriteString(cmdStyle.Render(fmt.Sprintf("waylog \"trace summary %s\"", s.story.TraceID)) + "\n")
-	if ctx.RequestID != "" {
-		b.WriteString(cmdStyle.Render(fmt.Sprintf("waylog \"explain request %s\"", ctx.RequestID)) + "\n")
-	}
-
-	return b.String()
-}
-
-func (s StoryModel) rootCauseSummary() string {
-	if s.story.Success {
-		return ""
-	}
-	root, rootIdx := s.likelyRootCause()
-	if root == nil {
-		return ""
-	}
-	code := root.ErrorCode
-	if code == "" {
-		code = fmt.Sprintf("HTTP_%d", root.StatusCode)
-	}
-
-	var propagated []string
-	for i := rootIdx - 1; i >= 0; i-- {
-		hop := s.story.Chain[i]
-		if hop.Service == "" || hop.Service == root.Service {
-			continue
-		}
-		if hop.StatusCode >= 500 {
-			propagated = append(propagated, hop.Service)
-		}
-	}
-	if len(propagated) == 0 {
-		return failStyle.Render(fmt.Sprintf("Root Cause: %s at %s", code, root.Service))
-	}
-	return failStyle.Render(fmt.Sprintf("Root Cause: %s at %s (propagated via %s)", code, root.Service, strings.Join(propagated, " -> ")))
-}
-
-func (s StoryModel) likelyRootCause() (*Hop, int) {
-	for i := len(s.story.Chain) - 1; i >= 0; i-- {
-		hop := s.story.Chain[i]
-		if hop.Success {
-			continue
-		}
-		return &hop, i
-	}
-	return nil, -1
-}
diff --git a/internal/tui/styles.go b/internal/tui/styles.go
deleted file mode 100644
index 2c723c8..0000000
--- a/internal/tui/styles.go
+++ /dev/null
@@ -1,204 +0,0 @@
-package tui
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/charmbracelet/lipgloss"
-)
-
-// Colors used across views.
-var (
-	colorGreen  = lipgloss.Color("#00CC00")
-	colorYellow = lipgloss.Color("#CCCC00")
-	colorRed    = lipgloss.Color("#CC0000")
-	colorCyan   = lipgloss.Color("#00CCCC")
-	colorDim    = lipgloss.Color("#666666")
-	colorHighBg = lipgloss.Color("#333366")
-	colorWhite  = lipgloss.Color("#FFFFFF")
-)
-
-// ASCII art banner for the dashboard header.
-var bannerLines = []string{
-	`██╗    ██╗ █████╗ ██╗   ██╗██╗      ██████╗  ██████╗`,
-	`██║    ██║██╔══██╗╚██╗ ██╔╝██║     ██╔═══██╗██╔════╝`,
-	`██║ █╗ ██║███████║ ╚████╔╝ ██║     ██║   ██║██║  ███╗`,
-	`██║███╗██║██╔══██║  ╚██╔╝  ██║     ██║   ██║██║   ██║`,
-	`╚███╔███╔╝██║  ██║   ██║   ███████╗╚██████╔╝╚██████╔╝`,
-	` ╚══╝╚══╝ ╚═╝  ╚═╝   ╚═╝   ╚══════╝ ╚═════╝  ╚═════╝`,
-}
-
-var gradientBannerLines = buildGradientBanner(bannerLines)
-
-// Shared styles used by dashboard, story, and help views.
-var (
-	titleStyle       = lipgloss.NewStyle().Bold(true).Foreground(colorCyan).Padding(0, 1)
-	statusBarStyle   = lipgloss.NewStyle().Foreground(colorDim).Padding(0, 1)
-	helpBarStyle     = lipgloss.NewStyle().Foreground(colorDim)
-	labelStyle       = lipgloss.NewStyle().Foreground(colorCyan).Bold(true)
-	successStyle     = lipgloss.NewStyle().Foreground(colorGreen)
-	failStyle        = lipgloss.NewStyle().Foreground(colorRed)
-	errorBarStyle    = lipgloss.NewStyle().Foreground(colorRed)
-	selectedRowStyle = lipgloss.NewStyle().Background(colorHighBg).Foreground(colorWhite).Bold(true)
-	liveIndicator    = lipgloss.NewStyle().Foreground(colorGreen).Bold(true).Render("● Live")
-)
-
-// StatusColor returns the appropriate style for a status code.
-func StatusColor(code int) lipgloss.Style {
-	switch {
-	case code >= 500:
-		return failStyle
-	case code >= 400:
-		return lipgloss.NewStyle().Foreground(colorYellow)
-	default:
-		return successStyle
-	}
-}
-
-// renderColumns renders two text blocks side-by-side with a │ divider.
-func renderColumns(left, right string, leftWidth int) string {
-	leftLines := strings.Split(left, "\n")
-	rightLines := strings.Split(right, "\n")
-	maxLines := len(leftLines)
-	if len(rightLines) > maxLines {
-		maxLines = len(rightLines)
-	}
-
-	var b strings.Builder
-	padder := lipgloss.NewStyle().Width(leftWidth)
-	for i := 0; i < maxLines; i++ {
-		var l, r string
-		if i < len(leftLines) {
-			l = leftLines[i]
-		}
-		if i < len(rightLines) {
-			r = rightLines[i]
-		}
-		b.WriteString(padder.Render(l))
-		b.WriteString(" │ ")
-		b.WriteString(r)
-		b.WriteString("\n")
-	}
-	return b.String()
-}
-
-// separator returns a horizontal line.
-func separator(width int) string {
-	return strings.Repeat("─", width)
-}
-
-// renderBanner renders a centered dashboard title with a narrow-screen fallback.
-func renderBanner(width int) string {
-	artWidth := 0
-	for _, line := range gradientBannerLines {
-		if w := lipgloss.Width(line); w > artWidth {
-			artWidth = w
-		}
-	}
-
-	if width <= 0 || width < artWidth+2 {
-		return titleStyle.Render("WAYLOG") + "\n"
-	}
-
-	var b strings.Builder
-	for _, line := range gradientBannerLines {
-		pad := (width - lipgloss.Width(line)) / 2
-		if pad < 0 {
-			pad = 0
-		}
-		b.WriteString(strings.Repeat(" ", pad))
-		b.WriteString(line)
-		b.WriteString("\n")
-	}
-	return b.String()
-}
-
-func buildGradientBanner(lines []string) []string {
-	h := len(lines)
-	if h == 0 {
-		return nil
-	}
-	maxW := 1
-	for _, line := range lines {
-		if w := lipgloss.Width(line); w > maxW {
-			maxW = w
-		}
-	}
-
-	// Strong 3-stop cyan gradient for a filled neon look.
-	top := rgb{r: 140, g: 255, b: 255}
-	mid := rgb{r: 58, g: 224, b: 246}
-	bottom := rgb{r: 0, g: 170, b: 235}
-
-	out := make([]string, 0, h)
-	for y, line := range lines {
-		runes := []rune(line)
-		var b strings.Builder
-		for x, ch := range runes {
-			if ch == ' ' {
-				b.WriteRune(ch)
-				continue
-			}
-			vy := 0.0
-			if h > 1 {
-				vy = float64(y) / float64(h-1)
-			}
-			vx := 0.0
-			if maxW > 1 {
-				vx = float64(x) / float64(maxW-1)
-			}
-			t := 0.80*vy + 0.20*vx
-			c := threeStopColor(top, mid, bottom, t)
-			b.WriteString(lipgloss.NewStyle().Foreground(lipgloss.Color(c.Hex())).Render(string(ch)))
-		}
-		out = append(out, b.String())
-	}
-	return out
-}
-
-type rgb struct {
-	r int
-	g int
-	b int
-}
-
-func (c rgb) Hex() string {
-	return fmt.Sprintf("#%02X%02X%02X", clamp(c.r), clamp(c.g), clamp(c.b))
-}
-
-func mixColor(a, b rgb, t float64) rgb {
-	if t < 0 {
-		t = 0
-	}
-	if t > 1 {
-		t = 1
-	}
-	return rgb{
-		r: int(float64(a.r) + (float64(b.r)-float64(a.r))*t),
-		g: int(float64(a.g) + (float64(b.g)-float64(a.g))*t),
-		b: int(float64(a.b) + (float64(b.b)-float64(a.b))*t),
-	}
-}
-
-func threeStopColor(a, m, b rgb, t float64) rgb {
-	if t < 0 {
-		t = 0
-	}
-	if t > 1 {
-		t = 1
-	}
-	if t <= 0.5 {
-		return mixColor(a, m, t*2)
-	}
-	return mixColor(m, b, (t-0.5)*2)
-}
-
-func clamp(v int) int {
-	if v < 0 {
-		return 0
-	}
-	if v > 255 {
-		return 255
-	}
-	return v
-}
diff --git a/packages/waylog-ts/src/__tests__/runtime.test.ts b/packages/waylog-ts/src/__tests__/runtime.test.ts
new file mode 100644
index 0000000..5445e1e
--- /dev/null
+++ b/packages/waylog-ts/src/__tests__/runtime.test.ts
@@ -0,0 +1,123 @@
+import { describe, expect, it, afterEach, vi } from "vitest";
+import { init, shutdown, postSignal, installGlobalHandlers } from "../index.js";
+
+type AnyListener = (...args: unknown[]) => void;
+
+function initWithHooks(enabled: boolean, fetchMock: typeof fetch) {
+  init({
+    service: "checkout",
+    env: "demo",
+    ingestUrl: "http://localhost:8080",
+    apiKey: "k1",
+    enableRuntimeHooks: enabled,
+    fetch: fetchMock,
+  });
+}
+
+function okFetch() {
+  return vi.fn(async () => new Response(null, { status: 201 })) as unknown as typeof fetch;
+}
+
+function calls(fetchMock: typeof fetch) {
+  return (fetchMock as unknown as ReturnType).mock.calls;
+}
+
+// captureHandlers installs the global hooks while spying on process.on so we can
+// invoke the registered listeners directly. Calling them avoids process.emit(),
+// which would trip vitest's own unhandledRejection listener and fail the run.
+function captureHandlers(): { handlers: Record; uninstall: () => void } {
+  const onSpy = vi.spyOn(process, "on");
+  const uninstall = installGlobalHandlers();
+  const handlers: Record = {};
+  for (const call of onSpy.mock.calls) {
+    handlers[call[0] as string] = call[1] as AnyListener;
+  }
+  onSpy.mockRestore();
+  return { handlers, uninstall };
+}
+
+async function flush() {
+  await new Promise((r) => setTimeout(r, 0));
+}
+
+afterEach(async () => {
+  await shutdown(0);
+});
+
+describe("postSignal", () => {
+  it("posts to /v1/signals with bearer auth and config service/env/timestamp", async () => {
+    const fetchMock = okFetch();
+    initWithHooks(true, fetchMock);
+
+    await postSignal({
+      type: "runtime",
+      service: "",
+      env: "",
+      severity: "critical",
+      reason: "panic: boom",
+      source: "ts-sdk",
+      metadata: { subtype: "panic" },
+    });
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const [url, opts] = calls(fetchMock)[0];
+    expect(url).toBe("http://localhost:8080/v1/signals");
+    expect((opts as RequestInit).headers).toMatchObject({ Authorization: "Bearer k1" });
+    const body = JSON.parse((opts as RequestInit).body as string);
+    expect(body.service).toBe("checkout");
+    expect(body.env).toBe("demo");
+    expect(body.timestamp).toBeTruthy();
+    expect(body.metadata.subtype).toBe("panic");
+  });
+});
+
+describe("installGlobalHandlers", () => {
+  it("posts an unhandled_rejection runtime signal with env from config", async () => {
+    const fetchMock = okFetch();
+    initWithHooks(true, fetchMock);
+    const existingHandler = () => {};
+    process.on("unhandledRejection", existingHandler);
+    const { handlers, uninstall } = captureHandlers();
+    try {
+      handlers["unhandledRejection"]!(new Error("boom"), Promise.resolve());
+      await flush();
+    } finally {
+      uninstall();
+      process.off("unhandledRejection", existingHandler);
+    }
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const body = JSON.parse((calls(fetchMock)[0][1] as RequestInit).body as string);
+    expect(body.type).toBe("runtime");
+    expect(body.source).toBe("ts-sdk");
+    expect(body.env).toBe("demo");
+    expect(body.metadata.subtype).toBe("unhandled_rejection");
+  });
+
+  it("posts an uncaught_exception runtime signal", async () => {
+    const fetchMock = okFetch();
+    initWithHooks(true, fetchMock);
+    const { handlers, uninstall } = captureHandlers();
+    try {
+      handlers["uncaughtExceptionMonitor"]!(new Error("kaboom"));
+      await flush();
+    } finally {
+      uninstall();
+    }
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const body = JSON.parse((calls(fetchMock)[0][1] as RequestInit).body as string);
+    expect(body.metadata.subtype).toBe("uncaught_exception");
+  });
+
+  it("is a no-op when runtime hooks are disabled", () => {
+    const fetchMock = okFetch();
+    initWithHooks(false, fetchMock);
+    const { handlers, uninstall } = captureHandlers();
+    uninstall();
+
+    expect(handlers["unhandledRejection"]).toBeUndefined();
+    expect(handlers["uncaughtExceptionMonitor"]).toBeUndefined();
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+});
diff --git a/packages/waylog-ts/src/index.ts b/packages/waylog-ts/src/index.ts
index a53c7f0..afeea6e 100644
--- a/packages/waylog-ts/src/index.ts
+++ b/packages/waylog-ts/src/index.ts
@@ -13,6 +13,7 @@ export {
   newSpanId,
   newTraceId,
   parseTraceparent,
+  postSignal,
   recordOutgoingSpan,
   runWithContext,
   setField,
@@ -26,6 +27,7 @@ export {
   suppress,
   traceId,
 } from "./logger.js";
+export { installGlobalHandlers } from "./runtime.js";
 export { Transport, normalizeIngestUrl } from "./transport.js";
 export { SCHEMA_VERSION, WaylogError } from "./types.js";
 export type {
@@ -39,6 +41,7 @@ export type {
   Fields,
   Log,
   Logger,
+  Signal,
   Stats,
   Status,
   Step,
diff --git a/packages/waylog-ts/src/logger.ts b/packages/waylog-ts/src/logger.ts
index 28735a4..bf52b73 100644
--- a/packages/waylog-ts/src/logger.ts
+++ b/packages/waylog-ts/src/logger.ts
@@ -17,12 +17,14 @@ import {
   type LogLevel,
   type Logger,
   type Stats,
+  type Signal,
   type Status,
   type Step,
   type StepError,
   type WaylogConfig,
   type WideEvent,
 } from "./types.js";
+import { postSignal as postSignalTransport } from "./signal-transport.js";
 
 const defaultMaxSteps = 128;
 const defaultMaxLogs = 256;
@@ -133,6 +135,15 @@ export function stats(): Stats {
   };
 }
 
+export function runtimeHooksEnabled(): boolean {
+  return sdk?.cfg.enableRuntimeHooks === true;
+}
+
+export function postSignal(signal: Signal): Promise {
+  if (!sdk) return Promise.resolve();
+  return postSignalTransport(sdk.cfg, signal);
+}
+
 export function begin(ctx: Context = {}, opts: BeginOptions = {}): Context {
   const s = ensureSDK();
   const now = opts.now ?? new Date();
diff --git a/packages/waylog-ts/src/runtime.ts b/packages/waylog-ts/src/runtime.ts
new file mode 100644
index 0000000..6a4400c
--- /dev/null
+++ b/packages/waylog-ts/src/runtime.ts
@@ -0,0 +1,85 @@
+import { postSignal, runtimeHooksEnabled } from "./logger.js";
+
+const rethrownUnhandledRejections = new WeakSet();
+
+/**
+ * installGlobalHandlers registers process-level handlers that post a "runtime"
+ * signal when the process hits an uncaught exception or unhandled promise
+ * rejection, so crashes correlate with incidents during triage.
+ *
+ * No-op unless runtime hooks are enabled in config. Returns an uninstall
+ * function.
+ *
+ * The exception handler uses `uncaughtExceptionMonitor`, which observes the
+ * error without preventing Node's default crash — an observability SDK must not
+ * silently keep a broken process alive. The signal is best-effort: Node may exit
+ * before the async POST resolves, so callers should not rely on it landing after
+ * a hard crash.
+ */
+export function installGlobalHandlers(): () => void {
+  if (!runtimeHooksEnabled()) {
+    return () => {};
+  }
+
+  const onException = (err: Error): void => {
+    if (rethrownUnhandledRejections.has(err)) return;
+    void emit("uncaught_exception", err);
+  };
+  const rethrowUnhandledRejections =
+    process.listenerCount("unhandledRejection") === 0 && nodeUnhandledRejectionsModeCrashes();
+  const onRejection = (reason: unknown): void => {
+    const posted = emit("unhandled_rejection", reason);
+    if (rethrowUnhandledRejections) {
+      void posted.finally(() => {
+        setImmediate(() => {
+          throw markRethrown(reason);
+        });
+      });
+    }
+  };
+
+  process.on("uncaughtExceptionMonitor", onException);
+  process.on("unhandledRejection", onRejection);
+
+  return () => {
+    process.off("uncaughtExceptionMonitor", onException);
+    process.off("unhandledRejection", onRejection);
+  };
+}
+
+function emit(subtype: string, reason: unknown): Promise {
+  return postSignal({
+    type: "runtime",
+    service: "",
+    env: "",
+    severity: "critical",
+    reason: `${subtype}: ${reasonText(reason)}`,
+    message: stackOf(reason),
+    source: "ts-sdk",
+    metadata: { subtype },
+  }).catch(() => {});
+}
+
+function reasonText(reason: unknown): string {
+  return reason instanceof Error ? reason.message : String(reason);
+}
+
+function stackOf(reason: unknown): string {
+  return reason instanceof Error && reason.stack ? reason.stack : String(reason);
+}
+
+function markRethrown(reason: unknown): unknown {
+  if (typeof reason === "object" && reason !== null) {
+    rethrownUnhandledRejections.add(reason);
+    return reason;
+  }
+  const err = new Error(String(reason));
+  rethrownUnhandledRejections.add(err);
+  return err;
+}
+
+function nodeUnhandledRejectionsModeCrashes(): boolean {
+  const flag = process.execArgv.find((arg) => arg.startsWith("--unhandled-rejections"));
+  const mode = flag?.includes("=") ? flag.split("=", 2)[1] : undefined;
+  return mode !== "warn" && mode !== "none";
+}
diff --git a/packages/waylog-ts/src/signal-transport.ts b/packages/waylog-ts/src/signal-transport.ts
new file mode 100644
index 0000000..a9f273c
--- /dev/null
+++ b/packages/waylog-ts/src/signal-transport.ts
@@ -0,0 +1,58 @@
+import type { Signal, WaylogConfig } from "./types.js";
+import { normalizeIngestUrl } from "./transport.js";
+
+const maxSignalReasonLen = 512;
+const maxSignalMessageLen = 4096;
+const signalPostTimeoutMs = 5000;
+
+/**
+ * signalUrl resolves the /v1/signals endpoint. signalUrl wins when set;
+ * otherwise it derives from ingestUrl, replacing a trailing /v1/events path with
+ * /v1/signals and preserving any query parameters.
+ */
+export function signalUrl(cfg: WaylogConfig): string {
+  if (cfg.signalUrl) return cfg.signalUrl;
+  if (!cfg.ingestUrl) return "";
+  const u = new URL(normalizeIngestUrl(cfg.ingestUrl));
+  u.pathname = u.pathname.replace(/\/v1\/events$/, "/v1/signals");
+  return u.toString();
+}
+
+/**
+ * postSignal sends a production signal to the ingest server. It is a no-op when
+ * neither signalUrl nor ingestUrl is configured. service, env and timestamp
+ * default to config / now when unset, and reason is bounded in length. Honors an
+ * injected cfg.fetch. Success = any 2xx.
+ *
+ * The request is bounded by signalPostTimeoutMs: an observability SDK must never
+ * keep a broken process alive waiting on a hung endpoint, and the unhandled-
+ * rejection handler defers the process crash until this POST settles.
+ */
+export async function postSignal(cfg: WaylogConfig, signal: Signal): Promise {
+  const url = signalUrl(cfg);
+  if (!url) return;
+  const fetchImpl = cfg.fetch ?? fetch;
+  const body: Signal = {
+    ...signal,
+    service: signal.service || cfg.service,
+    env: signal.env || cfg.env || "",
+    timestamp: signal.timestamp || new Date().toISOString(),
+    ...(signal.reason ? { reason: truncate(signal.reason, maxSignalReasonLen) } : {}),
+    ...(signal.message ? { message: truncate(signal.message, maxSignalMessageLen) } : {}),
+  };
+  const headers: Record = { "Content-Type": "application/json" };
+  if (cfg.apiKey) headers.Authorization = `Bearer ${cfg.apiKey}`;
+  const resp = await fetchImpl(url, {
+    method: "POST",
+    headers,
+    body: JSON.stringify(body),
+    signal: AbortSignal.timeout(signalPostTimeoutMs),
+  });
+  if (resp.status < 200 || resp.status >= 300) {
+    throw new Error(`waylog signals error ${resp.status}`);
+  }
+}
+
+function truncate(s: string, n: number): string {
+  return s.length > n ? s.slice(0, n) : s;
+}
diff --git a/packages/waylog-ts/src/types.ts b/packages/waylog-ts/src/types.ts
index 7205162..fa26d12 100644
--- a/packages/waylog-ts/src/types.ts
+++ b/packages/waylog-ts/src/types.ts
@@ -1,6 +1,12 @@
 export const SCHEMA_VERSION = "2.0";
 
-export type Status = "ok" | "error" | "timeout" | "partial" | "aborted" | "suppressed";
+export type Status =
+  | "ok"
+  | "error"
+  | "timeout"
+  | "partial"
+  | "aborted"
+  | "suppressed";
 export type StepStatus = "ok" | "error";
 export type LogLevel = "info" | "warn" | "error";
 export type Fields = Record;
@@ -22,6 +28,36 @@ export interface WaylogConfig {
   batchMode?: boolean;
   redactor?: (fields: Fields) => Fields;
   fetch?: typeof fetch;
+  /**
+   * Enable runtime signal reporting. When on, installGlobalHandlers() posts a
+   * "runtime" signal to /v1/signals on an uncaught exception or unhandled
+   * rejection so crashes correlate with incidents. Off by default.
+   */
+  enableRuntimeHooks?: boolean;
+  /**
+   * Override the signal endpoint. Defaults to the ingestUrl host's /v1/signals
+   * path. Set only when signals go to a different host than events.
+   */
+  signalUrl?: string;
+}
+
+/**
+ * Signal is a production-context signal posted to /v1/signals. The SDK emits
+ * "runtime" signals (uncaught exceptions, unhandled rejections) so they
+ * correlate with incidents during triage. type/service/env/source/severity/
+ * reason/timestamp are required by the server; postSignal fills service, env and
+ * timestamp from config when unset.
+ */
+export interface Signal {
+  type: string;
+  service?: string;
+  env?: string;
+  severity?: string;
+  reason?: string;
+  message?: string;
+  source?: string;
+  timestamp?: string;
+  metadata?: Record;
 }
 
 export interface Stats {
diff --git a/pkg/api/v2/types.go b/pkg/api/v2/types.go
index e14af86..9cb37df 100644
--- a/pkg/api/v2/types.go
+++ b/pkg/api/v2/types.go
@@ -16,6 +16,12 @@ const (
 
 	BlastViewSingleFamily = "single_family"
 	BlastViewCrossFamily  = "cross_family"
+
+	// Wire-level capture statuses for {Propagation,Blast,Alert}Evidence.CaptureStatus.
+	// Internal incidents.EvidenceCaptureStatus values cast to these strings.
+	CaptureStatusOK      = "ok"
+	CaptureStatusPartial = "partial"
+	CaptureStatusMissing = "missing"
 )
 
 type EventSearchResponse struct {
@@ -132,30 +138,115 @@ type IncidentEvidence struct {
 }
 
 type Incident struct {
-	IncidentID              string             `json:"incident_id"`
-	Env                     string             `json:"env"`
-	Service                 string             `json:"service"`
-	ErrorFamily             ErrorFamily        `json:"error_family"`
-	Status                  string             `json:"status"`
-	Cause                   string             `json:"cause"`
-	Confidence              string             `json:"confidence"`
-	Severity                int                `json:"severity"`
-	StartedAt               time.Time          `json:"started_at"`
-	UpdatedAt               time.Time          `json:"updated_at"`
-	LastSeenAt              time.Time          `json:"last_seen_at"`
-	RecoveringAt            *time.Time         `json:"recovering_at,omitempty"`
-	ResolvedAt              *time.Time         `json:"resolved_at,omitempty"`
-	AffectedRequests        int                `json:"affected_requests"`
-	AffectedUsers           *int               `json:"affected_users,omitempty"`
-	AffectedServices        int                `json:"affected_services"`
-	TopServices             []string           `json:"top_services"`
-	SampleTraces            []string           `json:"sample_traces"`
-	Evidence                []IncidentEvidence `json:"evidence"`
-	NextChecks              []string           `json:"next_checks"`
-	InstrumentationWarnings []string           `json:"instrumentation_warnings,omitempty"`
-	Lift                    float64            `json:"lift"`
-	BaselineCount           int                `json:"baseline_count"`
-	CurrentCount            int                `json:"current_count"`
+	IncidentID              string               `json:"incident_id"`
+	Env                     string               `json:"env"`
+	Service                 string               `json:"service"`
+	ErrorFamily             ErrorFamily          `json:"error_family"`
+	Status                  string               `json:"status"`
+	Cause                   string               `json:"cause"`
+	Confidence              string               `json:"confidence"`
+	Severity                int                  `json:"severity"`
+	StartedAt               time.Time            `json:"started_at"`
+	UpdatedAt               time.Time            `json:"updated_at"`
+	LastSeenAt              time.Time            `json:"last_seen_at"`
+	RecoveringAt            *time.Time           `json:"recovering_at,omitempty"`
+	ResolvedAt              *time.Time           `json:"resolved_at,omitempty"`
+	AffectedRequests        int                  `json:"affected_requests"`
+	AffectedUsers           *int                 `json:"affected_users,omitempty"`
+	AffectedServices        int                  `json:"affected_services"`
+	TopServices             []string             `json:"top_services"`
+	SampleTraces            []string             `json:"sample_traces"`
+	Evidence                []IncidentEvidence   `json:"evidence"`
+	NextChecks              []string             `json:"next_checks"`
+	InstrumentationWarnings []string             `json:"instrumentation_warnings,omitempty"`
+	Lift                    float64              `json:"lift"`
+	BaselineCount           int                  `json:"baseline_count"`
+	CurrentCount            int                  `json:"current_count"`
+	Propagation             *PropagationSnapshot `json:"propagation,omitempty"`
+	Blast                   *BlastSnapshot       `json:"blast,omitempty"`
+	Alerts                  *AlertSnapshot       `json:"alerts,omitempty"`
+	Runtime                 *RuntimeSnapshot     `json:"runtime,omitempty"`
+}
+
+type PropagationSnapshot struct {
+	Opening *PropagationEvidence `json:"opening,omitempty"`
+	Latest  *PropagationEvidence `json:"latest,omitempty"`
+}
+
+type PropagationEvidence struct {
+	OriginService string            `json:"origin_service"`
+	OriginStep    string            `json:"origin_step"`
+	Path          []PropagationStep `json:"path"`
+	SampleTraceID string            `json:"sample_trace_id"`
+	FirstSeenAt   *time.Time        `json:"first_seen_at,omitempty"`
+	CapturedAt    time.Time         `json:"captured_at"`
+	CaptureStatus string            `json:"capture_status"`
+}
+
+type PropagationStep struct {
+	Service    string `json:"service"`
+	Step       string `json:"step"`
+	StartMS    int64  `json:"start_ms"`
+	DurationMS int64  `json:"duration_ms"`
+	Status     string `json:"status"`
+	ErrorCode  string `json:"error_code,omitempty"`
+}
+
+type BlastSnapshot struct {
+	Opening *BlastEvidence `json:"opening,omitempty"`
+	Latest  *BlastEvidence `json:"latest,omitempty"`
+}
+
+type BlastEvidence struct {
+	AffectedRequests int       `json:"affected_requests"`
+	AffectedUsers    *int      `json:"affected_users,omitempty"`
+	AffectedServices int       `json:"affected_services"`
+	TopServices      []string  `json:"top_services"`
+	SampledTraces    []string  `json:"sampled_traces"`
+	CapturedAt       time.Time `json:"captured_at"`
+	CaptureStatus    string    `json:"capture_status"`
+}
+
+type AlertSnapshot struct {
+	Opening *AlertEvidence `json:"opening,omitempty"`
+	Latest  *AlertEvidence `json:"latest,omitempty"`
+}
+
+type AlertEvidence struct {
+	Matches       []MatchedAlert `json:"matches"`
+	CapturedAt    time.Time      `json:"captured_at"`
+	CaptureStatus string         `json:"capture_status"`
+}
+
+type MatchedAlert struct {
+	SignalID    string    `json:"signal_id"`
+	AlertID     string    `json:"alert_id,omitempty"`
+	Source      string    `json:"source"`
+	Severity    string    `json:"severity"`
+	Reason      string    `json:"reason"`
+	ProviderURL string    `json:"provider_url,omitempty"`
+	EvidenceIDs []string  `json:"evidence_ids,omitempty"`
+	MatchedAt   time.Time `json:"matched_at"`
+	Strategy    string    `json:"strategy"`
+}
+
+type RuntimeSnapshot struct {
+	Matches []RuntimeEvidence `json:"matches,omitempty"`
+	Opening *RuntimeEvidence  `json:"opening,omitempty"`
+	Latest  *RuntimeEvidence  `json:"latest,omitempty"`
+}
+
+type RuntimeEvidence struct {
+	Subtype       string         `json:"subtype"`
+	Service       string         `json:"service"`
+	Reason        string         `json:"reason"`
+	Severity      string         `json:"severity"`
+	Source        string         `json:"source"`
+	SignalID      string         `json:"signal_id"`
+	OccurredAt    time.Time      `json:"occurred_at"`
+	Metadata      map[string]any `json:"metadata,omitempty"`
+	CapturedAt    time.Time      `json:"captured_at"`
+	CaptureStatus string         `json:"capture_status"`
 }
 
 type IncidentListResponse struct {
diff --git a/pkg/triage/report.go b/pkg/triage/report.go
index c1920b6..0fe93fc 100644
--- a/pkg/triage/report.go
+++ b/pkg/triage/report.go
@@ -6,6 +6,8 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
+	"sort"
+	"strings"
 )
 
 type Confidence string
@@ -24,11 +26,17 @@ type Report struct {
 	SampleTraces  []TraceSample   `json:"sample_traces,omitempty"`
 	Signals       []SignalRef     `json:"signals,omitempty"`
 	Alerts        []AlertRef      `json:"alerts,omitempty"`
+	Runtime       []RuntimeRef    `json:"runtime,omitempty"`
 	NextChecks    []NextCheck     `json:"next_checks,omitempty"`
 	Confidence    Confidence      `json:"confidence"`
 	GeneratedAt   string          `json:"generated_at"`
 	PlanRunID     string          `json:"plan_run_id,omitempty"`
 	ReportHash    string          `json:"report_hash"`
+	// EvidenceFingerprint identifies the evidence set grounding this report;
+	// unlike ReportHash it is stable across engine ticks until evidence is
+	// attached or removed (ADR 0002). omitempty keeps pre-fingerprint
+	// report_hash values unchanged.
+	EvidenceFingerprint string `json:"evidence_fingerprint,omitempty"`
 }
 
 type IncidentRef struct {
@@ -71,6 +79,20 @@ type AlertRef struct {
 	EvidenceIDs []string `json:"evidence_ids"`
 }
 
+// RuntimeRef is a runtime evidence row in the report — infra (k8s OOMKill,
+// crashloop) or app (panic, unhandled rejection). It deliberately omits the
+// capture timestamp: only stable signal fields participate in report_hash so
+// the hash does not churn as fresh captures update CapturedAt.
+type RuntimeRef struct {
+	SignalID   string `json:"signal_id"`
+	Subtype    string `json:"subtype"`
+	Service    string `json:"service"`
+	Source     string `json:"source"`
+	Severity   string `json:"severity"`
+	Reason     string `json:"reason"`
+	OccurredAt string `json:"occurred_at"`
+}
+
 type NextCheck struct {
 	ID     string `json:"id"`
 	Prompt string `json:"prompt"`
@@ -104,6 +126,7 @@ func (r *Report) CanonicalHash() (string, error) {
 	clone.GeneratedAt = ""
 	clone.PlanRunID = ""
 	clone.ReportHash = ""
+	clone.EvidenceFingerprint = ""
 	raw, err := json.Marshal(&clone)
 	if err != nil {
 		return "", fmt.Errorf("triage: canonical marshal: %w", err)
@@ -111,3 +134,38 @@ func (r *Report) CanonicalHash() (string, error) {
 	sum := sha256.Sum256(raw)
 	return "sha256:" + hex.EncodeToString(sum[:]), nil
 }
+
+// CanonicalEvidenceFingerprint returns sha256: over the report's
+// evidence identity set: sorted, deduplicated kind:id tuples for the
+// incident, signals, alerts, runtime events, and sample traces. Volatile
+// fields (counts, confidence, next checks, payloads, timestamps) are
+// excluded by construction, so the fingerprint is stable across engine
+// ticks until evidence is attached or removed (ADR 0002).
+func (r *Report) CanonicalEvidenceFingerprint() string {
+	set := map[string]struct{}{}
+	add := func(kind, id string) {
+		if id != "" {
+			set[kind+":"+id] = struct{}{}
+		}
+	}
+	add("incident", r.IncidentRef.ID)
+	for _, s := range r.Signals {
+		add("signal", s.ID)
+	}
+	for _, a := range r.Alerts {
+		add("alert", a.SignalID)
+	}
+	for _, rt := range r.Runtime {
+		add("runtime", rt.SignalID)
+	}
+	for _, t := range r.SampleTraces {
+		add("trace", t.TraceID)
+	}
+	keys := make([]string, 0, len(set))
+	for k := range set {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	sum := sha256.Sum256([]byte(strings.Join(keys, "\n")))
+	return "sha256:" + hex.EncodeToString(sum[:])
+}
diff --git a/pkg/triage/report_test.go b/pkg/triage/report_test.go
index db8bd4d..d1c1878 100644
--- a/pkg/triage/report_test.go
+++ b/pkg/triage/report_test.go
@@ -157,3 +157,80 @@ func TestCanonicalHashFormat(t *testing.T) {
 		t.Fatalf("hash length wrong: got %d (%q)", len(h), h)
 	}
 }
+
+func fingerprintFixture() triage.Report {
+	return triage.Report{
+		SchemaVersion: "triage.v1",
+		IncidentRef:   triage.IncidentRef{ID: "inc_fp", Window: "15m"},
+		BlastSnapshot: triage.BlastSnapshot{Requests: 12, Users: 8, Services: 4},
+		SampleTraces:  []triage.TraceSample{{TraceID: "trace_a", Summary: "checkout 502"}},
+		Signals:       []triage.SignalRef{{ID: "sig_dep", Type: "dependency", EvidenceIDs: []string{"e1"}}},
+		Alerts:        []triage.AlertRef{{SignalID: "sig_alert", Source: "grafana", Severity: "critical", Reason: "spike"}},
+		Runtime:       []triage.RuntimeRef{{SignalID: "sig_rt", Subtype: "oom_killed", Service: "checkout"}},
+		NextChecks:    []triage.NextCheck{{ID: "check_0", Prompt: "verify x"}},
+		Confidence:    triage.ConfidenceHigh,
+		GeneratedAt:   "2026-06-12T00:00:00Z",
+	}
+}
+
+func TestEvidenceFingerprintStableAcrossVolatileChanges(t *testing.T) {
+	a := fingerprintFixture()
+	b := fingerprintFixture()
+	// Everything that legitimately drifts between engine ticks changes…
+	b.BlastSnapshot = triage.BlastSnapshot{Requests: 99, Users: 70, Services: 9}
+	b.Confidence = triage.ConfidenceLow
+	b.NextChecks = []triage.NextCheck{{ID: "check_9", Prompt: "different"}}
+	b.GeneratedAt = "2026-06-12T00:05:00Z"
+	b.PlanRunID = "plan_123"
+	b.SampleTraces[0].Summary = "different summary"
+	b.FirstFailure = []byte(`{"step":"other"}`)
+	// …but the evidence identity set is the same, so the fingerprint must match.
+	if a.CanonicalEvidenceFingerprint() != b.CanonicalEvidenceFingerprint() {
+		t.Fatalf("fingerprint must ignore volatile fields:\n a=%s\n b=%s",
+			a.CanonicalEvidenceFingerprint(), b.CanonicalEvidenceFingerprint())
+	}
+}
+
+func TestEvidenceFingerprintChangesWhenEvidenceChanges(t *testing.T) {
+	a := fingerprintFixture()
+	b := fingerprintFixture()
+	b.Signals = append(b.Signals, triage.SignalRef{ID: "sig_new", Type: "deploy"})
+	if a.CanonicalEvidenceFingerprint() == b.CanonicalEvidenceFingerprint() {
+		t.Fatal("attaching a new signal must change the fingerprint")
+	}
+	c := fingerprintFixture()
+	c.IncidentRef.ID = "inc_other"
+	if a.CanonicalEvidenceFingerprint() == c.CanonicalEvidenceFingerprint() {
+		t.Fatal("a different incident must have a different fingerprint")
+	}
+}
+
+func TestEvidenceFingerprintIsOrderIndependentAndDeduped(t *testing.T) {
+	a := fingerprintFixture()
+	a.Signals = []triage.SignalRef{{ID: "sig_1"}, {ID: "sig_2"}}
+	b := fingerprintFixture()
+	b.Signals = []triage.SignalRef{{ID: "sig_2"}, {ID: "sig_1"}, {ID: "sig_1"}}
+	if a.CanonicalEvidenceFingerprint() != b.CanonicalEvidenceFingerprint() {
+		t.Fatal("fingerprint must be order-independent and deduplicated")
+	}
+	if !strings.HasPrefix(a.CanonicalEvidenceFingerprint(), "sha256:") {
+		t.Fatalf("fingerprint format: %s", a.CanonicalEvidenceFingerprint())
+	}
+}
+
+func TestEvidenceFingerprintFieldExcludedFromReportHash(t *testing.T) {
+	a := fingerprintFixture()
+	b := fingerprintFixture()
+	b.EvidenceFingerprint = b.CanonicalEvidenceFingerprint()
+	ha, err := a.CanonicalHash()
+	if err != nil {
+		t.Fatal(err)
+	}
+	hb, err := b.CanonicalHash()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if ha != hb {
+		t.Fatal("evidence_fingerprint is derived metadata and must not feed report_hash")
+	}
+}
diff --git a/pkg/waylog/http/middleware.go b/pkg/waylog/http/middleware.go
index 7b79c5b..ceee86a 100644
--- a/pkg/waylog/http/middleware.go
+++ b/pkg/waylog/http/middleware.go
@@ -59,7 +59,7 @@ func ServeHTTP(w http.ResponseWriter, r *http.Request, route string, next func(h
 	sw := wrapResponseWriter(w, ctx)
 	var sealed atomic.Bool
 
-	deliver := func(kind lifecycleKind) {
+	deliver := func(kind lifecycleKind, recovered ...any) {
 		if !sealed.CompareAndSwap(false, true) {
 			_, _ = waylogv2.Finalize(ctx)
 			return
@@ -68,7 +68,7 @@ func ServeHTTP(w http.ResponseWriter, r *http.Request, route string, next func(h
 		case lifecycleTimeout:
 			_, _ = waylogv2.FinalizeTimeout(ctx)
 		case lifecyclePanic:
-			_, _ = waylogv2.FinalizePanic(ctx)
+			_, _ = waylogv2.FinalizePanic(ctx, recovered...)
 		case lifecycleAborted:
 			_, _ = waylogv2.FinalizeAborted(ctx)
 		default:
@@ -88,7 +88,7 @@ func ServeHTTP(w http.ResponseWriter, r *http.Request, route string, next func(h
 			if !sw.WroteHeader() {
 				sw.WriteHeader(http.StatusInternalServerError)
 			}
-			deliver(lifecyclePanic)
+			deliver(lifecyclePanic, rec)
 		}
 	}()
 
diff --git a/pkg/waylog/http/runtime_test.go b/pkg/waylog/http/runtime_test.go
new file mode 100644
index 0000000..088349c
--- /dev/null
+++ b/pkg/waylog/http/runtime_test.go
@@ -0,0 +1,80 @@
+package wayloghttp_test
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	wayloghttp "github.com/sssmaran/WaylogCLI/pkg/waylog/http"
+	waylogv2 "github.com/sssmaran/WaylogCLI/pkg/waylog/v2"
+)
+
+// TestPanicPostsRuntimeSignal verifies the real ingest path: a panic inside an
+// instrumented handler is recovered by the adapter, returns 500, and (with
+// runtime hooks enabled) posts a "runtime" signal with subtype=panic carrying
+// the SDK's service/env so it correlates with the incident.
+func TestPanicPostsRuntimeSignal(t *testing.T) {
+	t.Cleanup(func() {
+		ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+		defer cancel()
+		_ = waylogv2.Shutdown(ctx)
+	})
+
+	signals := make(chan waylogv2.Signal, 4)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/v1/signals" {
+			var sig waylogv2.Signal
+			_ = json.NewDecoder(r.Body).Decode(&sig)
+			signals <- sig
+			w.WriteHeader(http.StatusCreated)
+			return
+		}
+		// /v1/events
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(`{"accepted":1}`))
+	}))
+	defer srv.Close()
+
+	if err := waylogv2.Init(waylogv2.Config{
+		Service:            "checkout",
+		Env:                "demo",
+		IngestURL:          srv.URL,
+		APIKey:             "k",
+		EnableRuntimeHooks: true,
+	}); err != nil {
+		t.Fatalf("Init: %v", err)
+	}
+
+	h := wayloghttp.HTTP(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
+		panic("handler boom")
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/checkout", nil)
+	h.ServeHTTP(rec, req) // must not propagate the panic
+
+	if rec.Code != http.StatusInternalServerError {
+		t.Errorf("status = %d, want 500", rec.Code)
+	}
+
+	select {
+	case sig := <-signals:
+		if sig.Type != "runtime" || sig.Source != "go-sdk" {
+			t.Errorf("type/source = %q/%q, want runtime/go-sdk", sig.Type, sig.Source)
+		}
+		if sig.Service != "checkout" || sig.Env != "demo" {
+			t.Errorf("service/env = %q/%q, want checkout/demo", sig.Service, sig.Env)
+		}
+		if sig.Metadata["subtype"] != "panic" {
+			t.Errorf("subtype = %v, want panic", sig.Metadata["subtype"])
+		}
+		if !strings.Contains(sig.Reason, "panic") {
+			t.Errorf("reason = %q, want it to mention the panic", sig.Reason)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for runtime signal from panic")
+	}
+}
diff --git a/pkg/waylog/v2/assemble.go b/pkg/waylog/v2/assemble.go
index 0564f3c..674eba5 100644
--- a/pkg/waylog/v2/assemble.go
+++ b/pkg/waylog/v2/assemble.go
@@ -30,9 +30,25 @@ func Finalize(ctx context.Context) (*eventv2.Event, error) {
 	return finalize(ctx, lifecycleNormal)
 }
 
-// FinalizePanic seals the request as a panic-owned lifecycle emit.
-func FinalizePanic(ctx context.Context) (*eventv2.Event, error) {
-	return finalize(ctx, lifecyclePanic)
+// FinalizePanic seals the request as a panic-owned lifecycle emit. The optional
+// recovered value (the result of recover()) is used to enrich the runtime signal
+// when runtime hooks are enabled. The signal is posted asynchronously and even
+// if event delivery fails, so it never blocks the recovering request goroutine
+// (this runs in the HTTP middleware's recover path) and a transient /v1/events
+// error never suppresses the /v1/signals evidence; the original event/error are
+// still returned.
+func FinalizePanic(ctx context.Context, recovered ...any) (*eventv2.Event, error) {
+	ev, err := finalize(ctx, lifecyclePanic)
+	if ev != nil {
+		var rec any
+		if len(recovered) > 0 {
+			rec = recovered[0]
+		}
+		if s := getState(); s != nil && s.cfg.EnableRuntimeHooks {
+			go postPanicSignal(s.cfg, rec)
+		}
+	}
+	return ev, err
 }
 
 // FinalizeAborted seals the request as an aborted lifecycle emit unless the
diff --git a/pkg/waylog/v2/runtime.go b/pkg/waylog/v2/runtime.go
new file mode 100644
index 0000000..24c2fc3
--- /dev/null
+++ b/pkg/waylog/v2/runtime.go
@@ -0,0 +1,50 @@
+package waylogv2
+
+import (
+	"context"
+	"time"
+)
+
+const signalPostTimeout = 5 * time.Second
+
+// SafeGo runs fn in a new goroutine, recovering any panic. When runtime hooks
+// are enabled, a recovered panic posts a "runtime" signal so it correlates with
+// incidents. A bare `go fn()` whose panic goes unrecovered crashes the whole
+// process; SafeGo contains it and records the evidence.
+func SafeGo(fn func()) {
+	go func() {
+		defer func() {
+			if rec := recover(); rec != nil {
+				if s := getState(); s != nil && s.cfg.EnableRuntimeHooks {
+					// Post asynchronously (matching FinalizePanic in assemble.go)
+					// so a slow/unreachable signal endpoint can't block this
+					// goroutine's teardown for up to signalPostTimeout.
+					go postPanicSignal(s.cfg, rec)
+				}
+			}
+		}()
+		fn()
+	}()
+}
+
+// postPanicSignal posts a best-effort runtime signal describing a recovered
+// panic. It uses a fresh background context with a short timeout, never the
+// request context: a client disconnect must not suppress the panic evidence.
+func postPanicSignal(cfg Config, recovered any) {
+	reason := "panic"
+	if recovered != nil {
+		reason = "panic: " + sanitizeReason(recovered)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), signalPostTimeout)
+	defer cancel()
+	_ = postSignalWithConfig(ctx, cfg, Signal{
+		Type:     "runtime",
+		Service:  cfg.Service,
+		Env:      cfg.Env,
+		Severity: "critical",
+		Reason:   reason,
+		Message:  reason,
+		Source:   "go-sdk",
+		Metadata: map[string]any{"subtype": "panic"},
+	})
+}
diff --git a/pkg/waylog/v2/runtime_test.go b/pkg/waylog/v2/runtime_test.go
new file mode 100644
index 0000000..c08bc58
--- /dev/null
+++ b/pkg/waylog/v2/runtime_test.go
@@ -0,0 +1,66 @@
+package waylogv2
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+// signalRecorder is an httptest server that decodes posted signals onto a channel.
+func signalRecorder(t *testing.T) (*httptest.Server, chan Signal) {
+	t.Helper()
+	ch := make(chan Signal, 4)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/v1/signals" {
+			var sig Signal
+			_ = json.NewDecoder(r.Body).Decode(&sig)
+			ch <- sig
+		}
+		w.WriteHeader(http.StatusCreated)
+	}))
+	t.Cleanup(srv.Close)
+	return srv, ch
+}
+
+func TestSafeGoRecoversPanicAndPostsSignal(t *testing.T) {
+	t.Cleanup(resetForTest)
+	srv, ch := signalRecorder(t)
+	if err := Init(Config{Service: "checkout", Env: "demo", IngestURL: srv.URL, APIKey: "k", EnableRuntimeHooks: true}); err != nil {
+		t.Fatalf("Init: %v", err)
+	}
+
+	SafeGo(func() { panic("boom") })
+
+	select {
+	case sig := <-ch:
+		if sig.Type != "runtime" || sig.Source != "go-sdk" {
+			t.Errorf("type/source = %q/%q, want runtime/go-sdk", sig.Type, sig.Source)
+		}
+		if sig.Service != "checkout" || sig.Env != "demo" {
+			t.Errorf("service/env = %q/%q, want checkout/demo", sig.Service, sig.Env)
+		}
+		if sig.Metadata["subtype"] != "panic" {
+			t.Errorf("subtype = %v, want panic", sig.Metadata["subtype"])
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for SafeGo panic signal")
+	}
+}
+
+func TestSafeGoNoSignalWhenHooksDisabled(t *testing.T) {
+	t.Cleanup(resetForTest)
+	srv, ch := signalRecorder(t)
+	if err := Init(Config{Service: "checkout", Env: "demo", IngestURL: srv.URL, APIKey: "k"}); err != nil {
+		t.Fatalf("Init: %v", err)
+	}
+
+	SafeGo(func() { panic("boom") })
+
+	select {
+	case sig := <-ch:
+		t.Fatalf("unexpected signal with hooks disabled: %+v", sig)
+	case <-time.After(250 * time.Millisecond):
+	}
+}
diff --git a/pkg/waylog/v2/sdk_test.go b/pkg/waylog/v2/sdk_test.go
index c52ce5b..011c968 100644
--- a/pkg/waylog/v2/sdk_test.go
+++ b/pkg/waylog/v2/sdk_test.go
@@ -311,7 +311,7 @@ func TestLifecyclePanicAndTimeoutPreserveExistingExplicitFailAnchor(t *testing.T
 		finalize  func(context.Context) (*eventv2.Event, error)
 		wantState eventv2.Status
 	}{
-		{name: "panic", finalize: FinalizePanic, wantState: eventv2.StatusError},
+		{name: "panic", finalize: func(ctx context.Context) (*eventv2.Event, error) { return FinalizePanic(ctx) }, wantState: eventv2.StatusError},
 		{name: "timeout", finalize: FinalizeTimeout, wantState: eventv2.StatusTimeout},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
diff --git a/pkg/waylog/v2/signal.go b/pkg/waylog/v2/signal.go
new file mode 100644
index 0000000..b160b54
--- /dev/null
+++ b/pkg/waylog/v2/signal.go
@@ -0,0 +1,141 @@
+package waylogv2
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+)
+
+const (
+	maxSignalReasonLen  = 512
+	maxSignalMessageLen = 4096
+)
+
+// signalClient is shared across signal posts. Signals are rare, so a small
+// pooled client with a short timeout is plenty.
+var signalClient = &http.Client{Timeout: 5 * time.Second}
+
+// Signal is a production-context signal posted to /v1/signals. The SDK emits
+// "runtime" signals (recovered panics, uncaught errors) so they correlate with
+// incidents during triage. Type, Service, Env, Source, Reason and Timestamp are
+// required by the server; PostSignal fills Service/Env/Timestamp from config
+// when unset.
+type Signal struct {
+	Type      string         `json:"type"`
+	Service   string         `json:"service"`
+	Env       string         `json:"env"`
+	Severity  string         `json:"severity,omitempty"`
+	Reason    string         `json:"reason,omitempty"`
+	Message   string         `json:"message,omitempty"`
+	Source    string         `json:"source,omitempty"`
+	Timestamp time.Time      `json:"timestamp"`
+	Metadata  map[string]any `json:"metadata,omitempty"`
+}
+
+// PostSignal sends a production signal to the configured ingest server. It is a
+// no-op when the SDK is not initialized or when neither SignalURL nor IngestURL
+// is configured. Service, Env and Timestamp default to the SDK config / now when
+// unset. Best-effort and synchronous: callers should not block on the result.
+func PostSignal(ctx context.Context, sig Signal) error {
+	s := getState()
+	if s == nil {
+		return nil
+	}
+	return postSignalWithConfig(ctx, s.cfg, sig)
+}
+
+func postSignalWithConfig(ctx context.Context, cfg Config, sig Signal) error {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	endpoint := signalURL(cfg)
+	if endpoint == "" {
+		return nil
+	}
+	if sig.Service == "" {
+		sig.Service = cfg.Service
+	}
+	if sig.Env == "" {
+		sig.Env = cfg.Env
+	}
+	if sig.Timestamp.IsZero() {
+		sig.Timestamp = time.Now().UTC()
+	}
+	// Bound reason and message for every signal (parity with the TS SDK, which
+	// truncates both in its signal transport) so no SDK caller can ship an
+	// unbounded payload. This is the single place the size cap is applied.
+	sig.Reason = boundString(sig.Reason, maxSignalReasonLen)
+	sig.Message = boundString(sig.Message, maxSignalMessageLen)
+
+	body, err := json.Marshal(sig)
+	if err != nil {
+		return fmt.Errorf("waylog: marshal signal: %w", err)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
+	if err != nil {
+		return fmt.Errorf("waylog: build signal request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	if cfg.APIKey != "" {
+		req.Header.Set("Authorization", "Bearer "+cfg.APIKey)
+	}
+	resp, err := signalClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("waylog: post signal: %w", err)
+	}
+	defer resp.Body.Close()
+	_, _ = io.Copy(io.Discard, resp.Body)
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return fmt.Errorf("waylog: signal endpoint returned HTTP %d", resp.StatusCode)
+	}
+	return nil
+}
+
+// signalURL resolves the /v1/signals endpoint. SignalURL wins when set;
+// otherwise it derives from IngestURL, replacing a trailing /v1/events path with
+// /v1/signals and preserving any query parameters.
+func signalURL(cfg Config) string {
+	if cfg.SignalURL != "" {
+		return cfg.SignalURL
+	}
+	if cfg.IngestURL == "" {
+		return ""
+	}
+	u, err := url.Parse(cfg.IngestURL)
+	if err != nil {
+		return ""
+	}
+	path := strings.TrimRight(u.Path, "/")
+	if strings.HasSuffix(path, "/v1/events") {
+		path = strings.TrimSuffix(path, "/v1/events")
+	}
+	u.Path = path + "/v1/signals"
+	return u.String()
+}
+
+// sanitizeReason stringifies a recovered panic value and trims whitespace. The
+// size cap is applied centrally in postSignalWithConfig, so this does not bound
+// length itself. Full field redaction is a later concern.
+func sanitizeReason(v any) string {
+	return strings.TrimSpace(fmt.Sprintf("%v", v))
+}
+
+// boundString caps s to at most n bytes without splitting a multibyte UTF-8
+// rune: if the cut lands mid-sequence it steps back off the trailing
+// continuation bytes, so a truncated reason/message stays valid UTF-8 in the
+// signal JSON. Mirrors the TS SDK's transport truncation.
+func boundString(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	for n > 0 && s[n]&0xC0 == 0x80 {
+		n--
+	}
+	return s[:n]
+}
diff --git a/pkg/waylog/v2/signal_test.go b/pkg/waylog/v2/signal_test.go
new file mode 100644
index 0000000..757ca7f
--- /dev/null
+++ b/pkg/waylog/v2/signal_test.go
@@ -0,0 +1,159 @@
+package waylogv2
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+	"testing"
+	"unicode/utf8"
+)
+
+func TestPostSignalPostsToSignalsEndpoint(t *testing.T) {
+	t.Cleanup(resetForTest)
+
+	var (
+		mu      sync.Mutex
+		gotPath string
+		gotAuth string
+		gotBody Signal
+	)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		mu.Lock()
+		defer mu.Unlock()
+		gotPath = r.URL.Path
+		gotAuth = r.Header.Get("Authorization")
+		_ = json.NewDecoder(r.Body).Decode(&gotBody)
+		w.WriteHeader(http.StatusCreated)
+	}))
+	defer srv.Close()
+
+	if err := Init(Config{Service: "checkout", Env: "demo", IngestURL: srv.URL, APIKey: "k1", EnableRuntimeHooks: true}); err != nil {
+		t.Fatalf("Init: %v", err)
+	}
+
+	err := PostSignal(context.Background(), Signal{
+		Type:     "runtime",
+		Severity: "critical",
+		Reason:   "panic: boom",
+		Source:   "go-sdk",
+		Metadata: map[string]any{"subtype": "panic"},
+	})
+	if err != nil {
+		t.Fatalf("PostSignal: %v", err)
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+	if gotPath != "/v1/signals" {
+		t.Errorf("path = %q, want /v1/signals", gotPath)
+	}
+	if gotAuth != "Bearer k1" {
+		t.Errorf("auth = %q, want Bearer k1", gotAuth)
+	}
+	if gotBody.Service != "checkout" || gotBody.Env != "demo" {
+		t.Errorf("service/env not filled from config: %q/%q", gotBody.Service, gotBody.Env)
+	}
+	if gotBody.Timestamp.IsZero() {
+		t.Error("timestamp not auto-filled (server requires it)")
+	}
+	if gotBody.Metadata["subtype"] != "panic" {
+		t.Errorf("subtype = %v, want panic", gotBody.Metadata["subtype"])
+	}
+}
+
+// Parity with the TS SDK (which truncates reason→512 and message→4096 in its
+// signal transport): PostSignal must bound both for every signal, not just the
+// panic path via sanitizeReason.
+func TestPostSignalBoundsReasonAndMessage(t *testing.T) {
+	t.Cleanup(resetForTest)
+
+	var (
+		mu      sync.Mutex
+		gotBody Signal
+	)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		mu.Lock()
+		defer mu.Unlock()
+		_ = json.NewDecoder(r.Body).Decode(&gotBody)
+		w.WriteHeader(http.StatusCreated)
+	}))
+	defer srv.Close()
+
+	if err := Init(Config{Service: "checkout", Env: "demo", IngestURL: srv.URL}); err != nil {
+		t.Fatalf("Init: %v", err)
+	}
+
+	err := PostSignal(context.Background(), Signal{
+		Type:    "runtime",
+		Source:  "go-sdk",
+		Reason:  strings.Repeat("r", maxSignalReasonLen+500),
+		Message: strings.Repeat("m", maxSignalMessageLen+500),
+	})
+	if err != nil {
+		t.Fatalf("PostSignal: %v", err)
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+	if len(gotBody.Reason) != maxSignalReasonLen {
+		t.Errorf("reason len = %d, want bounded to %d", len(gotBody.Reason), maxSignalReasonLen)
+	}
+	if len(gotBody.Message) != maxSignalMessageLen {
+		t.Errorf("message len = %d, want bounded to %d", len(gotBody.Message), maxSignalMessageLen)
+	}
+}
+
+// boundString must cap on a rune boundary so a truncated reason/message never
+// ships split UTF-8 (which json.Marshal would otherwise replace with U+FFFD).
+func TestBoundStringKeepsRuneBoundary(t *testing.T) {
+	s := strings.Repeat("→", 5) // U+2192 = 3 bytes each → 15 bytes
+	// Cap at 7 bytes: lands inside the 3rd rune (bytes 6,7,8); must step back to 6.
+	got := boundString(s, 7)
+	if len(got) != 6 {
+		t.Fatalf("expected step-back to 6 bytes, got %d (%q)", len(got), got)
+	}
+	if !utf8.ValidString(got) {
+		t.Fatalf("boundString produced invalid UTF-8: %q", got)
+	}
+	if strings.ContainsRune(got, '�') {
+		t.Fatalf("boundString produced a replacement char: %q", got)
+	}
+	if asciiCap := boundString(strings.Repeat("a", 10), 4); asciiCap != "aaaa" {
+		t.Fatalf("ascii cap: got %q, want aaaa", asciiCap)
+	}
+	if noop := boundString("short", 100); noop != "short" {
+		t.Fatalf("under-cap must be unchanged: got %q", noop)
+	}
+}
+
+func TestPostSignalNoOpWhenUninitialized(t *testing.T) {
+	resetForTest()
+	if err := PostSignal(context.Background(), Signal{Type: "runtime", Source: "go-sdk", Reason: "x"}); err != nil {
+		t.Fatalf("PostSignal with no SDK should be a no-op, got %v", err)
+	}
+}
+
+func TestSignalURLDerivation(t *testing.T) {
+	cases := []struct {
+		name string
+		cfg  Config
+		want string
+	}{
+		{"bare host", Config{IngestURL: "http://localhost:8080"}, "http://localhost:8080/v1/signals"},
+		{"events path", Config{IngestURL: "http://localhost:8080/v1/events"}, "http://localhost:8080/v1/signals"},
+		{"trailing slash", Config{IngestURL: "http://localhost:8080/"}, "http://localhost:8080/v1/signals"},
+		{"preserve query", Config{IngestURL: "http://h/v1/events?x=1"}, "http://h/v1/signals?x=1"},
+		{"override wins with empty ingest", Config{SignalURL: "http://other/v1/signals"}, "http://other/v1/signals"},
+		{"empty both", Config{}, ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := signalURL(tc.cfg); got != tc.want {
+				t.Errorf("signalURL = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
diff --git a/pkg/waylog/v2/waylog.go b/pkg/waylog/v2/waylog.go
index 9fa18ab..bcc8e77 100644
--- a/pkg/waylog/v2/waylog.go
+++ b/pkg/waylog/v2/waylog.go
@@ -50,6 +50,16 @@ type Config struct {
 	MaxEventsPerSec  int
 
 	Redactor func(F) F
+
+	// EnableRuntimeHooks turns on runtime signal reporting: recovered panics
+	// (via the HTTP middleware and SafeGo) post a "runtime" signal to
+	// /v1/signals so they correlate with incidents during triage. Off by default.
+	EnableRuntimeHooks bool
+
+	// SignalURL overrides the signal endpoint. When empty, signals are posted to
+	// the IngestURL host's /v1/signals path. Set this only when signals go to a
+	// different host than events.
+	SignalURL string
 }
 
 const (
diff --git a/scripts/check-rollup-contract.sh b/scripts/check-rollup-contract.sh
deleted file mode 100755
index 97ca2c3..0000000
--- a/scripts/check-rollup-contract.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env bash
-# check-rollup-contract.sh — enforce the canonical rollup contract:
-# default user-facing surfaces (top errors, overview KPIs, compare_windows,
-# spike detection, failure_patterns) must consume analysis.RollupWindow,
-# not the propagation-counted store.SummarizeWindow / analysis.DiffSummaries
-# / analysis.DetectFailurePatternsFromSummary.
-#
-# The allow-list below pins every legitimate reference. Any NEW mention of
-# these propagation-counted APIs outside the allow-list fails CI — that's
-# how we prevent the PMT_502=9-not-3 cascade-amplification bug from
-# coming back.
-#
-# If you are adding a NEW detail surface that genuinely needs propagation
-# spread (trace stories, blast radius, failure chains), bind the result
-# to a variable named `propagationSummary` and extend the allow-list with
-# a short justification.
-set -uo pipefail
-
-ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-cd "$ROOT"
-
-# Files allowed to reference propagation-counted APIs.
-allowlist=(
-  "internal/graph/store/summaries.go"            # defines WindowSummary/SummarizeWindow
-  "internal/graph/store/summaries_test.go"       # tests the definition
-  "internal/graph/analysis/diff.go"              # defines DiffSummaries alongside DiffRollups
-  "internal/graph/analysis/diff_test.go"         # tests the definition
-  "internal/graph/analysis/patterns.go"          # defines DetectFailurePatternsFromSummary
-  "internal/graph/analysis/rollup.go"            # doc contract referencing both paths
-  "internal/tools/store.go"                      # interface surface (preserved)
-  "internal/tools/failures_test.go"              # test stub implements interface
-  "internal/ingest/handler.go"                   # frozenStore delegator for tools.Store
-)
-
-is_allowed() {
-  local path="$1"
-  for allowed in "${allowlist[@]}"; do
-    if [ "$path" = "$allowed" ]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
-violations=0
-pattern='\b(SummarizeWindow|DiffSummaries|DetectFailurePatternsFromSummary)\b'
-
-while IFS= read -r path; do
-  # Skip worktree scratch dirs and vendored code.
-  case "$path" in
-    .claude/*|.git/*|vendor/*) continue ;;
-  esac
-  if ! is_allowed "$path"; then
-    matches=$(grep -nE "$pattern" "$path" || true)
-    if [ -n "$matches" ]; then
-      echo "VIOLATION: $path references propagation-counted rollup API"
-      echo "$matches" | sed 's/^/  /'
-      violations=1
-    fi
-  fi
-done < <(find . -type f -name '*.go' \
-  -not -path './.claude/*' \
-  -not -path './.git/*' \
-  -not -path './vendor/*' \
-  | sed 's|^\./||' \
-  | sort)
-
-if [ "$violations" -ne 0 ]; then
-  echo ""
-  echo "FAIL: default rollups must consume analysis.RollupWindow."
-  echo "See internal/graph/analysis/rollup.go for the contract."
-  echo "If this is a legitimate detail surface, extend the allow-list in"
-  echo "scripts/check-rollup-contract.sh with a short justification."
-  exit 1
-fi
-
-echo "OK: rollup contract honored"
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index 558eee8..9358f07 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -101,7 +101,7 @@ type blastResponse struct {
 
 func main() {
 	if len(os.Args) < 2 {
-		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json  [arg]")
+		fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json  [arg]")
 		os.Exit(2)
 	}
 
@@ -142,6 +142,10 @@ func main() {
 		}
 	case "first-incident-id":
 		fmt.Println(firstIncidentID(body))
+	case "active-incident-ids":
+		for _, id := range activeIncidentIDs(body) {
+			fmt.Println(id)
+		}
 	case "triage-report-hash":
 		fmt.Println(triageReportHash(body))
 	case "plan-triage-report-hash":
@@ -297,6 +301,20 @@ func firstIncidentID(body []byte) string {
 	return ""
 }
 
+func activeIncidentIDs(body []byte) []string {
+	var resp incidentsResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return nil
+	}
+	ids := make([]string, 0, len(resp.Incidents))
+	for _, inc := range resp.Incidents {
+		if inc.Status == "active" && inc.IncidentID != "" {
+			ids = append(ids, inc.IncidentID)
+		}
+	}
+	return ids
+}
+
 func incidentCauseIsDependency(body []byte, incidentID string) bool {
 	var resp incidentsResponse
 	if err := json.Unmarshal(body, &resp); err != nil {
diff --git a/scripts/demo-acceptance-json/main_test.go b/scripts/demo-acceptance-json/main_test.go
index 6bad1d8..02fdcbb 100644
--- a/scripts/demo-acceptance-json/main_test.go
+++ b/scripts/demo-acceptance-json/main_test.go
@@ -22,6 +22,28 @@ func TestDependencyIncidentHelpers(t *testing.T) {
 	}
 }
 
+func TestActiveIncidentIDs(t *testing.T) {
+	body := []byte(`{"incidents":[
+		{"incident_id":"inc_a","status":"active"},
+		{"incident_id":"inc_b","status":"resolved"},
+		{"incident_id":"inc_c","status":"active"},
+		{"incident_id":"","status":"active"}
+	]}`)
+	got := activeIncidentIDs(body)
+	want := []string{"inc_a", "inc_c"}
+	if len(got) != len(want) {
+		t.Fatalf("activeIncidentIDs len = %d (%v), want %d (%v)", len(got), got, len(want), want)
+	}
+	for i, id := range want {
+		if got[i] != id {
+			t.Fatalf("activeIncidentIDs[%d] = %q, want %q", i, got[i], id)
+		}
+	}
+	if got := activeIncidentIDs([]byte(`{not-json`)); got != nil {
+		t.Fatalf("malformed input should return nil, got %v", got)
+	}
+}
+
 func TestTriageReportHash(t *testing.T) {
 	body := []byte(`{"schema_version":"triage.v1","incident_ref":{"id":"inc_x"},"confidence":"medium","generated_at":"t","report_hash":"sha256:deadbeef"}`)
 	if got := triageReportHash(body); got != "sha256:deadbeef" {
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index 9e78dcc..1e2096a 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -8,6 +8,7 @@ WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
 REQUESTS="${REQUESTS:-20}"
 CONCURRENCY="${CONCURRENCY:-5}"
 TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
+WAYLOG_DEMO_EXPECT_ALERTS="${WAYLOG_DEMO_EXPECT_ALERTS:-1}"
 CLI_BIN="${WAYLOG_CLI_BIN:-}"
 JSON_BIN="${WAYLOG_JSON_HELPER_BIN:-}"
 
@@ -44,6 +45,10 @@ json_first_incident_id() {
   "$JSON_BIN" first-incident-id
 }
 
+json_active_incident_ids() {
+  "$JSON_BIN" active-incident-ids
+}
+
 json_triage_report_hash() {
   "$JSON_BIN" triage-report-hash
 }
@@ -145,6 +150,65 @@ incident_id="$(json_first_incident_id <<<"$incidents_json")"
 "${CLI[@]}" --json incident "$incident_id" >/dev/null || fail "waylog incident failed for incident $incident_id"
 echo "PASS: waylog incident"
 
+# --- v1.0 incident evidence: propagation.latest + blast.latest (+ alerts.latest after A2) ---
+# MVP gate: at least one active incident must have the current expected
+# evidence snapshots captured successfully. The demo auto-fire loop can create
+# fresh active incidents while acceptance is running, so requiring every active
+# incident to be fully captured is intentionally too strict.
+# Pin all evidence/snapshot/triage assertions to the PMT_502 dependency incident
+# (incident_id above). The deterministic checkout panic opens a separate
+# checkout:request:WAYLOG_PANIC incident, but its runtime signal correlates by
+# service+env onto this dependency incident too, so this incident must carry BOTH
+# infra (oom_killed) and app (panic) runtime evidence.
+evidence_ok=0
+runtime_subtypes=""
+inc_detail=""
+prop_status="missing"
+blast_status="missing"
+alert_status="missing"
+alert_ids=""
+runtime_status="missing"
+ev_kinds=""
+ev_runtime_subtypes=""
+for _ in $(seq 1 20); do
+  inc_detail="$(curl -fsS -H "Authorization: Bearer ${WAYLOG_READ_KEY}" "${INGEST_URL}/v1/incidents/${incident_id}")" || fail "GET /v1/incidents/${incident_id} failed"
+  prop_status="$(echo "$inc_detail" | jq -r '.incident.propagation.latest.capture_status // "missing"')"
+  blast_status="$(echo "$inc_detail" | jq -r '.incident.blast.latest.capture_status // "missing"')"
+  alert_status="$(echo "$inc_detail" | jq -r '.incident.alerts.latest.capture_status // "missing"')"
+  alert_ids="$(echo "$inc_detail" | jq -r '[.incident.alerts.latest.matches[]?.alert_id // empty] | unique | join(" ")')"
+  runtime_status="$(echo "$inc_detail" | jq -r '.incident.runtime.latest.capture_status // "missing"')"
+  runtime_subtypes="$(echo "$inc_detail" | jq -r '[.incident.runtime.matches[]?.subtype // empty] | unique | join(" ")')"
+  has_infra=0; case " $runtime_subtypes " in *" oom_killed "*|*" crashloop "*) has_infra=1 ;; esac
+  has_app=0; case " $runtime_subtypes " in *" panic "*|*" unhandled_rejection "*) has_app=1 ;; esac
+  has_expected_alert=0; case " $alert_ids " in *" ${alert_id} "*) has_expected_alert=1 ;; esac
+  # Flat evidence[] rows must carry the same evidence kinds as the snapshots, so
+  # the API/dashboard/report surfaces cannot drift apart silently. Snapshot checks
+  # above prove capture; these prove the same evidence reached the incident's flat
+  # evidence list (what the triage report and CLI render from).
+  ev_kinds="$(echo "$inc_detail" | jq -r '[.incident.evidence[]?.kind] | unique | join(" ")')"
+  ev_runtime_subtypes="$(echo "$inc_detail" | jq -r '[.incident.evidence[]? | select(.kind == "runtime") | .fields.subtype // empty] | unique | join(" ")')"
+  row_trace=0; case " $ev_kinds " in *" trace "*) row_trace=1 ;; esac
+  row_signal=0; case " $ev_kinds " in *" signal "*) row_signal=1 ;; esac
+  row_infra=0; case " $ev_runtime_subtypes " in *" oom_killed "*|*" crashloop "*) row_infra=1 ;; esac
+  row_app=0; case " $ev_runtime_subtypes " in *" panic "*|*" unhandled_rejection "*) row_app=1 ;; esac
+  if [[ "$prop_status" == "ok" && "$blast_status" == "ok" && "$runtime_status" == "ok" && "$has_infra" -eq 1 && "$has_app" -eq 1 \
+        && "$row_trace" -eq 1 && "$row_infra" -eq 1 && "$row_app" -eq 1 ]] \
+     && { [[ "$WAYLOG_DEMO_EXPECT_ALERTS" != "1" ]] || [[ "$alert_status" == "ok" && "$has_expected_alert" -eq 1 && "$row_signal" -eq 1 ]]; }; then
+    evidence_ok=1
+    break
+  fi
+  sleep 5
+done
+echo "  ${incident_id} propagation: $(echo "$inc_detail" | jq -r '.incident.propagation.latest | "origin=\(.origin_service)/\(.origin_step) status=\(.capture_status)" // "MISSING"')"
+echo "  ${incident_id} blast:       $(echo "$inc_detail" | jq -r '.incident.blast.latest | "req=\(.affected_requests) svc=\(.affected_services) users=\(.affected_users // 0) status=\(.capture_status)" // "MISSING"')"
+echo "  ${incident_id} runtime:     status=${runtime_status} subtypes=[${runtime_subtypes}]"
+echo "  ${incident_id} evidence[]:  kinds=[${ev_kinds}] runtime_subtypes=[${ev_runtime_subtypes}]"
+if [[ "$WAYLOG_DEMO_EXPECT_ALERTS" == "1" ]]; then
+  echo "  ${incident_id} alerts:      $(echo "$inc_detail" | jq -r '.incident.alerts.latest | "matches=\(.matches | length) status=\(.capture_status)" // "MISSING"') ids=[${alert_ids}]"
+fi
+[[ "$evidence_ok" -eq 1 ]] || fail "incident ${incident_id} did not reach propagation+blast+alerts ok AND flat evidence[] rows (trace + signal + runtime infra + runtime app) with expected alert ${alert_id} and BOTH infra (oom_killed) and app (panic) runtime evidence; got snapshot runtime subtypes=[${runtime_subtypes}] evidence[] kinds=[${ev_kinds}] evidence[] runtime subtypes=[${ev_runtime_subtypes}] alert ids=[${alert_ids}]"
+echo "PASS: incident ${incident_id} evidence (snapshots propagation+blast+alerts ok + runtime infra+app; flat evidence[] kinds=[${ev_kinds}])"
+
 snapshot="$("${CLI[@]}" incident "$incident_id" --snapshot)" || fail "waylog incident snapshot failed for incident $incident_id"
 [[ "$snapshot" == *"payment.charge"* ]] || fail "incident snapshot did not mention payment.charge"
 echo "PASS: waylog incident snapshot"
@@ -168,4 +232,9 @@ grep -q "$hash_a" /tmp/waylog-demo-triage-report.md || fail "triage markdown rep
 grep -q "$alert_id" /tmp/waylog-demo-triage-report.md || fail "triage markdown report did not cite alert evidence"
 echo "PASS: triage markdown report cites alert evidence"
 
+# Runtime evidence must reach the deterministic report too, not just the API/dashboard.
+grep -qiE 'oom' /tmp/waylog-demo-triage-report.md || fail "triage report did not cite infra runtime (oom) evidence"
+grep -qiE 'panic' /tmp/waylog-demo-triage-report.md || fail "triage report did not cite app runtime (panic) evidence"
+echo "PASS: triage report cites infra (oom) and app (panic) runtime evidence"
+
 echo "Demo acceptance passed."
diff --git a/scripts/demo-fire-burst.sh b/scripts/demo-fire-burst.sh
new file mode 100755
index 0000000..0fa84f8
--- /dev/null
+++ b/scripts/demo-fire-burst.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
+INGEST_URL="${INGEST_URL:-http://localhost:8080}"
+WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
+REQUESTS="${REQUESTS:-20}"
+CONCURRENCY="${CONCURRENCY:-5}"
+ALERT_TIMESTAMP="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+
+curl -fsS -X POST "${GATEWAY_URL}/demo/burst" \
+  -H "Content-Type: application/json" \
+  --data "{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}" \
+  >/dev/null
+
+curl -fsS -X POST "${INGEST_URL}/v1/alerts" \
+  -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+  -H "Content-Type: application/json" \
+  --data "{\"receiver\":\"crux-demo\",\"status\":\"firing\",\"alerts\":[{\"status\":\"firing\",\"labels\":{\"alertname\":\"CheckoutPaymentFailure\",\"service\":\"checkout\",\"step\":\"payment.charge\",\"env\":\"demo\",\"severity\":\"critical\",\"error_code\":\"PMT_502\"},\"annotations\":{\"summary\":\"PMT_502 spike in checkout payment flow\",\"description\":\"Synthetic Crux demo alert for checkout payment failures\",\"runbook_url\":\"http://localhost:9081/demo\"},\"startsAt\":\"${ALERT_TIMESTAMP}\",\"generatorURL\":\"http://localhost:9081/demo\"}],\"commonLabels\":{\"alertname\":\"CheckoutPaymentFailure\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"error_code\":\"PMT_502\"},\"commonAnnotations\":{\"summary\":\"PMT_502 spike in checkout payment flow\"}}" \
+  >/dev/null
+
+# Infra runtime evidence: a K8s OOMKill on checkout. env MUST be "demo" or the
+# incident engine filters it out before correlation. Targets checkout so it
+# lands on the same incident as the alert/dependency evidence above.
+curl -fsS -X POST "${INGEST_URL}/v1/signals" \
+  -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+  -H "Content-Type: application/json" \
+  --data "{\"type\":\"runtime\",\"source\":\"k8s-demo\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"reason\":\"OOMKilled\",\"message\":\"Container checkout killed by OOM (limit: 256Mi, usage: 312Mi).\",\"resource\":{\"service\":\"checkout\",\"container\":\"checkout\"},\"metadata\":{\"subtype\":\"oom_killed\",\"pod\":\"checkout-7f8b9c-x2k\",\"container\":\"checkout\"},\"timestamp\":\"${ALERT_TIMESTAMP}\"}" \
+  >/dev/null
+
+echo "burst fired: demo deploy/dependency/runtime signals + alert + ${REQUESTS} payment_502 requests (${CONCURRENCY} concurrency)"
diff --git a/scripts/demo.sh b/scripts/demo.sh
index 6009381..0bf61db 100755
--- a/scripts/demo.sh
+++ b/scripts/demo.sh
@@ -31,11 +31,9 @@ else
   export WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
 fi
 export WAYLOG_PROFILE="${WAYLOG_PROFILE:-demo}"
-export WAYLOG_V2_READS="${WAYLOG_V2_READS:-true}"
 export WAYLOG_INCIDENT_TICK_INTERVAL="${WAYLOG_INCIDENT_TICK_INTERVAL:-5s}"
 export EVENT_LOG_DIR="${EVENT_LOG_DIR:-${STATE_DIR}/eventlog}"
 export EVENT_LOG_V2_DIR="${EVENT_LOG_V2_DIR:-${STATE_DIR}/eventlog-v2}"
-export SNAPSHOT_PATH="${SNAPSHOT_PATH:-${STATE_DIR}/graph_snapshot.json}"
 export SQLITE_PATH="${SQLITE_PATH:-${STATE_DIR}/waylog.db}"
 
 start() {
@@ -135,30 +133,43 @@ wait_for_tcp 127.0.0.1 9082 checkout-demo
 start api-gateway "${BIN_DIR}/api-gateway"
 wait_for_http "http://localhost:9081/demo" "api-gateway"
 
+# Auto-fire loop: produces a fresh burst every DEMO_FIRE_INTERVAL_SEC.
+# Opt out with DEMO_AUTO_FIRE=0 before running `make demo`.
+auto_fire_msg="Auto-fire disabled. Run ./scripts/demo-fire-burst.sh for an immediate incident."
+if [[ "${DEMO_AUTO_FIRE:-1}" == "1" ]]; then
+  export DEMO_FIRE_INTERVAL_SEC="${DEMO_FIRE_INTERVAL_SEC:-60}"
+  start auto-fire bash -c '
+    set -euo pipefail
+    sleep 5
+    while true; do
+      "'"${ROOT}"'"/scripts/demo-fire-burst.sh >/dev/null 2>&1 || true
+      sleep "${DEMO_FIRE_INTERVAL_SEC}"
+    done
+  '
+  auto_fire_msg="First incident in ~30 seconds (auto-fire loop active)."
+fi
+
 cat < --snapshot
-
-Logs:
-  ${LOG_DIR}
-
-Stop:
-  make demo-stop
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  Crux demo
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+  Dashboard:  http://localhost:8080/ui/
+  API:        http://localhost:8080
+  Demo UI:    http://localhost:9081/demo
+
+  ${auto_fire_msg}
+  Set DEMO_AUTO_FIRE=0 before \`make demo\` to disable.
+
+  Stop:       make demo-stop
+  Acceptance: make demo-acceptance
+  Logs:       ${LOG_DIR}
+
+  Useful CLI:
+    ./waylog incidents
+    ./waylog incident  --snapshot
+    ./waylog errors --window 15m
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 INFO
diff --git a/scripts/micro-demo-smoke.sh b/scripts/micro-demo-smoke.sh
index 7fd535e..6ea01c7 100755
--- a/scripts/micro-demo-smoke.sh
+++ b/scripts/micro-demo-smoke.sh
@@ -98,6 +98,34 @@ check_read "Trace story 404" "${INGEST_URL}/v1/traces/story?trace_id=00000000000
 # Test 10: Trace story 400 for missing param
 check_read "Trace story 400" "${INGEST_URL}/v1/traces/story" "400"
 
+# --- v1.0 incident evidence smoke ---
+# Every active incident must have non-nil propagation.latest and blast.latest
+# after the engine has run at least one tick. Latest captures even on
+# partial/missing status, so the only way these are nil is if the engine
+# never ran or the response shape is broken.
+if command -v jq &>/dev/null; then
+  echo ""
+  echo "=== Incident evidence: propagation.latest + blast.latest ==="
+  active_json=$(curl -fsS -H "Authorization: Bearer ${WAYLOG_READ_KEY}" "${INGEST_URL}/v1/incidents/active" 2>/dev/null || echo '{"incidents":[]}')
+  inc_count=$(echo "$active_json" | jq '.incidents | length' 2>/dev/null || echo 0)
+  if [[ "$inc_count" -eq 0 ]]; then
+    echo "WARN: no active incidents to check (demo may not have triggered failure scenarios yet)"
+  else
+    for inc_id in $(echo "$active_json" | jq -r '.incidents[].incident_id'); do
+      body=$(curl -fsS -H "Authorization: Bearer ${WAYLOG_READ_KEY}" "${INGEST_URL}/v1/incidents/${inc_id}")
+      ok=$(echo "$body" | jq '.incident.blast.latest != null and .incident.propagation.latest != null' 2>/dev/null || echo false)
+      if [[ "$ok" != "true" ]]; then
+        echo "FAIL: incident ${inc_id} missing latest snapshots"
+        echo "  body: $(echo "$body" | jq -c '{propagation:.incident.propagation,blast:.incident.blast}')"
+        failed=$((failed + 1))
+      else
+        echo "PASS: incident ${inc_id} has propagation.latest + blast.latest"
+        passed=$((passed + 1))
+      fi
+    done
+  fi
+fi
+
 echo ""
 echo "=== Results: $passed passed, $failed failed ==="
 
diff --git a/scripts/micro-demo-stop.sh b/scripts/micro-demo-stop.sh
index 37773bb..b1b58f8 100755
--- a/scripts/micro-demo-stop.sh
+++ b/scripts/micro-demo-stop.sh
@@ -7,4 +7,3 @@ pkill -f "go run ./examples/cmd/checkout-demo" >/dev/null 2>&1 || true
 pkill -f "go run ./examples/cmd/payment-demo" >/dev/null 2>&1 || true
 pkill -f "go run ./examples/cmd/db-demo" >/dev/null 2>&1 || true
 pkill -f "go run ./cmd/ingest" >/dev/null 2>&1 || true
-pkill -f "go run ./cmd/waylog-live" >/dev/null 2>&1 || true
diff --git a/scripts/micro-demo.sh b/scripts/micro-demo.sh
index 8ac28d3..366d693 100755
--- a/scripts/micro-demo.sh
+++ b/scripts/micro-demo.sh
@@ -4,7 +4,7 @@ set -euo pipefail
 GOCACHE_DIR="${GOCACHE:-/tmp/go-build}"
 export GOCACHE="$GOCACHE_DIR"
 
-# v2 demo path: Kafka and cmd/bridge are intentionally unused here.
+# v2 demo path: no Kafka or bridge process — HTTP/OTLP ingest only.
 unset KAFKA_BROKERS
 
 export INGEST_ADDR="${INGEST_ADDR:-:8080}"
@@ -12,7 +12,6 @@ export INGEST_URL="${INGEST_URL:-http://localhost:8080}"
 export WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
 export WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
 export DASHBOARD_AUTH="${DASHBOARD_AUTH:-key:demo}"
-export WAYLOG_V2_READS="${WAYLOG_V2_READS:-true}"
 export EVENT_LOG_V2_DIR="${EVENT_LOG_V2_DIR:-./data/eventlog-v2-demo}"
 
 pids=()
diff --git a/scripts/ratelimit-smoke.sh b/scripts/ratelimit-smoke.sh
new file mode 100755
index 0000000..e01fc1a
--- /dev/null
+++ b/scripts/ratelimit-smoke.sh
@@ -0,0 +1,71 @@
+#!/bin/sh
+# Rate-limit smoke: boots a throwaway ingest with a 5 rps write limit, floods
+# /v1/events, and verifies 429 + Retry-After, per-key isolation, and recovery.
+set -eu
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+PORT="${RATELIMIT_SMOKE_PORT:-18099}"
+ADDR="127.0.0.1:$PORT"
+TMP="$(mktemp -d)"
+SRV_PID=""
+cleanup() {
+  [ -n "$SRV_PID" ] && kill "$SRV_PID" 2>/dev/null || true
+  rm -rf "$TMP"
+}
+trap cleanup EXIT
+
+go build -o "$TMP/ingest" ./cmd/ingest
+
+WAYLOG_RATE_LIMIT_WRITE_RPS=5 \
+INGEST_ADDR="$ADDR" \
+WAYLOG_WRITE_KEY=loadkey \
+EVENT_LOG_DIR="$TMP/eventlog" \
+EVENT_LOG_SYNC=false \
+"$TMP/ingest" >"$TMP/ingest.log" 2>&1 &
+SRV_PID=$!
+
+ready=0
+i=0
+while [ "$i" -lt 50 ]; do
+  if curl -fsS "http://$ADDR/readyz" >/dev/null 2>&1; then ready=1; break; fi
+  sleep 0.2
+  i=$((i + 1))
+done
+[ "$ready" = 1 ] || { echo "FAIL: ingest did not become ready"; cat "$TMP/ingest.log"; exit 1; }
+
+post() { # post  -> prints http status code
+  curl -s -o /dev/null -w '%{http_code}' -X POST "http://$ADDR/v1/events" \
+    -H "Authorization: Bearer $1" -H 'Content-Type: application/json' -d '{}'
+}
+
+# Flood: 20 rapid requests against a 5 rps / burst-5 budget.
+codes=""
+i=0
+while [ "$i" -lt 20 ]; do
+  codes="$codes $(post loadkey)"
+  i=$((i + 1))
+done
+n429=$(echo "$codes" | tr ' ' '\n' | grep -c '^429$' || true)
+[ "$n429" -ge 5 ] || { echo "FAIL: expected >=5 throttled requests, codes:$codes"; exit 1; }
+echo "PASS: flood throttled ($n429/20 requests got 429)"
+
+# A throttled response must carry Retry-After: 1.
+retry_after=$(curl -s -D - -o /dev/null -X POST "http://$ADDR/v1/events" \
+  -H 'Authorization: Bearer loadkey' -d '{}' | tr -d '\r' | grep -i '^retry-after:' | awk '{print $2}')
+[ "$retry_after" = "1" ] || { echo "FAIL: Retry-After header missing on 429 (got '$retry_after')"; exit 1; }
+echo "PASS: 429 carries Retry-After: 1"
+
+# Per-key isolation: a different presented key must not be throttled
+# (it fails auth with 401, never 429).
+other=$(post otherkey)
+[ "$other" != "429" ] || { echo "FAIL: other key was throttled by loadkey's bucket"; exit 1; }
+echo "PASS: per-key isolation (other key got $other, not 429)"
+
+# Recovery: after >1s the bucket refills and requests pass again.
+sleep 1.5
+recovered=$(post loadkey)
+[ "$recovered" != "429" ] || { echo "FAIL: limiter did not recover after refill window"; exit 1; }
+echo "PASS: clean recovery after refill (got $recovered)"
+
+echo "ratelimit-smoke: all checks passed"
diff --git a/tests/integration/agent_test.go b/tests/integration/agent_test.go
index 618fbea..8c0679d 100644
--- a/tests/integration/agent_test.go
+++ b/tests/integration/agent_test.go
@@ -2,132 +2,9 @@ package integration
 
 import (
 	"net/http"
-	"reflect"
 	"testing"
 )
 
-func TestAgent_TriageWorkflow(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-
-	// Inject failure scenario: 50 healthy + 20 PMT_502 + 10 CHK_TIMEOUT.
-	ingestEvents(t, srv, makeHealthyEvents(50, "api-gateway"))
-	ingestEvents(t, srv, makeFailureEvents(20, "payment-service", "PMT_502"))
-	ingestEvents(t, srv, makeFailureEvents(10, "checkout-service", "CHK_TIMEOUT"))
-
-	// Step 1: graph_insights
-	iw := httpPOST(t, srv.ToolCall, "/v1/tools/graph_insights",
-		map[string]string{"window": "10m"})
-	if iw.Code != http.StatusOK {
-		t.Fatalf("graph_insights: expected 200, got %d: %s", iw.Code, iw.Body.String())
-	}
-	var insights struct {
-		SchemaVersion string `json:"schema_version"`
-		TotalFailures int    `json:"total_failures"`
-	}
-	decodeJSON(t, iw, &insights)
-	if insights.SchemaVersion != "1.0" {
-		t.Errorf("expected schema_version 1.0, got %q", insights.SchemaVersion)
-	}
-	if insights.TotalFailures < 20 {
-		t.Errorf("expected >= 20 failures, got %d", insights.TotalFailures)
-	}
-
-	// Step 2: failure_patterns with pagination
-	pw := httpPOST(t, srv.ToolCall, "/v1/tools/failure_patterns",
-		map[string]any{"window": "10m", "limit": 5})
-	if pw.Code != http.StatusOK {
-		t.Fatalf("failure_patterns: expected 200, got %d: %s", pw.Code, pw.Body.String())
-	}
-	var patterns struct {
-		Patterns []struct {
-			ErrorCode string `json:"error_code"`
-			Count     int    `json:"count"`
-		} `json:"patterns"`
-		TotalCount int  `json:"total_count"`
-		HasMore    bool `json:"has_more"`
-	}
-	decodeJSON(t, pw, &patterns)
-	if len(patterns.Patterns) == 0 {
-		t.Fatal("expected at least one failure pattern")
-	}
-	foundPMT := false
-	for _, p := range patterns.Patterns {
-		if p.ErrorCode == "PMT_502" {
-			foundPMT = true
-			if p.Count < 20 {
-				t.Errorf("expected PMT_502 count >= 20, got %d", p.Count)
-			}
-		}
-	}
-	if !foundPMT {
-		t.Error("PMT_502 not found in failure patterns")
-	}
-
-	// Step 3: blast_radius for PMT_502
-	bw := httpPOST(t, srv.ToolCall, "/v1/tools/blast_radius",
-		map[string]any{"error_code": "PMT_502", "window": "10m", "include_services": true})
-	if bw.Code != http.StatusOK {
-		t.Fatalf("blast_radius: expected 200, got %d: %s", bw.Code, bw.Body.String())
-	}
-	var blast struct {
-		AffectedRequests int `json:"affected_requests"`
-		AffectedUsers    int `json:"affected_users"`
-	}
-	decodeJSON(t, bw, &blast)
-	if blast.AffectedRequests < 20 {
-		t.Errorf("expected >= 20 affected requests, got %d", blast.AffectedRequests)
-	}
-}
-
-func TestAgent_IdempotencyReplay(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeFailureEvents(10, "payment-service", "PMT_502"))
-
-	body := map[string]string{"window": "10m"}
-	headers := map[string]string{"Idempotency-Key": "test-key-001"}
-
-	// First call — executes.
-	w1 := httpPOSTWithHeaders(t, srv.ToolCall, "/v1/tools/graph_insights", body, headers)
-	if w1.Code != http.StatusOK {
-		t.Fatalf("first call: expected 200, got %d: %s", w1.Code, w1.Body.String())
-	}
-
-	// Second call — same key+body → cached replay.
-	w2 := httpPOSTWithHeaders(t, srv.ToolCall, "/v1/tools/graph_insights", body, headers)
-	if w2.Code != http.StatusOK {
-		t.Fatalf("replay call: expected 200, got %d: %s", w2.Code, w2.Body.String())
-	}
-
-	// Verify same response content (compare decoded maps to handle key order differences).
-	var r1, r2 map[string]any
-	decodeJSON(t, w1, &r1)
-	decodeJSON(t, w2, &r2)
-	if !reflect.DeepEqual(r1, r2) {
-		t.Errorf("replay response differs:\n  first:  %v\n  replay: %v", r1, r2)
-	}
-}
-
-func TestAgent_IdempotencyConflict(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeFailureEvents(10, "payment-service", "PMT_502"))
-
-	headers := map[string]string{"Idempotency-Key": "test-key-002"}
-
-	// First call.
-	w1 := httpPOSTWithHeaders(t, srv.ToolCall, "/v1/tools/graph_insights",
-		map[string]string{"window": "10m"}, headers)
-	if w1.Code != http.StatusOK {
-		t.Fatalf("first call: expected 200, got %d", w1.Code)
-	}
-
-	// Same key, different body → 409.
-	w2 := httpPOSTWithHeaders(t, srv.ToolCall, "/v1/tools/graph_insights",
-		map[string]string{"window": "5m"}, headers)
-	if w2.Code != http.StatusConflict {
-		t.Fatalf("conflict call: expected 409, got %d: %s", w2.Code, w2.Body.String())
-	}
-}
-
 func TestAgent_ToolNotFound(t *testing.T) {
 	srv, _, _ := newIntegrationServer(t)
 
diff --git a/tests/integration/helpers_test.go b/tests/integration/helpers_test.go
index b51ec6c..220ed59 100644
--- a/tests/integration/helpers_test.go
+++ b/tests/integration/helpers_test.go
@@ -11,18 +11,14 @@ import (
 	"time"
 
 	"github.com/sssmaran/WaylogCLI/internal/coldstore"
-	"github.com/sssmaran/WaylogCLI/internal/graph/core"
-	graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
 	"github.com/sssmaran/WaylogCLI/internal/ingest"
 	"github.com/sssmaran/WaylogCLI/internal/testutil"
 	"github.com/sssmaran/WaylogCLI/internal/tools"
-	"github.com/sssmaran/WaylogCLI/internal/tracestore"
 	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
 type integrationServer struct {
 	*ingest.Server
-	traceStore *tracestore.Store
 	coldStore  *coldstore.SQLiteStore
 	coldWriter *coldstore.BatchWriter
 }
@@ -45,16 +41,10 @@ func newIntegrationServer(t *testing.T) (*integrationServer, *coldstore.SQLiteSt
 	t.Cleanup(func() { bw.Stop() })
 
 	reg := tools.NewRegistry()
-	if err := tools.RegisterGraphTools(reg); err != nil {
-		t.Fatal(err)
-	}
 
 	dedup := ingest.NewDedupCache()
 
-	ts := tracestore.NewStore()
 	srv := ingest.NewServer(ingest.ServerConfig{
-		Store:         graphstore.NewStore(),
-		TraceStore:    ts,
 		AskRegistry:   reg,
 		DedupCache:    dedup,
 		ColdWriter:    bw,
@@ -63,16 +53,11 @@ func newIntegrationServer(t *testing.T) (*integrationServer, *coldstore.SQLiteSt
 		PlanStore:     ingest.NewPlanStore(),
 	})
 
-	return &integrationServer{Server: srv, traceStore: ts, coldStore: cs, coldWriter: bw}, cs, bw
+	return &integrationServer{Server: srv, coldStore: cs, coldWriter: bw}, cs, bw
 }
 
 func ingestEvent(t *testing.T, srv *integrationServer, ev event.WideEvent) int {
 	t.Helper()
-	result := srv.Builder().BuildResult(ev)
-	srv.Store().Merge(result.Graph)
-	if result.Span != nil {
-		srv.traceStore.Upsert(ev.Request.TraceID, core.ID("request", ev.Request.TraceID), result.Span)
-	}
 	srv.Counters().Inc(!ev.Outcome.Success)
 	srv.AcceptedPtr().Add(1)
 	if srv.coldWriter != nil {
diff --git a/tests/integration/incident_test.go b/tests/integration/incident_test.go
index e5c4aed..5c25f58 100644
--- a/tests/integration/incident_test.go
+++ b/tests/integration/incident_test.go
@@ -5,8 +5,6 @@ import (
 	"testing"
 
 	"github.com/sssmaran/WaylogCLI/internal/coldstore"
-	"github.com/sssmaran/WaylogCLI/internal/testutil"
-	"github.com/sssmaran/WaylogCLI/pkg/event"
 )
 
 func TestIncident_DeployFailure(t *testing.T) {
@@ -29,41 +27,7 @@ func TestIncident_DeployFailure(t *testing.T) {
 	// 3. Inject 50 failure events with error_code=PMT_502.
 	ingestEvents(t, srv, makeFailureEvents(50, "payment-service", "PMT_502"))
 
-	// 4. Verify overview shows error rate increased.
-	ow := httpGET(t, srv.Overview, "/v1/overview?window=10m")
-	if ow.Code != http.StatusOK {
-		t.Fatalf("overview: expected 200, got %d", ow.Code)
-	}
-	var overview map[string]any
-	decodeJSON(t, ow, &overview)
-
-	errorRate, _ := overview["error_rate"].(float64)
-	if errorRate < 20 {
-		t.Errorf("expected error_rate >= 20%% (50/150), got %f", errorRate)
-	}
-
-	// 5. Verify recent traces include failures.
-	rw := httpGET(t, srv.RecentTraces, "/v1/traces/recent?limit=10&failures_only=true")
-	if rw.Code != http.StatusOK {
-		t.Fatalf("recent traces: expected 200, got %d", rw.Code)
-	}
-	var recentResp struct {
-		Traces []struct {
-			TraceID string `json:"trace_id"`
-			Success bool   `json:"success"`
-			Service string `json:"service"`
-		} `json:"traces"`
-		TotalCount int `json:"total_count"`
-	}
-	decodeJSON(t, rw, &recentResp)
-	if len(recentResp.Traces) == 0 {
-		t.Fatal("expected at least one failed trace")
-	}
-	if recentResp.Traces[0].Success {
-		t.Error("expected first trace to be a failure")
-	}
-
-	// 6. Verify deployment is listed via GET /v1/deployments.
+	// 4. Verify deployment is listed via GET /v1/deployments.
 	dw := httpGET(t, srv.DeployRoute, "/v1/deployments?window=1h")
 	if dw.Code != http.StatusOK {
 		t.Fatalf("deployments GET: expected 200, got %d", dw.Code)
@@ -86,7 +50,7 @@ func TestIncident_DeployFailure(t *testing.T) {
 		t.Error("deploy_v2.1 not found in deployments list")
 	}
 
-	// 7. Verify cold store has events (flush batch writer first).
+	// 6. Verify cold store has events (flush batch writer first).
 	flushColdWriter(t, bw)
 	page, err := cs.SearchEvents(coldstore.SearchFilter{
 		Service:   "payment-service",
@@ -100,55 +64,3 @@ func TestIncident_DeployFailure(t *testing.T) {
 		t.Errorf("expected >= 50 PMT_502 events in cold store, got %d", page.TotalCount)
 	}
 }
-
-func TestIncident_TraceStory(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-
-	// Ingest a multi-span trace: api-gateway → checkout → payment (fails).
-	traceID := "aaaa1111bbbb2222cccc3333dddd4444"
-	events := []event.WideEvent{
-		testutil.MakeEvent(
-			testutil.WithService("api-gateway"),
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("1111111111111111"),
-			testutil.WithError("GW_502", "upstream failure"),
-			testutil.WithStatusCode(502),
-			testutil.WithCallerService(""),
-		),
-		testutil.MakeEvent(
-			testutil.WithService("checkout"),
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("2222222222222222"),
-			testutil.WithParentSpanID("1111111111111111"),
-			testutil.WithError("CHK_502", "downstream failure"),
-			testutil.WithStatusCode(502),
-			testutil.WithCallerService("api-gateway"),
-		),
-		testutil.MakeEvent(
-			testutil.WithService("payment"),
-			testutil.WithTraceID(traceID),
-			testutil.WithSpanID("3333333333333333"),
-			testutil.WithParentSpanID("2222222222222222"),
-			testutil.WithError("PMT_502", "payment timeout"),
-			testutil.WithStatusCode(502),
-			testutil.WithCallerService("checkout"),
-		),
-	}
-	ingestEvents(t, srv, events)
-
-	// Verify trace story.
-	sw := httpGET(t, srv.TraceStory, "/v1/traces/story?trace_id="+traceID)
-	if sw.Code != http.StatusOK {
-		t.Fatalf("trace story: expected 200, got %d: %s", sw.Code, sw.Body.String())
-	}
-
-	var storyResp map[string]any
-	decodeJSON(t, sw, &storyResp)
-	story, ok := storyResp["story"].(map[string]any)
-	if !ok {
-		t.Fatal("trace story missing 'story' field")
-	}
-	if _, ok := story["chain"]; !ok {
-		t.Error("trace story missing 'chain' field")
-	}
-}
diff --git a/tests/integration/otlp_test.go b/tests/integration/otlp_test.go
index 20d9933..19e9172 100644
--- a/tests/integration/otlp_test.go
+++ b/tests/integration/otlp_test.go
@@ -56,9 +56,9 @@ func newOTLPV2Stack(t *testing.T) otlpV2Stack {
 		t.Fatalf("ingestv2.New: %v", err)
 	}
 	return otlpV2Stack{
-		otlp: otelhttp.NewHandler(v2, nil, 1<<20),
+		otlp: otelhttp.NewHandler(v2, nil, 1<<20, nil),
 		read: ingestv2.NewReadHandler(ingestv2.NewReader(index), nil, 24*time.Hour),
-		caps: ingest.NewServer(ingest.ServerConfig{OTLPEnabled: true, V2ReadsEnabled: true}),
+		caps: ingest.NewServer(ingest.ServerConfig{OTLPEnabled: true}),
 	}
 }
 
@@ -120,7 +120,7 @@ func TestOTLP_EndToEnd(t *testing.T) {
 	assertErrorsContainPaymentFamily(t, stack.read)
 	assertStoryShowsPaymentFailure(t, stack.read, wantTraceID)
 	assertBlastShowsPaymentImpact(t, stack.read)
-	assertCapabilitiesAdvertiseOTLPAndV2Reads(t, stack.caps)
+	assertCapabilitiesAdvertiseOTLP(t, stack.caps)
 }
 
 func httpSpan(traceID, spanID, parentSpanID []byte, name string, start, end uint64, status int64, attrs ...*commonpb.KeyValue) *tracepb.Span {
@@ -212,7 +212,7 @@ func assertBlastShowsPaymentImpact(t *testing.T, h *ingestv2.ReadHandler) {
 	}
 }
 
-func assertCapabilitiesAdvertiseOTLPAndV2Reads(t *testing.T, srv *ingest.Server) {
+func assertCapabilitiesAdvertiseOTLP(t *testing.T, srv *ingest.Server) {
 	t.Helper()
 	cw := httpGET(t, srv.Capabilities, "/v1/capabilities")
 	if cw.Code != http.StatusOK {
@@ -222,17 +222,11 @@ func assertCapabilitiesAdvertiseOTLPAndV2Reads(t *testing.T, srv *ingest.Server)
 		OTLP struct {
 			HTTPTraces bool `json:"http_traces"`
 		} `json:"otlp"`
-		V2Reads struct {
-			Enabled bool `json:"enabled"`
-		} `json:"v2_reads"`
 	}
 	decodeJSON(t, cw, &caps)
 	if !caps.OTLP.HTTPTraces {
 		t.Fatal("expected otlp.http_traces=true")
 	}
-	if !caps.V2Reads.Enabled {
-		t.Fatal("expected v2_reads.enabled=true")
-	}
 }
 
 type fakeV2WAL struct {
diff --git a/tests/integration/plan_test.go b/tests/integration/plan_test.go
index a0a2cdc..2825534 100644
--- a/tests/integration/plan_test.go
+++ b/tests/integration/plan_test.go
@@ -1,99 +1,11 @@
 package integration
 
 import (
-	"bufio"
 	"net/http"
 	"net/http/httptest"
-	"strings"
 	"testing"
 )
 
-func TestPlan_HappyPath(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeFailureEvents(20, "payment-service", "PMT_502"))
-
-	body := map[string]any{
-		"steps": []map[string]any{
-			{"id": "insights", "tool": "graph_insights", "params": map[string]any{"window": "10m"}},
-			{"id": "patterns", "tool": "failure_patterns", "params": map[string]any{"window": "10m", "limit": 5}},
-		},
-	}
-	w := httpPOST(t, srv.PlanExecute, "/v1/plans/execute", body)
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var result struct {
-		PlanID    string `json:"plan_id"`
-		Status    string `json:"status"`
-		Completed int    `json:"completed"`
-		Total     int    `json:"total"`
-		Steps     []struct {
-			ID   string `json:"id"`
-			Tool string `json:"tool"`
-		} `json:"steps"`
-	}
-	decodeJSON(t, w, &result)
-
-	if result.Status != "complete" {
-		t.Errorf("status = %q, want complete", result.Status)
-	}
-	if result.Completed != 2 {
-		t.Errorf("completed = %d, want 2", result.Completed)
-	}
-	if result.Total != 2 {
-		t.Errorf("total = %d, want 2", result.Total)
-	}
-	if result.PlanID == "" {
-		t.Error("plan_id should not be empty")
-	}
-	if w.Header().Get("X-Plan-ID") == "" {
-		t.Error("X-Plan-ID header should be set")
-	}
-}
-
-func TestPlan_RefChain(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeFailureEvents(10, "payment-service", "PMT_502"))
-
-	body := map[string]any{
-		"steps": []map[string]any{
-			{"id": "list_failures", "tool": "graph_failures", "params": map[string]any{"limit": 1}},
-			{"id": "explain", "tool": "explain_request", "params": map[string]any{
-				"trace_id": `$steps["list_failures"].result.failures[0].trace_id`,
-			}},
-		},
-	}
-	w := httpPOST(t, srv.PlanExecute, "/v1/plans/execute", body)
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	var result struct {
-		Status    string `json:"status"`
-		Completed int    `json:"completed"`
-		Steps     []struct {
-			ID    string `json:"id"`
-			Error *struct {
-				Code string `json:"code"`
-			} `json:"error"`
-		} `json:"steps"`
-	}
-	decodeJSON(t, w, &result)
-
-	if result.Completed != 2 {
-		t.Errorf("completed = %d, want 2", result.Completed)
-	}
-	if result.Status != "complete" {
-		t.Errorf("status = %q, want complete", result.Status)
-	}
-	for _, step := range result.Steps {
-		if step.Error != nil {
-			t.Errorf("step %q has error: %s", step.ID, step.Error.Code)
-		}
-	}
-}
-
 func TestPlan_ValidationErrors(t *testing.T) {
 	srv, _, _ := newIntegrationServer(t)
 
@@ -114,8 +26,8 @@ func TestPlan_ValidationErrors(t *testing.T) {
 		{
 			name: "duplicate IDs",
 			body: map[string]any{"steps": []map[string]any{
-				{"id": "a", "tool": "graph_insights", "params": map[string]any{"window": "10m"}},
-				{"id": "a", "tool": "graph_failures", "params": map[string]any{}},
+				{"id": "a", "tool": "explain_request", "params": map[string]any{"trace_id": "x"}},
+				{"id": "a", "tool": "blast_radius", "params": map[string]any{}},
 			}},
 		},
 	}
@@ -130,96 +42,6 @@ func TestPlan_ValidationErrors(t *testing.T) {
 	}
 }
 
-func TestPlan_Idempotency(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeHealthyEvents(5, "api-gateway"))
-
-	body := map[string]any{
-		"steps": []map[string]any{
-			{"id": "insights", "tool": "graph_insights", "params": map[string]any{"window": "10m"}},
-		},
-	}
-
-	// First call
-	w1 := httpPOSTWithHeaders(t, srv.PlanExecute, "/v1/plans/execute", body, map[string]string{
-		"Idempotency-Key": "plan-test-1",
-	})
-	if w1.Code != http.StatusOK {
-		t.Fatalf("first call: expected 200, got %d: %s", w1.Code, w1.Body.String())
-	}
-
-	// Replay with same key
-	w2 := httpPOSTWithHeaders(t, srv.PlanExecute, "/v1/plans/execute", body, map[string]string{
-		"Idempotency-Key": "plan-test-1",
-	})
-	if w2.Code != http.StatusOK {
-		t.Fatalf("replay: expected 200, got %d: %s", w2.Code, w2.Body.String())
-	}
-
-	// Conflict: same key, different body
-	differentBody := map[string]any{
-		"steps": []map[string]any{
-			{"id": "insights", "tool": "graph_insights", "params": map[string]any{"window": "5m"}},
-		},
-	}
-	w3 := httpPOSTWithHeaders(t, srv.PlanExecute, "/v1/plans/execute", differentBody, map[string]string{
-		"Idempotency-Key": "plan-test-1",
-	})
-	if w3.Code != http.StatusConflict {
-		t.Errorf("conflict: expected 409, got %d: %s", w3.Code, w3.Body.String())
-	}
-}
-
-func TestPlan_SSEStream(t *testing.T) {
-	srv, _, _ := newIntegrationServer(t)
-	ingestEvents(t, srv, makeFailureEvents(10, "payment-service", "PMT_502"))
-
-	// Execute a plan first to get a plan ID
-	body := map[string]any{
-		"steps": []map[string]any{
-			{"id": "insights", "tool": "graph_insights", "params": map[string]any{"window": "10m"}},
-		},
-	}
-	w := httpPOST(t, srv.PlanExecute, "/v1/plans/execute", body)
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	planID := w.Header().Get("X-Plan-ID")
-	if planID == "" {
-		t.Fatal("no X-Plan-ID header")
-	}
-
-	// Subscribe to the completed plan — should get replay
-	req := httptest.NewRequest(http.MethodGet, "/v1/stream/plans/"+planID, nil)
-	rec := httptest.NewRecorder()
-	srv.PlanStream(rec, req)
-
-	if rec.Code != http.StatusOK {
-		t.Fatalf("SSE: expected 200, got %d: %s", rec.Code, rec.Body.String())
-	}
-
-	// Parse SSE events
-	scanner := bufio.NewScanner(strings.NewReader(rec.Body.String()))
-	var eventTypes []string
-	for scanner.Scan() {
-		line := scanner.Text()
-		if strings.HasPrefix(line, "event: ") {
-			eventTypes = append(eventTypes, strings.TrimPrefix(line, "event: "))
-		}
-	}
-
-	// Should have step_start, step_complete, done
-	if len(eventTypes) < 3 {
-		t.Errorf("expected at least 3 SSE events, got %d: %v", len(eventTypes), eventTypes)
-	}
-
-	// Last event should be "done"
-	if len(eventTypes) > 0 && eventTypes[len(eventTypes)-1] != "done" {
-		t.Errorf("last event should be 'done', got %q", eventTypes[len(eventTypes)-1])
-	}
-}
-
 func TestPlan_SSEStream_NotFound(t *testing.T) {
 	srv, _, _ := newIntegrationServer(t)
 
@@ -231,4 +53,3 @@ func TestPlan_SSEStream_NotFound(t *testing.T) {
 		t.Errorf("expected 404, got %d", rec.Code)
 	}
 }
-