From e8b5e3dcebd3947b7d47ffc3fc8ca835762ef865 Mon Sep 17 00:00:00 2001 From: skota-hash Date: Tue, 19 May 2026 05:14:21 -0400 Subject: [PATCH 01/12] feat: incident evidence snapshots (propagation + blast) Wires Propagation and Blast evidence snapshots through the full incident-triage stack. Each active incident now carries: - PropagationSnapshot{Opening, Latest}: origin service/step, causal path of spans, sample trace, capture status - BlastSnapshot{Opening, Latest}: affected requests/users/services, top services, sampled traces, capture status Stack changes: - Schema (pkg/api/v2/types.go): new Propagation/Blast types, CaptureStatus{OK,Partial,Missing} constants, optional fields on Incident. - Storage: migration 005_incident_evidence.sql adds propagation_json and blast_json columns. Coldstore Migrate() now tracks applied migrations in schema_migrations and wraps each file + record insert in a single transaction so a crash mid-ALTER cannot leave the DB half-applied. 001_initial.sql loses its PRAGMAs (already set via writer DSN; journal_mode / synchronous cannot run inside a transaction). - Engine: internal/incidents/capture.go builds snapshots from the live graph; engine.go captures Opening on incident open and refreshes Latest every tick, reusing the already-fetched events slice for the anchor. - Triage: adapter prefers stored snapshots, falls back to a fresh capture for legacy incidents lacking evidence. - API + CLI + dashboard + MCP tools surface the new fields. CLI render adds "Where did it start?" and "How bad is it?" sections. - Tests: round-trip storage, capture, engine, triage, handler, render, dashboard, and registry tests added or extended. Demo acceptance gate (scripts/demo-acceptance.sh) iterates every active incident_id and requires propagation.latest + blast.latest on each, backed by a new active-incident-ids JSON helper and unit test. --- cmd/ingest/main.go | 26 +++ internal/cli/v2/render.go | 104 ++++++++++ internal/cli/v2/render_test.go | 63 ++++++ internal/coldstore/coldstore.go | 54 ++++- internal/coldstore/incident_store.go | 28 ++- internal/coldstore/incident_store_test.go | 192 ++++++++++++++++++ internal/coldstore/migrations/001_initial.sql | 9 +- .../migrations/005_incident_evidence.sql | 6 + internal/dashboard/static/index.html | 60 ++++++ internal/dashboard/static_test.go | 7 + internal/incidents/capture.go | 135 ++++++++++++ internal/incidents/capture_test.go | 156 ++++++++++++++ internal/incidents/engine.go | 84 +++++++- internal/incidents/engine_test.go | 107 +++++++++- internal/incidents/handler.go | 74 +++++++ internal/incidents/handler_test.go | 52 +++++ internal/incidents/interfaces.go | 4 + internal/incidents/store.go | 55 +++++ internal/incidents/types.go | 107 +++++++--- internal/tools/graph_tools_v2.go | 106 ++++++++++ internal/tools/registry.go | 24 +++ internal/tools/registry_test.go | 28 +++ internal/triage/adapter.go | 172 +++++++++++++--- internal/triage/engine.go | 23 ++- internal/triage/engine_test.go | 172 ++++++++++++++++ pkg/api/v2/types.go | 95 ++++++--- scripts/demo-acceptance-json/main.go | 20 +- scripts/demo-acceptance-json/main_test.go | 22 ++ scripts/demo-acceptance.sh | 28 +++ scripts/micro-demo-smoke.sh | 28 +++ tests/integration/plan_test.go | 1 - 31 files changed, 1930 insertions(+), 112 deletions(-) create mode 100644 internal/coldstore/migrations/005_incident_evidence.sql create mode 100644 internal/incidents/capture.go create mode 100644 internal/incidents/capture_test.go create mode 100644 internal/tools/graph_tools_v2.go diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go index e9c9ea1..06ad216 100644 --- a/cmd/ingest/main.go +++ b/cmd/ingest/main.go @@ -447,6 +447,20 @@ func main() { mux.Handle("/v1/events/", readCORS(v2ReadHandler.EventByID)) mux.Handle("/v1/traces/", readCORS(v2ReadHandler.TraceByID)) slog.Info("v2 read endpoints enabled") + // Replace legacy graph-backed explain_request / blast_radius with the + // v2-reader-backed handlers now that v2Reader exists. Other graph tools + // remain registered (Step 2 of the deletion plan removes them). + { + v2ToolReader := incidentReaderAdapter{reader: v2Reader} + if err := tools.RegisterExplainRequestTool(reg, v2ToolReader); err != nil { + slog.Error("register explain_request v2", "err", err) + os.Exit(1) + } + if err := tools.RegisterBlastRadiusTool(reg, v2ToolReader); err != nil { + slog.Error("register blast_radius v2", "err", err) + os.Exit(1) + } + } if incidentsEnabled { if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok { incidentStore := coldstore.NewIncidentStore(sqlite) @@ -1084,6 +1098,18 @@ func (a incidentReaderAdapter) SearchEvents(f incidents.SearchFilter, limit int) return res.Events } +func (a incidentReaderAdapter) TraceStoryByTraceID(traceID string) (apiv2.StoryResponse, bool) { + return a.reader.TraceStoryByTraceID(traceID) +} + +func (a incidentReaderAdapter) TraceEvents(traceID string) ([]*eventv2.Event, bool) { + result, ok := a.reader.GetTrace(traceID) + if !ok { + return nil, false + } + return result.Events, true +} + func toV2SearchFilter(f incidents.SearchFilter) ingestv2.SearchFilter { return ingestv2.SearchFilter{ Service: f.Service, diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go index 2e61981..9e0a503 100644 --- a/internal/cli/v2/render.go +++ b/internal/cli/v2/render.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "io" + "slices" "strings" "text/tabwriter" "time" @@ -124,6 +125,9 @@ func renderIncidentBody(w io.Writer, inc Incident) { fmt.Fprintf(w, "baseline_count: %d\n", inc.BaselineCount) fmt.Fprintf(w, "current_count: %d\n", inc.CurrentCount) + renderPropagationBlock(w, inc.Propagation) + renderBlastBlock(w, inc.Blast) + fmt.Fprintln(w, "\nevidence:") if len(inc.Evidence) == 0 { fmt.Fprintln(w, " none") @@ -379,6 +383,106 @@ func formatTime(t time.Time) string { return t.Format(time.RFC3339) } +func renderPropagationBlock(w io.Writer, p *apiv2.PropagationSnapshot) { + if p == nil || p.Latest == nil { + fmt.Fprintln(w, "\nWhere did it start?") + fmt.Fprintln(w, " Propagation evidence unavailable") + return + } + fmt.Fprintln(w, "\nWhere did it start?") + if p.Latest.CaptureStatus != apiv2.CaptureStatusOK { + fmt.Fprintf(w, " Propagation evidence unavailable (%s) — retrying\n", p.Latest.CaptureStatus) + return + } + fmt.Fprintf(w, " Origin: %s / %s\n", p.Latest.OriginService, p.Latest.OriginStep) + firstFailing, errCode := firstErrorStep(p.Latest) + if firstFailing != "" { + fmt.Fprintf(w, " First failing step: %s %s\n", firstFailing, errCode) + } + if len(p.Latest.Path) > 0 { + names := make([]string, 0, len(p.Latest.Path)) + for _, s := range p.Latest.Path { + names = append(names, s.Step) + } + fmt.Fprintf(w, " %s\n", strings.Join(names, " → ")) + } + fmt.Fprintf(w, " sample trace: %s · captured %s ago\n", + p.Latest.SampleTraceID, time.Since(p.Latest.CapturedAt).Round(time.Second)) +} + +func renderBlastBlock(w io.Writer, b *apiv2.BlastSnapshot) { + if b == nil || b.Latest == nil { + fmt.Fprintln(w, "\nHow bad is it?") + fmt.Fprintln(w, " Blast evidence unavailable") + return + } + fmt.Fprintln(w, "\nHow bad is it?") + if b.Opening != nil && blastDelta(b.Opening, b.Latest) { + fmt.Fprintf(w, " At open: %d req · %d svc · %s users\n", + b.Opening.AffectedRequests, b.Opening.AffectedServices, usersStr(b.Opening.AffectedUsers)) + fmt.Fprintf(w, " Now: %d req · %d svc · %s users\n", + b.Latest.AffectedRequests, b.Latest.AffectedServices, usersStr(b.Latest.AffectedUsers)) + } else { + fmt.Fprintf(w, " Now: %d req · %d svc · %s users\n", + b.Latest.AffectedRequests, b.Latest.AffectedServices, usersStr(b.Latest.AffectedUsers)) + } + if len(b.Latest.TopServices) > 0 { + fmt.Fprintf(w, " Top services: %s\n", strings.Join(b.Latest.TopServices, ", ")) + } + fmt.Fprintf(w, " captured %s ago\n", time.Since(b.Latest.CapturedAt).Round(time.Second)) +} + +func firstErrorStep(p *apiv2.PropagationEvidence) (step, code string) { + if p == nil { + return "", "" + } + for _, s := range p.Path { + if s.Status == "error" { + return s.Step, s.ErrorCode + } + } + return "", "" +} + +// blastDelta returns true if Opening and Latest differ on any user-visible +// impact field. CapturedAt and CaptureStatus are excluded — they change every +// tick and would force a permanent delta. +func blastDelta(o, l *apiv2.BlastEvidence) bool { + if o == nil || l == nil { + return false + } + if o.AffectedRequests != l.AffectedRequests { + return true + } + if o.AffectedServices != l.AffectedServices { + return true + } + if usersInt(o.AffectedUsers) != usersInt(l.AffectedUsers) { + return true + } + if !slices.Equal(o.TopServices, l.TopServices) { + return true + } + if len(o.SampledTraces) != len(l.SampledTraces) { + return true + } + return false +} + +func usersInt(u *int) int { + if u == nil { + return 0 + } + return *u +} + +func usersStr(u *int) string { + if u == nil { + return "?" + } + return fmt.Sprintf("%d", *u) +} + func RenderTriage(w io.Writer, rep *TriageReport) int { fmt.Fprintf(w, "Triage report incident=%s window=%s confidence=%s\n", rep.IncidentRef.ID, rep.IncidentRef.Window, rep.Confidence) diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go index 4fa4f34..b678230 100644 --- a/internal/cli/v2/render_test.go +++ b/internal/cli/v2/render_test.go @@ -228,3 +228,66 @@ func TestRenderTriageHeaderAndSections(t *testing.T) { } } } + +func TestRenderIncident_WithPropagationAndBlast(t *testing.T) { + ts := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC) + users := 47 + openUsers := 5 + resp := IncidentDetailResponse{ + Incident: Incident{ + IncidentID: "inc_render", + Service: "payment-service", + Status: "active", + Propagation: &apiv2.PropagationSnapshot{ + Latest: &apiv2.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []apiv2.PropagationStep{ + {Service: "payment-service", Step: "validate", Status: "ok"}, + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "7a3fb2", + CapturedAt: ts, + CaptureStatus: "ok", + }, + }, + Blast: &apiv2.BlastSnapshot{ + Opening: &apiv2.BlastEvidence{AffectedRequests: 3, AffectedServices: 1, AffectedUsers: &openUsers, CapturedAt: ts, CaptureStatus: "ok"}, + Latest: &apiv2.BlastEvidence{AffectedRequests: 184, AffectedServices: 3, AffectedUsers: &users, TopServices: []string{"checkout", "api-gateway"}, CapturedAt: ts, CaptureStatus: "ok"}, + }, + }, + } + var buf bytes.Buffer + RenderIncident(&buf, resp) + out := buf.String() + for _, want := range []string{ + "Where did it start?", + "Origin: payment-service / charge", + "First failing step: charge DB_TIMEOUT", + "validate → charge", + "How bad is it?", + "At open: 3 req", + "Now: 184 req", + "Top services: checkout, api-gateway", + } { + if !strings.Contains(out, want) { + t.Errorf("rendered output missing %q\n\nFull output:\n%s", want, out) + } + } +} + +func TestRenderIncident_PropagationMissing_ShowsRetryLine(t *testing.T) { + resp := IncidentDetailResponse{ + Incident: Incident{ + IncidentID: "inc_missing", + Propagation: &apiv2.PropagationSnapshot{ + Latest: &apiv2.PropagationEvidence{CapturedAt: time.Now(), CaptureStatus: "missing"}, + }, + }, + } + var buf bytes.Buffer + RenderIncident(&buf, resp) + if !strings.Contains(buf.String(), "Propagation evidence unavailable") { + t.Errorf("missing-state render missing the retry line:\n%s", buf.String()) + } +} diff --git a/internal/coldstore/coldstore.go b/internal/coldstore/coldstore.go index d97da0b..97affc2 100644 --- a/internal/coldstore/coldstore.go +++ b/internal/coldstore/coldstore.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "embed" + "errors" "fmt" "log/slog" "time" @@ -115,24 +116,71 @@ func Open(path string) (ManagedStore, error) { return s, nil } -// Migrate runs all embedded SQL migration files idempotently. +// Migrate runs all embedded SQL migration files idempotently. Applied +// migrations are tracked in the schema_migrations table; files already +// recorded there are skipped on subsequent calls. This lets non-idempotent +// statements (e.g. ALTER TABLE ADD COLUMN) live in a migration file and +// still satisfy the Migrate-twice contract. func (s *SQLiteStore) Migrate() error { + if _, err := s.writer.Exec(`CREATE TABLE IF NOT EXISTS schema_migrations ( + name TEXT PRIMARY KEY, + applied_at TEXT NOT NULL + )`); err != nil { + return fmt.Errorf("create schema_migrations: %w", err) + } entries, err := migrationsFS.ReadDir("migrations") if err != nil { return fmt.Errorf("read migrations dir: %w", err) } for _, entry := range entries { + var applied string + err := s.writer.QueryRow(`SELECT applied_at FROM schema_migrations WHERE name = ?`, entry.Name()).Scan(&applied) + if err == nil { + continue + } + if !errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("check migration %s: %w", entry.Name(), err) + } data, err := migrationsFS.ReadFile("migrations/" + entry.Name()) if err != nil { return fmt.Errorf("read migration %s: %w", entry.Name(), err) } - if _, err := s.writer.Exec(string(data)); err != nil { - return fmt.Errorf("exec migration %s: %w", entry.Name(), err) + if err := s.applyMigration(entry.Name(), data); err != nil { + return err } } return nil } +// applyMigration runs one migration file and records it in schema_migrations +// inside a single transaction. If either statement fails, the entire change +// is rolled back so a non-idempotent migration (e.g. ALTER TABLE ADD COLUMN) +// is never left half-applied across a crash or partial failure. +func (s *SQLiteStore) applyMigration(name string, data []byte) (err error) { + tx, err := s.writer.Begin() + if err != nil { + return fmt.Errorf("begin migration %s: %w", name, err) + } + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + if _, err = tx.Exec(string(data)); err != nil { + return fmt.Errorf("exec migration %s: %w", name, err) + } + if _, err = tx.Exec( + `INSERT INTO schema_migrations (name, applied_at) VALUES (?, ?)`, + name, time.Now().UTC().Format(tsFormat), + ); err != nil { + return fmt.Errorf("record migration %s: %w", name, err) + } + if err = tx.Commit(); err != nil { + return fmt.Errorf("commit migration %s: %w", name, err) + } + return nil +} + // Close closes both writer and reader database handles. func (s *SQLiteStore) Close() error { var firstErr error diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go index 9a05201..8e20f01 100644 --- a/internal/coldstore/incident_store.go +++ b/internal/coldstore/incident_store.go @@ -77,14 +77,22 @@ func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.In if err != nil { return fmt.Errorf("coldstore incident warnings: %w", err) } + propagation, err := jsonText(inc.Propagation) + if err != nil { + return fmt.Errorf("coldstore incident propagation: %w", err) + } + blast, err := jsonText(inc.Blast) + if err != nil { + return fmt.Errorf("coldstore incident blast: %w", err) + } _, err = execer.ExecContext(ctx, ` INSERT INTO incidents ( incident_id, env, service, error_service, error_step, error_code, status, cause, confidence, severity, started_at, updated_at, last_seen_at, recovering_at, resolved_at, affected_requests, affected_users, affected_services, top_services, sample_traces, evidence, next_checks, instrumentation_warnings, - lift, baseline_count, current_count - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + lift, baseline_count, current_count, propagation_json, blast_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(incident_id) DO UPDATE SET status = excluded.status, cause = excluded.cause, @@ -104,13 +112,16 @@ func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.In instrumentation_warnings = excluded.instrumentation_warnings, lift = excluded.lift, baseline_count = excluded.baseline_count, - current_count = excluded.current_count`, + current_count = excluded.current_count, + propagation_json = excluded.propagation_json, + blast_json = excluded.blast_json`, inc.IncidentID, inc.Env, inc.Service, inc.ErrorFamily.Service, inc.ErrorFamily.Step, inc.ErrorFamily.ErrorCode, string(inc.Status), string(inc.Cause), string(inc.Confidence), inc.Severity, formatTime(inc.StartedAt), formatTime(inc.UpdatedAt), formatTime(inc.LastSeenAt), nullableTime(inc.RecoveringAt), nullableTime(inc.ResolvedAt), inc.AffectedRequests, nullableInt(inc.AffectedUsers), inc.AffectedServices, topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount, + propagation, blast, ) if err != nil { return err @@ -166,7 +177,8 @@ func incidentSelectSQL() string { affected_requests, affected_users, affected_services, COALESCE(top_services, ''), COALESCE(sample_traces, ''), COALESCE(evidence, ''), COALESCE(next_checks, ''), COALESCE(instrumentation_warnings, ''), - lift, baseline_count, current_count + lift, baseline_count, current_count, + COALESCE(propagation_json, ''), COALESCE(blast_json, '') FROM incidents` } @@ -176,11 +188,13 @@ func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, var startedAt, updatedAt, lastSeenAt, recoveringAt, resolvedAt string var affectedUsers sql.NullInt64 var topServices, samples, evidence, nextChecks, warnings string + var propagationJSON, blastJSON string err := row.Scan( &inc.IncidentID, &inc.Env, &inc.Service, &inc.ErrorFamily.Service, &inc.ErrorFamily.Step, &inc.ErrorFamily.ErrorCode, &status, &cause, &confidence, &inc.Severity, &startedAt, &updatedAt, &lastSeenAt, &recoveringAt, &resolvedAt, &inc.AffectedRequests, &affectedUsers, &inc.AffectedServices, &topServices, &samples, &evidence, &nextChecks, &warnings, &inc.Lift, &inc.BaselineCount, &inc.CurrentCount, + &propagationJSON, &blastJSON, ) if err != nil { return incidents.Incident{}, err @@ -231,6 +245,12 @@ func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, if err := parseJSONText(warnings, &inc.InstrumentationWarnings); err != nil { return incidents.Incident{}, fmt.Errorf("coldstore incident warnings: %w", err) } + if err := parseJSONText(propagationJSON, &inc.Propagation); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident propagation: %w", err) + } + if err := parseJSONText(blastJSON, &inc.Blast); err != nil { + return incidents.Incident{}, fmt.Errorf("coldstore incident blast: %w", err) + } return inc, nil } diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go index 9853f3b..342c1aa 100644 --- a/internal/coldstore/incident_store_test.go +++ b/internal/coldstore/incident_store_test.go @@ -132,6 +132,179 @@ func TestIncidentStoreReplaceNonResolved(t *testing.T) { } } +func TestIncidentStore_PropagationSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_prop", ts) + inc.Propagation = &incidents.PropagationSnapshot{ + Opening: &incidents.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []incidents.PropagationStep{ + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "trace_a", + CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + }, + Latest: &incidents.PropagationEvidence{ + OriginService: "payment-service", + OriginStep: "charge", + Path: []incidents.PropagationStep{ + {Service: "payment-service", Step: "charge", Status: "error", ErrorCode: "DB_TIMEOUT"}, + }, + SampleTraceID: "trace_b", + CapturedAt: ts.Add(30 * time.Second), + CaptureStatus: incidents.CaptureOK, + }, + } + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation == nil { + t.Fatal("Propagation lost") + } + if got.Propagation.Opening == nil || got.Propagation.Latest == nil { + t.Fatal("Opening/Latest lost") + } + if got.Propagation.Opening.SampleTraceID != "trace_a" { + t.Errorf("Opening.SampleTraceID = %q", got.Propagation.Opening.SampleTraceID) + } + if got.Propagation.Latest.SampleTraceID != "trace_b" { + t.Errorf("Latest.SampleTraceID = %q", got.Propagation.Latest.SampleTraceID) + } + if got.Propagation.Opening.CaptureStatus != incidents.CaptureOK { + t.Errorf("Opening.CaptureStatus = %q", got.Propagation.Opening.CaptureStatus) + } +} + +func TestIncidentStore_BlastSnapshotRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + openUsers := 12 + latestUsers := 47 + inc := baseEvidenceIncident("inc_test_blast", ts) + inc.Blast = &incidents.BlastSnapshot{ + Opening: &incidents.BlastEvidence{ + AffectedRequests: 5, + AffectedUsers: &openUsers, + AffectedServices: 1, + TopServices: []string{"checkout"}, + SampledTraces: []string{"trace_a"}, + CapturedAt: ts, + CaptureStatus: incidents.CaptureOK, + }, + Latest: &incidents.BlastEvidence{ + AffectedRequests: 184, + AffectedUsers: &latestUsers, + AffectedServices: 3, + TopServices: []string{"checkout", "api-gateway"}, + SampledTraces: []string{"trace_b", "trace_c"}, + CapturedAt: ts.Add(time.Minute), + CaptureStatus: incidents.CaptureOK, + }, + } + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Blast == nil || got.Blast.Opening == nil || got.Blast.Latest == nil { + t.Fatalf("Blast snapshot lost: %+v", got.Blast) + } + if got.Blast.Opening.AffectedRequests != 5 || got.Blast.Latest.AffectedRequests != 184 { + t.Errorf("AffectedRequests round-trip wrong: opening=%d latest=%d", + got.Blast.Opening.AffectedRequests, got.Blast.Latest.AffectedRequests) + } + if got.Blast.Opening.AffectedServices != 1 || got.Blast.Latest.AffectedServices != 3 { + t.Errorf("AffectedServices round-trip wrong") + } + if got.Blast.Opening.AffectedUsers == nil || *got.Blast.Opening.AffectedUsers != openUsers { + t.Errorf("Opening.AffectedUsers lost: %+v", got.Blast.Opening.AffectedUsers) + } + if got.Blast.Latest.AffectedUsers == nil || *got.Blast.Latest.AffectedUsers != latestUsers { + t.Errorf("Latest.AffectedUsers lost: %+v", got.Blast.Latest.AffectedUsers) + } +} + +func TestIncidentStore_NilSnapshotsRoundTrip(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + inc := baseEvidenceIncident("inc_test_nil", ts) + if err := store.Upsert(ctx, inc); err != nil { + t.Fatalf("upsert: %v", err) + } + got, err := store.Get(ctx, inc.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation != nil { + t.Errorf("Propagation = %+v; want nil", got.Propagation) + } + if got.Blast != nil { + t.Errorf("Blast = %+v; want nil", got.Blast) + } +} + +func TestIncidentStore_DoesNotMergeOpening(t *testing.T) { + managed, err := Open(":memory:") + if err != nil { + t.Fatal(err) + } + defer managed.Close() + store := NewIncidentStore(managed.(*SQLiteStore)) + ctx := context.Background() + ts := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC) + base := baseEvidenceIncident("inc_test_dumb", ts) + base.Propagation = &incidents.PropagationSnapshot{ + Opening: &incidents.PropagationEvidence{SampleTraceID: "trace_a", CapturedAt: ts, CaptureStatus: incidents.CaptureOK}, + Latest: &incidents.PropagationEvidence{SampleTraceID: "trace_a", CapturedAt: ts, CaptureStatus: incidents.CaptureOK}, + } + if err := store.Upsert(ctx, base); err != nil { + t.Fatalf("upsert: %v", err) + } + base.Propagation = &incidents.PropagationSnapshot{ + Latest: &incidents.PropagationEvidence{SampleTraceID: "trace_b", CapturedAt: ts.Add(time.Minute), CaptureStatus: incidents.CaptureOK}, + } + if err := store.Upsert(ctx, base); err != nil { + t.Fatalf("upsert 2: %v", err) + } + got, err := store.Get(ctx, base.IncidentID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.Propagation == nil || got.Propagation.Opening != nil { + t.Errorf("Opening should be nil after explicit overwrite; got %+v", got.Propagation) + } + if got.Propagation == nil || got.Propagation.Latest == nil || got.Propagation.Latest.SampleTraceID != "trace_b" { + t.Errorf("Latest should be trace_b; got %+v", got.Propagation) + } +} + func testColdIncident(id string, status incidents.Status, at time.Time) incidents.Incident { resolvedAt := at.Add(time.Minute) inc := incidents.Incident{ @@ -160,3 +333,22 @@ func testColdIncident(id string, status incidents.Status, at time.Time) incident } return inc } + +// baseEvidenceIncident returns a minimal active incident shared across the +// Propagation/Blast snapshot round-trip tests. Tests set the relevant snapshot +// field after construction. +func baseEvidenceIncident(id string, ts time.Time) incidents.Incident { + return incidents.Incident{ + IncidentID: id, + Env: "demo", + Service: "payment-service", + ErrorFamily: apiv2.ErrorFamily{Service: "payment-service", Step: "charge", ErrorCode: "DB_TIMEOUT"}, + Status: incidents.StatusActive, + Cause: incidents.CauseUnknown, + Confidence: incidents.ConfidenceMedium, + Severity: 2, + StartedAt: ts, + UpdatedAt: ts, + LastSeenAt: ts, + } +} diff --git a/internal/coldstore/migrations/001_initial.sql b/internal/coldstore/migrations/001_initial.sql index ece71a4..5ec15ef 100644 --- a/internal/coldstore/migrations/001_initial.sql +++ b/internal/coldstore/migrations/001_initial.sql @@ -1,9 +1,8 @@ -- 001_initial.sql: events + deployments tables for SQLite cold storage. - -PRAGMA journal_mode = WAL; -PRAGMA busy_timeout = 5000; -PRAGMA foreign_keys = ON; -PRAGMA synchronous = NORMAL; +-- Connection-level PRAGMAs (journal_mode, busy_timeout, foreign_keys, +-- synchronous) are configured by Open() via the writer DSN or, for +-- :memory:, via explicit Exec calls — not here, because PRAGMAs that +-- change safety/journal modes cannot run inside a transaction. CREATE TABLE IF NOT EXISTS events ( id INTEGER PRIMARY KEY AUTOINCREMENT, diff --git a/internal/coldstore/migrations/005_incident_evidence.sql b/internal/coldstore/migrations/005_incident_evidence.sql new file mode 100644 index 0000000..da4aead --- /dev/null +++ b/internal/coldstore/migrations/005_incident_evidence.sql @@ -0,0 +1,6 @@ +-- 005_incident_evidence.sql: add v1.0 incident evidence snapshots +-- (PropagationSnapshot + BlastSnapshot). Both columns hold JSON text or NULL. +-- A NULL column round-trips to a nil *PropagationSnapshot / *BlastSnapshot in Go. + +ALTER TABLE incidents ADD COLUMN propagation_json TEXT; +ALTER TABLE incidents ADD COLUMN blast_json TEXT; diff --git a/internal/dashboard/static/index.html b/internal/dashboard/static/index.html index aed839e..63ac348 100644 --- a/internal/dashboard/static/index.html +++ b/internal/dashboard/static/index.html @@ -1022,6 +1022,8 @@

${esc(formatFamily(incident.error_family))}

${esc(incident.cause || "unknown")} · ${esc(incident.confidence || "low")} confidence
${esc(incident.status || "unknown")} · severity ${nf.format(incident.severity || 0)} · started ${esc(ago(incident.started_at))}
+ ${renderPropagationBlock(incident.propagation)} + ${renderBlastBlock(incident.blast)}
${impact("Affected requests", incident.affected_requests || 0)} ${impact("Affected users", incident.affected_users == null ? "unknown" : incident.affected_users)} @@ -1036,6 +1038,64 @@

${esc(formatFamily(incident.error_family))}

`); } + function renderPropagationBlock(p) { + if (!p || !p.latest) { + return `
+

Where did it start?

+
Propagation evidence unavailable
+
`; + } + const l = p.latest; + if (l.capture_status !== "ok") { + return `
+

Where did it start?

+
Propagation evidence unavailable (${esc(l.capture_status)}) — retrying
+
`; + } + const firstErr = (l.path || []).find(s => s.status === "error"); + const firstFailing = firstErr + ? `
First failing step: ${esc(firstErr.step)} ${esc(firstErr.error_code || "")}
` + : ""; + const arrow = (l.path || []).map(s => `${esc(s.step)}`).join(" → "); + return `
+

Where did it start?

+
Origin: ${esc(l.origin_service)} / ${esc(l.origin_step || "")}
+ ${firstFailing} +
${arrow}
+
sample trace: ${esc(l.sample_trace_id || "")} · captured ${esc(ago(l.captured_at))}
+
`; + } + function blastImpactDiffers(o, l) { + if (!o || !l) return false; + if (o.affected_requests !== l.affected_requests) return true; + if (o.affected_services !== l.affected_services) return true; + if ((o.affected_users || 0) !== (l.affected_users || 0)) return true; + const a = o.top_services || [], b = l.top_services || []; + if (a.length !== b.length) return true; + for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return true; + if ((o.sampled_traces || []).length !== (l.sampled_traces || []).length) return true; + return false; + } + function renderBlastBlock(b) { + if (!b || !b.latest) { + return `
+

How bad is it?

+
Blast evidence unavailable
+
`; + } + const l = b.latest; + const o = b.opening; + const showDelta = o && blastImpactDiffers(o, l); + const usersFmt = u => (u === null || u === undefined) ? "?" : `${u} users`; + const row = (e, label) => `
${esc(label)}: ${nf.format(e.affected_requests || 0)} req · ${nf.format(e.affected_services || 0)} svc · ${esc(usersFmt(e.affected_users))}
`; + const top = (l.top_services && l.top_services.length) ? `
Top services: ${l.top_services.map(esc).join(", ")}
` : ""; + return `
+

How bad is it?

+ ${showDelta ? row(o, "At open") + row(l, "Now") : row(l, "Now")} + ${top} +
captured ${esc(ago(l.captured_at))}
+
`; + } function renderEvidence(items) { if (!items.length) return `
No incident evidence attached.
`; return `