From 224a9ba98bb0a2174865169b72b79ddc0417388c Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Mon, 4 May 2026 23:12:36 -0400
Subject: [PATCH 01/14] feat(runtime): Added production signals API and SQLite
storage
Introduce the production-context signal foundation. Add the
internal/signals package with the Signal domain type, validation rules,
single-record POST handler, Store interface, unavailable store fallback, and
retention loop. Signal JSON preserves unknown top-level fields while
server-owned signal_id and received_at are generated only after validation.
Add SQLite-backed signal persistence in coldstore via a new signals table
migration and SignalStore implementation. Signals can be inserted, queried by
service/env/source/reason/type/time window, ordered deterministically, and
pruned by retention cutoff. The storage implementation lives in coldstore so it
can reuse the existing SQLite reader/writer handles and migration ownership.
Wire POST /v1/signals into the ingest server behind write-scope auth. When
SQLITE_PATH is unset, the endpoint returns a structured 503 durability error
without affecting existing v2 read APIs. Add WAYLOG_SIGNAL_RETENTION startup
validation and start a retention janitor only when SQLite-backed signal storage
is available.
Add Prometheus counters for accepted signals, rejected signals by reason, and
retention-pruned signals. Document the new endpoint in OpenAPI and add the new
retention env var to docs/env.md.
---
cmd/ingest/main.go | 14 ++
docs/env.md | 1 +
docs/openapi.yaml | 118 +++++++++++
internal/coldstore/migrations/003_signals.sql | 20 ++
internal/coldstore/signal_store.go | 188 +++++++++++++++++
internal/coldstore/signal_store_test.go | 97 +++++++++
internal/metrics/metrics.go | 25 +++
internal/signals/handler.go | 118 +++++++++++
internal/signals/handler_test.go | 143 +++++++++++++
internal/signals/retention.go | 39 ++++
internal/signals/retention_test.go | 51 +++++
internal/signals/store.go | 40 ++++
internal/signals/types.go | 190 ++++++++++++++++++
internal/signals/types_test.go | 78 +++++++
internal/signals/validate.go | 74 +++++++
internal/signals/validate_test.go | 53 +++++
16 files changed, 1249 insertions(+)
create mode 100644 internal/coldstore/migrations/003_signals.sql
create mode 100644 internal/coldstore/signal_store.go
create mode 100644 internal/coldstore/signal_store_test.go
create mode 100644 internal/signals/handler.go
create mode 100644 internal/signals/handler_test.go
create mode 100644 internal/signals/retention.go
create mode 100644 internal/signals/retention_test.go
create mode 100644 internal/signals/store.go
create mode 100644 internal/signals/types.go
create mode 100644 internal/signals/types_test.go
create mode 100644 internal/signals/validate.go
create mode 100644 internal/signals/validate_test.go
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 52c700e..599993e 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -32,6 +32,7 @@ import (
"github.com/sssmaran/WaylogCLI/internal/metrics"
otelhttp "github.com/sssmaran/WaylogCLI/internal/otel"
"github.com/sssmaran/WaylogCLI/internal/persist"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
"github.com/sssmaran/WaylogCLI/internal/tools"
"github.com/sssmaran/WaylogCLI/internal/tracestore"
)
@@ -124,6 +125,11 @@ func main() {
graphUI := config.GetenvBool("GRAPH_UI", false)
otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
+ signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+ if signalRetention <= 0 {
+ slog.Error("WAYLOG_SIGNAL_RETENTION must be positive", "value", signalRetention)
+ os.Exit(1)
+ }
causalEnabled := config.GetenvBool("CAUSAL_ENABLED", false)
causalInterval := config.GetenvDuration("CAUSAL_INTERVAL", 30*time.Second)
@@ -188,6 +194,7 @@ func main() {
// Optional SQLite cold store
var coldDB coldstore.ManagedStore
var coldWriter *coldstore.BatchWriter
+ var signalStore signals.Store = signals.UnavailableStore{}
if sqlitePath != "" {
if eventLogDir == "" {
slog.Warn("SQLITE_PATH set without EVENT_LOG_DIR — cold store is async-only, " +
@@ -207,6 +214,7 @@ func main() {
FlushInterval: config.GetenvDuration("SQLITE_FLUSH_INTERVAL", 500*time.Millisecond),
}, m)
coldWriter.Start()
+ signalStore = coldstore.NewSignalStore(coldDB.(*coldstore.SQLiteStore))
slog.Info("coldstore enabled", "path", sqlitePath)
}
@@ -355,6 +363,8 @@ func main() {
}
mux.Handle("/v1/events", writeAuth(http.HandlerFunc(eventsV2.Events)))
mux.Handle("/v1/events/validate", writeAuth(http.HandlerFunc(eventsV2.Validate)))
+ signalHandler := signals.NewHandler(signalStore, m)
+ mux.Handle("/v1/signals", writeAuth(http.HandlerFunc(signalHandler.Signals)))
// OTLP/HTTP traces reuse the same schema-2.0 WAL and projector as the SDK path.
if otlpEnabled {
@@ -453,6 +463,10 @@ func main() {
)
defer stop()
+ if _, ok := signalStore.(*coldstore.SignalStore); ok {
+ go signals.RunRetention(ctx, signalStore, signalRetention, 5*time.Minute, m, slog.Default())
+ }
+
go func() {
slog.Info("ingest listening", "addr", addr, "graph_hot_window", graphHotWindow)
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
diff --git a/docs/env.md b/docs/env.md
index f0c77ea..7ace4cb 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -56,6 +56,7 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
| `EVENT_LOG_SYNC` | `true` | Per-write fsync. Set `false` for dev/load testing |
| `EVENT_LOG_MAX_FILE_MB` | `50` | Rotation size. `0` disables rotation |
| `EVENT_LOG_RETENTION` | `72h` | Event log retention. Must be positive |
+| `WAYLOG_SIGNAL_RETENTION` | `72h` | Production-context signal retention. Must be positive. `/v1/signals` requires `SQLITE_PATH` |
| `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
| `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
| `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 9d874ee..151392b 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -19,6 +19,8 @@ tags:
description: Schema-2.0 event ingest and validation.
- name: OTLP
description: OTLP/HTTP trace ingest converted into schema-2.0 events.
+ - name: Signals
+ description: Production-context facts used by incident triage.
- name: Events
description: Direct event lookup and search.
- name: Traces
@@ -136,6 +138,64 @@ paths:
schema:
$ref: '#/components/schemas/IngestEnvelope'
+ /v1/signals:
+ post:
+ tags: [Signals]
+ operationId: ingestSignal
+ summary: Ingest one production-context signal
+ description: |
+ Accepts one low-volume production-context signal, such as a deploy,
+ dependency, runtime, healthcheck, config, or alert fact. Signals require
+ SQLite persistence and are used by the v2.1 incident engine.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/Signal'
+ responses:
+ '201':
+ description: Signal accepted and persisted
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/SignalAccepted'
+ '400':
+ description: Invalid signal JSON or fields
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ReadError'
+ '401':
+ description: Unauthorized
+ '405':
+ description: Method Not Allowed
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ReadError'
+ '413':
+ description: Request body too large
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ReadError'
+ '503':
+ description: Signal storage unavailable; set SQLITE_PATH
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ReadError'
+ '500':
+ description: Internal signal storage error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ReadError'
+
/v1/otlp/v1/traces:
post:
tags: [OTLP]
@@ -911,6 +971,64 @@ components:
items:
$ref: '#/components/schemas/ErrorRef'
+ Signal:
+ type: object
+ additionalProperties: true
+ required: [type, source, service, env, severity, reason, timestamp]
+ example:
+ type: deploy
+ source: github-actions
+ service: checkout
+ env: prod
+ severity: info
+ reason: RolloutComplete
+ message: checkout 1.18.2 rolled out
+ metadata:
+ deployment_id: deploy_123
+ version: 1.18.2
+ timestamp: "2026-05-02T18:09:40Z"
+ properties:
+ signal_id:
+ type: string
+ readOnly: true
+ description: Server-generated signal id with sig_ prefix.
+ type:
+ type: string
+ enum: [deploy, runtime, healthcheck, dependency, config, alert]
+ source:
+ type: string
+ service:
+ type: string
+ env:
+ type: string
+ severity:
+ type: string
+ enum: [info, warning, critical]
+ reason:
+ type: string
+ message:
+ type: string
+ resource:
+ type: object
+ additionalProperties: true
+ metadata:
+ type: object
+ additionalProperties: true
+ timestamp:
+ type: string
+ format: date-time
+ received_at:
+ type: string
+ format: date-time
+ readOnly: true
+
+ SignalAccepted:
+ type: object
+ required: [signal]
+ properties:
+ signal:
+ $ref: '#/components/schemas/Signal'
+
Anchor:
type: object
required: [step, error_code]
diff --git a/internal/coldstore/migrations/003_signals.sql b/internal/coldstore/migrations/003_signals.sql
new file mode 100644
index 0000000..e33156b
--- /dev/null
+++ b/internal/coldstore/migrations/003_signals.sql
@@ -0,0 +1,20 @@
+-- 003_signals.sql: production-context signal storage.
+
+CREATE TABLE IF NOT EXISTS signals (
+ signal_id TEXT PRIMARY KEY,
+ type TEXT NOT NULL,
+ source TEXT NOT NULL,
+ service TEXT NOT NULL,
+ env TEXT NOT NULL,
+ severity TEXT NOT NULL,
+ reason TEXT NOT NULL,
+ message TEXT,
+ resource TEXT,
+ metadata TEXT,
+ extra TEXT,
+ timestamp TEXT NOT NULL,
+ received_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now'))
+);
+
+CREATE INDEX IF NOT EXISTS idx_signals_service_env_type_ts ON signals (service, env, type, timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_signals_ts ON signals (timestamp);
diff --git a/internal/coldstore/signal_store.go b/internal/coldstore/signal_store.go
new file mode 100644
index 0000000..f9856be
--- /dev/null
+++ b/internal/coldstore/signal_store.go
@@ -0,0 +1,188 @@
+package coldstore
+
+import (
+ "context"
+ "database/sql"
+ "encoding/json"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+)
+
+type SignalStore struct {
+ db *SQLiteStore
+}
+
+func NewSignalStore(db *SQLiteStore) *SignalStore {
+ return &SignalStore{db: db}
+}
+
+func (s *SignalStore) Insert(ctx context.Context, sig *signals.Signal) error {
+ resource, err := marshalMap(sig.Resource)
+ if err != nil {
+ return fmt.Errorf("coldstore signals marshal resource: %w", err)
+ }
+ metadata, err := marshalMap(sig.Metadata)
+ if err != nil {
+ return fmt.Errorf("coldstore signals marshal metadata: %w", err)
+ }
+ extra, err := marshalMap(sig.Extra)
+ if err != nil {
+ return fmt.Errorf("coldstore signals marshal extra: %w", err)
+ }
+ _, err = s.db.writer.ExecContext(ctx, `
+ INSERT INTO signals (
+ signal_id, type, source, service, env, severity, reason, message,
+ resource, metadata, extra, timestamp, received_at
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ sig.SignalID, string(sig.Type), sig.Source, sig.Service, sig.Env, string(sig.Severity), sig.Reason, sig.Message,
+ resource, metadata, extra, sig.Timestamp.UTC().Format(tsFormat), sig.ReceivedAt.UTC().Format(tsFormat),
+ )
+ if err != nil {
+ return fmt.Errorf("coldstore insert signal: %w", err)
+ }
+ return nil
+}
+
+func (s *SignalStore) Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error) {
+ if f.Limit <= 0 {
+ f.Limit = 200
+ }
+ if f.Limit > 200 {
+ f.Limit = 200
+ }
+ conds := []string{}
+ args := []any{}
+ if f.Service != "" {
+ conds = append(conds, "service = ?")
+ args = append(args, f.Service)
+ }
+ if f.Env != "" {
+ conds = append(conds, "env = ?")
+ args = append(args, f.Env)
+ }
+ if f.Source != "" {
+ conds = append(conds, "source = ?")
+ args = append(args, f.Source)
+ }
+ if f.Reason != "" {
+ conds = append(conds, "reason = ?")
+ args = append(args, f.Reason)
+ }
+ if len(f.Types) > 0 {
+ placeholders := make([]string, 0, len(f.Types))
+ for _, typ := range f.Types {
+ placeholders = append(placeholders, "?")
+ args = append(args, string(typ))
+ }
+ conds = append(conds, "type IN ("+strings.Join(placeholders, ", ")+")")
+ }
+ if !f.Since.IsZero() {
+ conds = append(conds, "timestamp >= ?")
+ args = append(args, f.Since.UTC().Format(tsFormat))
+ }
+ if !f.Until.IsZero() {
+ conds = append(conds, "timestamp <= ?")
+ args = append(args, f.Until.UTC().Format(tsFormat))
+ }
+ where := ""
+ if len(conds) > 0 {
+ where = "WHERE " + strings.Join(conds, " AND ")
+ }
+ query := fmt.Sprintf(`SELECT signal_id, type, source, service, env, severity, reason,
+ COALESCE(message, ''), COALESCE(resource, ''), COALESCE(metadata, ''), COALESCE(extra, ''),
+ timestamp, received_at
+ FROM signals %s ORDER BY timestamp DESC, signal_id DESC LIMIT ?`, where)
+ args = append(args, f.Limit)
+ rows, err := s.db.reader.QueryContext(ctx, query, args...)
+ if err != nil {
+ return nil, fmt.Errorf("coldstore query signals: %w", err)
+ }
+ defer rows.Close()
+ out := make([]signals.Signal, 0, f.Limit)
+ for rows.Next() {
+ sig, err := scanSignal(rows)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, sig)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func (s *SignalStore) PruneOlderThan(ctx context.Context, cutoff time.Time) (int, error) {
+ res, err := s.db.writer.ExecContext(ctx, `DELETE FROM signals WHERE timestamp < ?`, cutoff.UTC().Format(tsFormat))
+ if err != nil {
+ return 0, fmt.Errorf("coldstore prune signals: %w", err)
+ }
+ n, err := res.RowsAffected()
+ if err != nil {
+ return 0, fmt.Errorf("coldstore prune signals rows affected: %w", err)
+ }
+ return int(n), nil
+}
+
+func scanSignal(rows interface {
+ Scan(dest ...any) error
+}) (signals.Signal, error) {
+ var sig signals.Signal
+ var typ, severity, timestamp, receivedAt string
+ var resource, metadata, extra string
+ if err := rows.Scan(
+ &sig.SignalID, &typ, &sig.Source, &sig.Service, &sig.Env, &severity, &sig.Reason,
+ &sig.Message, &resource, &metadata, &extra, ×tamp, &receivedAt,
+ ); err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore scan signal: %w", err)
+ }
+ sig.Type = signals.Type(typ)
+ sig.Severity = signals.Severity(severity)
+ ts, err := time.Parse(tsFormat, timestamp)
+ if err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore signal timestamp: %w", err)
+ }
+ sig.Timestamp = ts
+ recv, err := time.Parse(tsFormat, receivedAt)
+ if err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore signal received_at: %w", err)
+ }
+ sig.ReceivedAt = recv
+ if sig.Resource, err = unmarshalMap(resource); err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore signal resource: %w", err)
+ }
+ if sig.Metadata, err = unmarshalMap(metadata); err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore signal metadata: %w", err)
+ }
+ if sig.Extra, err = unmarshalMap(extra); err != nil {
+ return signals.Signal{}, fmt.Errorf("coldstore signal extra: %w", err)
+ }
+ return sig, nil
+}
+
+func marshalMap(m map[string]any) (sql.NullString, error) {
+ if len(m) == 0 {
+ return sql.NullString{}, nil
+ }
+ b, err := json.Marshal(m)
+ if err != nil {
+ return sql.NullString{}, err
+ }
+ return sql.NullString{String: string(b), Valid: true}, nil
+}
+
+func unmarshalMap(raw string) (map[string]any, error) {
+ if raw == "" {
+ return nil, nil
+ }
+ out := map[string]any{}
+ if err := json.Unmarshal([]byte(raw), &out); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+var _ signals.Store = (*SignalStore)(nil)
diff --git a/internal/coldstore/signal_store_test.go b/internal/coldstore/signal_store_test.go
new file mode 100644
index 0000000..7d85850
--- /dev/null
+++ b/internal/coldstore/signal_store_test.go
@@ -0,0 +1,97 @@
+package coldstore
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+)
+
+func TestSignalStoreInsertQueryAndPrune(t *testing.T) {
+ store := newSignalTestStore(t)
+ sigStore := NewSignalStore(store)
+ base := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+ rows := []signals.Signal{
+ testSignal("sig_a", signals.TypeDeploy, "github", "checkout", "prod", "RolloutComplete", base.Add(-time.Minute)),
+ testSignal("sig_b", signals.TypeDependency, "statuspage", "payment", "prod", "Provider5xx", base),
+ testSignal("sig_c", signals.TypeDeploy, "github", "checkout", "staging", "RolloutComplete", base.Add(-2*time.Minute)),
+ }
+ for i := range rows {
+ if err := sigStore.Insert(context.Background(), &rows[i]); err != nil {
+ t.Fatal(err)
+ }
+ }
+ got, err := sigStore.Query(context.Background(), signals.Filter{
+ Service: "checkout",
+ Env: "prod",
+ Source: "github",
+ Reason: "RolloutComplete",
+ Types: []signals.Type{signals.TypeDeploy},
+ Since: base.Add(-2 * time.Minute),
+ Until: base,
+ Limit: 10,
+ })
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(got) != 1 || got[0].SignalID != "sig_a" {
+ t.Fatalf("got=%+v", got)
+ }
+ if got[0].Metadata["version"] != "1.2.3" || got[0].Extra["custom_tag"] != "alpha" {
+ t.Fatalf("metadata/extra not round-tripped: %+v", got[0])
+ }
+
+ got, err = sigStore.Query(context.Background(), signals.Filter{Env: "prod", Limit: 10})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(got) != 2 || got[0].SignalID != "sig_b" || got[1].SignalID != "sig_a" {
+ t.Fatalf("ordering got=%+v", got)
+ }
+
+ deleted, err := sigStore.PruneOlderThan(context.Background(), base.Add(-30*time.Second))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if deleted != 2 {
+ t.Fatalf("deleted=%d want 2", deleted)
+ }
+ got, err = sigStore.Query(context.Background(), signals.Filter{Limit: 10})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(got) != 1 || got[0].SignalID != "sig_b" {
+ t.Fatalf("after prune got=%+v", got)
+ }
+}
+
+func newSignalTestStore(t *testing.T) *SQLiteStore {
+ t.Helper()
+ managed, err := Open(":memory:")
+ if err != nil {
+ t.Fatal(err)
+ }
+ t.Cleanup(func() { _ = managed.Close() })
+ store, ok := managed.(*SQLiteStore)
+ if !ok {
+ t.Fatalf("store type=%T", managed)
+ }
+ return store
+}
+
+func testSignal(id string, typ signals.Type, source, service, env, reason string, ts time.Time) signals.Signal {
+ return signals.Signal{
+ SignalID: id,
+ Type: typ,
+ Source: source,
+ Service: service,
+ Env: env,
+ Severity: signals.SeverityInfo,
+ Reason: reason,
+ Metadata: map[string]any{"version": "1.2.3"},
+ Extra: map[string]any{"custom_tag": "alpha"},
+ Timestamp: ts,
+ ReceivedAt: ts.Add(time.Second),
+ }
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index fd59cf2..a654cf8 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -67,6 +67,10 @@ type Metrics struct {
DeployUpsertsTotal prometheus.Counter
DeployUpsertErrors prometheus.Counter
+ SignalsAccepted prometheus.Counter
+ SignalsRejected *prometheus.CounterVec
+ SignalRetentionPruned prometheus.Counter
+
CausalRunsTotal prometheus.Counter
CausalRunDuration prometheus.Histogram
CausalRunFailures prometheus.Counter
@@ -316,6 +320,26 @@ func New(reg *prometheus.Registry) *Metrics {
Help: "Failed deployment upserts (non-env-conflict).",
})
+ m.SignalsAccepted = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_signals_accepted_total",
+ Help: "Production-context signals accepted into durable storage.",
+ })
+ m.SignalsRejected = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Name: "waylog_signals_rejected_total",
+ Help: "Production-context signals rejected by reason.",
+ }, []string{"reason"})
+ for _, reason := range []string{
+ "invalid_field", "unknown_type", "unknown_severity", "timestamp_too_far_in_future",
+ "body_oversize", "invalid_body", "invalid_json", "unsupported_method",
+ "durability_unavailable", "internal_error",
+ } {
+ m.SignalsRejected.WithLabelValues(reason).Add(0)
+ }
+ m.SignalRetentionPruned = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_signal_retention_pruned_total",
+ Help: "Production-context signals pruned by retention.",
+ })
+
m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "waylog_causal_runs_total",
Help: "Total causal inference runs.",
@@ -392,6 +416,7 @@ func New(reg *prometheus.Registry) *Metrics {
m.ToolDirectCallsTotal, m.DedupReplayTotal, m.DedupCacheSize,
m.ColdEventsWritten, m.ColdEventsDropped, m.ColdBatchLatency,
m.DeployUpsertsTotal, m.DeployUpsertErrors,
+ m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,
diff --git a/internal/signals/handler.go b/internal/signals/handler.go
new file mode 100644
index 0000000..a148d3f
--- /dev/null
+++ b/internal/signals/handler.go
@@ -0,0 +1,118 @@
+package signals
+
+import (
+ "encoding/json"
+ "errors"
+ "io"
+ "net/http"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/metrics"
+)
+
+const defaultMaxBodyBytes int64 = 1 << 20
+
+type Handler struct {
+ store Store
+ metrics *metrics.Metrics
+ now func() time.Time
+ futureSkew time.Duration
+ maxBodyBytes int64
+}
+
+func NewHandler(store Store, m *metrics.Metrics) *Handler {
+ if store == nil {
+ store = UnavailableStore{}
+ }
+ return &Handler{
+ store: store,
+ metrics: m,
+ now: time.Now,
+ futureSkew: 5 * time.Minute,
+ maxBodyBytes: defaultMaxBodyBytes,
+ }
+}
+
+func (h *Handler) Signals(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ h.reject(w, http.StatusMethodNotAllowed, CodeUnsupportedMethod, "method not allowed", "")
+ return
+ }
+
+ body, ok := h.readBody(w, r)
+ if !ok {
+ return
+ }
+ var signal Signal
+ if err := json.Unmarshal(body, &signal); err != nil {
+ var validation *ValidationError
+ if errors.As(err, &validation) {
+ h.reject(w, http.StatusBadRequest, validation.Code, "invalid signal", validation.Error())
+ return
+ }
+ h.reject(w, http.StatusBadRequest, CodeInvalidJSON, "invalid json", err.Error())
+ return
+ }
+ now := h.now().UTC()
+ if err := Validate(&signal, now, h.futureSkew); err != nil {
+ var validation *ValidationError
+ if errors.As(err, &validation) {
+ h.reject(w, http.StatusBadRequest, validation.Code, "invalid signal", validation.Error())
+ return
+ }
+ h.reject(w, http.StatusBadRequest, CodeInvalidField, "invalid signal", err.Error())
+ return
+ }
+ signal.SignalID = NewSignalID()
+ signal.ReceivedAt = now
+ if err := h.store.Insert(r.Context(), &signal); err != nil {
+ if errors.Is(err, ErrUnavailable) {
+ h.reject(w, http.StatusServiceUnavailable, CodeDurabilityUnavailable, "signals unavailable", "set SQLITE_PATH to enable signals")
+ return
+ }
+ h.reject(w, http.StatusInternalServerError, CodeInternalError, "internal error", "")
+ return
+ }
+ if h.metrics != nil {
+ h.metrics.SignalsAccepted.Inc()
+ }
+ writeJSON(w, http.StatusCreated, map[string]Signal{"signal": signal})
+}
+
+func (h *Handler) readBody(w http.ResponseWriter, r *http.Request) ([]byte, bool) {
+ r.Body = http.MaxBytesReader(w, r.Body, h.maxBodyBytes)
+ body, err := io.ReadAll(r.Body)
+ if err != nil {
+ var maxErr *http.MaxBytesError
+ if errors.As(err, &maxErr) {
+ h.reject(w, http.StatusRequestEntityTooLarge, CodeBodyOversize, "body too large", "request body exceeds 1 MB")
+ return nil, false
+ }
+ h.reject(w, http.StatusBadRequest, CodeInvalidBody, "invalid body", err.Error())
+ return nil, false
+ }
+ return body, true
+}
+
+func (h *Handler) reject(w http.ResponseWriter, status int, code, message, detail string) {
+ if h.metrics != nil {
+ h.metrics.SignalsRejected.WithLabelValues(code).Inc()
+ }
+ writeJSON(w, status, errorResponse{Error: readError{Code: code, Message: message, Detail: detail}})
+}
+
+type errorResponse struct {
+ Error readError `json:"error"`
+}
+
+type readError struct {
+ Code string `json:"code"`
+ Message string `json:"message"`
+ Detail string `json:"detail,omitempty"`
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(status)
+ _ = json.NewEncoder(w).Encode(v)
+}
diff --git a/internal/signals/handler_test.go b/internal/signals/handler_test.go
new file mode 100644
index 0000000..f52189a
--- /dev/null
+++ b/internal/signals/handler_test.go
@@ -0,0 +1,143 @@
+package signals
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+)
+
+func TestHandlerSignals(t *testing.T) {
+ now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+ store := &fakeStore{}
+ h := NewHandler(store, nil)
+ h.now = func() time.Time { return now }
+ body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z","custom_tag":"foo"}`
+ req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ if rec.Code != http.StatusCreated {
+ t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+ }
+ if len(store.inserted) != 1 {
+ t.Fatalf("inserted=%d", len(store.inserted))
+ }
+ if store.inserted[0].SignalID == "" || store.inserted[0].ReceivedAt.IsZero() {
+ t.Fatalf("server fields not set: %+v", store.inserted[0])
+ }
+ if store.inserted[0].Extra["custom_tag"] != "foo" {
+ t.Fatalf("extra=%+v", store.inserted[0].Extra)
+ }
+ var resp struct {
+ Signal Signal `json:"signal"`
+ }
+ if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+ t.Fatal(err)
+ }
+ if resp.Signal.SignalID != store.inserted[0].SignalID {
+ t.Fatalf("response id=%q inserted=%q", resp.Signal.SignalID, store.inserted[0].SignalID)
+ }
+}
+
+func TestHandlerRejectsInvalidSignals(t *testing.T) {
+ now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+ tests := []struct {
+ name string
+ body string
+ status int
+ code string
+ }{
+ {name: "invalid json", body: `{`, status: 400, code: CodeInvalidJSON},
+ {name: "missing service", body: `{"type":"deploy","source":"github","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeInvalidField},
+ {name: "unknown type", body: `{"type":"wrong","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeUnknownType},
+ {name: "unknown severity", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"huge","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`, status: 400, code: CodeUnknownSeverity},
+ {name: "future", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T20:00:00Z"}`, status: 400, code: CodeTimestampTooFarInFuture},
+ {name: "non object resource", body: `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z","resource":"bad"}`, status: 400, code: CodeInvalidField},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ h := NewHandler(&fakeStore{}, nil)
+ h.now = func() time.Time { return now }
+ req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(tt.body))
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ assertError(t, rec, tt.status, tt.code)
+ })
+ }
+}
+
+func TestHandlerRejectsMethod(t *testing.T) {
+ h := NewHandler(UnavailableStore{}, nil)
+ req := httptest.NewRequest(http.MethodGet, "/v1/signals", nil)
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ assertError(t, rec, http.StatusMethodNotAllowed, CodeUnsupportedMethod)
+}
+
+func TestHandlerRejectsOversizeBody(t *testing.T) {
+ h := NewHandler(UnavailableStore{}, nil)
+ h.maxBodyBytes = 8
+ req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(`{"too":"large"}`))
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ assertError(t, rec, http.StatusRequestEntityTooLarge, CodeBodyOversize)
+}
+
+func TestHandlerReportsStoreUnavailable(t *testing.T) {
+ h := NewHandler(UnavailableStore{}, nil)
+ h.now = func() time.Time { return time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC) }
+ body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`
+ req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ assertError(t, rec, http.StatusServiceUnavailable, CodeDurabilityUnavailable)
+}
+
+func TestHandlerReportsStoreError(t *testing.T) {
+ h := NewHandler(&fakeStore{err: errors.New("boom")}, nil)
+ h.now = func() time.Time { return time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC) }
+ body := `{"type":"deploy","source":"github","service":"checkout","env":"prod","severity":"info","reason":"RolloutComplete","timestamp":"2026-05-02T17:59:00Z"}`
+ req := httptest.NewRequest(http.MethodPost, "/v1/signals", bytes.NewBufferString(body))
+ rec := httptest.NewRecorder()
+ h.Signals(rec, req)
+ assertError(t, rec, http.StatusInternalServerError, CodeInternalError)
+}
+
+func assertError(t *testing.T, rec *httptest.ResponseRecorder, status int, code string) {
+ t.Helper()
+ if rec.Code != status {
+ t.Fatalf("status=%d want %d body=%s", rec.Code, status, rec.Body.String())
+ }
+ var resp errorResponse
+ if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+ t.Fatal(err)
+ }
+ if resp.Error.Code != code {
+ t.Fatalf("code=%q want %q body=%s", resp.Error.Code, code, rec.Body.String())
+ }
+}
+
+type fakeStore struct {
+ inserted []Signal
+ err error
+}
+
+func (s *fakeStore) Insert(_ context.Context, sig *Signal) error {
+ if s.err != nil {
+ return s.err
+ }
+ s.inserted = append(s.inserted, *sig)
+ return nil
+}
+
+func (s *fakeStore) Query(context.Context, Filter) ([]Signal, error) {
+ return nil, errors.New("unused")
+}
+
+func (s *fakeStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+ return 0, errors.New("unused")
+}
diff --git a/internal/signals/retention.go b/internal/signals/retention.go
new file mode 100644
index 0000000..4246c1a
--- /dev/null
+++ b/internal/signals/retention.go
@@ -0,0 +1,39 @@
+package signals
+
+import (
+ "context"
+ "log/slog"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/metrics"
+)
+
+func RunRetention(ctx context.Context, store Store, retention, interval time.Duration, m *metrics.Metrics, log *slog.Logger) {
+ if store == nil || retention <= 0 || interval <= 0 {
+ return
+ }
+ if log == nil {
+ log = slog.Default()
+ }
+ ticker := time.NewTicker(interval)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ cutoff := time.Now().UTC().Add(-retention)
+ deleted, err := store.PruneOlderThan(ctx, cutoff)
+ if err != nil {
+ log.Warn("signals retention prune failed", "err", err)
+ continue
+ }
+ if m != nil && deleted > 0 {
+ m.SignalRetentionPruned.Add(float64(deleted))
+ }
+ if deleted > 0 {
+ log.Info("signals retention pruned", "deleted", deleted, "cutoff", cutoff)
+ }
+ }
+ }
+}
diff --git a/internal/signals/retention_test.go b/internal/signals/retention_test.go
new file mode 100644
index 0000000..c56075b
--- /dev/null
+++ b/internal/signals/retention_test.go
@@ -0,0 +1,51 @@
+package signals
+
+import (
+ "context"
+ "log/slog"
+ "sync/atomic"
+ "testing"
+ "time"
+)
+
+func TestRunRetentionPrunesAndStops(t *testing.T) {
+ store := &retentionStore{}
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() {
+ RunRetention(ctx, store, time.Minute, time.Millisecond, nil, slog.Default())
+ close(done)
+ }()
+ deadline := time.After(time.Second)
+ for {
+ if store.calls() > 0 {
+ break
+ }
+ select {
+ case <-deadline:
+ t.Fatal("retention did not call prune")
+ default:
+ time.Sleep(time.Millisecond)
+ }
+ }
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(time.Second):
+ t.Fatal("retention did not stop")
+ }
+}
+
+type retentionStore struct {
+ n atomic.Int64
+}
+
+func (s *retentionStore) Insert(context.Context, *Signal) error { return nil }
+func (s *retentionStore) Query(context.Context, Filter) ([]Signal, error) {
+ return nil, nil
+}
+func (s *retentionStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+ s.n.Add(1)
+ return 1, nil
+}
+func (s *retentionStore) calls() int { return int(s.n.Load()) }
diff --git a/internal/signals/store.go b/internal/signals/store.go
new file mode 100644
index 0000000..b6fdc18
--- /dev/null
+++ b/internal/signals/store.go
@@ -0,0 +1,40 @@
+package signals
+
+import (
+ "context"
+ "errors"
+ "time"
+)
+
+var ErrUnavailable = errors.New("signals: store unavailable")
+
+type Store interface {
+ Insert(ctx context.Context, s *Signal) error
+ Query(ctx context.Context, f Filter) ([]Signal, error)
+ PruneOlderThan(ctx context.Context, cutoff time.Time) (int, error)
+}
+
+type Filter struct {
+ Service string
+ Env string
+ Source string
+ Reason string
+ Types []Type
+ Since time.Time
+ Until time.Time
+ Limit int
+}
+
+type UnavailableStore struct{}
+
+func (UnavailableStore) Insert(context.Context, *Signal) error {
+ return ErrUnavailable
+}
+
+func (UnavailableStore) Query(context.Context, Filter) ([]Signal, error) {
+ return nil, ErrUnavailable
+}
+
+func (UnavailableStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+ return 0, nil
+}
diff --git a/internal/signals/types.go b/internal/signals/types.go
new file mode 100644
index 0000000..8a27024
--- /dev/null
+++ b/internal/signals/types.go
@@ -0,0 +1,190 @@
+package signals
+
+import (
+ "encoding/json"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/google/uuid"
+)
+
+type Type string
+
+const (
+ TypeDeploy Type = "deploy"
+ TypeRuntime Type = "runtime"
+ TypeHealthcheck Type = "healthcheck"
+ TypeDependency Type = "dependency"
+ TypeConfig Type = "config"
+ TypeAlert Type = "alert"
+)
+
+func (t Type) Valid() bool {
+ switch t {
+ case TypeDeploy, TypeRuntime, TypeHealthcheck, TypeDependency, TypeConfig, TypeAlert:
+ return true
+ default:
+ return false
+ }
+}
+
+type Severity string
+
+const (
+ SeverityInfo Severity = "info"
+ SeverityWarning Severity = "warning"
+ SeverityCritical Severity = "critical"
+)
+
+func (s Severity) Valid() bool {
+ switch s {
+ case SeverityInfo, SeverityWarning, SeverityCritical:
+ return true
+ default:
+ return false
+ }
+}
+
+type Signal struct {
+ SignalID string `json:"signal_id"`
+ Type Type `json:"type"`
+ Source string `json:"source"`
+ Service string `json:"service"`
+ Env string `json:"env"`
+ Severity Severity `json:"severity"`
+ Reason string `json:"reason"`
+ Message string `json:"message,omitempty"`
+ Resource map[string]any `json:"resource,omitempty"`
+ Metadata map[string]any `json:"metadata,omitempty"`
+ Timestamp time.Time `json:"timestamp"`
+ ReceivedAt time.Time `json:"received_at"`
+ Extra map[string]any `json:"-"`
+}
+
+func NewSignalID() string {
+ return "sig_" + strings.ReplaceAll(uuid.NewString(), "-", "")
+}
+
+func (s *Signal) UnmarshalJSON(b []byte) error {
+ var raw map[string]json.RawMessage
+ if err := json.Unmarshal(b, &raw); err != nil {
+ return err
+ }
+ *s = Signal{}
+ extra := map[string]any{}
+ for key, value := range raw {
+ switch key {
+ case "signal_id":
+ if err := json.Unmarshal(value, &s.SignalID); err != nil {
+ return fmt.Errorf("signal_id: %w", err)
+ }
+ case "type":
+ var v string
+ if err := json.Unmarshal(value, &v); err != nil {
+ return fmt.Errorf("type: %w", err)
+ }
+ s.Type = Type(v)
+ case "source":
+ if err := json.Unmarshal(value, &s.Source); err != nil {
+ return fmt.Errorf("source: %w", err)
+ }
+ case "service":
+ if err := json.Unmarshal(value, &s.Service); err != nil {
+ return fmt.Errorf("service: %w", err)
+ }
+ case "env":
+ if err := json.Unmarshal(value, &s.Env); err != nil {
+ return fmt.Errorf("env: %w", err)
+ }
+ case "severity":
+ var v string
+ if err := json.Unmarshal(value, &v); err != nil {
+ return fmt.Errorf("severity: %w", err)
+ }
+ s.Severity = Severity(v)
+ case "reason":
+ if err := json.Unmarshal(value, &s.Reason); err != nil {
+ return fmt.Errorf("reason: %w", err)
+ }
+ case "message":
+ if err := json.Unmarshal(value, &s.Message); err != nil {
+ return fmt.Errorf("message: %w", err)
+ }
+ case "resource":
+ resource, err := decodeObject(value)
+ if err != nil {
+ return invalidField("resource", "resource must be an object")
+ }
+ s.Resource = resource
+ case "metadata":
+ metadata, err := decodeObject(value)
+ if err != nil {
+ return invalidField("metadata", "metadata must be an object")
+ }
+ s.Metadata = metadata
+ case "timestamp":
+ if err := json.Unmarshal(value, &s.Timestamp); err != nil {
+ return fmt.Errorf("timestamp: %w", err)
+ }
+ case "received_at":
+ if err := json.Unmarshal(value, &s.ReceivedAt); err != nil {
+ return fmt.Errorf("received_at: %w", err)
+ }
+ default:
+ var v any
+ if err := json.Unmarshal(value, &v); err != nil {
+ return fmt.Errorf("%s: %w", key, err)
+ }
+ extra[key] = v
+ }
+ }
+ if len(extra) > 0 {
+ s.Extra = extra
+ }
+ return nil
+}
+
+func (s Signal) MarshalJSON() ([]byte, error) {
+ out := map[string]any{}
+ for key, value := range s.Extra {
+ out[key] = value
+ }
+ out["signal_id"] = s.SignalID
+ out["type"] = s.Type
+ out["source"] = s.Source
+ out["service"] = s.Service
+ out["env"] = s.Env
+ out["severity"] = s.Severity
+ out["reason"] = s.Reason
+ if s.Message != "" {
+ out["message"] = s.Message
+ }
+ if s.Resource != nil {
+ out["resource"] = s.Resource
+ }
+ if s.Metadata != nil {
+ out["metadata"] = s.Metadata
+ }
+ if !s.Timestamp.IsZero() {
+ out["timestamp"] = s.Timestamp
+ }
+ if !s.ReceivedAt.IsZero() {
+ out["received_at"] = s.ReceivedAt
+ }
+ return json.Marshal(out)
+}
+
+func decodeObject(raw json.RawMessage) (map[string]any, error) {
+ if string(raw) == "null" {
+ return nil, nil
+ }
+ var out map[string]any
+ if err := json.Unmarshal(raw, &out); err != nil {
+ return nil, err
+ }
+ if out == nil {
+ return nil, fmt.Errorf("must be an object")
+ }
+ return out, nil
+}
diff --git a/internal/signals/types_test.go b/internal/signals/types_test.go
new file mode 100644
index 0000000..6970d38
--- /dev/null
+++ b/internal/signals/types_test.go
@@ -0,0 +1,78 @@
+package signals
+
+import (
+ "encoding/json"
+ "errors"
+ "testing"
+ "time"
+)
+
+func TestSignalJSONPreservesExtraAndOverridesServerFields(t *testing.T) {
+ raw := []byte(`{
+ "signal_id":"client",
+ "type":"deploy",
+ "source":"github",
+ "service":"checkout",
+ "env":"prod",
+ "severity":"info",
+ "reason":"RolloutComplete",
+ "metadata":{"version":"1.2.3"},
+ "timestamp":"2026-05-02T18:09:40Z",
+ "received_at":"2026-05-02T18:09:41Z",
+ "custom_tag":"foo"
+ }`)
+ var sig Signal
+ if err := json.Unmarshal(raw, &sig); err != nil {
+ t.Fatal(err)
+ }
+ if got := sig.Extra["custom_tag"]; got != "foo" {
+ t.Fatalf("custom_tag=%v", got)
+ }
+ sig.SignalID = "sig_server"
+ sig.ReceivedAt = time.Date(2026, 5, 2, 18, 9, 42, 0, time.UTC)
+ out, err := json.Marshal(sig)
+ if err != nil {
+ t.Fatal(err)
+ }
+ var decoded map[string]any
+ if err := json.Unmarshal(out, &decoded); err != nil {
+ t.Fatal(err)
+ }
+ if decoded["signal_id"] != "sig_server" {
+ t.Fatalf("signal_id=%v", decoded["signal_id"])
+ }
+ if decoded["custom_tag"] != "foo" {
+ t.Fatalf("custom_tag=%v", decoded["custom_tag"])
+ }
+}
+
+func TestTypeAndSeverityValidity(t *testing.T) {
+ for _, typ := range []Type{TypeDeploy, TypeRuntime, TypeHealthcheck, TypeDependency, TypeConfig, TypeAlert} {
+ if !typ.Valid() {
+ t.Fatalf("%q should be valid", typ)
+ }
+ }
+ if Type("bad").Valid() {
+ t.Fatal("bad type should be invalid")
+ }
+ for _, severity := range []Severity{SeverityInfo, SeverityWarning, SeverityCritical} {
+ if !severity.Valid() {
+ t.Fatalf("%q should be valid", severity)
+ }
+ }
+ if Severity("huge").Valid() {
+ t.Fatal("bad severity should be invalid")
+ }
+}
+
+func TestSignalJSONRejectsNonObjectResource(t *testing.T) {
+ var sig Signal
+ err := json.Unmarshal([]byte(`{"resource":"bad"}`), &sig)
+ if err == nil {
+ t.Fatal("expected error")
+ }
+ var validation *ValidationError
+ if !errors.As(err, &validation) || validation.Code != CodeInvalidField {
+ t.Fatalf("err=%T %[1]v", err)
+ }
+}
diff --git a/internal/signals/validate.go b/internal/signals/validate.go
new file mode 100644
index 0000000..b8dcb2f
--- /dev/null
+++ b/internal/signals/validate.go
@@ -0,0 +1,74 @@
+package signals
+
+import (
+ "fmt"
+ "strings"
+ "time"
+)
+
+const (
+ CodeInvalidField = "invalid_field"
+ CodeUnknownType = "unknown_type"
+ CodeUnknownSeverity = "unknown_severity"
+ CodeTimestampTooFarInFuture = "timestamp_too_far_in_future"
+ CodeBodyOversize = "body_oversize"
+ CodeInvalidBody = "invalid_body"
+ CodeInvalidJSON = "invalid_json"
+ CodeUnsupportedMethod = "unsupported_method"
+ CodeDurabilityUnavailable = "durability_unavailable"
+ CodeInternalError = "internal_error"
+)
+
+type ValidationError struct {
+ Code string
+ Field string
+ Detail string
+}
+
+func (e *ValidationError) Error() string {
+ if e.Field == "" {
+ return e.Detail
+ }
+ return e.Field + ": " + e.Detail
+}
+
+func Validate(s *Signal, now time.Time, futureSkew time.Duration) error {
+ if s == nil {
+ return invalidField("signal", "signal is required")
+ }
+ if strings.TrimSpace(string(s.Type)) == "" {
+ return invalidField("type", "type is required")
+ }
+ if !s.Type.Valid() {
+ return &ValidationError{Code: CodeUnknownType, Field: "type", Detail: fmt.Sprintf("unknown type %q", s.Type)}
+ }
+ if strings.TrimSpace(s.Source) == "" {
+ return invalidField("source", "source is required")
+ }
+ if strings.TrimSpace(s.Service) == "" {
+ return invalidField("service", "service is required")
+ }
+ if strings.TrimSpace(s.Env) == "" {
+ return invalidField("env", "env is required")
+ }
+ if strings.TrimSpace(string(s.Severity)) == "" {
+ return invalidField("severity", "severity is required")
+ }
+ if !s.Severity.Valid() {
+ return &ValidationError{Code: CodeUnknownSeverity, Field: "severity", Detail: fmt.Sprintf("unknown severity %q", s.Severity)}
+ }
+ if strings.TrimSpace(s.Reason) == "" {
+ return invalidField("reason", "reason is required")
+ }
+ if s.Timestamp.IsZero() {
+ return invalidField("timestamp", "timestamp is required")
+ }
+ if futureSkew > 0 && s.Timestamp.After(now.UTC().Add(futureSkew)) {
+ return &ValidationError{Code: CodeTimestampTooFarInFuture, Field: "timestamp", Detail: "timestamp is too far in the future"}
+ }
+ return nil
+}
+
+func invalidField(field, detail string) *ValidationError {
+ return &ValidationError{Code: CodeInvalidField, Field: field, Detail: detail}
+}
diff --git a/internal/signals/validate_test.go b/internal/signals/validate_test.go
new file mode 100644
index 0000000..580c5e8
--- /dev/null
+++ b/internal/signals/validate_test.go
@@ -0,0 +1,53 @@
+package signals
+
+import (
+ "errors"
+ "testing"
+ "time"
+)
+
+func TestValidate(t *testing.T) {
+ now := time.Date(2026, 5, 2, 18, 0, 0, 0, time.UTC)
+ valid := Signal{
+ Type: TypeDeploy,
+ Source: "github",
+ Service: "checkout",
+ Env: "prod",
+ Severity: SeverityInfo,
+ Reason: "RolloutComplete",
+ Timestamp: now,
+ }
+ tests := []struct {
+ name string
+ edit func(*Signal)
+ code string
+ }{
+ {name: "valid"},
+ {name: "missing service", edit: func(s *Signal) { s.Service = "" }, code: CodeInvalidField},
+ {name: "unknown type", edit: func(s *Signal) { s.Type = "wrong" }, code: CodeUnknownType},
+ {name: "unknown severity", edit: func(s *Signal) { s.Severity = "huge" }, code: CodeUnknownSeverity},
+ {name: "future timestamp", edit: func(s *Signal) { s.Timestamp = now.Add(2 * time.Hour) }, code: CodeTimestampTooFarInFuture},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ sig := valid
+ if tt.edit != nil {
+ tt.edit(&sig)
+ }
+ err := Validate(&sig, now, 5*time.Minute)
+ if tt.code == "" {
+ if err != nil {
+ t.Fatal(err)
+ }
+ return
+ }
+ var validation *ValidationError
+ if !errors.As(err, &validation) {
+ t.Fatalf("err=%T %[1]v", err)
+ }
+ if validation.Code != tt.code {
+ t.Fatalf("code=%q want %q", validation.Code, tt.code)
+ }
+ })
+ }
+}
From f196c573c1cdce6e524e694a63436a0b556917d6 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Tue, 5 May 2026 02:45:51 -0400
Subject: [PATCH 02/14] feat: Implemented the incident engine over the
schema-2.0 read path.
Adds internal/incidents with the incident domain model, stable incident IDs,
fixed-rule classification, evidence normalization, next-check templates,
snapshot rendering, HTTP handlers, in-memory test store, and engine lifecycle.
The engine derives incidents from v2 error-family spikes, enriches them with
signals and deployment context, persists stable samples, and transitions
active -> recovering -> resolved.
Adds SQLite incident persistence via coldstore migration 004_incidents.sql and
IncidentStore with upsert, get, active listing, and resolved pruning support.
Wires cmd/ingest so incidents start only when SQLITE_PATH is set,
WAYLOG_V2_READS=true, and WAYLOG_INCIDENTS_ENABLED=true. Bootstrap failure is
fatal under those conditions. The legacy detector continues as fallback when
incidents are unavailable or disabled, and is disabled only when the new engine
is running. /v1/insight now projects the top active incident when the v2.1
engine is active.
Adds read-auth incident routes:
- GET /v1/incidents/active
- GET /v1/incidents/{id}
- GET /v1/incidents/{id}/snapshot
Adds incident Prometheus metrics and updates OpenAPI/env docs for the new
incident surface and configuration.
Verification:
- go test ./internal/incidents ./internal/coldstore ./internal/ingest/v2 ./internal/ingest ./cmd/ingest
- go test ./...
- go test -race ./internal/incidents ./internal/coldstore
- go vet ./...
- bash scripts/check-doc-links.sh
- git diff --check
---
cmd/ingest/main.go | 161 +++++-
docs/env.md | 8 +
docs/openapi.yaml | 259 +++++++++
internal/coldstore/incident_store.go | 236 +++++++++
internal/coldstore/incident_store_test.go | 85 +++
.../coldstore/migrations/004_incidents.sql | 34 ++
internal/incidents/classifier.go | 269 ++++++++++
internal/incidents/classifier_test.go | 66 +++
internal/incidents/engine.go | 497 ++++++++++++++++++
internal/incidents/engine_test.go | 105 ++++
internal/incidents/handler.go | 92 ++++
internal/incidents/handler_test.go | 61 +++
internal/incidents/id.go | 19 +
internal/incidents/id_test.go | 25 +
internal/incidents/interfaces.go | 36 ++
internal/incidents/nextchecks.go | 30 ++
internal/incidents/render.go | 48 ++
internal/incidents/store.go | 104 ++++
internal/incidents/test_helpers_test.go | 60 +++
internal/incidents/types.go | 103 ++++
internal/metrics/metrics.go | 45 ++
21 files changed, 2341 insertions(+), 2 deletions(-)
create mode 100644 internal/coldstore/incident_store.go
create mode 100644 internal/coldstore/incident_store_test.go
create mode 100644 internal/coldstore/migrations/004_incidents.sql
create mode 100644 internal/incidents/classifier.go
create mode 100644 internal/incidents/classifier_test.go
create mode 100644 internal/incidents/engine.go
create mode 100644 internal/incidents/engine_test.go
create mode 100644 internal/incidents/handler.go
create mode 100644 internal/incidents/handler_test.go
create mode 100644 internal/incidents/id.go
create mode 100644 internal/incidents/id_test.go
create mode 100644 internal/incidents/interfaces.go
create mode 100644 internal/incidents/nextchecks.go
create mode 100644 internal/incidents/render.go
create mode 100644 internal/incidents/store.go
create mode 100644 internal/incidents/test_helpers_test.go
create mode 100644 internal/incidents/types.go
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 599993e..650f58d 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -26,6 +26,7 @@ import (
"github.com/sssmaran/WaylogCLI/internal/graph/causal"
"github.com/sssmaran/WaylogCLI/internal/graph/core"
graphstore "github.com/sssmaran/WaylogCLI/internal/graph/store"
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
"github.com/sssmaran/WaylogCLI/internal/ingest"
ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
"github.com/sssmaran/WaylogCLI/internal/mcp/stdio"
@@ -35,6 +36,8 @@ import (
"github.com/sssmaran/WaylogCLI/internal/signals"
"github.com/sssmaran/WaylogCLI/internal/tools"
"github.com/sssmaran/WaylogCLI/internal/tracestore"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
)
var graphStore *graphstore.Store
@@ -126,6 +129,16 @@ func main() {
otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+ incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
+ incidentCfg := incidents.Config{
+ TickInterval: config.GetenvDuration("WAYLOG_INCIDENT_TICK_INTERVAL", 30*time.Second),
+ Window: config.GetenvDuration("WAYLOG_INCIDENT_WINDOW", 10*time.Minute),
+ MinCount: config.GetenvInt("WAYLOG_INCIDENT_MIN_COUNT", 5),
+ MinLift: config.GetenvFloat("WAYLOG_INCIDENT_MIN_LIFT", 3.0),
+ ResolveAfter: config.GetenvDuration("WAYLOG_INCIDENT_RESOLVE_AFTER", 2*time.Minute),
+ DeployCorrelationWindow: config.GetenvDuration("WAYLOG_DEPLOY_CORRELATION_WINDOW", 15*time.Minute),
+ SampleLimit: config.GetenvInt("WAYLOG_INCIDENT_SAMPLE_LIMIT", 5),
+ }
if signalRetention <= 0 {
slog.Error("WAYLOG_SIGNAL_RETENTION must be positive", "value", signalRetention)
os.Exit(1)
@@ -380,8 +393,11 @@ func main() {
func(w http.ResponseWriter, r *http.Request) { inner.ServeHTTP(w, r) }))
}
mux.Handle("/v1/overview", readCORS(ingestServer.Overview))
+ var v2Reader *ingestv2.Reader
+ var incidentEngine *incidents.Engine
+ incidentRunning := false
if v2ReadsEnabled {
- v2Reader := ingestv2.NewReader(v2Index)
+ v2Reader = ingestv2.NewReader(v2Index)
v2ReadHandler := ingestv2.NewReadHandler(v2Reader, m, graphHotWindow)
mux.Handle("/v1/events/search", readCORS(v2ReadHandler.EventSearch))
mux.Handle("/v1/errors", readCORS(v2ReadHandler.Errors))
@@ -393,6 +409,32 @@ func main() {
mux.Handle("/v1/events/", readCORS(v2ReadHandler.EventByID))
mux.Handle("/v1/traces/", readCORS(v2ReadHandler.TraceByID))
slog.Info("v2 read endpoints enabled")
+ if incidentsEnabled {
+ if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok {
+ incidentStore := coldstore.NewIncidentStore(sqlite)
+ incidentEngine = incidents.NewEngine(
+ incidentReaderAdapter{reader: v2Reader},
+ signalStore,
+ coldDeployAdapter{store: sqlite},
+ incidentStore,
+ incidentCfg,
+ m,
+ slog.Default(),
+ )
+ if err := incidentEngine.Bootstrap(context.Background()); err != nil {
+ slog.Error("incident engine bootstrap failed", "err", err)
+ os.Exit(1)
+ }
+ incidentHandler := incidents.NewHandler(incidentEngine)
+ mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
+ mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
+ ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine})
+ incidentRunning = true
+ slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
+ } else {
+ slog.Info("incident engine disabled: SQLITE_PATH is not set")
+ }
+ }
} else {
mux.Handle("/v1/traces/story", readCORS(ingestServer.TraceStory))
mux.Handle("/v1/blast_radius", readCORS(ingestServer.BlastRadius))
@@ -466,6 +508,9 @@ func main() {
if _, ok := signalStore.(*coldstore.SignalStore); ok {
go signals.RunRetention(ctx, signalStore, signalRetention, 5*time.Minute, m, slog.Default())
}
+ if incidentRunning {
+ go incidentEngine.Run(ctx)
+ }
go func() {
slog.Info("ingest listening", "addr", addr, "graph_hot_window", graphHotWindow)
@@ -609,7 +654,7 @@ func main() {
// ---------------- Anomaly detection ticker ----------------
detectCfg := detect.ParseConfig()
- if detectCfg.Enabled {
+ if detectCfg.Enabled && !incidentRunning {
var deploySrc detect.DeploySource
if coldDB != nil {
deploySrc = coldDB
@@ -617,6 +662,8 @@ func main() {
detector := detect.NewDetector(detectCfg, graphStore, traceStore, deploySrc)
ingestServer.SetDetector(detector)
go detector.Run(ctx)
+ } else if incidentRunning {
+ slog.Info("legacy anomaly detector disabled because v2.1 incident engine is running")
}
// ---------------- Causal inference ticker ----------------
@@ -794,6 +841,116 @@ func printHelp() {
os.Stdout.WriteString("\n\033[2mnotes: MCP stdio: run with MCP_STDIO=1\033[0m\n")
}
+type coldDeployAdapter struct {
+ store *coldstore.SQLiteStore
+}
+
+func (a coldDeployAdapter) DeploymentsInWindow(ctx context.Context, start, end time.Time, serviceFilter string) ([]incidents.Deployment, error) {
+ rows, err := a.store.DeploymentsInWindow(ctx, start, end, serviceFilter)
+ if err != nil {
+ return nil, err
+ }
+ out := make([]incidents.Deployment, 0, len(rows))
+ for _, row := range rows {
+ out = append(out, incidents.Deployment{
+ ID: row.ID,
+ Service: row.Service,
+ Version: row.Version,
+ Env: row.Env,
+ FirstSeen: row.FirstSeen,
+ LastSeen: row.LastSeen,
+ Metadata: row.Metadata,
+ })
+ }
+ return out, nil
+}
+
+type incidentReaderAdapter struct {
+ reader *ingestv2.Reader
+}
+
+func (a incidentReaderAdapter) Errors(f incidents.SearchFilter, limit int) incidents.ErrorsResult {
+ res := a.reader.Errors(toV2SearchFilter(f), nil, limit)
+ return incidents.ErrorsResult{Rows: res.Rows}
+}
+
+func (a incidentReaderAdapter) BlastRadius(f incidents.SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse {
+ return a.reader.BlastRadius(toV2SearchFilter(f), ingestv2.BlastKeyMode{Key: key})
+}
+
+func (a incidentReaderAdapter) SearchEvents(f incidents.SearchFilter, limit int) []*eventv2.Event {
+ res := a.reader.SearchEvents(toV2SearchFilter(f), nil, limit)
+ return res.Events
+}
+
+func toV2SearchFilter(f incidents.SearchFilter) ingestv2.SearchFilter {
+ return ingestv2.SearchFilter{
+ Service: f.Service,
+ Statuses: f.Statuses,
+ ErrorCode: f.ErrorCode,
+ Since: f.Since,
+ Until: f.Until,
+ }
+}
+
+type incidentInsightAdapter struct {
+ engine *incidents.Engine
+}
+
+func (a incidentInsightAdapter) Current() *detect.Insight {
+ if a.engine == nil {
+ return nil
+ }
+ inc, err := a.engine.TopActive(context.Background())
+ if err != nil || inc == nil {
+ return nil
+ }
+ return projectIncidentInsight(*inc, time.Now().UTC())
+}
+
+func projectIncidentInsight(inc incidents.Incident, detectedAt time.Time) *detect.Insight {
+ affectedUsers := 0
+ if inc.AffectedUsers != nil {
+ affectedUsers = *inc.AffectedUsers
+ }
+ out := &detect.Insight{
+ DetectedAt: detectedAt,
+ TopErrorCode: inc.ErrorFamily.ErrorCode,
+ Lift: inc.Lift,
+ CurrentCount: inc.CurrentCount,
+ BaselineCount: inc.BaselineCount,
+ AffectedRequests: inc.AffectedRequests,
+ AffectedUsers: affectedUsers,
+ Services: append([]string(nil), inc.TopServices...),
+ SeverityScore: float64(inc.Severity),
+ }
+ if len(out.Services) == 0 {
+ out.Services = []string{inc.Service}
+ }
+ for _, ev := range inc.Evidence {
+ if ev.Kind == incidents.EvidenceDeployment && ev.DeployID != "" {
+ out.DeployCorrelation = &detect.DeployCorrelation{
+ DeploymentID: ev.DeployID,
+ Service: ev.Service,
+ Confidence: incidentConfidenceFloat(inc.Confidence),
+ }
+ break
+ }
+ }
+ return out
+}
+
+func incidentConfidenceFloat(c incidents.Confidence) float64 {
+ switch c {
+ case incidents.ConfidenceHigh:
+ return 0.9
+ case incidents.ConfidenceMedium:
+ return 0.65
+ default:
+ return 0.35
+ }
+}
+
func parseSlogLevel(s string) slog.Level {
switch strings.ToLower(s) {
case "debug":
diff --git a/docs/env.md b/docs/env.md
index 7ace4cb..db9afac 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -57,6 +57,14 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
| `EVENT_LOG_MAX_FILE_MB` | `50` | Rotation size. `0` disables rotation |
| `EVENT_LOG_RETENTION` | `72h` | Event log retention. Must be positive |
| `WAYLOG_SIGNAL_RETENTION` | `72h` | Production-context signal retention. Must be positive. `/v1/signals` requires `SQLITE_PATH` |
+| `WAYLOG_INCIDENTS_ENABLED` | `true` | Enable the v2.1 incident engine when `SQLITE_PATH` is set and `WAYLOG_V2_READS=true` |
+| `WAYLOG_INCIDENT_TICK_INTERVAL` | `30s` | Incident engine evaluation interval |
+| `WAYLOG_INCIDENT_WINDOW` | `10m` | Current error-family spike window |
+| `WAYLOG_INCIDENT_MIN_COUNT` | `5` | Minimum current-window failures needed to open an incident |
+| `WAYLOG_INCIDENT_MIN_LIFT` | `3.0` | Minimum current-vs-baseline lift when the family already exists in the baseline window |
+| `WAYLOG_INCIDENT_RESOLVE_AFTER` | `2m` | Time without renewed matching failures before a recovering incident resolves |
+| `WAYLOG_DEPLOY_CORRELATION_WINDOW` | `15m` | Window used to attach deploy signals and deployment records as incident evidence |
+| `WAYLOG_INCIDENT_SAMPLE_LIMIT` | `5` | Maximum persisted sample traces per incident |
| `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
| `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
| `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 151392b..ce503dc 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -435,6 +435,79 @@ paths:
'400':
$ref: '#/components/responses/ReadBadRequest'
+ /v1/incidents/active:
+ get:
+ tags: [Triage]
+ operationId: listActiveIncidents
+ summary: Active v2.1 incidents
+ description: Returns active and recovering incidents derived from v2 error-family spikes, signals, and deployment context.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ responses:
+ '200':
+ description: Active incidents
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/IncidentListResponse'
+ '401':
+ description: Unauthorized
+ '405':
+ description: Method Not Allowed
+
+ /v1/incidents/{id}:
+ get:
+ tags: [Triage]
+ operationId: getIncident
+ summary: Get one incident
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ parameters:
+ - $ref: '#/components/parameters/IncidentID'
+ responses:
+ '200':
+ description: Incident detail
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/IncidentDetailResponse'
+ '401':
+ description: Unauthorized
+ '404':
+ $ref: '#/components/responses/ReadNotFound'
+ '405':
+ description: Method Not Allowed
+
+ /v1/incidents/{id}/snapshot:
+ get:
+ tags: [Triage]
+ operationId: getIncidentSnapshot
+ summary: Render an incident snapshot
+ description: Defaults to text/plain. Send Accept: application/json to receive the snapshot text plus the incident object.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ parameters:
+ - $ref: '#/components/parameters/IncidentID'
+ responses:
+ '200':
+ description: Incident snapshot
+ content:
+ text/plain:
+ schema:
+ type: string
+ application/json:
+ schema:
+ $ref: '#/components/schemas/IncidentSnapshotResponse'
+ '401':
+ description: Unauthorized
+ '404':
+ $ref: '#/components/responses/ReadNotFound'
+ '405':
+ description: Method Not Allowed
+
/v1/capabilities:
get:
tags: [Capabilities]
@@ -710,6 +783,12 @@ components:
required: true
schema:
type: string
+ IncidentID:
+ name: id
+ in: path
+ required: true
+ schema:
+ type: string
TraceIDQuery:
name: trace_id
in: query
@@ -1430,6 +1509,186 @@ components:
items:
type: string
+ IncidentEvidence:
+ type: object
+ required: [kind, title, occurred_at]
+ properties:
+ kind:
+ type: string
+ enum: [signal, deployment, trace, metric]
+ title:
+ type: string
+ detail:
+ type: string
+ service:
+ type: string
+ signal_id:
+ type: string
+ deployment_id:
+ type: string
+ trace_id:
+ type: string
+ occurred_at:
+ type: string
+ format: date-time
+ fields:
+ type: object
+ additionalProperties: true
+
+ Incident:
+ type: object
+ required:
+ - incident_id
+ - env
+ - service
+ - error_family
+ - status
+ - cause
+ - confidence
+ - severity
+ - started_at
+ - updated_at
+ - last_seen_at
+ - affected_requests
+ - affected_services
+ - top_services
+ - sample_traces
+ - evidence
+ - next_checks
+ - lift
+ - baseline_count
+ - current_count
+ example:
+ incident_id: inc_7d0b0b3d5a52d891
+ env: prod
+ service: checkout
+ error_family:
+ service: checkout
+ step: payment.charge
+ error_code: PMT_502
+ status: active
+ cause: dependency
+ confidence: medium
+ severity: 8
+ started_at: '2026-05-04T16:00:00Z'
+ updated_at: '2026-05-04T16:02:00Z'
+ last_seen_at: '2026-05-04T16:02:00Z'
+ affected_requests: 12
+ affected_users: 8
+ affected_services: 3
+ top_services: [api-gateway, checkout, payment]
+ sample_traces: [7f3a2b9c000000000000000000000001]
+ evidence:
+ - kind: trace
+ title: First failing trace sample
+ detail: payment.charge/PMT_502
+ service: checkout
+ trace_id: 7f3a2b9c000000000000000000000001
+ occurred_at: '2026-05-04T16:00:00Z'
+ next_checks:
+ - Check the downstream service health and recent deploys.
+ lift: 6
+ baseline_count: 2
+ current_count: 12
+ properties:
+ incident_id:
+ type: string
+ env:
+ type: string
+ service:
+ type: string
+ error_family:
+ $ref: '#/components/schemas/ErrorFamily'
+ status:
+ type: string
+ enum: [active, recovering, resolved]
+ cause:
+ type: string
+ enum: [deploy, app, dependency, unknown]
+ confidence:
+ type: string
+ enum: [high, medium, low]
+ severity:
+ type: integer
+ minimum: 1
+ maximum: 10
+ started_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+ last_seen_at:
+ type: string
+ format: date-time
+ recovering_at:
+ type: string
+ format: date-time
+ nullable: true
+ resolved_at:
+ type: string
+ format: date-time
+ nullable: true
+ affected_requests:
+ type: integer
+ affected_users:
+ type: integer
+ nullable: true
+ affected_services:
+ type: integer
+ top_services:
+ type: array
+ items:
+ type: string
+ sample_traces:
+ type: array
+ items:
+ type: string
+ evidence:
+ type: array
+ items:
+ $ref: '#/components/schemas/IncidentEvidence'
+ next_checks:
+ type: array
+ items:
+ type: string
+ instrumentation_warnings:
+ type: array
+ items:
+ type: string
+ lift:
+ type: number
+ format: double
+ baseline_count:
+ type: integer
+ current_count:
+ type: integer
+
+ IncidentListResponse:
+ type: object
+ required: [incidents]
+ properties:
+ incidents:
+ type: array
+ items:
+ $ref: '#/components/schemas/Incident'
+
+ IncidentDetailResponse:
+ type: object
+ required: [incident]
+ properties:
+ incident:
+ $ref: '#/components/schemas/Incident'
+
+ IncidentSnapshotResponse:
+ type: object
+ required: [snapshot, incident]
+ properties:
+ snapshot:
+ type: string
+ incident:
+ $ref: '#/components/schemas/Incident'
+
CapabilitiesResponse:
type: object
example:
diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go
new file mode 100644
index 0000000..b6a3160
--- /dev/null
+++ b/internal/coldstore/incident_store.go
@@ -0,0 +1,236 @@
+package coldstore
+
+import (
+ "context"
+ "database/sql"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+)
+
+type IncidentStore struct {
+ db *SQLiteStore
+}
+
+func NewIncidentStore(db *SQLiteStore) *IncidentStore {
+ return &IncidentStore{db: db}
+}
+
+func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) error {
+ topServices, err := jsonText(inc.TopServices)
+ if err != nil {
+ return fmt.Errorf("coldstore incident top services: %w", err)
+ }
+ samples, err := jsonText(inc.SampleTraces)
+ if err != nil {
+ return fmt.Errorf("coldstore incident samples: %w", err)
+ }
+ evidence, err := jsonText(inc.Evidence)
+ if err != nil {
+ return fmt.Errorf("coldstore incident evidence: %w", err)
+ }
+ nextChecks, err := jsonText(inc.NextChecks)
+ if err != nil {
+ return fmt.Errorf("coldstore incident next checks: %w", err)
+ }
+ warnings, err := jsonText(inc.InstrumentationWarnings)
+ if err != nil {
+ return fmt.Errorf("coldstore incident warnings: %w", err)
+ }
+ _, err = s.db.writer.ExecContext(ctx, `
+ INSERT INTO incidents (
+ incident_id, env, service, error_service, error_step, error_code,
+ status, cause, confidence, severity, started_at, updated_at, last_seen_at,
+ recovering_at, resolved_at, affected_requests, affected_users, affected_services,
+ top_services, sample_traces, evidence, next_checks, instrumentation_warnings,
+ lift, baseline_count, current_count
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ ON CONFLICT(incident_id) DO UPDATE SET
+ status = excluded.status,
+ cause = excluded.cause,
+ confidence = excluded.confidence,
+ severity = excluded.severity,
+ updated_at = excluded.updated_at,
+ last_seen_at = excluded.last_seen_at,
+ recovering_at = excluded.recovering_at,
+ resolved_at = excluded.resolved_at,
+ affected_requests = excluded.affected_requests,
+ affected_users = excluded.affected_users,
+ affected_services = excluded.affected_services,
+ top_services = excluded.top_services,
+ sample_traces = excluded.sample_traces,
+ evidence = excluded.evidence,
+ next_checks = excluded.next_checks,
+ instrumentation_warnings = excluded.instrumentation_warnings,
+ lift = excluded.lift,
+ baseline_count = excluded.baseline_count,
+ current_count = excluded.current_count`,
+ inc.IncidentID, inc.Env, inc.Service, inc.ErrorFamily.Service, inc.ErrorFamily.Step, inc.ErrorFamily.ErrorCode,
+ string(inc.Status), string(inc.Cause), string(inc.Confidence), inc.Severity,
+ formatTime(inc.StartedAt), formatTime(inc.UpdatedAt), formatTime(inc.LastSeenAt),
+ nullableTime(inc.RecoveringAt), nullableTime(inc.ResolvedAt),
+ inc.AffectedRequests, nullableInt(inc.AffectedUsers), inc.AffectedServices,
+ topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount,
+ )
+ if err != nil {
+ return fmt.Errorf("coldstore upsert incident: %w", err)
+ }
+ return nil
+}
+
+func (s *IncidentStore) Get(ctx context.Context, id string) (incidents.Incident, error) {
+ row := s.db.reader.QueryRowContext(ctx, incidentSelectSQL()+` WHERE incident_id = ?`, id)
+ inc, err := scanIncident(row)
+ if errors.Is(err, sql.ErrNoRows) {
+ return incidents.Incident{}, incidents.ErrNotFound
+ }
+ return inc, err
+}
+
+func (s *IncidentStore) ListActive(ctx context.Context) ([]incidents.Incident, error) {
+ rows, err := s.db.reader.QueryContext(ctx, incidentSelectSQL()+` WHERE status != ? ORDER BY severity DESC, started_at DESC, incident_id ASC`, string(incidents.StatusResolved))
+ if err != nil {
+ return nil, fmt.Errorf("coldstore list active incidents: %w", err)
+ }
+ defer rows.Close()
+ var out []incidents.Incident
+ for rows.Next() {
+ inc, err := scanIncident(rows)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, inc)
+ }
+ if err := rows.Err(); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+func (s *IncidentStore) PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error) {
+ res, err := s.db.writer.ExecContext(ctx, `DELETE FROM incidents WHERE status = ? AND resolved_at IS NOT NULL AND resolved_at < ?`, string(incidents.StatusResolved), formatTime(cutoff))
+ if err != nil {
+ return 0, fmt.Errorf("coldstore prune incidents: %w", err)
+ }
+ n, err := res.RowsAffected()
+ if err != nil {
+ return 0, fmt.Errorf("coldstore prune incidents rows affected: %w", err)
+ }
+ return int(n), nil
+}
+
+func incidentSelectSQL() string {
+ return `SELECT incident_id, env, service, error_service, error_step, error_code,
+ status, cause, confidence, severity, started_at, updated_at, last_seen_at,
+ COALESCE(recovering_at, ''), COALESCE(resolved_at, ''),
+ affected_requests, affected_users, affected_services,
+ COALESCE(top_services, ''), COALESCE(sample_traces, ''), COALESCE(evidence, ''),
+ COALESCE(next_checks, ''), COALESCE(instrumentation_warnings, ''),
+ lift, baseline_count, current_count
+ FROM incidents`
+}
+
+func scanIncident(row interface{ Scan(dest ...any) error }) (incidents.Incident, error) {
+ var inc incidents.Incident
+ var status, cause, confidence string
+ var startedAt, updatedAt, lastSeenAt, recoveringAt, resolvedAt string
+ var affectedUsers sql.NullInt64
+ var topServices, samples, evidence, nextChecks, warnings string
+ err := row.Scan(
+ &inc.IncidentID, &inc.Env, &inc.Service, &inc.ErrorFamily.Service, &inc.ErrorFamily.Step, &inc.ErrorFamily.ErrorCode,
+ &status, &cause, &confidence, &inc.Severity, &startedAt, &updatedAt, &lastSeenAt,
+ &recoveringAt, &resolvedAt, &inc.AffectedRequests, &affectedUsers, &inc.AffectedServices,
+ &topServices, &samples, &evidence, &nextChecks, &warnings, &inc.Lift, &inc.BaselineCount, &inc.CurrentCount,
+ )
+ if err != nil {
+ return incidents.Incident{}, err
+ }
+ inc.Status = incidents.Status(status)
+ inc.Cause = incidents.Cause(cause)
+ inc.Confidence = incidents.Confidence(confidence)
+ var parseErr error
+ if inc.StartedAt, parseErr = time.Parse(tsFormat, startedAt); parseErr != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident started_at: %w", parseErr)
+ }
+ if inc.UpdatedAt, parseErr = time.Parse(tsFormat, updatedAt); parseErr != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident updated_at: %w", parseErr)
+ }
+ if inc.LastSeenAt, parseErr = time.Parse(tsFormat, lastSeenAt); parseErr != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident last_seen_at: %w", parseErr)
+ }
+ if recoveringAt != "" {
+ t, err := time.Parse(tsFormat, recoveringAt)
+ if err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident recovering_at: %w", err)
+ }
+ inc.RecoveringAt = &t
+ }
+ if resolvedAt != "" {
+ t, err := time.Parse(tsFormat, resolvedAt)
+ if err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident resolved_at: %w", err)
+ }
+ inc.ResolvedAt = &t
+ }
+ if affectedUsers.Valid {
+ v := int(affectedUsers.Int64)
+ inc.AffectedUsers = &v
+ }
+ if err := parseJSONText(topServices, &inc.TopServices); err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident top services: %w", err)
+ }
+ if err := parseJSONText(samples, &inc.SampleTraces); err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident samples: %w", err)
+ }
+ if err := parseJSONText(evidence, &inc.Evidence); err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident evidence: %w", err)
+ }
+ if err := parseJSONText(nextChecks, &inc.NextChecks); err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident next checks: %w", err)
+ }
+ if err := parseJSONText(warnings, &inc.InstrumentationWarnings); err != nil {
+ return incidents.Incident{}, fmt.Errorf("coldstore incident warnings: %w", err)
+ }
+ return inc, nil
+}
+
+func jsonText(v any) (sql.NullString, error) {
+ raw, err := json.Marshal(v)
+ if err != nil {
+ return sql.NullString{}, err
+ }
+ if string(raw) == "null" {
+ return sql.NullString{}, nil
+ }
+ return sql.NullString{String: string(raw), Valid: true}, nil
+}
+
+func parseJSONText(raw string, out any) error {
+ if raw == "" {
+ return nil
+ }
+ return json.Unmarshal([]byte(raw), out)
+}
+
+func formatTime(t time.Time) string {
+ return t.UTC().Format(tsFormat)
+}
+
+func nullableTime(t *time.Time) sql.NullString {
+ if t == nil {
+ return sql.NullString{}
+ }
+ return sql.NullString{String: formatTime(*t), Valid: true}
+}
+
+func nullableInt(v *int) sql.NullInt64 {
+ if v == nil {
+ return sql.NullInt64{}
+ }
+ return sql.NullInt64{Int64: int64(*v), Valid: true}
+}
+
+var _ incidents.Store = (*IncidentStore)(nil)
diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go
new file mode 100644
index 0000000..10cdfe8
--- /dev/null
+++ b/internal/coldstore/incident_store_test.go
@@ -0,0 +1,85 @@
+package coldstore
+
+import (
+ "context"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestIncidentStoreRoundtripAndPrune(t *testing.T) {
+ managed, err := Open(":memory:")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer managed.Close()
+ store := NewIncidentStore(managed.(*SQLiteStore))
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ users := 3
+ inc := incidents.Incident{
+ IncidentID: incidents.StableID("prod", apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}, now),
+ Env: "prod",
+ Service: "checkout",
+ ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+ Status: incidents.StatusActive,
+ Cause: incidents.CauseDependency,
+ Confidence: incidents.ConfidenceHigh,
+ Severity: 8,
+ StartedAt: now,
+ UpdatedAt: now,
+ LastSeenAt: now,
+ AffectedRequests: 9,
+ AffectedUsers: &users,
+ AffectedServices: 2,
+ TopServices: []string{"checkout", "payment"},
+ SampleTraces: []string{"trace-a", "trace-b"},
+ Evidence: []incidents.Evidence{{Kind: incidents.EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: now}},
+ NextChecks: []string{"check downstream"},
+ Lift: 9,
+ CurrentCount: 9,
+ }
+ if err := store.Upsert(context.Background(), inc); err != nil {
+ t.Fatal(err)
+ }
+ got, err := store.Get(context.Background(), inc.IncidentID)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if got.IncidentID != inc.IncidentID || got.AffectedUsers == nil || *got.AffectedUsers != users || len(got.SampleTraces) != 2 {
+ t.Fatalf("roundtrip=%+v", got)
+ }
+ active, err := store.ListActive(context.Background())
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(active) != 1 {
+ t.Fatalf("active=%+v", active)
+ }
+ resolvedAt := now.Add(time.Minute)
+ inc.Status = incidents.StatusResolved
+ inc.ResolvedAt = &resolvedAt
+ if err := store.Upsert(context.Background(), inc); err != nil {
+ t.Fatal(err)
+ }
+ active, err = store.ListActive(context.Background())
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(active) != 0 {
+ t.Fatalf("active after resolve=%+v", active)
+ }
+ deleted, err := store.PruneResolvedOlderThan(context.Background(), resolvedAt.Add(time.Second))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if deleted != 1 {
+ t.Fatalf("deleted=%d", deleted)
+ }
+ _, err = store.Get(context.Background(), inc.IncidentID)
+ if !errors.Is(err, incidents.ErrNotFound) {
+ t.Fatalf("expected not found, got %v", err)
+ }
+}
diff --git a/internal/coldstore/migrations/004_incidents.sql b/internal/coldstore/migrations/004_incidents.sql
new file mode 100644
index 0000000..076a66d
--- /dev/null
+++ b/internal/coldstore/migrations/004_incidents.sql
@@ -0,0 +1,34 @@
+-- 004_incidents.sql: v2.1 incident engine persistence.
+
+CREATE TABLE IF NOT EXISTS incidents (
+ incident_id TEXT PRIMARY KEY,
+ env TEXT NOT NULL,
+ service TEXT NOT NULL,
+ error_service TEXT NOT NULL,
+ error_step TEXT NOT NULL,
+ error_code TEXT NOT NULL,
+ status TEXT NOT NULL,
+ cause TEXT NOT NULL,
+ confidence TEXT NOT NULL,
+ severity INTEGER NOT NULL,
+ started_at TEXT NOT NULL,
+ updated_at TEXT NOT NULL,
+ last_seen_at TEXT NOT NULL,
+ recovering_at TEXT,
+ resolved_at TEXT,
+ affected_requests INTEGER NOT NULL,
+ affected_users INTEGER,
+ affected_services INTEGER NOT NULL,
+ top_services TEXT,
+ sample_traces TEXT,
+ evidence TEXT,
+ next_checks TEXT,
+ instrumentation_warnings TEXT,
+ lift REAL NOT NULL DEFAULT 0,
+ baseline_count INTEGER NOT NULL DEFAULT 0,
+ current_count INTEGER NOT NULL DEFAULT 0
+);
+
+CREATE INDEX IF NOT EXISTS idx_incidents_status_started ON incidents (status, started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_incidents_family_started ON incidents (env, service, error_step, error_code, started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_incidents_resolved_at ON incidents (resolved_at);
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
new file mode 100644
index 0000000..8cde790
--- /dev/null
+++ b/internal/incidents/classifier.go
@@ -0,0 +1,269 @@
+package incidents
+
+import (
+ "fmt"
+ "sort"
+ "strings"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type ClassificationInput struct {
+ Incident Incident
+ Events []*eventv2.Event
+ Signals []signals.Signal
+ Deployments []Deployment
+ Now time.Time
+}
+
+type Classification struct {
+ Cause Cause
+ Confidence Confidence
+ Evidence []Evidence
+ NextChecks []string
+ InstrumentationWarnings []string
+}
+
+func Classify(input ClassificationInput) Classification {
+ evidence := collectTraceEvidence(input.Events)
+ warnings := instrumentationWarnings(input.Events, input.Signals)
+
+ if dep := matchingDependencySignal(input); dep != nil {
+ evidence = append(evidence, signalEvidence(*dep, "Dependency signal overlaps first failing downstream"))
+ return classification(CauseDependency, ConfidenceHigh, evidence, warnings)
+ }
+ if downstream := firstFailingDownstream(input.Events); downstream != "" {
+ evidence = append(evidence, Evidence{
+ Kind: EvidenceTrace,
+ Title: "First failing step calls downstream service",
+ Detail: downstream,
+ Service: downstream,
+ OccurredAt: input.Incident.StartedAt,
+ })
+ return classification(CauseDependency, ConfidenceMedium, evidence, warnings)
+ }
+ if dep := matchingDeployment(input); dep != nil {
+ evidence = append(evidence, deploymentEvidence(*dep))
+ return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+ }
+ if sig := matchingSignal(input, signals.TypeDeploy); sig != nil {
+ evidence = append(evidence, signalEvidence(*sig, "Deploy signal overlaps incident window"))
+ return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
+ }
+ if len(input.Events) > 0 && input.Incident.ErrorFamily.Step != "" && firstFailingDownstream(input.Events) == "" {
+ return classification(CauseApp, ConfidenceMedium, evidence, warnings)
+ }
+ return classification(CauseUnknown, ConfidenceLow, evidence, warnings)
+}
+
+func classification(cause Cause, confidence Confidence, evidence []Evidence, warnings []string) Classification {
+ return Classification{
+ Cause: cause,
+ Confidence: confidence,
+ Evidence: normalizeEvidence(evidence, 8),
+ NextChecks: NextChecks(cause, confidence),
+ InstrumentationWarnings: uniqueStrings(warnings),
+ }
+}
+
+func matchingDependencySignal(input ClassificationInput) *signals.Signal {
+ downstream := firstFailingDownstream(input.Events)
+ for i := range input.Signals {
+ sig := input.Signals[i]
+ if sig.Type != signals.TypeDependency {
+ continue
+ }
+ if downstream != "" && sig.Service != downstream {
+ continue
+ }
+ return &input.Signals[i]
+ }
+ return nil
+}
+
+func matchingDeployment(input ClassificationInput) *Deployment {
+ version := sampleVersion(input.Events)
+ for i := range input.Deployments {
+ dep := input.Deployments[i]
+ if dep.Env != "" && input.Incident.Env != "" && dep.Env != input.Incident.Env {
+ continue
+ }
+ if dep.Service != input.Incident.Service {
+ continue
+ }
+ if version != "" && dep.Version != "" && dep.Version != version {
+ continue
+ }
+ return &input.Deployments[i]
+ }
+ return nil
+}
+
+func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal {
+ version := sampleVersion(input.Events)
+ for i := range input.Signals {
+ sig := input.Signals[i]
+ if sig.Type != typ || sig.Service != input.Incident.Service {
+ continue
+ }
+ if version == "" {
+ return &input.Signals[i]
+ }
+ if sigVersion := stringField(sig.Metadata, "version"); sigVersion == "" || sigVersion == version {
+ return &input.Signals[i]
+ }
+ }
+ return nil
+}
+
+func collectTraceEvidence(events []*eventv2.Event) []Evidence {
+ out := make([]Evidence, 0, 2)
+ for _, ev := range events {
+ if ev == nil || ev.Anchor == nil {
+ continue
+ }
+ out = append(out, Evidence{
+ Kind: EvidenceTrace,
+ Title: "First failing trace sample",
+ Detail: fmt.Sprintf("%s/%s", ev.Anchor.Step, ev.Anchor.ErrorCode),
+ Service: ev.Service,
+ TraceID: ev.TraceID,
+ OccurredAt: ev.TsStart,
+ })
+ break
+ }
+ return out
+}
+
+func deploymentEvidence(dep Deployment) Evidence {
+ return Evidence{
+ Kind: EvidenceDeployment,
+ Title: "Deployment overlaps incident window",
+ Detail: dep.Version,
+ Service: dep.Service,
+ DeployID: dep.ID,
+ OccurredAt: dep.FirstSeen,
+ }
+}
+
+func signalEvidence(sig signals.Signal, title string) Evidence {
+ return Evidence{
+ Kind: EvidenceSignal,
+ Title: title,
+ Detail: sig.Reason,
+ Service: sig.Service,
+ SignalID: sig.SignalID,
+ OccurredAt: sig.Timestamp,
+ Fields: map[string]any{
+ "type": string(sig.Type),
+ "severity": string(sig.Severity),
+ "source": sig.Source,
+ },
+ }
+}
+
+func normalizeEvidence(evidence []Evidence, limit int) []Evidence {
+ sort.SliceStable(evidence, func(i, j int) bool {
+ if !evidence[i].OccurredAt.Equal(evidence[j].OccurredAt) {
+ return evidence[i].OccurredAt.Before(evidence[j].OccurredAt)
+ }
+ if evidence[i].Kind != evidence[j].Kind {
+ return evidence[i].Kind < evidence[j].Kind
+ }
+ return evidence[i].Title < evidence[j].Title
+ })
+ seen := map[string]struct{}{}
+ out := make([]Evidence, 0, len(evidence))
+ for _, ev := range evidence {
+ key := string(ev.Kind) + "|" + ev.Title + "|" + ev.SignalID + "|" + ev.DeployID + "|" + ev.TraceID
+ if _, ok := seen[key]; ok {
+ continue
+ }
+ seen[key] = struct{}{}
+ out = append(out, ev)
+ if limit > 0 && len(out) == limit {
+ break
+ }
+ }
+ return out
+}
+
+func instrumentationWarnings(events []*eventv2.Event, sigs []signals.Signal) []string {
+ var warnings []string
+ if sampleVersion(events) == "" {
+ warnings = append(warnings, "missing_service_version")
+ }
+ if firstFailingDownstream(events) != "" && !hasSignalType(sigs, signals.TypeDependency) {
+ warnings = append(warnings, "missing_dependency_signal")
+ }
+ for _, ev := range events {
+ if ev != nil && ev.Status == eventv2.StatusPartial {
+ warnings = append(warnings, "partial_trace")
+ break
+ }
+ }
+ return warnings
+}
+
+func firstFailingDownstream(events []*eventv2.Event) string {
+ for _, ev := range events {
+ if ev == nil || ev.Anchor == nil {
+ continue
+ }
+ for _, step := range ev.Steps {
+ if step.Name == ev.Anchor.Step && step.Status == eventv2.StepStatusError && step.Downstream != nil {
+ return step.Downstream.Service
+ }
+ }
+ }
+ return ""
+}
+
+func sampleVersion(events []*eventv2.Event) string {
+ for _, ev := range events {
+ if ev != nil && ev.Version != "" {
+ return ev.Version
+ }
+ }
+ return ""
+}
+
+func hasSignalType(sigs []signals.Signal, typ signals.Type) bool {
+ for _, sig := range sigs {
+ if sig.Type == typ {
+ return true
+ }
+ }
+ return false
+}
+
+func stringField(m map[string]any, key string) string {
+ if m == nil {
+ return ""
+ }
+ v, _ := m[key].(string)
+ return v
+}
+
+func uniqueStrings(in []string) []string {
+ if len(in) == 0 {
+ return nil
+ }
+ seen := map[string]struct{}{}
+ out := make([]string, 0, len(in))
+ for _, s := range in {
+ s = strings.TrimSpace(s)
+ if s == "" {
+ continue
+ }
+ if _, ok := seen[s]; ok {
+ continue
+ }
+ seen[s] = struct{}{}
+ out = append(out, s)
+ }
+ sort.Strings(out)
+ return out
+}
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
new file mode 100644
index 0000000..348787a
--- /dev/null
+++ b/internal/incidents/classifier_test.go
@@ -0,0 +1,66 @@
+package incidents
+
+import (
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func TestClassifierRules(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+ paymentEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+ t.Run("dependency with signal", func(t *testing.T) {
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{paymentEvent},
+ Signals: []signals.Signal{{
+ SignalID: "sig_dep",
+ Type: signals.TypeDependency,
+ Service: "payment",
+ Env: "prod",
+ Reason: "upstream_5xx",
+ Severity: signals.SeverityCritical,
+ Timestamp: now.Add(-time.Minute),
+ }},
+ })
+ if got.Cause != CauseDependency || got.Confidence != ConfidenceHigh {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+
+ t.Run("dependency trace only", func(t *testing.T) {
+ got := Classify(ClassificationInput{Incident: base, Events: []*eventv2.Event{paymentEvent}})
+ if got.Cause != CauseDependency || got.Confidence != ConfidenceMedium {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+
+ t.Run("deploy", func(t *testing.T) {
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{testIncidentEvent("e2", "trace-b", now, "checkout", "cart.validate", "CHK_500", "")},
+ Deployments: []Deployment{{ID: "dep_1", Service: "checkout", Version: "v1", Env: "prod", FirstSeen: now.Add(-time.Minute)}},
+ })
+ if got.Cause != CauseDeploy || got.Confidence != ConfidenceHigh {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+
+ t.Run("app", func(t *testing.T) {
+ got := Classify(ClassificationInput{Incident: base, Events: []*eventv2.Event{testIncidentEvent("e3", "trace-c", now, "checkout", "cart.validate", "CHK_500", "")}})
+ if got.Cause != CauseApp || got.Confidence != ConfidenceMedium {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+
+ t.Run("unknown", func(t *testing.T) {
+ got := Classify(ClassificationInput{Incident: base})
+ if got.Cause != CauseUnknown || got.Confidence != ConfidenceLow {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+}
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
new file mode 100644
index 0000000..6eb6cba
--- /dev/null
+++ b/internal/incidents/engine.go
@@ -0,0 +1,497 @@
+package incidents
+
+import (
+ "context"
+ "errors"
+ "log/slog"
+ "math"
+ "sort"
+ "sync"
+ "time"
+
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/sssmaran/WaylogCLI/internal/metrics"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type Config struct {
+ TickInterval time.Duration
+ Window time.Duration
+ MinCount int
+ MinLift float64
+ ResolveAfter time.Duration
+ DeployCorrelationWindow time.Duration
+ SampleLimit int
+}
+
+func DefaultConfig() Config {
+ return Config{
+ TickInterval: 30 * time.Second,
+ Window: 10 * time.Minute,
+ MinCount: 5,
+ MinLift: 3.0,
+ ResolveAfter: 2 * time.Minute,
+ DeployCorrelationWindow: 15 * time.Minute,
+ SampleLimit: 5,
+ }
+}
+
+func (c Config) withDefaults() Config {
+ d := DefaultConfig()
+ if c.TickInterval <= 0 {
+ c.TickInterval = d.TickInterval
+ }
+ if c.Window <= 0 {
+ c.Window = d.Window
+ }
+ if c.MinCount <= 0 {
+ c.MinCount = d.MinCount
+ }
+ if c.MinLift <= 0 {
+ c.MinLift = d.MinLift
+ }
+ if c.ResolveAfter <= 0 {
+ c.ResolveAfter = d.ResolveAfter
+ }
+ if c.DeployCorrelationWindow <= 0 {
+ c.DeployCorrelationWindow = d.DeployCorrelationWindow
+ }
+ if c.SampleLimit <= 0 {
+ c.SampleLimit = d.SampleLimit
+ }
+ return c
+}
+
+type Engine struct {
+ reader Reader
+ signals SignalStore
+ deploys DeploySource
+ store Store
+ cfg Config
+ metrics *metrics.Metrics
+ log *slog.Logger
+ now func() time.Time
+
+ mu sync.RWMutex
+ active map[string]Incident
+}
+
+func NewEngine(reader Reader, signalStore SignalStore, deploys DeploySource, store Store, cfg Config, m *metrics.Metrics, log *slog.Logger) *Engine {
+ if log == nil {
+ log = slog.Default()
+ }
+ return &Engine{
+ reader: reader,
+ signals: signalStore,
+ deploys: deploys,
+ store: store,
+ cfg: cfg.withDefaults(),
+ metrics: m,
+ log: log,
+ now: time.Now,
+ active: map[string]Incident{},
+ }
+}
+
+func (e *Engine) Bootstrap(ctx context.Context) error {
+ rows, err := e.store.ListActive(ctx)
+ if err != nil {
+ return err
+ }
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ e.active = map[string]Incident{}
+ for _, inc := range rows {
+ e.active[inc.IncidentID] = inc
+ }
+ if e.metrics != nil {
+ e.metrics.IncidentActive.Set(float64(len(rows)))
+ }
+ return nil
+}
+
+func (e *Engine) Run(ctx context.Context) {
+ ticker := time.NewTicker(e.cfg.TickInterval)
+ defer ticker.Stop()
+ e.log.Info("incident engine started", "interval", e.cfg.TickInterval, "window", e.cfg.Window)
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ if err := e.Tick(ctx); err != nil {
+ e.log.Warn("incident tick failed", "err", err)
+ }
+ }
+ }
+}
+
+func (e *Engine) Tick(ctx context.Context) error {
+ start := time.Now()
+ if e.metrics != nil {
+ defer func() { e.metrics.IncidentTickLatency.Observe(time.Since(start).Seconds()) }()
+ }
+ now := e.now().UTC()
+ currentStart := now.Add(-e.cfg.Window)
+ baselineStart := now.Add(-2 * e.cfg.Window)
+ statuses := failedStatuses()
+ current := e.reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
+ baseline := e.reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
+ baselineByFamily := map[string]int{}
+ for _, row := range baseline.Rows {
+ baselineByFamily[familyKey(row.ErrorFamily)] = row.Count
+ }
+
+ seen := map[string]struct{}{}
+ for _, row := range current.Rows {
+ if row.Count < e.cfg.MinCount {
+ continue
+ }
+ baselineCount := baselineByFamily[familyKey(row.ErrorFamily)]
+ lift := computeLift(row.Count, baselineCount)
+ if baselineCount > 0 && lift < e.cfg.MinLift {
+ continue
+ }
+ inc, err := e.buildIncident(ctx, row, baselineCount, lift, currentStart, now)
+ if err != nil {
+ return err
+ }
+ seen[inc.IncidentID] = struct{}{}
+ if err := e.store.Upsert(ctx, inc); err != nil {
+ return err
+ }
+ e.remember(inc)
+ }
+ if err := e.transitionMissing(ctx, seen, now); err != nil {
+ return err
+ }
+ if e.metrics != nil {
+ e.metrics.IncidentActive.Set(float64(e.activeCount()))
+ }
+ return nil
+}
+
+func (e *Engine) Active(ctx context.Context) ([]Incident, error) {
+ rows, err := e.store.ListActive(ctx)
+ if err != nil {
+ return nil, err
+ }
+ sortIncidents(rows)
+ return rows, nil
+}
+
+func (e *Engine) Get(ctx context.Context, id string) (Incident, error) {
+ return e.store.Get(ctx, id)
+}
+
+func (e *Engine) TopActive(ctx context.Context) (*Incident, error) {
+ rows, err := e.Active(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if len(rows) == 0 {
+ return nil, nil
+ }
+ return &rows[0], nil
+}
+
+func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, error) {
+ events := e.sampleEvents(row.ErrorFamily, since, now, 200)
+ startedAt := earliestEventTime(events, now)
+ env := firstEventEnv(events)
+ if existing, ok := e.findByFamily(env, row.ErrorFamily); ok {
+ startedAt = existing.StartedAt
+ }
+ id := StableID(env, row.ErrorFamily, startedAt)
+ existing, hadExisting := e.getCached(id)
+ if !hadExisting {
+ if prior, ok := e.findByFamily(env, row.ErrorFamily); ok {
+ existing = prior
+ id = prior.IncidentID
+ hadExisting = true
+ }
+ }
+ blast := e.reader.BlastRadius(
+ SearchFilter{Since: since, Until: now},
+ apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
+ )
+ sigs, err := e.querySignals(ctx, row.ErrorFamily.Service, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+ if err != nil && !errors.Is(err, signals.ErrUnavailable) {
+ return Incident{}, err
+ }
+ deploys, err := e.queryDeploys(ctx, row.ErrorFamily.Service, now.Add(-e.cfg.DeployCorrelationWindow), now)
+ if err != nil {
+ return Incident{}, err
+ }
+ inc := Incident{
+ IncidentID: id,
+ Env: env,
+ Service: row.ErrorFamily.Service,
+ ErrorFamily: row.ErrorFamily,
+ Status: StatusActive,
+ Severity: severity(row.Count, blast.AffectedServices, lift),
+ StartedAt: startedAt,
+ UpdatedAt: now,
+ LastSeenAt: now,
+ AffectedRequests: blast.AffectedRequests,
+ AffectedUsers: cloneInt(row.AffectedUsers),
+ AffectedServices: blast.AffectedServices,
+ TopServices: append([]string(nil), blast.TopServices...),
+ SampleTraces: stableSamples(existing.SampleTraces, events, e.cfg.SampleLimit),
+ Lift: lift,
+ BaselineCount: baselineCount,
+ CurrentCount: row.Count,
+ }
+ if hadExisting {
+ inc.StartedAt = existing.StartedAt
+ inc.RecoveringAt = nil
+ }
+ class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
+ inc.Cause = class.Cause
+ inc.Confidence = class.Confidence
+ inc.Evidence = class.Evidence
+ inc.NextChecks = class.NextChecks
+ inc.InstrumentationWarnings = class.InstrumentationWarnings
+ e.observeClassification(inc.Cause, inc.Confidence)
+ if e.metrics != nil {
+ if hadExisting {
+ e.metrics.IncidentUpdated.Inc()
+ } else {
+ e.metrics.IncidentOpened.Inc()
+ }
+ }
+ return inc, nil
+}
+
+func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}, now time.Time) error {
+ e.mu.RLock()
+ rows := make([]Incident, 0, len(e.active))
+ for _, inc := range e.active {
+ rows = append(rows, cloneIncident(inc))
+ }
+ e.mu.RUnlock()
+ for _, inc := range rows {
+ if _, ok := seen[inc.IncidentID]; ok {
+ continue
+ }
+ switch inc.Status {
+ case StatusActive:
+ inc.Status = StatusRecovering
+ t := now
+ inc.RecoveringAt = &t
+ inc.UpdatedAt = now
+ if err := e.store.Upsert(ctx, inc); err != nil {
+ return err
+ }
+ e.remember(inc)
+ if e.metrics != nil {
+ e.metrics.IncidentRecovered.Inc()
+ }
+ case StatusRecovering:
+ if now.Sub(inc.LastSeenAt) >= e.cfg.ResolveAfter {
+ inc.Status = StatusResolved
+ t := now
+ inc.ResolvedAt = &t
+ inc.UpdatedAt = now
+ if err := e.store.Upsert(ctx, inc); err != nil {
+ return err
+ }
+ e.forget(inc.IncidentID)
+ if e.metrics != nil {
+ e.metrics.IncidentResolved.Inc()
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
+ events := e.reader.SearchEvents(SearchFilter{
+ Service: f.Service,
+ ErrorCode: f.ErrorCode,
+ Since: since,
+ Until: until,
+ Statuses: failedStatuses(),
+ }, limit)
+ out := make([]*eventv2.Event, 0, len(events))
+ for _, ev := range events {
+ if ev != nil && ev.Anchor != nil && ev.Anchor.Step == f.Step {
+ out = append(out, ev)
+ }
+ }
+ return out
+}
+
+func (e *Engine) querySignals(ctx context.Context, service, env string, since, until time.Time) ([]signals.Signal, error) {
+ if e.signals == nil {
+ return nil, nil
+ }
+ return e.signals.Query(ctx, signals.Filter{Service: service, Env: env, Since: since, Until: until, Limit: 200})
+}
+
+func (e *Engine) queryDeploys(ctx context.Context, service string, since, until time.Time) ([]Deployment, error) {
+ if e.deploys == nil {
+ return nil, nil
+ }
+ return e.deploys.DeploymentsInWindow(ctx, since, until, service)
+}
+
+func (e *Engine) remember(inc Incident) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ e.active[inc.IncidentID] = cloneIncident(inc)
+}
+
+func (e *Engine) forget(id string) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ delete(e.active, id)
+}
+
+func (e *Engine) getCached(id string) (Incident, bool) {
+ e.mu.RLock()
+ defer e.mu.RUnlock()
+ inc, ok := e.active[id]
+ return cloneIncident(inc), ok
+}
+
+func (e *Engine) findByFamily(env string, family apiv2.ErrorFamily) (Incident, bool) {
+ e.mu.RLock()
+ defer e.mu.RUnlock()
+ for _, inc := range e.active {
+ if inc.Env == env && inc.ErrorFamily == family && inc.Status != StatusResolved {
+ return cloneIncident(inc), true
+ }
+ }
+ return Incident{}, false
+}
+
+func (e *Engine) activeCount() int {
+ e.mu.RLock()
+ defer e.mu.RUnlock()
+ return len(e.active)
+}
+
+func (e *Engine) observeClassification(cause Cause, confidence Confidence) {
+ if e.metrics == nil {
+ return
+ }
+ e.metrics.IncidentClassifications.With(prometheus.Labels{
+ "cause": string(cause),
+ "confidence": string(confidence),
+ }).Inc()
+}
+
+func failedStatuses() map[eventv2.Status]struct{} {
+ return map[eventv2.Status]struct{}{
+ eventv2.StatusError: {},
+ eventv2.StatusTimeout: {},
+ eventv2.StatusPartial: {},
+ eventv2.StatusAborted: {},
+ }
+}
+
+func computeLift(current, baseline int) float64 {
+ if baseline <= 0 {
+ return float64(current)
+ }
+ return float64(current) / float64(baseline)
+}
+
+func severity(count, services int, lift float64) int {
+ score := 1 + count/5 + services
+ if lift >= 10 {
+ score += 3
+ } else if lift >= 3 {
+ score += 2
+ }
+ return int(math.Min(10, float64(score)))
+}
+
+func familyKey(f apiv2.ErrorFamily) string {
+ return f.Service + "\x00" + f.Step + "\x00" + f.ErrorCode
+}
+
+func earliestEventTime(events []*eventv2.Event, fallback time.Time) time.Time {
+ out := fallback
+ for _, ev := range events {
+ if ev == nil {
+ continue
+ }
+ if out.IsZero() || ev.TsStart.Before(out) {
+ out = ev.TsStart
+ }
+ }
+ return out.UTC()
+}
+
+func firstEventEnv(events []*eventv2.Event) string {
+ for _, ev := range events {
+ if ev != nil && ev.Env != "" {
+ return ev.Env
+ }
+ }
+ return "unknown"
+}
+
+func stableSamples(existing []string, events []*eventv2.Event, limit int) []string {
+ if limit <= 0 {
+ return nil
+ }
+ out := append([]string(nil), existing...)
+ seen := map[string]struct{}{}
+ for _, traceID := range out {
+ seen[traceID] = struct{}{}
+ }
+ if len(out) == 0 {
+ ascending := append([]*eventv2.Event(nil), events...)
+ sort.SliceStable(ascending, func(i, j int) bool {
+ if !ascending[i].TsStart.Equal(ascending[j].TsStart) {
+ return ascending[i].TsStart.Before(ascending[j].TsStart)
+ }
+ return ascending[i].TraceID < ascending[j].TraceID
+ })
+ for _, ev := range ascending {
+ if ev != nil && ev.TraceID != "" {
+ out = append(out, ev.TraceID)
+ seen[ev.TraceID] = struct{}{}
+ break
+ }
+ }
+ }
+ recent := append([]*eventv2.Event(nil), events...)
+ sort.SliceStable(recent, func(i, j int) bool {
+ if !recent[i].TsStart.Equal(recent[j].TsStart) {
+ return recent[i].TsStart.After(recent[j].TsStart)
+ }
+ return recent[i].TraceID < recent[j].TraceID
+ })
+ for _, ev := range recent {
+ if ev == nil || ev.TraceID == "" {
+ continue
+ }
+ if _, ok := seen[ev.TraceID]; ok {
+ continue
+ }
+ out = append(out, ev.TraceID)
+ seen[ev.TraceID] = struct{}{}
+ if len(out) == limit {
+ break
+ }
+ }
+ if len(out) > limit {
+ out = out[:limit]
+ }
+ return out
+}
+
+func cloneInt(in *int) *int {
+ if in == nil {
+ return nil
+ }
+ v := *in
+ return &v
+}
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
new file mode 100644
index 0000000..0b76de0
--- /dev/null
+++ b/internal/incidents/engine_test.go
@@ -0,0 +1,105 @@
+package incidents
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func TestEngineLifecycleAndSampleStability(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ reader := &fakeReader{
+ current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+ ErrorFamily: testFamily(),
+ Count: 6,
+ AffectedTraces: 6,
+ SampleTraces: []string{"trace-new"},
+ }}},
+ blast: apiv2.BlastRadiusResponse{
+ AffectedRequests: 6,
+ AffectedServices: 2,
+ TopServices: []string{"checkout", "payment"},
+ SampleTraces: []string{"trace-new"},
+ },
+ events: []*eventv2.Event{
+ testIncidentEvent("old", "trace-old", now.Add(-2*time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+ testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+ },
+ }
+ store := NewMemoryStore()
+ engine := NewEngine(reader, nil, nil, store, Config{MinCount: 5, ResolveAfter: time.Minute, SampleLimit: 2}, nil, nil)
+ engine.now = func() time.Time { return now }
+ if err := engine.Bootstrap(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ if err := engine.Tick(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ rows, err := engine.Active(context.Background())
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(rows) != 1 || rows[0].Status != StatusActive {
+ t.Fatalf("rows=%+v", rows)
+ }
+ if got := rows[0].SampleTraces; len(got) != 2 || got[0] != "trace-old" || got[1] != "trace-new" {
+ t.Fatalf("samples=%+v", got)
+ }
+
+ reader.current.Rows = nil
+ now = now.Add(30 * time.Second)
+ if err := engine.Tick(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ rows, _ = engine.Active(context.Background())
+ if len(rows) != 1 || rows[0].Status != StatusRecovering {
+ t.Fatalf("recovering rows=%+v", rows)
+ }
+
+ now = now.Add(2 * time.Minute)
+ if err := engine.Tick(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ rows, _ = engine.Active(context.Background())
+ if len(rows) != 0 {
+ t.Fatalf("expected resolved incident removed from active cache, rows=%+v", rows)
+ }
+
+ rehydrated := NewEngine(reader, nil, nil, store, Config{}, nil, nil)
+ if err := rehydrated.Bootstrap(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ rows, _ = rehydrated.Active(context.Background())
+ if len(rows) != 0 {
+ t.Fatalf("bootstrap should ignore resolved incidents, rows=%+v", rows)
+ }
+}
+
+type fakeReader struct {
+ current ErrorsResult
+ base ErrorsResult
+ blast apiv2.BlastRadiusResponse
+ events []*eventv2.Event
+ calls int
+}
+
+func (r *fakeReader) Errors(_ SearchFilter, _ int) ErrorsResult {
+ r.calls++
+ if r.calls%2 == 1 {
+ return r.current
+ }
+ return r.base
+}
+
+func (r *fakeReader) BlastRadius(_ SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse {
+ out := r.blast
+ out.Key = key
+ return out
+}
+
+func (r *fakeReader) SearchEvents(_ SearchFilter, _ int) []*eventv2.Event {
+ return r.events
+}
diff --git a/internal/incidents/handler.go b/internal/incidents/handler.go
new file mode 100644
index 0000000..08d57ee
--- /dev/null
+++ b/internal/incidents/handler.go
@@ -0,0 +1,92 @@
+package incidents
+
+import (
+ "encoding/json"
+ "errors"
+ "net/http"
+ "strings"
+)
+
+type Handler struct {
+ engine *Engine
+}
+
+func NewHandler(engine *Engine) *Handler {
+ return &Handler{engine: engine}
+}
+
+func (h *Handler) Active(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+ return
+ }
+ rows, err := h.engine.Active(r.Context())
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "internal_error", "query incidents failed", err.Error())
+ return
+ }
+ writeJSON(w, http.StatusOK, ActiveResponse{Incidents: rows})
+}
+
+func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+ return
+ }
+ path := strings.TrimPrefix(r.URL.Path, "/v1/incidents/")
+ if path == "" || path == r.URL.Path {
+ writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+ return
+ }
+ if strings.HasSuffix(path, "/snapshot") {
+ id := strings.TrimSuffix(path, "/snapshot")
+ h.snapshot(w, r, id)
+ return
+ }
+ inc, err := h.engine.Get(r.Context(), path)
+ if errors.Is(err, ErrNotFound) {
+ writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+ return
+ }
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
+ return
+ }
+ writeJSON(w, http.StatusOK, DetailResponse{Incident: inc})
+}
+
+func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
+ inc, err := h.engine.Get(r.Context(), id)
+ if errors.Is(err, ErrNotFound) {
+ writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+ return
+ }
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
+ return
+ }
+ snapshot := RenderSnapshot(inc)
+ if strings.Contains(r.Header.Get("Accept"), "application/json") {
+ writeJSON(w, http.StatusOK, SnapshotResponse{Snapshot: snapshot, Incident: inc})
+ return
+ }
+ w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write([]byte(snapshot))
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(status)
+ _ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+ writeJSON(w, status, map[string]any{
+ "error": map[string]any{
+ "code": code,
+ "message": message,
+ "detail": detail,
+ },
+ })
+}
diff --git a/internal/incidents/handler_test.go b/internal/incidents/handler_test.go
new file mode 100644
index 0000000..2be9a80
--- /dev/null
+++ b/internal/incidents/handler_test.go
@@ -0,0 +1,61 @@
+package incidents
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ store := NewMemoryStore()
+ inc := testIncident(now)
+ if err := store.Upsert(context.Background(), inc); err != nil {
+ t.Fatal(err)
+ }
+ engine := NewEngine(&fakeReader{}, nil, nil, store, Config{}, nil, nil)
+ if err := engine.Bootstrap(context.Background()); err != nil {
+ t.Fatal(err)
+ }
+ h := NewHandler(engine)
+
+ rec := httptest.NewRecorder()
+ h.Active(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/active", nil))
+ if rec.Code != http.StatusOK {
+ t.Fatalf("active status=%d body=%s", rec.Code, rec.Body.String())
+ }
+ var active ActiveResponse
+ if err := json.Unmarshal(rec.Body.Bytes(), &active); err != nil {
+ t.Fatal(err)
+ }
+ if len(active.Incidents) != 1 || active.Incidents[0].IncidentID != inc.IncidentID {
+ t.Fatalf("active=%+v", active)
+ }
+
+ rec = httptest.NewRecorder()
+ h.Incident(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID, nil))
+ if rec.Code != http.StatusOK || !strings.Contains(rec.Body.String(), inc.IncidentID) {
+ t.Fatalf("detail status=%d body=%s", rec.Code, rec.Body.String())
+ }
+
+ rec = httptest.NewRecorder()
+ h.Incident(rec, httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID+"/snapshot", nil))
+ if rec.Code != http.StatusOK || !strings.Contains(rec.Header().Get("Content-Type"), "text/plain") {
+ t.Fatalf("snapshot status=%d content-type=%s", rec.Code, rec.Header().Get("Content-Type"))
+ }
+ if !strings.Contains(rec.Body.String(), "Incident "+inc.IncidentID) {
+ t.Fatalf("snapshot=%s", rec.Body.String())
+ }
+
+ rec = httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodGet, "/v1/incidents/"+inc.IncidentID+"/snapshot", nil)
+ req.Header.Set("Accept", "application/json")
+ h.Incident(rec, req)
+ if rec.Code != http.StatusOK || !strings.Contains(rec.Body.String(), `"snapshot"`) {
+ t.Fatalf("json snapshot status=%d body=%s", rec.Code, rec.Body.String())
+ }
+}
diff --git a/internal/incidents/id.go b/internal/incidents/id.go
new file mode 100644
index 0000000..8a038ce
--- /dev/null
+++ b/internal/incidents/id.go
@@ -0,0 +1,19 @@
+package incidents
+
+import (
+ "crypto/sha256"
+ "encoding/hex"
+ "strings"
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+const idBucket = 5 * time.Minute
+
+func StableID(env string, family apiv2.ErrorFamily, startedAt time.Time) string {
+ bucket := startedAt.UTC().Truncate(idBucket).Format(time.RFC3339)
+ parts := []string{env, family.Service, apiv2.FormatErrorFamily(family), bucket}
+ sum := sha256.Sum256([]byte(strings.Join(parts, "|")))
+ return "inc_" + hex.EncodeToString(sum[:])[:16]
+}
diff --git a/internal/incidents/id_test.go b/internal/incidents/id_test.go
new file mode 100644
index 0000000..bd96ede
--- /dev/null
+++ b/internal/incidents/id_test.go
@@ -0,0 +1,25 @@
+package incidents
+
+import (
+ "testing"
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestStableIDUsesFiveMinuteBucket(t *testing.T) {
+ family := apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}
+ base := time.Date(2026, 5, 4, 12, 3, 0, 0, time.UTC)
+ a := StableID("prod", family, base)
+ b := StableID("prod", family, base.Add(90*time.Second))
+ c := StableID("prod", family, base.Add(3*time.Minute))
+ if a != b {
+ t.Fatalf("same bucket ids differ: %s %s", a, b)
+ }
+ if a == c {
+ t.Fatalf("different bucket id did not change: %s", a)
+ }
+ if len(a) != len("inc_")+16 {
+ t.Fatalf("id length=%d id=%s", len(a), a)
+ }
+}
diff --git a/internal/incidents/interfaces.go b/internal/incidents/interfaces.go
new file mode 100644
index 0000000..d2ce85e
--- /dev/null
+++ b/internal/incidents/interfaces.go
@@ -0,0 +1,36 @@
+package incidents
+
+import (
+ "context"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+type Reader interface {
+ Errors(f SearchFilter, limit int) ErrorsResult
+ BlastRadius(f SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse
+ SearchEvents(f SearchFilter, limit int) []*eventv2.Event
+}
+
+type SearchFilter struct {
+ Service string
+ Statuses map[eventv2.Status]struct{}
+ ErrorCode string
+ Since time.Time
+ Until time.Time
+}
+
+type ErrorsResult struct {
+ Rows []apiv2.ErrorRow
+}
+
+type SignalStore interface {
+ Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error)
+}
+
+type DeploySource interface {
+ DeploymentsInWindow(ctx context.Context, start, end time.Time, serviceFilter string) ([]Deployment, error)
+}
diff --git a/internal/incidents/nextchecks.go b/internal/incidents/nextchecks.go
new file mode 100644
index 0000000..b3a6559
--- /dev/null
+++ b/internal/incidents/nextchecks.go
@@ -0,0 +1,30 @@
+package incidents
+
+func NextChecks(cause Cause, confidence Confidence) []string {
+ switch cause {
+ case CauseDeploy:
+ return []string{
+ "Compare error onset with the deployment timestamp.",
+ "Check whether the deployed service version appears on failing traces.",
+ "Roll back or canary-disable the deployment if the affected family is still rising.",
+ }
+ case CauseDependency:
+ return []string{
+ "Check the downstream service health and recent deploys.",
+ "Inspect retries, timeouts, and circuit-breaker state for the failing step.",
+ "Notify the downstream owner with sample traces and affected service list.",
+ }
+ case CauseApp:
+ return []string{
+ "Inspect the first failing step and recent application logs.",
+ "Compare failing request fields against recent successful requests.",
+ "Add instrumentation if the step lacks enough context to isolate the bad branch.",
+ }
+ default:
+ return []string{
+ "Inspect sample traces for missing downstream or deploy evidence.",
+ "Check whether production signals are being posted to /v1/signals.",
+ "Add service version and dependency health signals to improve classification.",
+ }
+ }
+}
diff --git a/internal/incidents/render.go b/internal/incidents/render.go
new file mode 100644
index 0000000..75c2024
--- /dev/null
+++ b/internal/incidents/render.go
@@ -0,0 +1,48 @@
+package incidents
+
+import (
+ "fmt"
+ "strings"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func RenderSnapshot(inc Incident) string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "Incident %s\n", inc.IncidentID)
+ fmt.Fprintf(&b, "Status: %s\n", inc.Status)
+ fmt.Fprintf(&b, "Family: %s\n", apiv2.FormatErrorFamily(inc.ErrorFamily))
+ fmt.Fprintf(&b, "Cause: %s (%s confidence)\n", inc.Cause, inc.Confidence)
+ fmt.Fprintf(&b, "Started: %s\n", inc.StartedAt.Format("2006-01-02T15:04:05Z07:00"))
+ fmt.Fprintf(&b, "Affected: %d requests, %d services\n", inc.AffectedRequests, inc.AffectedServices)
+ fmt.Fprintf(&b, "Lift: %.2fx over baseline %d\n", inc.Lift, inc.BaselineCount)
+ if len(inc.TopServices) > 0 {
+ fmt.Fprintf(&b, "Top services: %s\n", strings.Join(inc.TopServices, ", "))
+ }
+ if len(inc.SampleTraces) > 0 {
+ fmt.Fprintf(&b, "Sample traces: %s\n", strings.Join(inc.SampleTraces, ", "))
+ }
+ if len(inc.Evidence) > 0 {
+ b.WriteString("\nEvidence:\n")
+ for _, ev := range inc.Evidence {
+ fmt.Fprintf(&b, "- %s: %s", ev.Kind, ev.Title)
+ if ev.Detail != "" {
+ fmt.Fprintf(&b, " (%s)", ev.Detail)
+ }
+ b.WriteByte('\n')
+ }
+ }
+ if len(inc.NextChecks) > 0 {
+ b.WriteString("\nNext checks:\n")
+ for _, check := range inc.NextChecks {
+ fmt.Fprintf(&b, "- %s\n", check)
+ }
+ }
+ if len(inc.InstrumentationWarnings) > 0 {
+ b.WriteString("\nInstrumentation warnings:\n")
+ for _, warning := range inc.InstrumentationWarnings {
+ fmt.Fprintf(&b, "- %s\n", warning)
+ }
+ }
+ return b.String()
+}
diff --git a/internal/incidents/store.go b/internal/incidents/store.go
new file mode 100644
index 0000000..90b7323
--- /dev/null
+++ b/internal/incidents/store.go
@@ -0,0 +1,104 @@
+package incidents
+
+import (
+ "context"
+ "errors"
+ "sort"
+ "sync"
+ "time"
+)
+
+var ErrNotFound = errors.New("incidents: not found")
+
+type Store interface {
+ Upsert(ctx context.Context, inc Incident) error
+ Get(ctx context.Context, id string) (Incident, error)
+ ListActive(ctx context.Context) ([]Incident, error)
+ PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error)
+}
+
+type MemoryStore struct {
+ mu sync.Mutex
+ rows map[string]Incident
+}
+
+func NewMemoryStore() *MemoryStore {
+ return &MemoryStore{rows: map[string]Incident{}}
+}
+
+func (s *MemoryStore) Upsert(_ context.Context, inc Incident) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ s.rows[inc.IncidentID] = cloneIncident(inc)
+ return nil
+}
+
+func (s *MemoryStore) Get(_ context.Context, id string) (Incident, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ inc, ok := s.rows[id]
+ if !ok {
+ return Incident{}, ErrNotFound
+ }
+ return cloneIncident(inc), nil
+}
+
+func (s *MemoryStore) ListActive(_ context.Context) ([]Incident, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ out := make([]Incident, 0, len(s.rows))
+ for _, inc := range s.rows {
+ if inc.Status != StatusResolved {
+ out = append(out, cloneIncident(inc))
+ }
+ }
+ sortIncidents(out)
+ return out, nil
+}
+
+func (s *MemoryStore) PruneResolvedOlderThan(_ context.Context, cutoff time.Time) (int, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ deleted := 0
+ for id, inc := range s.rows {
+ if inc.Status == StatusResolved && inc.ResolvedAt != nil && inc.ResolvedAt.Before(cutoff) {
+ delete(s.rows, id)
+ deleted++
+ }
+ }
+ return deleted, nil
+}
+
+func sortIncidents(rows []Incident) {
+ sort.SliceStable(rows, func(i, j int) bool {
+ if rows[i].Severity != rows[j].Severity {
+ return rows[i].Severity > rows[j].Severity
+ }
+ if !rows[i].StartedAt.Equal(rows[j].StartedAt) {
+ return rows[i].StartedAt.After(rows[j].StartedAt)
+ }
+ return rows[i].IncidentID < rows[j].IncidentID
+ })
+}
+
+func cloneIncident(in Incident) Incident {
+ out := in
+ out.TopServices = append([]string(nil), in.TopServices...)
+ out.SampleTraces = append([]string(nil), in.SampleTraces...)
+ out.Evidence = append([]Evidence(nil), in.Evidence...)
+ out.NextChecks = append([]string(nil), in.NextChecks...)
+ out.InstrumentationWarnings = append([]string(nil), in.InstrumentationWarnings...)
+ if in.AffectedUsers != nil {
+ v := *in.AffectedUsers
+ out.AffectedUsers = &v
+ }
+ if in.RecoveringAt != nil {
+ v := *in.RecoveringAt
+ out.RecoveringAt = &v
+ }
+ if in.ResolvedAt != nil {
+ v := *in.ResolvedAt
+ out.ResolvedAt = &v
+ }
+ return out
+}
diff --git a/internal/incidents/test_helpers_test.go b/internal/incidents/test_helpers_test.go
new file mode 100644
index 0000000..8ff2e4a
--- /dev/null
+++ b/internal/incidents/test_helpers_test.go
@@ -0,0 +1,60 @@
+package incidents
+
+import (
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+)
+
+func testFamily() apiv2.ErrorFamily {
+ return apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"}
+}
+
+func testIncidentEvent(id, traceID string, ts time.Time, service, step, code, downstream string) *eventv2.Event {
+ ev := &eventv2.Event{
+ SchemaVersion: eventv2.SchemaVersion2,
+ EventID: id,
+ TsStart: ts,
+ TsEnd: ts.Add(10 * time.Millisecond),
+ DurationMS: 10,
+ Kind: "http",
+ Service: service,
+ Env: "prod",
+ Version: "v1",
+ TraceID: traceID,
+ SpanID: id + "-span",
+ Status: eventv2.StatusError,
+ Anchor: &eventv2.Anchor{Step: step, ErrorCode: code},
+ }
+ stepObj := eventv2.Step{Name: step, StartMS: 0, DurationMS: 10, Status: eventv2.StepStatusError, Error: &eventv2.StepError{Code: code, Reason: "failed"}}
+ if downstream != "" {
+ stepObj.Downstream = &eventv2.Downstream{Service: downstream, Endpoint: "/charge", Kind: "http"}
+ }
+ ev.Steps = []eventv2.Step{stepObj}
+ return ev
+}
+
+func testIncident(now time.Time) Incident {
+ return Incident{
+ IncidentID: StableID("prod", testFamily(), now),
+ Env: "prod",
+ Service: "checkout",
+ ErrorFamily: testFamily(),
+ Status: StatusActive,
+ Cause: CauseDependency,
+ Confidence: ConfidenceMedium,
+ Severity: 7,
+ StartedAt: now,
+ UpdatedAt: now,
+ LastSeenAt: now,
+ AffectedRequests: 6,
+ AffectedServices: 2,
+ TopServices: []string{"checkout", "payment"},
+ SampleTraces: []string{"trace-a"},
+ Evidence: []Evidence{{Kind: EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: now}},
+ NextChecks: []string{"check payment"},
+ Lift: 6,
+ CurrentCount: 6,
+ }
+}
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
new file mode 100644
index 0000000..f1d8bdb
--- /dev/null
+++ b/internal/incidents/types.go
@@ -0,0 +1,103 @@
+package incidents
+
+import (
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+type Status string
+
+const (
+ StatusActive Status = "active"
+ StatusRecovering Status = "recovering"
+ StatusResolved Status = "resolved"
+)
+
+type Cause string
+
+const (
+ CauseDeploy Cause = "deploy"
+ CauseApp Cause = "app"
+ CauseDependency Cause = "dependency"
+ CauseUnknown Cause = "unknown"
+)
+
+type Confidence string
+
+const (
+ ConfidenceHigh Confidence = "high"
+ ConfidenceMedium Confidence = "medium"
+ ConfidenceLow Confidence = "low"
+)
+
+type EvidenceKind string
+
+const (
+ EvidenceSignal EvidenceKind = "signal"
+ EvidenceDeployment EvidenceKind = "deployment"
+ EvidenceTrace EvidenceKind = "trace"
+ EvidenceMetric EvidenceKind = "metric"
+)
+
+type Evidence struct {
+ Kind EvidenceKind `json:"kind"`
+ Title string `json:"title"`
+ Detail string `json:"detail,omitempty"`
+ Service string `json:"service,omitempty"`
+ SignalID string `json:"signal_id,omitempty"`
+ DeployID string `json:"deployment_id,omitempty"`
+ TraceID string `json:"trace_id,omitempty"`
+ OccurredAt time.Time `json:"occurred_at"`
+ Fields map[string]any `json:"fields,omitempty"`
+}
+
+type Incident struct {
+ IncidentID string `json:"incident_id"`
+ Env string `json:"env"`
+ Service string `json:"service"`
+ ErrorFamily apiv2.ErrorFamily `json:"error_family"`
+ Status Status `json:"status"`
+ Cause Cause `json:"cause"`
+ Confidence Confidence `json:"confidence"`
+ Severity int `json:"severity"`
+ StartedAt time.Time `json:"started_at"`
+ UpdatedAt time.Time `json:"updated_at"`
+ LastSeenAt time.Time `json:"last_seen_at"`
+ RecoveringAt *time.Time `json:"recovering_at,omitempty"`
+ ResolvedAt *time.Time `json:"resolved_at,omitempty"`
+ AffectedRequests int `json:"affected_requests"`
+ AffectedUsers *int `json:"affected_users,omitempty"`
+ AffectedServices int `json:"affected_services"`
+ TopServices []string `json:"top_services"`
+ SampleTraces []string `json:"sample_traces"`
+ Evidence []Evidence `json:"evidence"`
+ NextChecks []string `json:"next_checks"`
+ InstrumentationWarnings []string `json:"instrumentation_warnings,omitempty"`
+ Lift float64 `json:"lift"`
+ BaselineCount int `json:"baseline_count"`
+ CurrentCount int `json:"current_count"`
+}
+
+type ActiveResponse struct {
+ Incidents []Incident `json:"incidents"`
+}
+
+type DetailResponse struct {
+ Incident Incident `json:"incident"`
+}
+
+type SnapshotResponse struct {
+ Snapshot string `json:"snapshot"`
+ Incident Incident `json:"incident"`
+}
+
+type Deployment struct {
+ ID string
+ Service string
+ Version string
+ Env string
+ FirstSeen time.Time
+ LastSeen time.Time
+ Metadata map[string]string
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index a654cf8..6412a70 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -71,6 +71,14 @@ type Metrics struct {
SignalsRejected *prometheus.CounterVec
SignalRetentionPruned prometheus.Counter
+ IncidentOpened prometheus.Counter
+ IncidentUpdated prometheus.Counter
+ IncidentRecovered prometheus.Counter
+ IncidentResolved prometheus.Counter
+ IncidentTickLatency prometheus.Histogram
+ IncidentActive prometheus.Gauge
+ IncidentClassifications *prometheus.CounterVec
+
CausalRunsTotal prometheus.Counter
CausalRunDuration prometheus.Histogram
CausalRunFailures prometheus.Counter
@@ -340,6 +348,41 @@ func New(reg *prometheus.Registry) *Metrics {
Help: "Production-context signals pruned by retention.",
})
+ m.IncidentOpened = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incidents_opened_total",
+ Help: "Incidents opened by the v2.1 incident engine.",
+ })
+ m.IncidentUpdated = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incidents_updated_total",
+ Help: "Incidents updated by the v2.1 incident engine.",
+ })
+ m.IncidentRecovered = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incidents_recovered_total",
+ Help: "Incidents moved to recovering by the v2.1 incident engine.",
+ })
+ m.IncidentResolved = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incidents_resolved_total",
+ Help: "Incidents resolved by the v2.1 incident engine.",
+ })
+ m.IncidentTickLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
+ Name: "waylog_incident_tick_latency_seconds",
+ Help: "Incident engine tick duration.",
+ Buckets: defaultBuckets,
+ })
+ m.IncidentActive = prometheus.NewGauge(prometheus.GaugeOpts{
+ Name: "waylog_incidents_active",
+ Help: "Active or recovering incidents currently tracked by the v2.1 incident engine.",
+ })
+ m.IncidentClassifications = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Name: "waylog_incident_classifications_total",
+ Help: "Incident classifications by cause and confidence.",
+ }, []string{"cause", "confidence"})
+ for _, cause := range []string{"deploy", "app", "dependency", "unknown"} {
+ for _, confidence := range []string{"high", "medium", "low"} {
+ m.IncidentClassifications.WithLabelValues(cause, confidence).Add(0)
+ }
+ }
+
m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "waylog_causal_runs_total",
Help: "Total causal inference runs.",
@@ -417,6 +460,8 @@ func New(reg *prometheus.Registry) *Metrics {
m.ColdEventsWritten, m.ColdEventsDropped, m.ColdBatchLatency,
m.DeployUpsertsTotal, m.DeployUpsertErrors,
m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
+ m.IncidentOpened, m.IncidentUpdated, m.IncidentRecovered, m.IncidentResolved,
+ m.IncidentTickLatency, m.IncidentActive, m.IncidentClassifications,
m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,
From 3e1be99e1f9a74bf835038eefa2dec719457de30 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Tue, 5 May 2026 03:12:27 -0400
Subject: [PATCH 03/14] feat: added incident CLI and dashboard surfaces
Expose the incident engine through the operator CLI and embedded
dashboard.
Promote incident HTTP response DTOs into pkg/api/v2 so server handlers,
CLI clients, OpenAPI, and dashboard consumers share one public contract.
Update the internal incidents handler to convert internal engine incidents
into the shared API DTOs.
Added CLI commands:
- waylog incidents [--json]
- waylog incident [--json] [--snapshot]
The new commands reuse the existing v2-read capability gate, read auth,
path escaping, JSON rendering, and error handling. Snapshot mode supports
plain text by default and JSON when --json is supplied.
Added incident client methods and human renderers for active incident tables,
incident detail, evidence, next checks, instrumentation warnings, and sample
traces.
Add an active-incident strip to the dashboard and a #/incident/ detail
screen. The dashboard fetches /v1/incidents/active during the normal polling
cycle, renders incident cards above the existing errors panel, and links sample
traces into the existing explain screen. If the incident API is unavailable
(404/503), the dashboard shows an empty incident strip instead of failing the
main v2 triage UI.
Tests cover CLI routing/rendering, snapshot text and JSON behavior, dashboard
static references, and handler DTO shape.
Verification:
- go test ./pkg/api/v2 ./internal/incidents ./internal/cli/v2 ./internal/dashboard
- go test ./...
- go vet ./...
- bash scripts/check-doc-links.sh
- git diff --check
---
internal/cli/v2/client.go | 35 +++++++
internal/cli/v2/cmd.go | 66 ++++++++++++++
internal/cli/v2/cmd_test.go | 102 +++++++++++++++++++++
internal/cli/v2/render.go | 96 ++++++++++++++++++++
internal/cli/v2/render_test.go | 48 ++++++++++
internal/cli/v2/types.go | 5 +
internal/dashboard/static/index.html | 131 ++++++++++++++++++++++++---
internal/dashboard/static_test.go | 7 ++
internal/incidents/handler.go | 63 ++++++++++++-
internal/incidents/handler_test.go | 4 +-
internal/incidents/types.go | 13 ---
pkg/api/v2/types.go | 52 +++++++++++
12 files changed, 591 insertions(+), 31 deletions(-)
diff --git a/internal/cli/v2/client.go b/internal/cli/v2/client.go
index 0ce7100..5430945 100644
--- a/internal/cli/v2/client.go
+++ b/internal/cli/v2/client.go
@@ -145,6 +145,30 @@ func (c *Client) Blast(ctx context.Context, p BlastParams) (BlastRadiusResponse,
return out, err
}
+func (c *Client) Incidents(ctx context.Context) (IncidentListResponse, error) {
+ var out IncidentListResponse
+ err := c.do(ctx, "/v1/incidents/active", nil, &out)
+ return out, err
+}
+
+func (c *Client) Incident(ctx context.Context, incidentID string) (IncidentDetailResponse, error) {
+ var out IncidentDetailResponse
+ err := c.do(ctx, "/v1/incidents/"+url.PathEscape(incidentID), nil, &out)
+ return out, err
+}
+
+func (c *Client) IncidentSnapshotText(ctx context.Context, incidentID string) (string, error) {
+ var out string
+ err := c.doRaw(ctx, "/v1/incidents/"+url.PathEscape(incidentID)+"/snapshot", nil, "text/plain", &out)
+ return out, err
+}
+
+func (c *Client) IncidentSnapshotJSON(ctx context.Context, incidentID string) (IncidentSnapshotResponse, error) {
+ var out IncidentSnapshotResponse
+ err := c.doRaw(ctx, "/v1/incidents/"+url.PathEscape(incidentID)+"/snapshot", nil, "application/json", &out)
+ return out, err
+}
+
func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchResponse, error) {
q := url.Values{}
addQuery(q, "error_code", p.ErrorCode)
@@ -160,6 +184,10 @@ func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchRespons
}
func (c *Client) do(ctx context.Context, path string, q url.Values, out any) error {
+ return c.doRaw(ctx, path, q, "application/json", out)
+}
+
+func (c *Client) doRaw(ctx context.Context, path string, q url.Values, accept string, out any) error {
u, err := url.Parse(c.base + path)
if err != nil {
return &TransportError{Err: err}
@@ -174,6 +202,9 @@ func (c *Client) do(ctx context.Context, path string, q url.Values, out any) err
if c.apiKey != "" {
req.Header.Set("Authorization", "Bearer "+c.apiKey)
}
+ if accept != "" {
+ req.Header.Set("Accept", accept)
+ }
resp, err := c.http.Do(req)
if err != nil {
return &TransportError{Err: err}
@@ -189,6 +220,10 @@ func (c *Client) do(ctx context.Context, path string, q url.Values, out any) err
if out == nil || len(strings.TrimSpace(string(body))) == 0 {
return nil
}
+ if text, ok := out.(*string); ok {
+ *text = string(body)
+ return nil
+ }
if err := json.Unmarshal(body, out); err != nil {
return &TransportError{Err: fmt.Errorf("decode response: %w", err)}
}
diff --git a/internal/cli/v2/cmd.go b/internal/cli/v2/cmd.go
index f6617f1..87c3fae 100644
--- a/internal/cli/v2/cmd.go
+++ b/internal/cli/v2/cmd.go
@@ -49,6 +49,10 @@ func RunCLI(args []string, _ io.Reader, stdout, stderr io.Writer) int {
return runCapabilities(ctx, client, cfg, rest[1:], stdout, stderr)
case "recent":
return runRecent(ctx, client, cfg, rest[1:], stdout, stderr)
+ case "incidents":
+ return runIncidents(ctx, client, cfg, rest[1:], stdout, stderr)
+ case "incident":
+ return runIncident(ctx, client, cfg, rest[1:], stdout, stderr)
case "errors":
return runErrors(ctx, client, cfg, rest[1:], stdout, stderr)
case "event":
@@ -182,6 +186,64 @@ func runRecent(ctx context.Context, client *Client, cfg cliConfig, args []string
return renderOrError(stdout, stderr, cfg.json, resp, err, RenderRecent)
}
+func runIncidents(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+ if len(args) != 0 {
+ return usage(stderr, "usage: waylog incidents [--json]")
+ }
+ if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+ return gate
+ }
+ resp, err := client.Incidents(ctx)
+ return renderOrError(stdout, stderr, cfg.json, resp, err, RenderIncidents)
+}
+
+func runIncident(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+ incidentID, snapshot, err := parseIncidentArgs(args)
+ if err != nil {
+ return usage(stderr, err.Error())
+ }
+ if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+ return gate
+ }
+ if snapshot {
+ if cfg.json {
+ resp, err := client.IncidentSnapshotJSON(ctx, incidentID)
+ return renderOrError(stdout, stderr, true, resp, err, RenderIncidentSnapshot)
+ }
+ text, err := client.IncidentSnapshotText(ctx, incidentID)
+ if err != nil {
+ fmt.Fprintln(stderr, err)
+ return exitCodeForError(err)
+ }
+ fmt.Fprint(stdout, text)
+ return 0
+ }
+ resp, err := client.Incident(ctx, incidentID)
+ return renderOrError(stdout, stderr, cfg.json, resp, err, RenderIncident)
+}
+
+func parseIncidentArgs(args []string) (string, bool, error) {
+ incidentID := ""
+ snapshot := false
+ for _, arg := range args {
+ switch {
+ case arg == "--snapshot":
+ snapshot = true
+ case strings.HasPrefix(arg, "-"):
+ return "", false, fmt.Errorf("unknown flag: %s", arg)
+ default:
+ if incidentID != "" {
+ return "", false, errors.New("usage: waylog incident [--snapshot] [--json]")
+ }
+ incidentID = arg
+ }
+ }
+ if incidentID == "" {
+ return "", false, errors.New("usage: waylog incident [--snapshot] [--json]")
+ }
+ return incidentID, snapshot, nil
+}
+
func runEvent(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
if len(args) != 1 {
return usage(stderr, "usage: waylog event [--json]")
@@ -423,6 +485,8 @@ func printUsage(w io.Writer) {
fmt.Fprintln(w, `Usage:
waylog capabilities [--json]
waylog recent [--window ] [--service ] [--status ] [--limit ] [--cursor ] [--include-suppressed] [--json]
+ waylog incidents [--json]
+ waylog incident [--snapshot] [--json]
waylog errors [--window ] [--service ] [--limit ] [--cursor ] [--json]
waylog event [--json]
waylog trace [--json]
@@ -431,6 +495,8 @@ func printUsage(w io.Writer) {
waylog search [--service ] [--status ] [--window ] [--limit ] [--cursor ] [--json]
Recommended loop:
+ waylog incidents
+ waylog incident
waylog recent
waylog errors --window 15m
waylog blast checkout:payment.charge:PMT_502 --window 15m
diff --git a/internal/cli/v2/cmd_test.go b/internal/cli/v2/cmd_test.go
index 3f038f8..0cf256f 100644
--- a/internal/cli/v2/cmd_test.go
+++ b/internal/cli/v2/cmd_test.go
@@ -76,6 +76,108 @@ func TestRunCLIRecentSerializesFilters(t *testing.T) {
}
}
+func TestRunCLIIncidentsListsActive(t *testing.T) {
+ var gotPath string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path == "/v1/capabilities" {
+ _, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+ return
+ }
+ gotPath = r.URL.Path
+ _, _ = w.Write([]byte(`{"incidents":[{"incident_id":"inc_1234567890abcdef","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}]}`))
+ }))
+ defer srv.Close()
+
+ var stdout, stderr bytes.Buffer
+ code := RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+ if code != 0 {
+ t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+ }
+ if gotPath != "/v1/incidents/active" {
+ t.Fatalf("path=%q", gotPath)
+ }
+ for _, want := range []string{"INCIDENT", "dependency", "checkout:payment.charge:PMT_502"} {
+ if !strings.Contains(stdout.String(), want) {
+ t.Fatalf("stdout missing %q:\n%s", want, stdout.String())
+ }
+ }
+}
+
+func TestRunCLIIncidentsEmptyAndRequiresV2Reads(t *testing.T) {
+ calls := 0
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ calls++
+ _, _ = w.Write([]byte(`{"v2_reads":{"enabled":false}}`))
+ }))
+ defer srv.Close()
+
+ var stdout, stderr bytes.Buffer
+ code := RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+ if code != 3 || calls != 1 || !strings.Contains(stderr.String(), "WAYLOG_V2_READS=true") {
+ t.Fatalf("code=%d calls=%d stdout=%q stderr=%q", code, calls, stdout.String(), stderr.String())
+ }
+
+ srv.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path == "/v1/capabilities" {
+ _, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+ return
+ }
+ _, _ = w.Write([]byte(`{"incidents":[]}`))
+ })
+ stdout.Reset()
+ stderr.Reset()
+ code = RunCLI([]string{"--addr", srv.URL, "incidents"}, nil, &stdout, &stderr)
+ if code != 0 || !strings.Contains(stdout.String(), "No active incidents.") {
+ t.Fatalf("code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+ }
+}
+
+func TestRunCLIIncidentDetailAndSnapshot(t *testing.T) {
+ calls := []string{}
+ accepts := []string{}
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path == "/v1/capabilities" {
+ _, _ = w.Write([]byte(`{"v2_reads":{"enabled":true}}`))
+ return
+ }
+ calls = append(calls, r.URL.String())
+ accepts = append(accepts, r.Header.Get("Accept"))
+ switch {
+ case strings.HasSuffix(r.URL.Path, "/snapshot") && r.Header.Get("Accept") == "application/json":
+ _, _ = w.Write([]byte(`{"snapshot":"Incident inc/1\n","incident":{"incident_id":"inc/1","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}}`))
+ case strings.HasSuffix(r.URL.Path, "/snapshot"):
+ w.Header().Set("Content-Type", "text/plain")
+ _, _ = w.Write([]byte("Incident inc/1\n"))
+ default:
+ _, _ = w.Write([]byte(`{"incident":{"incident_id":"inc/1","env":"prod","service":"checkout","error_family":{"service":"checkout","step":"payment.charge","error_code":"PMT_502"},"status":"active","cause":"dependency","confidence":"medium","severity":8,"started_at":"2026-05-04T12:00:00Z","updated_at":"2026-05-04T12:01:00Z","last_seen_at":"2026-05-04T12:01:00Z","affected_requests":12,"affected_services":3,"top_services":["checkout","payment"],"sample_traces":["trace-a"],"evidence":[{"kind":"trace","title":"sample","trace_id":"trace-a","occurred_at":"2026-05-04T12:00:00Z"}],"next_checks":["check payment"],"lift":6,"baseline_count":2,"current_count":12}}`))
+ }
+ }))
+ defer srv.Close()
+
+ var stdout, stderr bytes.Buffer
+ code := RunCLI([]string{"--addr", srv.URL, "incident", "inc/1"}, nil, &stdout, &stderr)
+ if code != 0 {
+ t.Fatalf("detail code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+ }
+ if calls[0] != "/v1/incidents/inc%2F1" || !strings.Contains(stdout.String(), "incident_id: inc/1") {
+ t.Fatalf("calls=%v stdout=%q", calls, stdout.String())
+ }
+
+ stdout.Reset()
+ stderr.Reset()
+ code = RunCLI([]string{"--addr", srv.URL, "incident", "inc/1", "--snapshot"}, nil, &stdout, &stderr)
+ if code != 0 || stdout.String() != "Incident inc/1\n" {
+ t.Fatalf("snapshot code=%d stdout=%q stderr=%q", code, stdout.String(), stderr.String())
+ }
+
+ stdout.Reset()
+ stderr.Reset()
+ code = RunCLI([]string{"--addr", srv.URL, "--json", "incident", "inc/1", "--snapshot"}, nil, &stdout, &stderr)
+ if code != 0 || !strings.Contains(stdout.String(), `"snapshot"`) || accepts[len(accepts)-1] != "application/json" {
+ t.Fatalf("json snapshot code=%d accepts=%v stdout=%q stderr=%q", code, accepts, stdout.String(), stderr.String())
+ }
+}
+
func TestRunCLIEventEscapesIDAndRequiresV2Reads(t *testing.T) {
calls := []string{}
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index e4051cc..bf9b574 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -68,6 +68,102 @@ func RenderRecent(w io.Writer, resp RecentTracesResponse) {
renderNextCursor(w, resp.NextCursor)
}
+func RenderIncidents(w io.Writer, resp IncidentListResponse) {
+ if len(resp.Incidents) == 0 {
+ fmt.Fprintln(w, "No active incidents.")
+ return
+ }
+ tw := tabwriter.NewWriter(w, 0, 4, 2, ' ', 0)
+ fmt.Fprintln(tw, "INCIDENT\tSTATUS\tCAUSE\tCONF\tSEVERITY\tFAMILY\tAFFECTED\tSTARTED")
+ for _, inc := range resp.Incidents {
+ fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%d\t%s\t%d req / %d svc\t%s\n",
+ truncateID(inc.IncidentID),
+ inc.Status,
+ inc.Cause,
+ inc.Confidence,
+ inc.Severity,
+ apiv2.FormatErrorFamily(inc.ErrorFamily),
+ inc.AffectedRequests,
+ inc.AffectedServices,
+ formatTime(inc.StartedAt),
+ )
+ }
+ _ = tw.Flush()
+}
+
+func RenderIncident(w io.Writer, resp IncidentDetailResponse) {
+ renderIncidentBody(w, resp.Incident)
+}
+
+func RenderIncidentSnapshot(w io.Writer, resp IncidentSnapshotResponse) {
+ if resp.Snapshot != "" {
+ fmt.Fprintln(w, resp.Snapshot)
+ }
+}
+
+func renderIncidentBody(w io.Writer, inc Incident) {
+ fmt.Fprintf(w, "incident_id: %s\n", inc.IncidentID)
+ fmt.Fprintf(w, "status: %s\n", inc.Status)
+ fmt.Fprintf(w, "family: %s\n", apiv2.FormatErrorFamily(inc.ErrorFamily))
+ fmt.Fprintf(w, "cause: %s (%s confidence)\n", inc.Cause, inc.Confidence)
+ fmt.Fprintf(w, "severity: %d\n", inc.Severity)
+ fmt.Fprintf(w, "started_at: %s\n", formatTime(inc.StartedAt))
+ fmt.Fprintf(w, "updated_at: %s\n", formatTime(inc.UpdatedAt))
+ if inc.ResolvedAt != nil {
+ fmt.Fprintf(w, "resolved_at: %s\n", formatTime(*inc.ResolvedAt))
+ }
+ fmt.Fprintf(w, "affected_requests: %d\n", inc.AffectedRequests)
+ if inc.AffectedUsers == nil {
+ fmt.Fprintln(w, "affected_users: null")
+ } else {
+ fmt.Fprintf(w, "affected_users: %d\n", *inc.AffectedUsers)
+ }
+ fmt.Fprintf(w, "affected_services: %d\n", inc.AffectedServices)
+ fmt.Fprintf(w, "top_services: %s\n", strings.Join(inc.TopServices, ","))
+ fmt.Fprintf(w, "lift: %.2f\n", inc.Lift)
+ fmt.Fprintf(w, "baseline_count: %d\n", inc.BaselineCount)
+ fmt.Fprintf(w, "current_count: %d\n", inc.CurrentCount)
+
+ fmt.Fprintln(w, "\nevidence:")
+ if len(inc.Evidence) == 0 {
+ fmt.Fprintln(w, " none")
+ } else {
+ for _, ev := range inc.Evidence {
+ detail := ev.Detail
+ if detail == "" {
+ detail = ev.Service
+ }
+ fmt.Fprintf(w, " - %s: %s", ev.Kind, ev.Title)
+ if detail != "" {
+ fmt.Fprintf(w, " (%s)", detail)
+ }
+ if ev.TraceID != "" {
+ fmt.Fprintf(w, " trace=%s", truncateID(ev.TraceID))
+ }
+ fmt.Fprintln(w)
+ }
+ }
+
+ fmt.Fprintln(w, "\nnext_checks:")
+ if len(inc.NextChecks) == 0 {
+ fmt.Fprintln(w, " none")
+ } else {
+ for _, check := range inc.NextChecks {
+ fmt.Fprintf(w, " - %s\n", check)
+ }
+ }
+
+ if len(inc.InstrumentationWarnings) > 0 {
+ fmt.Fprintln(w, "\ninstrumentation_warnings:")
+ for _, warning := range inc.InstrumentationWarnings {
+ fmt.Fprintf(w, " - %s\n", warning)
+ }
+ }
+ if len(inc.SampleTraces) > 0 {
+ fmt.Fprintf(w, "\nsample_traces: %s\n", truncateList(inc.SampleTraces))
+ }
+}
+
func RenderEvent(w io.Writer, ev *Event) {
if ev == nil {
fmt.Fprintln(w, "No event found.")
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 598c54c..0d4a755 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -84,6 +84,54 @@ func TestRenderEventPrintsSummaryCounts(t *testing.T) {
}
}
+func TestRenderIncidentsAndDetail(t *testing.T) {
+ start := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ inc := Incident{
+ IncidentID: "inc_1234567890abcdef",
+ Env: "prod",
+ Service: "checkout",
+ ErrorFamily: ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+ Status: "active",
+ Cause: "dependency",
+ Confidence: "medium",
+ Severity: 8,
+ StartedAt: start,
+ UpdatedAt: start.Add(time.Minute),
+ LastSeenAt: start.Add(time.Minute),
+ AffectedRequests: 12,
+ AffectedServices: 3,
+ TopServices: []string{"checkout", "payment"},
+ SampleTraces: []string{"trace-1234567890"},
+ Evidence: []IncidentEvidence{{Kind: "trace", Title: "First failing trace sample", Detail: "payment.charge/PMT_502", TraceID: "trace-1234567890", OccurredAt: start}},
+ NextChecks: []string{"Check payment health."},
+ Lift: 6,
+ BaselineCount: 2,
+ CurrentCount: 12,
+ }
+
+ var out bytes.Buffer
+ RenderIncidents(&out, IncidentListResponse{Incidents: []Incident{inc}})
+ for _, want := range []string{"INCIDENT", "dependency", "medium", "checkout:payment.charge:PMT_502", "12 req / 3 svc"} {
+ if !strings.Contains(out.String(), want) {
+ t.Fatalf("list output missing %q:\n%s", want, out.String())
+ }
+ }
+
+ out.Reset()
+ RenderIncident(&out, IncidentDetailResponse{Incident: inc})
+ for _, want := range []string{"incident_id: inc_1234567890abcdef", "cause: dependency (medium confidence)", "evidence:", "next_checks:", "sample_traces:"} {
+ if !strings.Contains(out.String(), want) {
+ t.Fatalf("detail output missing %q:\n%s", want, out.String())
+ }
+ }
+
+ out.Reset()
+ RenderIncidents(&out, IncidentListResponse{})
+ if !strings.Contains(out.String(), "No active incidents.") {
+ t.Fatalf("empty output=%q", out.String())
+ }
+}
+
func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
var out bytes.Buffer
resp := CapabilitiesResponse{}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 6672d99..1a940de 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -31,6 +31,11 @@ type ErrorRow = apiv2.ErrorRow
type ErrorsResponse = apiv2.ErrorsResponse
type BlastKey = apiv2.BlastKey
type BlastRadiusResponse = apiv2.BlastRadiusResponse
+type Incident = apiv2.Incident
+type IncidentEvidence = apiv2.IncidentEvidence
+type IncidentListResponse = apiv2.IncidentListResponse
+type IncidentDetailResponse = apiv2.IncidentDetailResponse
+type IncidentSnapshotResponse = apiv2.IncidentSnapshotResponse
type eventGetResponse struct {
Event *Event `json:"event"`
diff --git a/internal/dashboard/static/index.html b/internal/dashboard/static/index.html
index 6ee9ac6..db935b6 100644
--- a/internal/dashboard/static/index.html
+++ b/internal/dashboard/static/index.html
@@ -580,6 +580,36 @@
}
.empty strong { color: var(--text); display: block; margin-bottom: 6px; }
.error-box { color: var(--danger); border-color: var(--danger-soft); background: var(--danger-soft); }
+ .incident-strip {
+ display: grid;
+ gap: 10px;
+ margin-bottom: 14px;
+ }
+ .incident-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
+ gap: 8px;
+ }
+ .incident-card {
+ display: grid;
+ gap: 8px;
+ min-height: 44px;
+ border: 1px solid var(--line);
+ border-radius: var(--radius-md);
+ background: linear-gradient(135deg, var(--item-bg), transparent);
+ padding: 12px;
+ transition: border-color 0.15s ease, background 0.15s ease;
+ }
+ .incident-card:hover { border-color: var(--line-strong); background: var(--row-hover); }
+ .incident-title { display: flex; justify-content: space-between; gap: 8px; align-items: center; }
+ .incident-detail { display: grid; gap: 12px; }
+ .evidence-list, .check-list { display: grid; gap: 8px; margin: 0; padding: 0; list-style: none; }
+ .evidence-list li, .check-list li {
+ border: 1px solid var(--line);
+ border-radius: var(--radius-md);
+ background: var(--item-bg);
+ padding: 10px 12px;
+ }
.disabled {
display: grid;
place-items: center;
@@ -619,6 +649,7 @@
capabilities: null,
errors: null,
recent: null,
+ incidents: null,
errorTrend: [],
latencyTrend: [],
timers: [],
@@ -736,13 +767,23 @@
async function loadCapabilities() {
state.capabilities = await fetchJSON("/v1/capabilities");
}
+ async function loadIncidents() {
+ try {
+ return await fetchJSON("/v1/incidents/active");
+ } catch (err) {
+ if (err.status === 404 || err.status === 503) return { incidents: [] };
+ throw err;
+ }
+ }
async function loadDashboardData() {
- const [errors, recent] = await Promise.all([
+ const [errors, recent, incidents] = await Promise.all([
fetchJSON("/v1/errors?" + params({ window: state.window, limit: 50 })),
fetchJSON("/v1/traces/recent?" + params({ window: state.window, limit: 50, include_suppressed: true })),
+ loadIncidents(),
]);
state.errors = errors;
state.recent = recent;
+ state.incidents = incidents;
const errorEvents = (errors.rows || []).reduce((sum, row) => sum + Number(row.count || 0), 0);
const durations = (recent.traces || []).map(t => Number(t.duration_ms || 0));
pushTrend(state.errorTrend, errorEvents);
@@ -791,7 +832,10 @@
${tab("#/blast/" + encodeURIComponent(firstFamily()), "Blast", route.screen === "blast", !firstFamily())}
-
+
+ ${renderIncidentStrip()}
+ ${content}
+
`;
@@ -811,6 +855,27 @@
return state.recent?.traces?.[0]?.trace_id || state.errors?.rows?.[0]?.sample_traces?.[0] || "";
}
+ function renderIncidentStrip() {
+ const incidents = state.incidents?.incidents || [];
+ if (!incidents.length) {
+ return `
+ Active incidents
No active incidents.
+ `;
+ }
+ return `
+ Active incidents
${nf.format(incidents.length)} incident${incidents.length === 1 ? "" : "s"} detected
+
+ `;
+ }
+
function renderRecent() {
return `
Recent requests
Polls every 5s
@@ -927,6 +992,51 @@ First failing step
${storyCard("Sample traces", `${(blast.sample_traces || []).map(id => `
${esc(shortID(id))}`).join("") || "
No sample traces.
"}
`)}
`);
}
+
+ async function renderIncident(id) {
+ if (!id) return shell(`Choose an incident to inspect.
`);
+ const resp = await fetchJSON("/v1/incidents/" + encodeURIComponent(id));
+ const incident = resp.incident || {};
+ return shell(`
+
+
incident summary
+
${esc(formatFamily(incident.error_family))}
+
${esc(incident.cause || "unknown")} · ${esc(incident.confidence || "low")} confidence
+
${esc(incident.status || "unknown")} · severity ${nf.format(incident.severity || 0)} · started ${esc(ago(incident.started_at))}
+
+
+ ${impact("Affected requests", incident.affected_requests || 0)}
+ ${impact("Affected users", incident.affected_users == null ? "unknown" : incident.affected_users)}
+ ${impact("Affected services", incident.affected_services || 0)}
+ ${impact("Lift", Number(incident.lift || 0).toFixed(2) + "x")}
+
+
+ ${storyCard("Evidence", renderEvidence(incident.evidence || []))}
+ ${storyCard("Next checks", renderChecks(incident.next_checks || []))}
+ ${storyCard("Sample traces", renderIncidentSamples(incident.sample_traces || []))}
+ ${storyCard("Instrumentation warnings", renderWarnings(incident.instrumentation_warnings || []))}
+
+
`);
+ }
+ function renderEvidence(items) {
+ if (!items.length) return `No incident evidence attached.
`;
+ return ``;
+ }
+ function renderChecks(items) {
+ if (!items.length) return `No next checks generated.
`;
+ return `${items.map(check => `- ${esc(check)}
`).join("")}
`;
+ }
+ function renderIncidentSamples(ids) {
+ if (!ids.length) return `No sample traces attached.
`;
+ return ``;
+ }
+ function renderWarnings(items) {
+ if (!items.length) return `No instrumentation warnings.
`;
+ return `${items.map(warning => `- ${esc(warning)}
`).join("")}
`;
+ }
function impact(label, value) {
return `${esc(label)}
${esc(typeof value === "number" ? nf.format(value) : value)}
`;
}
@@ -943,6 +1053,8 @@ First failing step
document.getElementById("app").innerHTML = await renderExplain(route.id);
} else if (route.screen === "blast") {
document.getElementById("app").innerHTML = await renderBlast(route.id);
+ } else if (route.screen === "incident") {
+ document.getElementById("app").innerHTML = await renderIncident(route.id);
} else {
document.getElementById("app").innerHTML = shell(renderErrors());
}
@@ -985,18 +1097,9 @@ This dashboard requires WAYLOG_V2_READS=true.
const recent = document.getElementById("recent-list");
const scrollTop = recent ? recent.scrollTop : 0;
await loadDashboardData();
- if (parseHash().screen === "errors") {
- await renderCurrentScreen();
- const nextRecent = document.getElementById("recent-list");
- if (nextRecent) nextRecent.scrollTop = scrollTop;
- } else {
- // /explain and /blast own their own DOM; just refresh the recent panel in place.
- const list = document.getElementById("recent-list");
- if (list) {
- list.innerHTML = recentItemsHTML();
- list.scrollTop = scrollTop;
- }
- }
+ await renderCurrentScreen();
+ const nextRecent = document.getElementById("recent-list");
+ if (nextRecent) nextRecent.scrollTop = scrollTop;
}
function reportPollError(err) {
const live = document.getElementById("live-region");
diff --git a/internal/dashboard/static_test.go b/internal/dashboard/static_test.go
index d287abc..ce2587f 100644
--- a/internal/dashboard/static_test.go
+++ b/internal/dashboard/static_test.go
@@ -43,6 +43,13 @@ func TestStaticDashboardHTML(t *testing.T) {
"#/errors",
"#/explain",
"#/blast",
+ "#/incident",
+ "/v1/incidents/active",
+ "Active incidents",
+ "No active incidents.",
+ "Next checks",
+ "Instrumentation warnings",
+ "sample_traces",
"renderSparkline",
"This dashboard requires WAYLOG_V2_READS=true",
"first observable failing step",
diff --git a/internal/incidents/handler.go b/internal/incidents/handler.go
index 08d57ee..d6058c2 100644
--- a/internal/incidents/handler.go
+++ b/internal/incidents/handler.go
@@ -5,6 +5,8 @@ import (
"errors"
"net/http"
"strings"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
)
type Handler struct {
@@ -25,7 +27,7 @@ func (h *Handler) Active(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusInternalServerError, "internal_error", "query incidents failed", err.Error())
return
}
- writeJSON(w, http.StatusOK, ActiveResponse{Incidents: rows})
+ writeJSON(w, http.StatusOK, apiv2.IncidentListResponse{Incidents: toAPIIncidents(rows)})
}
func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
@@ -52,7 +54,7 @@ func (h *Handler) Incident(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusInternalServerError, "internal_error", "query incident failed", err.Error())
return
}
- writeJSON(w, http.StatusOK, DetailResponse{Incident: inc})
+ writeJSON(w, http.StatusOK, apiv2.IncidentDetailResponse{Incident: toAPIIncident(inc)})
}
func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
@@ -67,7 +69,7 @@ func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
}
snapshot := RenderSnapshot(inc)
if strings.Contains(r.Header.Get("Accept"), "application/json") {
- writeJSON(w, http.StatusOK, SnapshotResponse{Snapshot: snapshot, Incident: inc})
+ writeJSON(w, http.StatusOK, apiv2.IncidentSnapshotResponse{Snapshot: snapshot, Incident: toAPIIncident(inc)})
return
}
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
@@ -75,6 +77,61 @@ func (h *Handler) snapshot(w http.ResponseWriter, r *http.Request, id string) {
_, _ = w.Write([]byte(snapshot))
}
+func toAPIIncidents(rows []Incident) []apiv2.Incident {
+ out := make([]apiv2.Incident, 0, len(rows))
+ for _, inc := range rows {
+ out = append(out, toAPIIncident(inc))
+ }
+ return out
+}
+
+func toAPIIncident(inc Incident) apiv2.Incident {
+ return apiv2.Incident{
+ IncidentID: inc.IncidentID,
+ Env: inc.Env,
+ Service: inc.Service,
+ ErrorFamily: inc.ErrorFamily,
+ Status: string(inc.Status),
+ Cause: string(inc.Cause),
+ Confidence: string(inc.Confidence),
+ Severity: inc.Severity,
+ StartedAt: inc.StartedAt,
+ UpdatedAt: inc.UpdatedAt,
+ LastSeenAt: inc.LastSeenAt,
+ RecoveringAt: inc.RecoveringAt,
+ ResolvedAt: inc.ResolvedAt,
+ AffectedRequests: inc.AffectedRequests,
+ AffectedUsers: inc.AffectedUsers,
+ AffectedServices: inc.AffectedServices,
+ TopServices: inc.TopServices,
+ SampleTraces: inc.SampleTraces,
+ Evidence: toAPIEvidence(inc.Evidence),
+ NextChecks: inc.NextChecks,
+ InstrumentationWarnings: inc.InstrumentationWarnings,
+ Lift: inc.Lift,
+ BaselineCount: inc.BaselineCount,
+ CurrentCount: inc.CurrentCount,
+ }
+}
+
+func toAPIEvidence(rows []Evidence) []apiv2.IncidentEvidence {
+ out := make([]apiv2.IncidentEvidence, 0, len(rows))
+ for _, ev := range rows {
+ out = append(out, apiv2.IncidentEvidence{
+ Kind: string(ev.Kind),
+ Title: ev.Title,
+ Detail: ev.Detail,
+ Service: ev.Service,
+ SignalID: ev.SignalID,
+ DeployID: ev.DeployID,
+ TraceID: ev.TraceID,
+ OccurredAt: ev.OccurredAt,
+ Fields: ev.Fields,
+ })
+ }
+ return out
+}
+
func writeJSON(w http.ResponseWriter, status int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
diff --git a/internal/incidents/handler_test.go b/internal/incidents/handler_test.go
index 2be9a80..670c19c 100644
--- a/internal/incidents/handler_test.go
+++ b/internal/incidents/handler_test.go
@@ -8,6 +8,8 @@ import (
"strings"
"testing"
"time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
)
func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
@@ -28,7 +30,7 @@ func TestHandlerActiveDetailAndSnapshot(t *testing.T) {
if rec.Code != http.StatusOK {
t.Fatalf("active status=%d body=%s", rec.Code, rec.Body.String())
}
- var active ActiveResponse
+ var active apiv2.IncidentListResponse
if err := json.Unmarshal(rec.Body.Bytes(), &active); err != nil {
t.Fatal(err)
}
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
index f1d8bdb..cf3b59a 100644
--- a/internal/incidents/types.go
+++ b/internal/incidents/types.go
@@ -79,19 +79,6 @@ type Incident struct {
CurrentCount int `json:"current_count"`
}
-type ActiveResponse struct {
- Incidents []Incident `json:"incidents"`
-}
-
-type DetailResponse struct {
- Incident Incident `json:"incident"`
-}
-
-type SnapshotResponse struct {
- Snapshot string `json:"snapshot"`
- Incident Incident `json:"incident"`
-}
-
type Deployment struct {
ID string
Service string
diff --git a/pkg/api/v2/types.go b/pkg/api/v2/types.go
index 9fd4a79..e14af86 100644
--- a/pkg/api/v2/types.go
+++ b/pkg/api/v2/types.go
@@ -119,6 +119,58 @@ type BlastRadiusResponse struct {
SampleTraces []string `json:"sample_traces"`
}
+type IncidentEvidence struct {
+ Kind string `json:"kind"`
+ Title string `json:"title"`
+ Detail string `json:"detail,omitempty"`
+ Service string `json:"service,omitempty"`
+ SignalID string `json:"signal_id,omitempty"`
+ DeployID string `json:"deployment_id,omitempty"`
+ TraceID string `json:"trace_id,omitempty"`
+ OccurredAt time.Time `json:"occurred_at"`
+ Fields map[string]any `json:"fields,omitempty"`
+}
+
+type Incident struct {
+ IncidentID string `json:"incident_id"`
+ Env string `json:"env"`
+ Service string `json:"service"`
+ ErrorFamily ErrorFamily `json:"error_family"`
+ Status string `json:"status"`
+ Cause string `json:"cause"`
+ Confidence string `json:"confidence"`
+ Severity int `json:"severity"`
+ StartedAt time.Time `json:"started_at"`
+ UpdatedAt time.Time `json:"updated_at"`
+ LastSeenAt time.Time `json:"last_seen_at"`
+ RecoveringAt *time.Time `json:"recovering_at,omitempty"`
+ ResolvedAt *time.Time `json:"resolved_at,omitempty"`
+ AffectedRequests int `json:"affected_requests"`
+ AffectedUsers *int `json:"affected_users,omitempty"`
+ AffectedServices int `json:"affected_services"`
+ TopServices []string `json:"top_services"`
+ SampleTraces []string `json:"sample_traces"`
+ Evidence []IncidentEvidence `json:"evidence"`
+ NextChecks []string `json:"next_checks"`
+ InstrumentationWarnings []string `json:"instrumentation_warnings,omitempty"`
+ Lift float64 `json:"lift"`
+ BaselineCount int `json:"baseline_count"`
+ CurrentCount int `json:"current_count"`
+}
+
+type IncidentListResponse struct {
+ Incidents []Incident `json:"incidents"`
+}
+
+type IncidentDetailResponse struct {
+ Incident Incident `json:"incident"`
+}
+
+type IncidentSnapshotResponse struct {
+ Snapshot string `json:"snapshot"`
+ Incident Incident `json:"incident"`
+}
+
func FormatErrorFamily(f ErrorFamily) string {
return escapeErrorFamilyPart(f.Service) + ":" + escapeErrorFamilyPart(f.Step) + ":" + escapeErrorFamilyPart(f.ErrorCode)
}
From b8e9636d9e004a9ea60e76e6d7de1fe26ee095d2 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Tue, 5 May 2026 03:35:19 -0400
Subject: [PATCH 04/14] feat: auto-post demo signals for incident acceptance
Made the demo produce the full production-triage path from a single
Run traffic burst action.
Added a demo signal poster to the api-gateway burst path. The burst now posts
a checkout deploy signal and a payment dependency signal to /v1/signals using
INGEST_URL and WAYLOG_WRITE_KEY, then runs traffic as before. Signal failures
are reported in the burst summary but do not block traffic, so no-SQLite and
micro-demo style setups remain usable.
Seed each burst with up to six payment_502 requests before falling back to the
existing weighted traffic mix. This keeps the burst bounded and user-triggered
while making incident creation deterministic enough for demo acceptance.
Fix incident signal enrichment by querying signals across the incident env and
time window instead of filtering only to the primary service. This lets a
downstream payment dependency signal enrich checkout:payment.charge:PMT_502
incidents to high-confidence dependency classification.
Update the demo UI, README, and demo script copy to point evaluators at the
active incident flow. Extend demo acceptance to verify accepted signals,
active dependency incidents, incident detail, and text snapshots.
Tests cover signal posting, signal failure reporting, deterministic burst
seeding, downstream signal classification, UI copy, and acceptance JSON helpers.
Verification:
- go test ./examples/microdemo
- go test ./internal/incidents
- go test ./scripts/demo-acceptance-json
- go test ./examples/microdemo ./internal/incidents ./internal/cli/v2 ./internal/dashboard
- go test ./...
- go vet ./...
- bash -n scripts/demo.sh scripts/demo-acceptance.sh
- bash scripts/check-doc-links.sh
- git diff --check
---
README.md | 8 +-
examples/cmd/api-gateway/main.go | 4 +
examples/microdemo/burst.go | 22 +++-
examples/microdemo/burst_test.go | 63 ++++++++-
examples/microdemo/gateway.go | 11 ++
examples/microdemo/signals.go | 153 ++++++++++++++++++++++
examples/microdemo/signals_test.go | 84 ++++++++++++
examples/microdemo/ui.html | 13 +-
examples/microdemo/ui_test.go | 2 +
internal/incidents/engine.go | 6 +-
internal/incidents/engine_test.go | 57 ++++++++
scripts/demo-acceptance-json/main.go | 85 +++++++++++-
scripts/demo-acceptance-json/main_test.go | 23 ++++
scripts/demo-acceptance.sh | 36 +++++
scripts/demo.sh | 5 +-
15 files changed, 553 insertions(+), 19 deletions(-)
create mode 100644 examples/microdemo/signals.go
create mode 100644 examples/microdemo/signals_test.go
create mode 100644 scripts/demo-acceptance-json/main_test.go
diff --git a/README.md b/README.md
index 439170b..db85a6f 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,12 @@ Run `make demo` and see it yourself.
make demo
```
-This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, and does not require Docker, Kafka, or the bridge process.
+This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, stores demo signals/incidents in local SQLite, and does not require Docker, Kafka, or the bridge process.
Once the stack is up:
1. Open demo controls at , or open the dashboard at . The local demo disables dashboard login.
-2. Click **Run traffic burst** to fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
+2. Click **Run traffic burst** to post demo deploy/dependency signals and fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
```bash
curl -s -X POST http://localhost:9081/purchase \
-H 'Content-Type: application/json' \
@@ -54,6 +54,8 @@ Once the stack is up:
```
3. Investigate with the v2 CLI:
```bash
+ ./waylog incidents
+ ./waylog incident --snapshot
./waylog errors --window 15m
./waylog explain
./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
@@ -315,4 +317,4 @@ Public alpha. APIs may break before 1.0.
- No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
- No multi-tenancy. One instance = one trust boundary.
-**Fastest walkthrough:** `make demo`, open , click **Run traffic burst**, then use the dashboard or `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
+**Fastest walkthrough:** `make demo`, open , click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
diff --git a/examples/cmd/api-gateway/main.go b/examples/cmd/api-gateway/main.go
index 12ca98e..24aa63d 100644
--- a/examples/cmd/api-gateway/main.go
+++ b/examples/cmd/api-gateway/main.go
@@ -17,6 +17,10 @@ func main() {
checkoutURL := config.Getenv("CHECKOUT_URL", "http://localhost:9082")
gateway := microdemo.NewGatewayHandler(checkoutURL)
+ gateway.SetSignalPoster(microdemo.NewDemoSignalPoster(
+ config.Getenv("INGEST_URL", "http://localhost:8080"),
+ config.Getenv("WAYLOG_WRITE_KEY", ""),
+ ))
mux := http.NewServeMux()
mux.Handle("/purchase", gateway.PurchaseHandler())
diff --git a/examples/microdemo/burst.go b/examples/microdemo/burst.go
index 3c76258..cf7ccc4 100644
--- a/examples/microdemo/burst.go
+++ b/examples/microdemo/burst.go
@@ -14,6 +14,7 @@ import (
const (
defaultBurstRequests = 50
defaultBurstConcurrency = 10
+ incidentSeedPayments = 6
maxBurstRequests = 250
maxBurstConcurrency = 50
maxBurstSamples = 5
@@ -27,6 +28,7 @@ type BurstRequest struct {
type BurstSummary struct {
Requested BurstRequest `json:"requested"`
Accepted BurstRequest `json:"accepted"`
+ Signals []SignalResult `json:"signals,omitempty"`
DurationMs int64 `json:"duration_ms"`
ByScenario map[string]int `json:"by_scenario"`
OK int `json:"ok"`
@@ -87,6 +89,20 @@ func normalizeBurstRequest(raw BurstRequest) (requested, accepted BurstRequest)
return requested, accepted
}
+func pickBurstScenarioForIndex(i, requests int) string {
+ if i < incidentSeedPaymentCount(requests) {
+ return ScenarioPayment502
+ }
+ return pickBurstScenario()
+}
+
+func incidentSeedPaymentCount(requests int) int {
+ if requests < incidentSeedPayments {
+ return requests
+ }
+ return incidentSeedPayments
+}
+
func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) BurstSummary {
requested, accepted := normalizeBurstRequest(raw)
summary := BurstSummary{
@@ -112,11 +128,11 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs
// concurrency instead of stacking up `requests` blocked goroutines.
sem <- struct{}{}
wg.Add(1)
- go func() {
+ scenario := pickBurstScenarioForIndex(i, accepted.Requests)
+ go func(scenario string) {
defer wg.Done()
defer func() { <-sem }()
- scenario := pickBurstScenario()
payload, _ := json.Marshal(PurchaseRequest{
SKU: "X1",
Scenario: scenario,
@@ -155,7 +171,7 @@ func runBurst(ctx context.Context, dispatch http.Handler, raw BurstRequest) Burs
summary.SampleTraceIDs = append(summary.SampleTraceIDs, resp.TraceID)
}
}
- }()
+ }(scenario)
}
wg.Wait()
summary.DurationMs = time.Since(start).Milliseconds()
diff --git a/examples/microdemo/burst_test.go b/examples/microdemo/burst_test.go
index eaa914b..8c49b87 100644
--- a/examples/microdemo/burst_test.go
+++ b/examples/microdemo/burst_test.go
@@ -1,6 +1,7 @@
package microdemo
import (
+ "context"
"encoding/json"
"net/http"
"net/http/httptest"
@@ -61,8 +62,16 @@ func TestRunBurstDispatchesEveryRequestThroughHandler(t *testing.T) {
if got := r.Header.Get("Content-Type"); got != "application/json" {
t.Fatalf("content-type = %q, want application/json", got)
}
+ var purchase PurchaseRequest
+ if err := json.NewDecoder(r.Body).Decode(&purchase); err != nil {
+ t.Fatalf("decode purchase: %v", err)
+ }
w.Header().Set("Content-Type", "application/json")
- _, _ = w.Write([]byte(`{"success":true,"trace_id":"t","scenario":"happy"}`))
+ _ = json.NewEncoder(w).Encode(map[string]any{
+ "success": purchase.Scenario == ScenarioHappy,
+ "trace_id": "trace-" + purchase.Scenario,
+ "scenario": purchase.Scenario,
+ })
})
summary := runBurst(t.Context(), dispatch, BurstRequest{Requests: 20, Concurrency: 4})
@@ -72,11 +81,25 @@ func TestRunBurstDispatchesEveryRequestThroughHandler(t *testing.T) {
if summary.Accepted.Requests != 20 || summary.Accepted.Concurrency != 4 {
t.Fatalf("accepted = %#v, want 20/4", summary.Accepted)
}
- if summary.OK != 20 || summary.Errors != 0 || summary.Suppressed != 0 {
- t.Fatalf("summary counts = ok:%d errors:%d suppressed:%d", summary.OK, summary.Errors, summary.Suppressed)
+ if summary.Errors < incidentSeedPayments {
+ t.Fatalf("errors = %d, want at least seeded payment failures %d", summary.Errors, incidentSeedPayments)
+ }
+ if summary.ByScenario[ScenarioPayment502] < incidentSeedPayments {
+ t.Fatalf("payment_502 count = %d, want at least %d", summary.ByScenario[ScenarioPayment502], incidentSeedPayments)
+ }
+ if summary.OK+summary.Errors+summary.Suppressed != 20 {
+ t.Fatalf("summary total = %d, want 20", summary.OK+summary.Errors+summary.Suppressed)
+ }
+}
+
+func TestPickBurstScenarioForIndexSeedsPaymentFailures(t *testing.T) {
+ for i := 0; i < incidentSeedPayments; i++ {
+ if got := pickBurstScenarioForIndex(i, 20); got != ScenarioPayment502 {
+ t.Fatalf("seed scenario[%d] = %q, want payment_502", i, got)
+ }
}
- if summary.ByScenario[ScenarioHappy] != 20 {
- t.Fatalf("happy count = %d, want 20", summary.ByScenario[ScenarioHappy])
+ if got := incidentSeedPaymentCount(3); got != 3 {
+ t.Fatalf("seed count = %d, want capped to request count 3", got)
}
}
@@ -139,6 +162,28 @@ func TestServeBurstAppliesDefaultsWhenZero(t *testing.T) {
}
}
+func TestServeBurstPostsDemoSignals(t *testing.T) {
+ gateway := NewGatewayHandler("http://checkout.example")
+ gateway.SetPurchaseHandler(okBurstDispatch())
+ gateway.SetSignalPoster(staticSignalPoster{results: []SignalResult{{
+ Type: "dependency", Service: "payment", Reason: "payment_gateway_5xx", Accepted: true, Status: http.StatusCreated, SignalID: "sig_demo",
+ }}})
+ rec := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodPost, "/demo/burst", strings.NewReader(`{"requests":1,"concurrency":1}`))
+ req.Header.Set("Content-Type", "application/json")
+ gateway.ServeBurst(rec, req)
+ if rec.Code != http.StatusOK {
+ t.Fatalf("status = %d, want 200: %s", rec.Code, rec.Body.String())
+ }
+ var summary BurstSummary
+ if err := json.Unmarshal(rec.Body.Bytes(), &summary); err != nil {
+ t.Fatalf("unmarshal summary: %v", err)
+ }
+ if len(summary.Signals) != 1 || !summary.Signals[0].Accepted || summary.Signals[0].SignalID != "sig_demo" {
+ t.Fatalf("signals = %+v", summary.Signals)
+ }
+}
+
func serveBurstForTest(t *testing.T, body string) *httptest.ResponseRecorder {
t.Helper()
gateway := NewGatewayHandler("http://checkout.example")
@@ -150,6 +195,14 @@ func serveBurstForTest(t *testing.T, body string) *httptest.ResponseRecorder {
return rec
}
+type staticSignalPoster struct {
+ results []SignalResult
+}
+
+func (p staticSignalPoster) PostDemoSignals(context.Context) []SignalResult {
+ return p.results
+}
+
func okBurstDispatch() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var req PurchaseRequest
diff --git a/examples/microdemo/gateway.go b/examples/microdemo/gateway.go
index a6a7fac..f065440 100644
--- a/examples/microdemo/gateway.go
+++ b/examples/microdemo/gateway.go
@@ -30,6 +30,7 @@ type GatewayHandler struct {
checkoutURL string
client *http.Client
purchase http.Handler
+ signals SignalPoster
}
type PurchaseRequest struct {
@@ -61,6 +62,11 @@ func (h *GatewayHandler) SetPurchaseHandler(handler http.Handler) {
h.purchase = handler
}
+// SetSignalPoster overrides the signal poster used by /demo/burst. Test seam.
+func (h *GatewayHandler) SetSignalPoster(poster SignalPoster) {
+ h.signals = poster
+}
+
func (h *GatewayHandler) ServeDemo(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write(uiHTML)
@@ -136,7 +142,12 @@ func (h *GatewayHandler) ServeBurst(w http.ResponseWriter, r *http.Request) {
}
}
+ var signalResults []SignalResult
+ if h.signals != nil {
+ signalResults = h.signals.PostDemoSignals(r.Context())
+ }
summary := runBurst(r.Context(), h.purchase, req)
+ summary.Signals = signalResults
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(summary)
}
diff --git a/examples/microdemo/signals.go b/examples/microdemo/signals.go
new file mode 100644
index 0000000..b7b6611
--- /dev/null
+++ b/examples/microdemo/signals.go
@@ -0,0 +1,153 @@
+package microdemo
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+)
+
+const demoSignalTimeout = 2 * time.Second
+
+type SignalResult struct {
+ Type string `json:"type"`
+ Service string `json:"service"`
+ Reason string `json:"reason"`
+ Accepted bool `json:"accepted"`
+ Status int `json:"status,omitempty"`
+ SignalID string `json:"signal_id,omitempty"`
+ Error string `json:"error,omitempty"`
+}
+
+type SignalPoster interface {
+ PostDemoSignals(ctx context.Context) []SignalResult
+}
+
+type DemoSignalPoster struct {
+ ingestURL string
+ apiKey string
+ client *http.Client
+ now func() time.Time
+}
+
+func NewDemoSignalPoster(ingestURL, apiKey string) *DemoSignalPoster {
+ return &DemoSignalPoster{
+ ingestURL: strings.TrimRight(strings.TrimSpace(ingestURL), "/"),
+ apiKey: strings.TrimSpace(apiKey),
+ client: &http.Client{Timeout: demoSignalTimeout},
+ now: func() time.Time { return time.Now().UTC() },
+ }
+}
+
+func (p *DemoSignalPoster) PostDemoSignals(ctx context.Context) []SignalResult {
+ specs := []demoSignalSpec{
+ {
+ Type: "deploy",
+ Service: "checkout",
+ Severity: "info",
+ Reason: "demo_checkout_rollout",
+ Message: "Demo checkout rollout before the payment dependency incident.",
+ Resource: map[string]any{"service": "checkout"},
+ Metadata: map[string]any{"version": "demo-v2.1", "demo": "traffic_burst"},
+ },
+ {
+ Type: "dependency",
+ Service: "payment",
+ Severity: "critical",
+ Reason: "payment_gateway_5xx",
+ Message: "Demo payment provider is returning intermittent 5xx responses.",
+ Resource: map[string]any{"service": "payment", "endpoint": "POST /charge"},
+ Metadata: map[string]any{"error_code": "PMT_502", "downstream": "payment", "demo": "traffic_burst"},
+ },
+ }
+
+ results := make([]SignalResult, 0, len(specs))
+ for _, spec := range specs {
+ results = append(results, p.postSignal(ctx, spec))
+ }
+ return results
+}
+
+func (p *DemoSignalPoster) postSignal(ctx context.Context, spec demoSignalSpec) SignalResult {
+ result := SignalResult{Type: spec.Type, Service: spec.Service, Reason: spec.Reason}
+ if p == nil || p.ingestURL == "" {
+ result.Error = "INGEST_URL is not configured"
+ return result
+ }
+
+ body, err := json.Marshal(spec.body(p.now()))
+ if err != nil {
+ result.Error = err.Error()
+ return result
+ }
+ reqCtx, cancel := context.WithTimeout(ctx, demoSignalTimeout)
+ defer cancel()
+ req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, p.ingestURL+"/v1/signals", bytes.NewReader(body))
+ if err != nil {
+ result.Error = err.Error()
+ return result
+ }
+ req.Header.Set("Content-Type", "application/json")
+ if p.apiKey != "" {
+ req.Header.Set("X-API-Key", p.apiKey)
+ }
+
+ client := p.client
+ if client == nil {
+ client = &http.Client{Timeout: demoSignalTimeout}
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ result.Error = err.Error()
+ return result
+ }
+ defer resp.Body.Close()
+ result.Status = resp.StatusCode
+
+ raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
+ if resp.StatusCode != http.StatusCreated {
+ result.Error = fmt.Sprintf("signal POST returned HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(raw)))
+ return result
+ }
+ var accepted struct {
+ Signal struct {
+ SignalID string `json:"signal_id"`
+ } `json:"signal"`
+ }
+ if err := json.Unmarshal(raw, &accepted); err != nil {
+ result.Error = "accepted signal response was not valid JSON: " + err.Error()
+ return result
+ }
+ result.Accepted = true
+ result.SignalID = accepted.Signal.SignalID
+ return result
+}
+
+type demoSignalSpec struct {
+ Type string
+ Service string
+ Severity string
+ Reason string
+ Message string
+ Resource map[string]any
+ Metadata map[string]any
+}
+
+func (s demoSignalSpec) body(ts time.Time) map[string]any {
+ return map[string]any{
+ "type": s.Type,
+ "source": "waylog-demo",
+ "service": s.Service,
+ "env": "demo",
+ "severity": s.Severity,
+ "reason": s.Reason,
+ "message": s.Message,
+ "resource": s.Resource,
+ "metadata": s.Metadata,
+ "timestamp": ts.UTC(),
+ }
+}
diff --git a/examples/microdemo/signals_test.go b/examples/microdemo/signals_test.go
new file mode 100644
index 0000000..d1d530b
--- /dev/null
+++ b/examples/microdemo/signals_test.go
@@ -0,0 +1,84 @@
+package microdemo
+
+import (
+ "bytes"
+ "encoding/json"
+ "io"
+ "net/http"
+ "testing"
+ "time"
+)
+
+func TestDemoSignalPosterPostsDeployAndDependencySignals(t *testing.T) {
+ var posted []map[string]any
+ poster := NewDemoSignalPoster("http://ingest.example", "demo-write")
+ poster.client = &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) {
+ if r.URL.Path != "/v1/signals" {
+ t.Fatalf("path = %s, want /v1/signals", r.URL.Path)
+ }
+ if got := r.Header.Get("X-API-Key"); got != "demo-write" {
+ t.Fatalf("api key = %q, want demo-write", got)
+ }
+ var body map[string]any
+ if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+ t.Fatalf("decode signal: %v", err)
+ }
+ posted = append(posted, body)
+ raw, _ := json.Marshal(map[string]any{
+ "signal": map[string]any{"signal_id": "sig_" + body["type"].(string)},
+ })
+ return &http.Response{
+ StatusCode: http.StatusCreated,
+ Header: http.Header{"Content-Type": []string{"application/json"}},
+ Body: io.NopCloser(bytes.NewReader(raw)),
+ }, nil
+ })}
+ poster.now = func() time.Time { return time.Date(2026, 5, 5, 12, 0, 0, 0, time.UTC) }
+ results := poster.PostDemoSignals(t.Context())
+ if len(results) != 2 {
+ t.Fatalf("results len = %d, want 2", len(results))
+ }
+ for _, result := range results {
+ if !result.Accepted || result.SignalID == "" || result.Status != http.StatusCreated {
+ t.Fatalf("result = %+v", result)
+ }
+ }
+ if len(posted) != 2 {
+ t.Fatalf("posted len = %d, want 2", len(posted))
+ }
+ if posted[0]["type"] != "deploy" || posted[0]["service"] != "checkout" || posted[0]["env"] != "demo" {
+ t.Fatalf("deploy signal = %+v", posted[0])
+ }
+ if posted[1]["type"] != "dependency" || posted[1]["service"] != "payment" || posted[1]["reason"] != "payment_gateway_5xx" {
+ t.Fatalf("dependency signal = %+v", posted[1])
+ }
+ metadata, ok := posted[1]["metadata"].(map[string]any)
+ if !ok || metadata["error_code"] != "PMT_502" {
+ t.Fatalf("dependency metadata = %+v", posted[1]["metadata"])
+ }
+}
+
+func TestDemoSignalPosterReportsNonCreatedResponse(t *testing.T) {
+ poster := NewDemoSignalPoster("http://ingest.example", "")
+ poster.client = &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) {
+ return &http.Response{
+ StatusCode: http.StatusServiceUnavailable,
+ Body: io.NopCloser(bytes.NewBufferString("set SQLITE_PATH to enable signals")),
+ }, nil
+ })}
+ results := poster.PostDemoSignals(t.Context())
+ if len(results) != 2 {
+ t.Fatalf("results len = %d, want 2", len(results))
+ }
+ for _, result := range results {
+ if result.Accepted || result.Status != http.StatusServiceUnavailable || result.Error == "" {
+ t.Fatalf("result = %+v", result)
+ }
+ }
+}
+
+type roundTripFunc func(*http.Request) (*http.Response, error)
+
+func (f roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
+ return f(r)
+}
diff --git a/examples/microdemo/ui.html b/examples/microdemo/ui.html
index 30ede80..b5ac343 100644
--- a/examples/microdemo/ui.html
+++ b/examples/microdemo/ui.html
@@ -542,7 +542,7 @@ Scenarios
- Public alpha — an impact-analysis engine for backend systems built on WideEvents.
+ Public alpha — request triage plus signal-driven incident triage for backend systems.
---
## What Waylog does
-A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened:
+A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened in the request, then groups repeated failures into an incident with signal-backed cause evidence:
```text
trace 7f3a2b9c… flow=purchase user=standard region=us-east-1
@@ -31,7 +31,7 @@ A request hits your API gateway, fans out to three services, and one of them fai
blast radius: 12 requests · 8 users · 4 services
```
-This is not log search. Waylog builds a live in-memory graph from every request flowing through your services. When you ask a question — "why did this trace fail?", "who is affected by `PMT_502`?", "what changed in the last 10 minutes?" — it walks the graph and returns a precomputed, structured answer. Root-cause rollups count the originating failure once, not once per propagated hop.
+This is not log search, metrics storage, or incident management. Waylog builds request-triage views from WideEvents, accepts production-context signals such as deploys and dependency health, and returns deterministic answers for "why did this trace fail?", "what incident is active?", and "who is affected by `PMT_502`?". Root-cause rollups count the originating failure once, not once per propagated hop.
Run `make demo` and see it yourself.
@@ -60,9 +60,10 @@ Once the stack is up:
./waylog explain
./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
./waylog blast --code PMT_502 --window 15m
+ ./waylog triage
```
-The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
+The traffic burst posts fresh demo deploy/dependency signals on each run so the incident panel has evidence to attach. The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
Stop with `make demo-stop`.
@@ -72,8 +73,9 @@ Prefer Docker? Use `make docker-dev` / `make docker-down`. Prefer foreground ser
## How it works
1. **Capture** — services emit [WideEvents](docs/waylog-sdk-contract.md) via the Go or TypeScript SDK, or push OpenTelemetry spans to `/v1/otlp/v1/traces`. Every event is durably logged (WAL + fsync) before it enters the derived read models.
-2. **Analyze** — the ingest server projects completed execution segments into request, service, error, user, and trace views. Deterministic tools answer specific questions: propagation chain, blast radius, what-changed, deploy correlation.
-3. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views through the same tool registry. Every answer is also callable by agents as a structured tool with idempotency keys.
+2. **Signal** — deploy systems, dependency monitors, or operators post small production-context facts to `/v1/signals`.
+3. **Triage** — the ingest server projects request views (`recent`, `errors`, `explain`, `blast`) and opens incidents when error families spike against overlapping signals.
+4. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views. Primary incident surfaces are `waylog incidents`, `waylog incident `, `/v1/incidents/*`, and the dashboard incident cards.
## Get traces in
@@ -206,7 +208,7 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
### Analysis tools
-All ten tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
| Tool | Answers |
| ------------------ | ------------------------------------------------------------- |
@@ -220,6 +222,7 @@ All ten tools are deterministic, idempotent, and available via CLI, REST `/v1/to
| `graph_query` | DSL query over the graph (`expr` + `window`) |
| `compare_windows` | Diff error rates between two windows |
| `graph_insights` | Windowed rollup of top errors and patterns |
+| `triage_incident` | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
@@ -231,6 +234,7 @@ The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs a
- `#/errors` — top error families over `/v1/errors`
- `#/explain/` — first observable failing step over `/v1/traces/story`
- `#/blast/` — impact panel over `/v1/blast_radius`
+- `#/incident/` — incident evidence and next checks over `/v1/incidents/{id}`
- recent-request stream from `/v1/traces/recent`, polled every 5s
- no Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts
@@ -243,8 +247,8 @@ Go / TS services (SDK) · OTLP/HTTP collectors
▼
ingest server
├─ event log (append-only WAL, source of truth)
- ├─ derived read models (errors · explain · blast · recent traces)
- ├─ SQLite cold store (events · deployments · causal claims)
+ ├─ derived read models (errors · explain · blast · recent traces · incidents)
+ ├─ SQLite cold store (events · deployments · signals · incidents · causal claims)
├─ tool registry · Ask · plan execution
└─ v2 dashboard · health · metrics · OpenAPI
│
@@ -275,7 +279,7 @@ Waylog uses three scoped keys. They are independent — the dashboard never hold
| Key | Protects |
| ------------------ | ----------------------------------------------------- |
-| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces` (SDKs, collectors) |
+| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces`, `/v1/signals` (SDKs, collectors, production signals) |
| `WAYLOG_READ_KEY` | Read APIs, dashboard session |
| `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*` |
@@ -292,12 +296,13 @@ Public alpha. APIs may break before 1.0.
- durable ingest with WAL + replay
- hot graph with flattened 3-node model + dedicated trace store
- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
-- SQLite cold store (events, deployments, causal claims)
+- SQLite cold store (events, deployments, signals, incidents, causal claims)
+- signal-driven incident engine with `waylog incidents`, `waylog incident `, and dashboard incident cards
- 10 deterministic analysis tools, rollup-correct root-cause attribution
- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
- `/v1/traces/story` and indented failure-path rendering in the dashboard
- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
+- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
- live TUI (`waylog-live --dev` streams via SSE), MCP stdio
- scoped auth (write/read/agent) with startup validation
@@ -314,7 +319,10 @@ Public alpha. APIs may break before 1.0.
- OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
- Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
+- Signal and incident records are SQLite-backed; they do not use the event WAL/replay path.
+- Incident cause classification is deterministic and heuristic. `runtime` signals are accepted but do not produce a `runtime` cause label yet.
- No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
- No multi-tenancy. One instance = one trust boundary.
+- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
**Fastest walkthrough:** `make demo`, open , click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 650f58d..7d76092 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -36,6 +36,8 @@ import (
"github.com/sssmaran/WaylogCLI/internal/signals"
"github.com/sssmaran/WaylogCLI/internal/tools"
"github.com/sssmaran/WaylogCLI/internal/tracestore"
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+ "github.com/sssmaran/WaylogCLI/internal/triagehttp"
apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
)
@@ -412,8 +414,9 @@ func main() {
if incidentsEnabled {
if sqlite, ok := coldDB.(*coldstore.SQLiteStore); ok {
incidentStore := coldstore.NewIncidentStore(sqlite)
+ incReader := incidentReaderAdapter{reader: v2Reader}
incidentEngine = incidents.NewEngine(
- incidentReaderAdapter{reader: v2Reader},
+ incReader,
signalStore,
coldDeployAdapter{store: sqlite},
incidentStore,
@@ -429,6 +432,34 @@ func main() {
mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
ingestServer.SetDetector(incidentInsightAdapter{engine: incidentEngine})
+
+ // Triage engine: deterministic TriageReport build for a given
+ // incident. Reuses the same v2Reader-backed adapter for blast
+ // queries, the live graph + trace store for first-failure
+ // stories, and the configured signal store. Read-scope auth.
+ triageEng, err := triage.NewEngine(triage.Deps{
+ Incidents: triage.NewIncidentLookupAdapter(incidentEngine),
+ Blast: triage.NewBlastQueryAdapter(incReader),
+ Story: triage.NewStoryBuilderAdapter(
+ incidentEngine,
+ func(traceID string) (apiv2.StoryResponse, bool) {
+ return v2Reader.TraceStoryByTraceID(traceID)
+ },
+ ),
+ Signals: triage.NewSignalQueryAdapter(signalStore),
+ NextChecks: triage.NewNextChecksAdapter(),
+ })
+ if err != nil {
+ slog.Error("triage engine init failed", "err", err)
+ os.Exit(1)
+ }
+ if err := tools.RegisterTriageTool(reg, triageEng); err != nil {
+ slog.Error("triage tool register failed", "err", err)
+ os.Exit(1)
+ }
+ triageHandler := triagehttp.NewHandler(triageEng)
+ mux.Handle("/v1/triage/", readCORS(triageHandler.Triage))
+
incidentRunning = true
slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
} else {
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index ce503dc..d60eb4d 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -7,7 +7,9 @@ info:
The primary product path is schema-2.0 ingest plus the v2 read APIs used by
the operator CLI and embedded dashboard: recent traces, error families,
- trace story, blast radius, event search, and direct trace/event lookup.
+ trace story, blast radius, incidents, event search, and direct trace/event
+ lookup. `/v1/insight` is a compatibility endpoint; new clients should use
+ `/v1/incidents/*`.
Write endpoints require write-scope auth when auth is configured. Read
endpoints require read-scope auth when read keys are configured.
@@ -485,7 +487,9 @@ paths:
tags: [Triage]
operationId: getIncidentSnapshot
summary: Render an incident snapshot
- description: Defaults to text/plain. Send Accept: application/json to receive the snapshot text plus the incident object.
+ description: |
+ Defaults to text/plain. Send Accept: application/json to receive the
+ snapshot text plus the incident object.
security:
- ApiKeyHeader: []
- BearerAuth: []
@@ -508,6 +512,81 @@ paths:
'405':
description: Method Not Allowed
+ /v1/triage/{incident_id}:
+ get:
+ tags: [Triage]
+ operationId: getTriageReport
+ summary: Build a deterministic TriageReport for an open incident
+ description: |
+ Returns a structured TriageReport v1 (incident_ref, blast_snapshot,
+ first_failure, sample_traces, signals, next_checks, confidence,
+ report_hash). Same builder backs `POST /v1/tools/triage_incident`;
+ both surfaces produce identical `report_hash` for the same input.
+ Read-scope auth.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ parameters:
+ - name: incident_id
+ in: path
+ required: true
+ schema: {type: string}
+ - name: window
+ in: query
+ required: false
+ schema: {type: string, default: "15m"}
+ description: Go duration string (e.g. 15m, 1h). Default 15m.
+ - name: snapshot
+ in: query
+ required: false
+ schema: {type: boolean, default: false}
+ description: |
+ When true, freeze evaluation bounds to the incident's started_at
+ and updated_at instead of using wall-clock now.
+ responses:
+ '200':
+ description: TriageReport v1
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/TriageReport'
+ '400':
+ description: Missing or invalid parameters
+ '401':
+ description: Unauthorized
+ '500':
+ description: Triage build failed
+
+ /v1/insight:
+ get:
+ tags: [Operational]
+ operationId: getCompatibilityInsight
+ summary: Compatibility anomaly insight
+ deprecated: true
+ description: |
+ Compatibility endpoint for older dashboard/tool consumers. When the
+ v2.1 incident engine is running, this projects the top active incident
+ into the legacy insight shape. Otherwise it falls back to the legacy
+ detector. New clients should use `/v1/incidents/active` and
+ `/v1/incidents/{id}`.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ responses:
+ '200':
+ description: Compatibility insight object
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ '204':
+ description: No active insight
+ '401':
+ description: Unauthorized
+ '405':
+ description: Method Not Allowed
+
/v1/capabilities:
get:
tags: [Capabilities]
@@ -1689,6 +1768,75 @@ components:
incident:
$ref: '#/components/schemas/Incident'
+ TriageReport:
+ type: object
+ required: [schema_version, incident_ref, blast_snapshot, confidence, generated_at, report_hash]
+ description: |
+ Deterministic triage artifact for an open incident. Versioned via
+ schema_version. report_hash is sha256 over the canonical JSON
+ excluding generated_at, plan_run_id, and report_hash itself.
+ properties:
+ schema_version:
+ type: string
+ enum: ["triage.v1"]
+ incident_ref:
+ type: object
+ required: [id]
+ properties:
+ id: {type: string}
+ window: {type: string, description: "Go duration string e.g. 15m0s"}
+ blast_snapshot:
+ type: object
+ properties:
+ requests: {type: integer}
+ users: {type: integer}
+ services: {type: integer}
+ top_error_families:
+ type: array
+ items:
+ type: object
+ properties:
+ service: {type: string}
+ step: {type: string}
+ error_code: {type: string}
+ count: {type: integer}
+ first_failure:
+ type: object
+ additionalProperties: true
+ description: Full /v1/traces/story payload for the first observed failing step.
+ sample_traces:
+ type: array
+ items:
+ type: object
+ properties:
+ trace_id: {type: string}
+ summary: {type: string}
+ signals:
+ type: array
+ items:
+ type: object
+ properties:
+ id: {type: string}
+ type: {type: string}
+ evidence_ids: {type: array, items: {type: string}}
+ next_checks:
+ type: array
+ items:
+ type: object
+ properties:
+ id: {type: string}
+ prompt: {type: string}
+ confidence:
+ type: string
+ enum: [low, medium, high]
+ generated_at: {type: string}
+ plan_run_id:
+ type: string
+ description: Set only when produced via /v1/plans/execute.
+ report_hash:
+ type: string
+ description: "sha256:"
+
CapabilitiesResponse:
type: object
example:
diff --git a/internal/cli/v2/client.go b/internal/cli/v2/client.go
index 5430945..d44f3dd 100644
--- a/internal/cli/v2/client.go
+++ b/internal/cli/v2/client.go
@@ -169,6 +169,19 @@ func (c *Client) IncidentSnapshotJSON(ctx context.Context, incidentID string) (I
return out, err
}
+func (c *Client) Triage(ctx context.Context, id string, p TriageParams) (*TriageReport, error) {
+ q := url.Values{}
+ addQuery(q, "window", p.Window)
+ if p.Snapshot {
+ q.Set("snapshot", "true")
+ }
+ var rep TriageReport
+ if err := c.do(ctx, "/v1/triage/"+url.PathEscape(id), q, &rep); err != nil {
+ return nil, err
+ }
+ return &rep, nil
+}
+
func (c *Client) Search(ctx context.Context, p SearchParams) (EventSearchResponse, error) {
q := url.Values{}
addQuery(q, "error_code", p.ErrorCode)
diff --git a/internal/cli/v2/client_test.go b/internal/cli/v2/client_test.go
index 8d7d621..bbe73ad 100644
--- a/internal/cli/v2/client_test.go
+++ b/internal/cli/v2/client_test.go
@@ -71,3 +71,32 @@ func containsQuery(raw, want string) bool {
}
return false
}
+
+func TestClientTriageBuildsExpectedURL(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/v1/triage/inc_abc" {
+ t.Fatalf("path = %q", r.URL.Path)
+ }
+ if r.URL.Query().Get("snapshot") != "true" {
+ t.Fatalf("snapshot query missing")
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _, _ = w.Write([]byte(`{
+ "schema_version":"triage.v1",
+ "incident_ref":{"id":"inc_abc","window":"15m"},
+ "confidence":"medium",
+ "generated_at":"t",
+ "report_hash":"sha256:x"
+ }`))
+ }))
+ defer srv.Close()
+
+ c := NewClient(ClientConfig{BaseURL: srv.URL, APIKey: "test-key", Timeout: 5 * time.Second})
+ rep, err := c.Triage(context.Background(), "inc_abc", TriageParams{Snapshot: true})
+ if err != nil {
+ t.Fatalf("triage: %v", err)
+ }
+ if rep.IncidentRef.ID != "inc_abc" {
+ t.Fatalf("got id %q", rep.IncidentRef.ID)
+ }
+}
diff --git a/internal/cli/v2/cmd.go b/internal/cli/v2/cmd.go
index 87c3fae..9c2ba9e 100644
--- a/internal/cli/v2/cmd.go
+++ b/internal/cli/v2/cmd.go
@@ -12,7 +12,7 @@ import (
"time"
)
-const version = "v2-phase-2"
+const version = "v2.1-triage"
type cliConfig struct {
addr string
@@ -65,6 +65,8 @@ func RunCLI(args []string, _ io.Reader, stdout, stderr io.Writer) int {
return runBlast(ctx, client, cfg, rest[1:], stdout, stderr)
case "search":
return runSearch(ctx, client, cfg, rest[1:], stdout, stderr)
+ case "triage":
+ return runTriage(ctx, client, cfg, rest[1:], stdout, stderr)
default:
fmt.Fprintf(stderr, "unknown command: %s\n", rest[0])
printUsage(stderr)
@@ -244,6 +246,57 @@ func parseIncidentArgs(args []string) (string, bool, error) {
return incidentID, snapshot, nil
}
+func runTriage(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
+ id, window, snapshot, err := parseTriageArgs(args)
+ if err != nil {
+ return usage(stderr, err.Error())
+ }
+ if gate := requireV2Reads(ctx, client, stderr); gate != 0 {
+ return gate
+ }
+ rep, err := client.Triage(ctx, id, TriageParams{Window: window, Snapshot: snapshot})
+ if err != nil {
+ fmt.Fprintln(stderr, err)
+ return exitCodeForError(err)
+ }
+ if cfg.json {
+ if err := renderJSON(stdout, rep); err != nil {
+ fmt.Fprintln(stderr, err)
+ return 2
+ }
+ return 0
+ }
+ return RenderTriage(stdout, rep)
+}
+
+func parseTriageArgs(args []string) (id, window string, snapshot bool, err error) {
+ for i := 0; i < len(args); i++ {
+ arg := args[i]
+ switch {
+ case arg == "--snapshot":
+ snapshot = true
+ case arg == "--window":
+ if i+1 >= len(args) {
+ return "", "", false, fmt.Errorf("--window requires a value")
+ }
+ window = args[i+1]
+ i++
+ case strings.HasPrefix(arg, "--window="):
+ window = strings.TrimPrefix(arg, "--window=")
+ case strings.HasPrefix(arg, "-"):
+ return "", "", false, fmt.Errorf("unknown flag: %s", arg)
+ case id == "":
+ id = arg
+ default:
+ return "", "", false, fmt.Errorf("unexpected argument: %s", arg)
+ }
+ }
+ if id == "" {
+ return "", "", false, fmt.Errorf("usage: waylog triage [--window 15m] [--snapshot]")
+ }
+ return id, window, snapshot, nil
+}
+
func runEvent(ctx context.Context, client *Client, cfg cliConfig, args []string, stdout, stderr io.Writer) int {
if len(args) != 1 {
return usage(stderr, "usage: waylog event [--json]")
diff --git a/internal/cli/v2/cmd_test.go b/internal/cli/v2/cmd_test.go
index 0cf256f..51c39fb 100644
--- a/internal/cli/v2/cmd_test.go
+++ b/internal/cli/v2/cmd_test.go
@@ -309,3 +309,34 @@ func TestRunCLIUsage(t *testing.T) {
t.Fatalf("code=%d stderr=%q", code, stderr.String())
}
}
+
+func TestParseTriageArgs(t *testing.T) {
+ cases := []struct {
+ name string
+ in []string
+ wantID string
+ wantSnap bool
+ wantWin string
+ wantErr bool
+ }{
+ {"id only", []string{"inc_abc"}, "inc_abc", false, "", false},
+ {"id + snapshot", []string{"inc_abc", "--snapshot"}, "inc_abc", true, "", false},
+ {"id + window", []string{"inc_abc", "--window", "30m"}, "inc_abc", false, "30m", false},
+ {"id + window=30m", []string{"inc_abc", "--window=30m"}, "inc_abc", false, "30m", false},
+ {"missing id", []string{}, "", false, "", true},
+ }
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ id, win, snap, err := parseTriageArgs(tc.in)
+ if (err != nil) != tc.wantErr {
+ t.Fatalf("err=%v wantErr=%v", err, tc.wantErr)
+ }
+ if err != nil {
+ return
+ }
+ if id != tc.wantID || snap != tc.wantSnap || win != tc.wantWin {
+ t.Fatalf("got id=%q win=%q snap=%v want %q %q %v", id, win, snap, tc.wantID, tc.wantWin, tc.wantSnap)
+ }
+ })
+ }
+}
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index bf9b574..e0bed00 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -357,3 +357,41 @@ func formatTime(t time.Time) string {
}
return t.Format(time.RFC3339)
}
+
+func RenderTriage(w io.Writer, rep *TriageReport) int {
+ fmt.Fprintf(w, "Triage report incident=%s window=%s confidence=%s\n",
+ rep.IncidentRef.ID, rep.IncidentRef.Window, rep.Confidence)
+ fmt.Fprintf(w, " hash: %s\n\n", rep.ReportHash)
+
+ fmt.Fprintln(w, "Blast")
+ fmt.Fprintf(w, " requests=%d users=%d services=%d\n",
+ rep.BlastSnapshot.Requests, rep.BlastSnapshot.Users, rep.BlastSnapshot.Services)
+ for _, f := range rep.BlastSnapshot.TopErrorFamilies {
+ fmt.Fprintf(w, " %s/%s/%s count=%d\n", f.Service, f.Step, f.ErrorCode, f.Count)
+ }
+ fmt.Fprintln(w)
+
+ if len(rep.SampleTraces) > 0 {
+ fmt.Fprintln(w, "Sample traces")
+ for _, s := range rep.SampleTraces {
+ fmt.Fprintf(w, " %s %s\n", s.TraceID, s.Summary)
+ }
+ fmt.Fprintln(w)
+ }
+
+ if len(rep.Signals) > 0 {
+ fmt.Fprintln(w, "Signals")
+ for _, s := range rep.Signals {
+ fmt.Fprintf(w, " %s type=%s evidence=%v\n", s.ID, s.Type, s.EvidenceIDs)
+ }
+ fmt.Fprintln(w)
+ }
+
+ if len(rep.NextChecks) > 0 {
+ fmt.Fprintln(w, "Next checks")
+ for _, c := range rep.NextChecks {
+ fmt.Fprintf(w, " - %s\n", c.Prompt)
+ }
+ }
+ return 0
+}
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 0d4a755..a60e4cc 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -9,6 +9,7 @@ import (
apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+ triage "github.com/sssmaran/WaylogCLI/pkg/triage"
)
func TestRenderStoryPinsObservableLanguage(t *testing.T) {
@@ -160,3 +161,30 @@ func TestRenderNextCursor(t *testing.T) {
t.Fatalf("output=%s", out.String())
}
}
+
+func TestRenderTriageHeaderAndSections(t *testing.T) {
+ rep := &TriageReport{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_abc", Window: "15m"},
+ BlastSnapshot: triage.BlastSnapshot{
+ Requests: 12, Users: 8, Services: 4,
+ TopErrorFamilies: []triage.ErrorFamily{
+ {Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+ },
+ },
+ Signals: []triage.SignalRef{{ID: "sig_1", Type: "deploy"}},
+ NextChecks: []triage.NextCheck{{ID: "check_payment_health", Prompt: "Verify payment-service health"}},
+ Confidence: triage.ConfidenceMedium,
+ ReportHash: "sha256:abc",
+ }
+ var buf bytes.Buffer
+ if rc := RenderTriage(&buf, rep); rc != 0 {
+ t.Fatalf("render returned %d", rc)
+ }
+ out := buf.String()
+ for _, want := range []string{"inc_abc", "PMT_502", "deploy", "Verify payment-service health", "sha256:abc"} {
+ if !strings.Contains(out, want) {
+ t.Fatalf("output missing %q\noutput:\n%s", want, out)
+ }
+ }
+}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 1a940de..58fcde9 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -5,6 +5,7 @@ import (
apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
)
type CapabilitiesResponse struct {
@@ -85,3 +86,10 @@ type ClientConfig struct {
APIKey string
Timeout time.Duration
}
+
+type TriageParams struct {
+ Window string
+ Snapshot bool
+}
+
+type TriageReport = pkgtriage.Report
diff --git a/internal/ingest/triage_route_test.go b/internal/ingest/triage_route_test.go
new file mode 100644
index 0000000..de1f4a7
--- /dev/null
+++ b/internal/ingest/triage_route_test.go
@@ -0,0 +1,92 @@
+package ingest_test
+
+// Integration test for Task 11: verifies that the /v1/triage/{id} route is
+// dispatched to the triage handler when wired into a ServeMux the same way
+// cmd/ingest/main.go wires it. The Server type does not own this route
+// (cmd/ingest/main.go composes the mux directly), so this test reproduces
+// the exact mount pattern with stubbed triage dependencies.
+
+import (
+ "context"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+ "github.com/sssmaran/WaylogCLI/internal/triagehttp"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestTriageRouteDispatchesToHandler(t *testing.T) {
+ eng, err := triage.NewEngine(triage.Deps{
+ Incidents: stubTriageIncidents{},
+ Blast: stubTriageBlast{},
+ Story: stubTriageStory{},
+ Signals: stubTriageSignals{},
+ NextChecks: stubTriageNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ })
+ if err != nil {
+ t.Fatalf("NewEngine: %v", err)
+ }
+ h := triagehttp.NewHandler(eng)
+
+ // Mirror cmd/ingest/main.go: mux.Handle("/v1/triage/", readCORS(h.Triage)).
+ // We omit auth here because the auth wrapper is exercised elsewhere; this
+ // test verifies the dispatch wiring (path → handler).
+ mux := http.NewServeMux()
+ mux.Handle("/v1/triage/", http.HandlerFunc(h.Triage))
+
+ srv := httptest.NewServer(mux)
+ t.Cleanup(srv.Close)
+
+ resp, err := http.Get(srv.URL + "/v1/triage/inc_abc")
+ if err != nil {
+ t.Fatalf("GET: %v", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode == http.StatusNotFound {
+ t.Fatalf("route not registered (404)")
+ }
+ if resp.StatusCode != http.StatusOK {
+ t.Fatalf("status = %d, want 200", resp.StatusCode)
+ }
+ if ct := resp.Header.Get("Content-Type"); !strings.Contains(ct, "json") {
+ t.Fatalf("Content-Type = %q, want json", ct)
+ }
+}
+
+// --- stub dependencies ---
+
+type stubTriageIncidents struct{}
+
+func (stubTriageIncidents) GetIncident(_ context.Context, id string) (triage.IncidentSummary, error) {
+ return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type stubTriageBlast struct{}
+
+func (stubTriageBlast) BlastSnapshot(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+ return triage.BlastSnapshotResult{}, nil
+}
+
+type stubTriageStory struct{}
+
+func (stubTriageStory) FirstFailureStory(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) (triage.FirstFailureResult, error) {
+ return triage.FirstFailureResult{}, nil
+}
+
+type stubTriageSignals struct{}
+
+func (stubTriageSignals) SignalsFor(_ context.Context, _ triage.IncidentSummary, _ triage.BuildOptions) ([]triage.SignalEvidence, error) {
+ return nil, nil
+}
+
+type stubTriageNextChecks struct{}
+
+func (stubTriageNextChecks) NextChecks(_ context.Context, _ triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+ return nil, nil
+}
diff --git a/internal/tools/triage.go b/internal/tools/triage.go
new file mode 100644
index 0000000..7ad969d
--- /dev/null
+++ b/internal/tools/triage.go
@@ -0,0 +1,57 @@
+package tools
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+const triageInputSchema = `{
+ "type": "object",
+ "required": ["incident_id"],
+ "properties": {
+ "incident_id": {"type": "string"},
+ "window": {"type": "string", "description": "Go duration string, default 15m"},
+ "snapshot": {"type": "boolean", "description": "Freeze evaluation bounds to incident.started_at..updated_at"}
+ }
+}`
+
+const triageOutputSchema = `{
+ "type": "object",
+ "description": "TriageReport v1; see pkg/triage.Report for the full Go struct."
+}`
+
+func RegisterTriageTool(reg *Registry, engine *triage.Engine) error {
+ return reg.Register(Tool{
+ Name: "triage_incident",
+ Description: "Build a deterministic TriageReport for an open incident.",
+ Version: "triage.v1",
+ InputSchema: json.RawMessage(triageInputSchema),
+ OutputSchema: json.RawMessage(triageOutputSchema),
+ Examples: []string{
+ `{"incident_id":"inc_01HX...","window":"15m"}`,
+ `{"incident_id":"inc_01HX...","snapshot":true}`,
+ },
+ Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+ var p struct {
+ IncidentID string `json:"incident_id"`
+ Window string `json:"window"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(params, &p); err != nil {
+ return nil, fmt.Errorf("triage_incident: bad params: %w", err)
+ }
+ if p.IncidentID == "" {
+ return nil, fmt.Errorf("triage_incident: incident_id required")
+ }
+ opts, err := triage.ParseBuildOptions(p.Window, p.Snapshot, time.Now())
+ if err != nil {
+ return nil, err
+ }
+ return engine.Build(ctx, p.IncidentID, opts)
+ },
+ })
+}
diff --git a/internal/tools/triage_test.go b/internal/tools/triage_test.go
new file mode 100644
index 0000000..560e802
--- /dev/null
+++ b/internal/tools/triage_test.go
@@ -0,0 +1,92 @@
+package tools_test
+
+import (
+ "context"
+ "encoding/json"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/tools"
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestRegisterTriageToolListsTool(t *testing.T) {
+ reg := tools.NewRegistry()
+ eng := newStubEngine(t)
+ if err := tools.RegisterTriageTool(reg, eng); err != nil {
+ t.Fatalf("register: %v", err)
+ }
+ if _, ok := reg.Tool("triage_incident"); !ok {
+ t.Fatalf("triage_incident not registered")
+ }
+}
+
+func TestTriageToolHandlerReturnsReport(t *testing.T) {
+ reg := tools.NewRegistry()
+ eng := newStubEngine(t)
+ if err := tools.RegisterTriageTool(reg, eng); err != nil {
+ t.Fatalf("register: %v", err)
+ }
+ params := json.RawMessage(`{"incident_id":"inc_abc","window":"15m","snapshot":false}`)
+ out, err := reg.Call(context.Background(), nil /* graph store unused by triage */, "triage_incident", params)
+ if err != nil {
+ t.Fatalf("call: %v", err)
+ }
+ rep, ok := out.(*pkgtriage.Report)
+ if !ok {
+ t.Fatalf("expected *pkgtriage.Report, got %T", out)
+ }
+ if rep.IncidentRef.ID != "inc_abc" {
+ t.Fatalf("wrong incident id: %q", rep.IncidentRef.ID)
+ }
+}
+
+// newStubEngine wires a triage.Engine with stub deps that always succeed.
+// We duplicate the stubs inline to avoid creating a separate `triagetest` helper package for M1.
+func newStubEngine(t *testing.T) *triage.Engine {
+ t.Helper()
+ deps := triage.Deps{
+ Incidents: triageStubIncidents{},
+ Blast: triageStubBlast{},
+ Story: triageStubStory{},
+ Signals: triageStubSignals{},
+ NextChecks: triageStubNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := triage.NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ return eng
+}
+
+type triageStubIncidents struct{}
+
+func (triageStubIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+ return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type triageStubBlast struct{}
+
+func (triageStubBlast) BlastSnapshot(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+ return triage.BlastSnapshotResult{}, nil
+}
+
+type triageStubStory struct{}
+
+func (triageStubStory) FirstFailureStory(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.FirstFailureResult, error) {
+ return triage.FirstFailureResult{}, nil
+}
+
+type triageStubSignals struct{}
+
+func (triageStubSignals) SignalsFor(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) ([]triage.SignalEvidence, error) {
+ return nil, nil
+}
+
+type triageStubNextChecks struct{}
+
+func (triageStubNextChecks) NextChecks(ctx context.Context, inc triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+ return nil, nil
+}
diff --git a/internal/triage/adapter.go b/internal/triage/adapter.go
new file mode 100644
index 0000000..8188bd9
--- /dev/null
+++ b/internal/triage/adapter.go
@@ -0,0 +1,272 @@
+package triage
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "strconv"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// Upstream collaborator interfaces. Defined narrowly so adapters are testable
+// without instantiating real engines/stores. Production wiring (Task 11)
+// satisfies these with *incidents.Engine (Get / BlastRadius+Errors), the
+// signal store, and a closure over (*core.Graph, *tracestore.Store).
+
+// IncidentReader returns a single incident by ID. *incidents.Engine satisfies
+// this via its Get method.
+type IncidentReader interface {
+ Get(ctx context.Context, id string) (incidents.Incident, error)
+}
+
+// BlastReader exposes the read-side queries the blast adapter needs. The
+// production reader passed to *incidents.Engine (incidents.Reader) satisfies
+// this directly because the method signatures are identical.
+type BlastReader interface {
+ BlastRadius(f incidents.SearchFilter, key apiv2.BlastKey) apiv2.BlastRadiusResponse
+ Errors(f incidents.SearchFilter, limit int) incidents.ErrorsResult
+}
+
+// SignalStore is the read surface of internal/signals.Store the adapter calls.
+type SignalStore interface {
+ Query(ctx context.Context, f signals.Filter) ([]signals.Signal, error)
+}
+
+// StoryBuildFunc renders the public-shape trace story for a given trace ID.
+// Production wiring closes over *ingestv2.Reader and calls
+// Reader.TraceStoryByTraceID. Tests inject a stub directly. The bool return
+// is the "found" indicator: when false, the adapter returns an empty result
+// without erroring.
+type StoryBuildFunc func(traceID string) (apiv2.StoryResponse, bool)
+
+// ----- adapter implementations -----
+
+const defaultWindowLabel = "15m"
+
+type incidentLookupAdapter struct{ r IncidentReader }
+
+func NewIncidentLookupAdapter(r IncidentReader) IncidentLookup {
+ return incidentLookupAdapter{r: r}
+}
+
+func (a incidentLookupAdapter) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+ inc, err := a.r.Get(ctx, id)
+ if err != nil {
+ if errors.Is(err, incidents.ErrNotFound) {
+ return IncidentSummary{}, fmt.Errorf("%w: %s", ErrUnknownIncident, id)
+ }
+ return IncidentSummary{}, err
+ }
+ return IncidentSummary{
+ ID: inc.IncidentID,
+ Window: defaultWindowLabel,
+ Env: inc.Env,
+ StartedAt: inc.StartedAt,
+ UpdatedAt: inc.UpdatedAt,
+ Service: inc.ErrorFamily.Service,
+ Step: inc.ErrorFamily.Step,
+ ErrorCode: inc.ErrorFamily.ErrorCode,
+ Confidence: mapConfidence(inc.Confidence),
+ NextChecks: append([]string(nil), inc.NextChecks...),
+ }, nil
+}
+
+// mapConfidence converts an incidents.Confidence string to its pkg/triage
+// counterpart. Unknown values default to medium so the produced report
+// always passes Validate.
+func mapConfidence(c incidents.Confidence) pkgtriage.Confidence {
+ switch c {
+ case incidents.ConfidenceHigh:
+ return pkgtriage.ConfidenceHigh
+ case incidents.ConfidenceLow:
+ return pkgtriage.ConfidenceLow
+ case incidents.ConfidenceMedium:
+ return pkgtriage.ConfidenceMedium
+ default:
+ return pkgtriage.ConfidenceMedium
+ }
+}
+
+type blastQueryAdapter struct{ r BlastReader }
+
+func NewBlastQueryAdapter(r BlastReader) BlastQuery {
+ return blastQueryAdapter{r: r}
+}
+
+func (a blastQueryAdapter) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+ end := opts.Now
+ if end.IsZero() {
+ end = inc.UpdatedAt
+ }
+ window := opts.Window
+ if window <= 0 {
+ window = defaultWindow
+ }
+ filter := incidents.SearchFilter{
+ Service: inc.Service,
+ ErrorCode: inc.ErrorCode,
+ Since: end.Add(-window),
+ Until: end,
+ }
+ br := a.r.BlastRadius(filter, apiv2.BlastKey{
+ Service: inc.Service,
+ Step: inc.Step,
+ ErrorCode: inc.ErrorCode,
+ })
+ users := 0
+ if br.AffectedUsers != nil {
+ users = *br.AffectedUsers
+ }
+ rows := a.r.Errors(filter, 5).Rows
+ families := make([]pkgtriage.ErrorFamily, 0, len(rows))
+ for _, row := range rows {
+ families = append(families, pkgtriage.ErrorFamily{
+ Service: row.ErrorFamily.Service,
+ Step: row.ErrorFamily.Step,
+ ErrorCode: row.ErrorFamily.ErrorCode,
+ Count: row.Count,
+ })
+ }
+ return BlastSnapshotResult{
+ Requests: br.AffectedRequests,
+ Users: users,
+ Services: br.AffectedServices,
+ TopErrorFamilies: families,
+ }, nil
+}
+
+type storyBuilderAdapter struct {
+ r IncidentReader
+ build StoryBuildFunc
+}
+
+// NewStoryBuilderAdapter wraps an upstream incident reader (to discover the
+// first-failure trace ID) and a story-build function (production: closure
+// over tracestory.BuildWithTraceStore). The trace selected is the first
+// SampleTraces entry on the underlying incident; if none exists, returns an
+// empty result rather than erroring (M1).
+func NewStoryBuilderAdapter(r IncidentReader, build StoryBuildFunc) StoryBuilder {
+ return storyBuilderAdapter{r: r, build: build}
+}
+
+func (a storyBuilderAdapter) FirstFailureStory(ctx context.Context, inc IncidentSummary, _ BuildOptions) (FirstFailureResult, error) {
+ upstream, err := a.r.Get(ctx, inc.ID)
+ if err != nil {
+ if errors.Is(err, incidents.ErrNotFound) {
+ return FirstFailureResult{}, nil
+ }
+ return FirstFailureResult{}, err
+ }
+ if len(upstream.SampleTraces) == 0 {
+ return FirstFailureResult{}, nil
+ }
+ traceID := upstream.SampleTraces[0]
+ resp, ok := a.build(traceID)
+ if !ok {
+ return FirstFailureResult{}, nil
+ }
+ payload, err := json.Marshal(resp)
+ if err != nil {
+ return FirstFailureResult{}, fmt.Errorf("triage: marshal story: %w", err)
+ }
+ summary := storySummary(resp, inc)
+ return FirstFailureResult{
+ Payload: payload,
+ SampleTraces: []pkgtriage.TraceSample{{TraceID: resp.TraceID, Summary: summary}},
+ }, nil
+}
+
+func storySummary(s apiv2.StoryResponse, inc IncidentSummary) string {
+ svc := s.Service
+ step := ""
+ code := ""
+ if s.Anchor != nil {
+ step = s.Anchor.Step
+ code = s.Anchor.ErrorCode
+ }
+ switch {
+ case svc != "" && step != "" && code != "":
+ return svc + "/" + step + "/" + code
+ case svc != "" && code != "":
+ return svc + " " + code
+ case svc != "":
+ return svc + " failure"
+ case code != "":
+ return code
+ }
+ if inc.Service != "" && inc.Step != "" && inc.ErrorCode != "" {
+ return inc.Service + "/" + inc.Step + "/" + inc.ErrorCode
+ }
+ if inc.Service != "" && inc.ErrorCode != "" {
+ return inc.Service + " " + inc.ErrorCode
+ }
+ return "first failure"
+}
+
+type signalQueryAdapter struct{ s SignalStore }
+
+func NewSignalQueryAdapter(s SignalStore) SignalQuery {
+ return signalQueryAdapter{s: s}
+}
+
+func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+ end := opts.Now
+ if end.IsZero() {
+ end = inc.UpdatedAt
+ }
+ window := opts.Window
+ if window <= 0 {
+ window = defaultWindow
+ }
+ // Mirror incidents.Engine.querySignals: filter by env+window only. A
+ // service filter would drop cross-service evidence (e.g. a payment
+ // dependency signal on a checkout incident).
+ rows, err := a.s.Query(ctx, signals.Filter{
+ Env: inc.Env,
+ Since: end.Add(-window),
+ Until: end,
+ Limit: 200,
+ })
+ if err != nil {
+ if errors.Is(err, signals.ErrUnavailable) {
+ return nil, nil
+ }
+ return nil, err
+ }
+ out := make([]SignalEvidence, 0, len(rows))
+ for _, sig := range rows {
+ out = append(out, SignalEvidence{
+ ID: sig.SignalID,
+ Type: string(sig.Type),
+ EvidenceIDs: []string{sig.SignalID},
+ })
+ }
+ return out, nil
+}
+
+type nextChecksAdapter struct{}
+
+// NewNextChecksAdapter returns a passthrough that converts the incident's
+// own NextChecks list (already populated by the incidents engine via
+// internal/incidents.NextChecks(cause, confidence)) into the typed
+// NextCheckSpec entries the report consumes. Stable IDs (check_)
+// keep the report deterministic across runs.
+func NewNextChecksAdapter() NextChecksProvider {
+ return nextChecksAdapter{}
+}
+
+func (nextChecksAdapter) NextChecks(_ context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+ if len(inc.NextChecks) == 0 {
+ return nil, nil
+ }
+ out := make([]NextCheckSpec, 0, len(inc.NextChecks))
+ for i, prompt := range inc.NextChecks {
+ out = append(out, NextCheckSpec{ID: "check_" + strconv.Itoa(i), Prompt: prompt})
+ }
+ return out, nil
+}
diff --git a/internal/triage/adapter_test.go b/internal/triage/adapter_test.go
new file mode 100644
index 0000000..2e678d6
--- /dev/null
+++ b/internal/triage/adapter_test.go
@@ -0,0 +1,454 @@
+package triage_test
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+// ----- IncidentLookupAdapter -----
+
+type fakeIncidentReader struct {
+ inc incidents.Incident
+ err error
+}
+
+func (f fakeIncidentReader) Get(_ context.Context, _ string) (incidents.Incident, error) {
+ if f.err != nil {
+ return incidents.Incident{}, f.err
+ }
+ return f.inc, nil
+}
+
+func TestIncidentLookupAdapter_MapsFamilyFields(t *testing.T) {
+ started := time.Date(2026, 5, 6, 11, 0, 0, 0, time.UTC)
+ updated := time.Date(2026, 5, 6, 11, 5, 0, 0, time.UTC)
+ reader := fakeIncidentReader{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ Env: "demo",
+ StartedAt: started,
+ UpdatedAt: updated,
+ Service: "payment",
+ ErrorFamily: apiv2.ErrorFamily{
+ Service: "payment",
+ Step: "payment.charge",
+ ErrorCode: "PMT_502",
+ },
+ Confidence: incidents.ConfidenceHigh,
+ NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+ }}
+ a := triage.NewIncidentLookupAdapter(reader)
+ got, err := a.GetIncident(context.Background(), "inc_abc")
+ if err != nil {
+ t.Fatalf("GetIncident: %v", err)
+ }
+ if got.ID != "inc_abc" {
+ t.Fatalf("ID = %q, want inc_abc", got.ID)
+ }
+ if got.Env != "demo" {
+ t.Fatalf("Env = %q, want demo", got.Env)
+ }
+ if !got.StartedAt.Equal(started) {
+ t.Fatalf("StartedAt = %v, want %v", got.StartedAt, started)
+ }
+ if !got.UpdatedAt.Equal(updated) {
+ t.Fatalf("UpdatedAt = %v, want %v", got.UpdatedAt, updated)
+ }
+ if got.Service != "payment" || got.Step != "payment.charge" || got.ErrorCode != "PMT_502" {
+ t.Fatalf("family fields = %+v", got)
+ }
+ if got.Window != "15m" {
+ t.Fatalf("Window default = %q, want 15m", got.Window)
+ }
+ if got.Confidence != pkgtriage.ConfidenceHigh {
+ t.Fatalf("Confidence = %q, want high", got.Confidence)
+ }
+ wantChecks := []string{"Verify payment-service health", "Check recent deploys"}
+ if len(got.NextChecks) != len(wantChecks) {
+ t.Fatalf("NextChecks len = %d, want %d (%+v)", len(got.NextChecks), len(wantChecks), got.NextChecks)
+ }
+ for i := range wantChecks {
+ if got.NextChecks[i] != wantChecks[i] {
+ t.Fatalf("NextChecks[%d] = %q, want %q", i, got.NextChecks[i], wantChecks[i])
+ }
+ }
+}
+
+func TestIncidentLookupAdapter_ConfidenceMapping(t *testing.T) {
+ cases := []struct {
+ in incidents.Confidence
+ want pkgtriage.Confidence
+ }{
+ {incidents.ConfidenceHigh, pkgtriage.ConfidenceHigh},
+ {incidents.ConfidenceMedium, pkgtriage.ConfidenceMedium},
+ {incidents.ConfidenceLow, pkgtriage.ConfidenceLow},
+ {incidents.Confidence("nonsense"), pkgtriage.ConfidenceMedium},
+ }
+ for _, tc := range cases {
+ reader := fakeIncidentReader{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ Confidence: tc.in,
+ }}
+ a := triage.NewIncidentLookupAdapter(reader)
+ got, err := a.GetIncident(context.Background(), "inc_abc")
+ if err != nil {
+ t.Fatalf("GetIncident(%q): %v", tc.in, err)
+ }
+ if got.Confidence != tc.want {
+ t.Fatalf("Confidence(%q) = %q, want %q", tc.in, got.Confidence, tc.want)
+ }
+ }
+}
+
+func TestIncidentLookupAdapter_NextChecksDefensiveCopy(t *testing.T) {
+ original := []string{"a", "b"}
+ reader := fakeIncidentReader{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ NextChecks: original,
+ }}
+ a := triage.NewIncidentLookupAdapter(reader)
+ got, err := a.GetIncident(context.Background(), "inc_abc")
+ if err != nil {
+ t.Fatalf("GetIncident: %v", err)
+ }
+ // Mutating the original slice must not affect the summary's copy.
+ original[0] = "MUTATED"
+ if got.NextChecks[0] != "a" {
+ t.Fatalf("NextChecks copy must be defensive, got %q after mutation", got.NextChecks[0])
+ }
+}
+
+func TestIncidentLookupAdapter_NotFoundIsErrUnknown(t *testing.T) {
+ a := triage.NewIncidentLookupAdapter(fakeIncidentReader{err: incidents.ErrNotFound})
+ if _, err := a.GetIncident(context.Background(), "missing"); !errors.Is(err, triage.ErrUnknownIncident) {
+ t.Fatalf("err = %v, want ErrUnknownIncident", err)
+ }
+}
+
+// ----- BlastQueryAdapter -----
+
+type fakeBlastReader struct {
+ br apiv2.BlastRadiusResponse
+ rows []apiv2.ErrorRow
+}
+
+func (f fakeBlastReader) BlastRadius(_ incidents.SearchFilter, _ apiv2.BlastKey) apiv2.BlastRadiusResponse {
+ return f.br
+}
+
+func (f fakeBlastReader) Errors(_ incidents.SearchFilter, _ int) incidents.ErrorsResult {
+ return incidents.ErrorsResult{Rows: f.rows}
+}
+
+func TestBlastQueryAdapter_MapsCountsAndTopFamilies(t *testing.T) {
+ users := 8
+ reader := fakeBlastReader{
+ br: apiv2.BlastRadiusResponse{
+ AffectedRequests: 12,
+ AffectedUsers: &users,
+ AffectedServices: 4,
+ },
+ rows: []apiv2.ErrorRow{
+ {ErrorFamily: apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502"}, Count: 11},
+ {ErrorFamily: apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_503"}, Count: 3},
+ },
+ }
+ a := triage.NewBlastQueryAdapter(reader)
+ inc := triage.IncidentSummary{
+ ID: "inc_abc", Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502",
+ UpdatedAt: time.Date(2026, 5, 6, 11, 5, 0, 0, time.UTC),
+ }
+ opts, _ := triage.ParseBuildOptions("15m", true, time.Now())
+ opts.Now = inc.UpdatedAt
+
+ got, err := a.BlastSnapshot(context.Background(), inc, opts)
+ if err != nil {
+ t.Fatalf("BlastSnapshot: %v", err)
+ }
+ if got.Requests != 12 || got.Users != 8 || got.Services != 4 {
+ t.Fatalf("counts = %+v", got)
+ }
+ if len(got.TopErrorFamilies) != 2 {
+ t.Fatalf("top families = %d, want 2", len(got.TopErrorFamilies))
+ }
+ if got.TopErrorFamilies[0].ErrorCode != "PMT_502" || got.TopErrorFamilies[0].Count != 11 {
+ t.Fatalf("first family = %+v", got.TopErrorFamilies[0])
+ }
+}
+
+func TestBlastQueryAdapter_NilUsersBecomesZero(t *testing.T) {
+ reader := fakeBlastReader{br: apiv2.BlastRadiusResponse{AffectedRequests: 1, AffectedUsers: nil}}
+ a := triage.NewBlastQueryAdapter(reader)
+ inc := triage.IncidentSummary{Service: "x", UpdatedAt: time.Now()}
+ opts, _ := triage.ParseBuildOptions("15m", false, time.Now())
+ got, err := a.BlastSnapshot(context.Background(), inc, opts)
+ if err != nil {
+ t.Fatalf("BlastSnapshot: %v", err)
+ }
+ if got.Users != 0 {
+ t.Fatalf("Users = %d, want 0 when AffectedUsers is nil", got.Users)
+ }
+}
+
+// ----- StoryBuilderAdapter -----
+
+type fakeIncForStory struct{ inc incidents.Incident }
+
+func (f fakeIncForStory) Get(_ context.Context, _ string) (incidents.Incident, error) {
+ return f.inc, nil
+}
+
+func TestStoryBuilderAdapter_UsesFirstSampleTrace(t *testing.T) {
+ traceID := "abc123"
+ wantStory := apiv2.StoryResponse{
+ TraceID: traceID,
+ Service: "payment",
+ Anchor: &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"},
+ Linkage: "trace_id",
+ }
+
+ called := false
+ build := func(tid string) (apiv2.StoryResponse, bool) {
+ called = true
+ if tid != traceID {
+ t.Fatalf("build called with %q, want %q", tid, traceID)
+ }
+ return wantStory, true
+ }
+
+ incReader := fakeIncForStory{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ SampleTraces: []string{traceID, "other"},
+ Service: "payment",
+ ErrorFamily: apiv2.ErrorFamily{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502"},
+ }}
+ a := triage.NewStoryBuilderAdapter(incReader, build)
+ inc := triage.IncidentSummary{ID: "inc_abc"}
+ opts, _ := triage.ParseBuildOptions("15m", false, time.Now())
+
+ got, err := a.FirstFailureStory(context.Background(), inc, opts)
+ if err != nil {
+ t.Fatalf("FirstFailureStory: %v", err)
+ }
+ if !called {
+ t.Fatalf("build func was not called")
+ }
+ if len(got.SampleTraces) != 1 || got.SampleTraces[0].TraceID != traceID {
+ t.Fatalf("sample traces = %+v", got.SampleTraces)
+ }
+ // Payload should be a non-empty JSON object that decodes to the public
+ // StoryResponse shape.
+ if len(got.Payload) == 0 || got.Payload[0] != '{' {
+ t.Fatalf("payload not JSON object: %s", string(got.Payload))
+ }
+ var decoded map[string]any
+ if err := json.Unmarshal(got.Payload, &decoded); err != nil {
+ t.Fatalf("payload unmarshal: %v", err)
+ }
+ if decoded["trace_id"] != traceID {
+ t.Fatalf("payload.trace_id = %v, want %q", decoded["trace_id"], traceID)
+ }
+}
+
+func TestStoryBuilderAdapter_NoSampleTraceReturnsEmptyResult(t *testing.T) {
+ build := func(string) (apiv2.StoryResponse, bool) {
+ t.Fatalf("build should not be called when no sample trace")
+ return apiv2.StoryResponse{}, false
+ }
+ incReader := fakeIncForStory{inc: incidents.Incident{IncidentID: "inc_abc"}}
+ a := triage.NewStoryBuilderAdapter(incReader, build)
+ got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+ if err != nil {
+ t.Fatalf("FirstFailureStory: %v", err)
+ }
+ if len(got.SampleTraces) != 0 {
+ t.Fatalf("expected no sample traces, got %+v", got.SampleTraces)
+ }
+}
+
+func TestStoryBuilderAdapter_StoryNotFoundReturnsEmpty(t *testing.T) {
+ // When TraceStoryByTraceID returns ok=false (no matching trace), the
+ // adapter must produce an empty result without erroring.
+ build := func(string) (apiv2.StoryResponse, bool) {
+ return apiv2.StoryResponse{}, false
+ }
+ incReader := fakeIncForStory{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ SampleTraces: []string{"missing"},
+ }}
+ a := triage.NewStoryBuilderAdapter(incReader, build)
+ got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+ if err != nil {
+ t.Fatalf("FirstFailureStory: %v", err)
+ }
+ if len(got.Payload) != 0 || len(got.SampleTraces) != 0 {
+ t.Fatalf("expected empty result for not-found story, got %+v", got)
+ }
+}
+
+// TestStoryBuilderAdapterPayloadHasReadAPIFields verifies the FirstFailure
+// payload uses the public StoryResponse shape — keys consumers see at
+// /v1/traces/story.
+func TestStoryBuilderAdapterPayloadHasReadAPIFields(t *testing.T) {
+ traceID := "trace_demo"
+ resp := apiv2.StoryResponse{
+ TraceID: traceID,
+ Anchor: &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"},
+ Path: []apiv2.StoryStep{{Name: "payment.charge", StartMS: 0, DurationMS: 12}},
+ Logs: []apiv2.StoryLog{{TsOffsetMS: 5, Msg: "boom"}},
+ Downstream: []apiv2.StoryDownstream{
+ {Step: "payment.charge", Service: "payment", Endpoint: "/charge"},
+ },
+ Linkage: "trace_id",
+ }
+ build := func(string) (apiv2.StoryResponse, bool) { return resp, true }
+ incReader := fakeIncForStory{inc: incidents.Incident{
+ IncidentID: "inc_abc",
+ SampleTraces: []string{traceID},
+ }}
+ a := triage.NewStoryBuilderAdapter(incReader, build)
+
+ got, err := a.FirstFailureStory(context.Background(), triage.IncidentSummary{ID: "inc_abc"}, triage.BuildOptions{})
+ if err != nil {
+ t.Fatalf("FirstFailureStory: %v", err)
+ }
+ var decoded map[string]any
+ if err := json.Unmarshal(got.Payload, &decoded); err != nil {
+ t.Fatalf("payload unmarshal: %v", err)
+ }
+ for _, key := range []string{"trace_id", "anchor", "path", "logs", "downstream", "linkage"} {
+ if _, ok := decoded[key]; !ok {
+ t.Fatalf("payload missing read-API key %q: %v", key, decoded)
+ }
+ }
+}
+
+// ----- SignalQueryAdapter -----
+
+type fakeSignalStore struct {
+ out []signals.Signal
+ err error
+ got signals.Filter
+}
+
+func (f *fakeSignalStore) Query(_ context.Context, filter signals.Filter) ([]signals.Signal, error) {
+ f.got = filter
+ if f.err != nil {
+ return nil, f.err
+ }
+ return f.out, nil
+}
+
+func TestSignalQueryAdapter_QueriesBroadByEnvWindowNotService(t *testing.T) {
+ // Adapter must mirror incidents.Engine.querySignals: filter by Env + window
+ // only. Service is intentionally NOT set so cross-service dependency
+ // signals (e.g. a payment-service signal evidencing a checkout incident)
+ // are surfaced.
+ store := &fakeSignalStore{
+ out: []signals.Signal{
+ {SignalID: "sig_1", Type: signals.TypeDeploy, Service: "payment"},
+ {SignalID: "sig_2", Type: signals.TypeDependency, Service: "payment"},
+ },
+ }
+ a := triage.NewSignalQueryAdapter(store)
+ now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+ inc := triage.IncidentSummary{
+ Service: "checkout",
+ Env: "demo",
+ UpdatedAt: now,
+ }
+ opts, _ := triage.ParseBuildOptions("15m", false, now)
+
+ got, err := a.SignalsFor(context.Background(), inc, opts)
+ if err != nil {
+ t.Fatalf("SignalsFor: %v", err)
+ }
+ if store.got.Service != "" {
+ t.Fatalf("filter.Service = %q, want empty (broad query)", store.got.Service)
+ }
+ if store.got.Env != "demo" {
+ t.Fatalf("filter.Env = %q, want demo", store.got.Env)
+ }
+ wantSince := now.Add(-15 * time.Minute)
+ if !store.got.Since.Equal(wantSince) {
+ t.Fatalf("filter.Since = %v, want %v", store.got.Since, wantSince)
+ }
+ if !store.got.Until.Equal(now) {
+ t.Fatalf("filter.Until = %v, want %v", store.got.Until, now)
+ }
+ if store.got.Limit != 200 {
+ t.Fatalf("filter.Limit = %d, want 200", store.got.Limit)
+ }
+ if len(got) != 2 {
+ t.Fatalf("got %d signals, want 2 (cross-service signals must be returned)", len(got))
+ }
+ if got[0].ID != "sig_1" || got[0].Type != "deploy" {
+ t.Fatalf("first signal = %+v", got[0])
+ }
+ // Critical assertion for Fix 1: the payment-service dependency signal must
+ // be in the result even though inc.Service = checkout.
+ foundPaymentDep := false
+ for _, s := range got {
+ if s.ID == "sig_2" && s.Type == "dependency" {
+ foundPaymentDep = true
+ }
+ }
+ if !foundPaymentDep {
+ t.Fatalf("payment-service dependency signal dropped: got %+v", got)
+ }
+}
+
+func TestSignalQueryAdapter_UnavailableReturnsEmpty(t *testing.T) {
+ a := triage.NewSignalQueryAdapter(&fakeSignalStore{err: signals.ErrUnavailable})
+ got, err := a.SignalsFor(context.Background(), triage.IncidentSummary{UpdatedAt: time.Now()}, triage.BuildOptions{Window: time.Minute})
+ if err != nil {
+ t.Fatalf("SignalsFor: %v", err)
+ }
+ if len(got) != 0 {
+ t.Fatalf("want empty when unavailable, got %+v", got)
+ }
+}
+
+// ----- NextChecksAdapter -----
+
+func TestNextChecksAdapter_ConsumesIncidentNextChecks(t *testing.T) {
+ a := triage.NewNextChecksAdapter()
+ got, err := a.NextChecks(context.Background(), triage.IncidentSummary{
+ Service: "checkout",
+ ErrorCode: "PMT_502",
+ NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+ })
+ if err != nil {
+ t.Fatalf("NextChecks: %v", err)
+ }
+ if len(got) != 2 {
+ t.Fatalf("got %d checks, want 2: %+v", len(got), got)
+ }
+ if got[0].ID != "check_0" || got[0].Prompt != "Verify payment-service health" {
+ t.Fatalf("got[0] = %+v, want {check_0, Verify payment-service health}", got[0])
+ }
+ if got[1].ID != "check_1" || got[1].Prompt != "Check recent deploys" {
+ t.Fatalf("got[1] = %+v, want {check_1, Check recent deploys}", got[1])
+ }
+}
+
+func TestNextChecksAdapter_EmptyIncidentReturnsEmpty(t *testing.T) {
+ a := triage.NewNextChecksAdapter()
+ got, err := a.NextChecks(context.Background(), triage.IncidentSummary{
+ Service: "anything", ErrorCode: "XYZ_123",
+ })
+ if err != nil {
+ t.Fatalf("NextChecks: %v", err)
+ }
+ if len(got) != 0 {
+ t.Fatalf("expected no checks for empty NextChecks, got %+v", got)
+ }
+}
diff --git a/internal/triage/engine.go b/internal/triage/engine.go
new file mode 100644
index 0000000..07f5a04
--- /dev/null
+++ b/internal/triage/engine.go
@@ -0,0 +1,139 @@
+package triage
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "time"
+
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+var ErrUnknownIncident = errors.New("triage: unknown incident")
+
+// IncidentSummary is the minimal incident shape this package needs.
+// Adapter types in the wiring layer convert from internal/incidents.Incident.
+type IncidentSummary struct {
+ ID string
+ Window string
+ Env string
+ StartedAt time.Time
+ UpdatedAt time.Time
+ Service string
+ Step string
+ ErrorCode string
+ Confidence pkgtriage.Confidence
+ NextChecks []string
+}
+
+type BlastSnapshotResult struct {
+ Requests int
+ Users int
+ Services int
+ TopErrorFamilies []pkgtriage.ErrorFamily
+}
+
+type FirstFailureResult struct {
+ Payload json.RawMessage
+ SampleTraces []pkgtriage.TraceSample
+}
+
+type SignalEvidence = pkgtriage.SignalRef
+
+type NextCheckSpec = pkgtriage.NextCheck
+
+type IncidentLookup interface {
+ GetIncident(ctx context.Context, id string) (IncidentSummary, error)
+}
+
+type BlastQuery interface {
+ BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error)
+}
+
+type StoryBuilder interface {
+ FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error)
+}
+
+type SignalQuery interface {
+ SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error)
+}
+
+type NextChecksProvider interface {
+ NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error)
+}
+
+type Deps struct {
+ Incidents IncidentLookup
+ Blast BlastQuery
+ Story StoryBuilder
+ Signals SignalQuery
+ NextChecks NextChecksProvider
+ Now func() time.Time
+}
+
+type Engine struct {
+ deps Deps
+}
+
+func NewEngine(d Deps) (*Engine, error) {
+ if d.Incidents == nil || d.Blast == nil || d.Story == nil || d.Signals == nil || d.NextChecks == nil {
+ return nil, fmt.Errorf("triage: NewEngine requires all dependencies")
+ }
+ if d.Now == nil {
+ d.Now = time.Now
+ }
+ return &Engine{deps: d}, nil
+}
+
+func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions) (*pkgtriage.Report, error) {
+ inc, err := e.deps.Incidents.GetIncident(ctx, incidentID)
+ if err != nil {
+ return nil, err
+ }
+ if opts.Snapshot {
+ opts.Now = inc.UpdatedAt
+ }
+
+ blast, err := e.deps.Blast.BlastSnapshot(ctx, inc, opts)
+ if err != nil {
+ return nil, fmt.Errorf("triage: blast: %w", err)
+ }
+ story, err := e.deps.Story.FirstFailureStory(ctx, inc, opts)
+ if err != nil {
+ return nil, fmt.Errorf("triage: story: %w", err)
+ }
+ sigs, err := e.deps.Signals.SignalsFor(ctx, inc, opts)
+ if err != nil {
+ return nil, fmt.Errorf("triage: signals: %w", err)
+ }
+ checks, err := e.deps.NextChecks.NextChecks(ctx, inc)
+ if err != nil {
+ return nil, fmt.Errorf("triage: next_checks: %w", err)
+ }
+
+ r := &pkgtriage.Report{
+ SchemaVersion: pkgtriage.SchemaVersionV1,
+ IncidentRef: pkgtriage.IncidentRef{ID: inc.ID, Window: opts.Window.String()},
+ BlastSnapshot: pkgtriage.BlastSnapshot{
+ Requests: blast.Requests, Users: blast.Users, Services: blast.Services,
+ TopErrorFamilies: blast.TopErrorFamilies,
+ },
+ FirstFailure: story.Payload,
+ SampleTraces: story.SampleTraces,
+ Signals: sigs,
+ NextChecks: checks,
+ Confidence: inc.Confidence,
+ GeneratedAt: e.deps.Now().UTC().Format(time.RFC3339Nano),
+ }
+
+ hash, err := r.CanonicalHash()
+ if err != nil {
+ return nil, fmt.Errorf("triage: hash: %w", err)
+ }
+ r.ReportHash = hash
+ if err := r.Validate(); err != nil {
+ return nil, fmt.Errorf("triage: produced invalid report: %w", err)
+ }
+ return r, nil
+}
diff --git a/internal/triage/engine_test.go b/internal/triage/engine_test.go
new file mode 100644
index 0000000..b316830
--- /dev/null
+++ b/internal/triage/engine_test.go
@@ -0,0 +1,317 @@
+package triage
+
+import (
+ "context"
+ "encoding/json"
+ "strings"
+ "testing"
+ "time"
+
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestNewEngineRequiresAllDeps(t *testing.T) {
+ if _, err := NewEngine(Deps{}); err == nil {
+ t.Fatalf("expected error when deps are zero, got nil")
+ }
+}
+
+func TestEngineBuildReturnsErrorForUnknownIncident(t *testing.T) {
+ deps := stubDeps()
+ deps.Incidents = stubIncidentLookup{err: ErrUnknownIncident}
+ eng, err := NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ opts, _ := ParseBuildOptions("", false, time.Now())
+ if _, err := eng.Build(context.Background(), "inc_missing", opts); err == nil {
+ t.Fatalf("expected error for unknown incident")
+ }
+}
+
+// --- test helpers ---
+
+type stubIncidentLookup struct {
+ err error
+}
+
+func (s stubIncidentLookup) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+ return IncidentSummary{}, s.err
+}
+
+type stubBlastQuery struct{}
+
+func (stubBlastQuery) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+ return BlastSnapshotResult{}, nil
+}
+
+type stubStoryBuilder struct{}
+
+func (stubStoryBuilder) FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error) {
+ return FirstFailureResult{}, nil
+}
+
+type stubSignalQuery struct{}
+
+func (stubSignalQuery) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+ return nil, nil
+}
+
+type stubNextChecks struct{}
+
+func (stubNextChecks) NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+ return nil, nil
+}
+
+func stubDeps() Deps {
+ return Deps{
+ Incidents: stubIncidentLookup{},
+ Blast: stubBlastQuery{},
+ Story: stubStoryBuilder{},
+ Signals: stubSignalQuery{},
+ NextChecks: stubNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC) },
+ }
+}
+
+type richBlast struct{}
+
+func (richBlast) BlastSnapshot(ctx context.Context, inc IncidentSummary, opts BuildOptions) (BlastSnapshotResult, error) {
+ return BlastSnapshotResult{
+ Requests: 12, Users: 8, Services: 4,
+ TopErrorFamilies: []pkgtriage.ErrorFamily{
+ {Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+ },
+ }, nil
+}
+
+type richStory struct{}
+
+func (richStory) FirstFailureStory(ctx context.Context, inc IncidentSummary, opts BuildOptions) (FirstFailureResult, error) {
+ return FirstFailureResult{
+ Payload: json.RawMessage(`{"trace_id":"abc","first_failure":"payment.charge"}`),
+ SampleTraces: []pkgtriage.TraceSample{{TraceID: "abc", Summary: "payment 502"}},
+ }, nil
+}
+
+type richSignals struct{}
+
+func (richSignals) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
+ return []SignalEvidence{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}}, nil
+}
+
+type richNextChecks struct{}
+
+func (richNextChecks) NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+ return []NextCheckSpec{{ID: "check_payment_health", Prompt: "Verify payment-service health"}}, nil
+}
+
+type richIncidents struct{}
+
+func (richIncidents) GetIncident(ctx context.Context, id string) (IncidentSummary, error) {
+ return IncidentSummary{
+ ID: id, Window: "15m", Env: "demo",
+ StartedAt: time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC),
+ UpdatedAt: time.Date(2026, 5, 6, 0, 5, 0, 0, time.UTC),
+ Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502",
+ Confidence: pkgtriage.ConfidenceHigh,
+ NextChecks: []string{"Verify payment-service health"},
+ }, nil
+}
+
+func TestEngineBuildAssemblesAllSections(t *testing.T) {
+ deps := Deps{
+ Incidents: richIncidents{},
+ Blast: richBlast{},
+ Story: richStory{},
+ Signals: richSignals{},
+ NextChecks: richNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ opts, _ := ParseBuildOptions("15m", false, deps.Now())
+ r, err := eng.Build(context.Background(), "inc_abc", opts)
+ if err != nil {
+ t.Fatalf("build: %v", err)
+ }
+ if r.IncidentRef.Window != "15m0s" {
+ t.Fatalf("incident_ref.window should reflect opts.Window, got %q", r.IncidentRef.Window)
+ }
+ if r.BlastSnapshot.Requests != 12 {
+ t.Fatalf("blast.requests = %d, want 12", r.BlastSnapshot.Requests)
+ }
+ if len(r.SampleTraces) != 1 || r.SampleTraces[0].TraceID != "abc" {
+ t.Fatalf("sample_traces wrong: %+v", r.SampleTraces)
+ }
+ if len(r.Signals) != 1 || r.Signals[0].Type != "deploy" {
+ t.Fatalf("signals wrong: %+v", r.Signals)
+ }
+ if len(r.NextChecks) != 1 {
+ t.Fatalf("next_checks missing")
+ }
+ if r.Confidence != pkgtriage.ConfidenceHigh {
+ t.Fatalf("Confidence = %q, want high (must come from incident, not hard-coded medium)", r.Confidence)
+ }
+ if r.ReportHash == "" || !strings.HasPrefix(r.ReportHash, "sha256:") {
+ t.Fatalf("report_hash missing/invalid: %q", r.ReportHash)
+ }
+ if err := r.Validate(); err != nil {
+ t.Fatalf("produced report failed validation: %v", err)
+ }
+}
+
+// TestTriageReportFromDemoShape exercises the engine end-to-end against the
+// demo's actual shape — the cross-service signal (payment dependency on a
+// checkout incident), high confidence, and incident-provided next checks.
+// It is the regression gate for the four M1 fixes.
+func TestTriageReportFromDemoShape(t *testing.T) {
+ demoIncidents := stubGetIncident(IncidentSummary{
+ ID: "inc_demo",
+ Window: "15m",
+ Env: "demo",
+ Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502",
+ StartedAt: time.Date(2026, 5, 6, 0, 0, 0, 0, time.UTC),
+ UpdatedAt: time.Date(2026, 5, 6, 0, 5, 0, 0, time.UTC),
+ Confidence: pkgtriage.ConfidenceHigh,
+ NextChecks: []string{"Verify payment-service health", "Check recent deploys"},
+ })
+ demoBlast := stubBlastSnapshot{
+ out: BlastSnapshotResult{
+ Requests: 7, Users: 3, Services: 2,
+ TopErrorFamilies: []pkgtriage.ErrorFamily{
+ {Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502", Count: 6},
+ },
+ },
+ }
+ // Story payload mirrors apiv2.StoryResponse shape; engine treats it as
+ // opaque RawMessage.
+ demoStory := stubStoryResult{
+ out: FirstFailureResult{
+ Payload: json.RawMessage(`{"trace_id":"t_demo","anchor":{"step":"payment.charge","error_code":"PMT_502"},"path":[],"logs":[],"downstream":[],"linkage":"trace_id"}`),
+ SampleTraces: []pkgtriage.TraceSample{
+ {TraceID: "t_demo", Summary: "checkout PMT_502"},
+ },
+ },
+ }
+ // Cross-service signal: incident is on `checkout`, but the dependency
+ // signal is from `payment`. Fix 1 ensures the broad query surfaces it.
+ demoSignals := stubSignalsResult{
+ out: []SignalEvidence{
+ {ID: "sig_payment_dep", Type: "dependency", EvidenceIDs: []string{"sig_payment_dep"}},
+ },
+ }
+ // Fix 3: NextChecks must come from the incident, not a static map keyed
+ // by service+code.
+ demoChecks := stubNextChecksResult{}
+
+ deps := Deps{
+ Incidents: demoIncidents,
+ Blast: demoBlast,
+ Story: demoStory,
+ Signals: demoSignals,
+ NextChecks: demoChecks,
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ opts, _ := ParseBuildOptions("15m", false, deps.Now())
+
+ r, err := eng.Build(context.Background(), "inc_demo", opts)
+ if err != nil {
+ t.Fatalf("Build: %v", err)
+ }
+ if r.IncidentRef.ID != "inc_demo" {
+ t.Fatalf("IncidentRef.ID = %q, want inc_demo", r.IncidentRef.ID)
+ }
+ foundFamily := false
+ for _, fam := range r.BlastSnapshot.TopErrorFamilies {
+ if fam.ErrorCode == "PMT_502" {
+ foundFamily = true
+ }
+ }
+ if !foundFamily {
+ t.Fatalf("BlastSnapshot.TopErrorFamilies missing PMT_502: %+v", r.BlastSnapshot.TopErrorFamilies)
+ }
+ foundPaymentSig := false
+ for _, sig := range r.Signals {
+ if sig.ID == "sig_payment_dep" {
+ foundPaymentSig = true
+ }
+ }
+ if !foundPaymentSig {
+ t.Fatalf("Signals missing payment dependency signal: %+v", r.Signals)
+ }
+ if r.Confidence != pkgtriage.ConfidenceHigh {
+ t.Fatalf("Confidence = %q, want high", r.Confidence)
+ }
+ if len(r.NextChecks) != 2 {
+ t.Fatalf("NextChecks len = %d, want 2: %+v", len(r.NextChecks), r.NextChecks)
+ }
+ if r.NextChecks[0].ID != "check_0" || r.NextChecks[0].Prompt != "Verify payment-service health" {
+ t.Fatalf("NextChecks[0] = %+v, want {check_0, Verify payment-service health}", r.NextChecks[0])
+ }
+ if r.NextChecks[1].ID != "check_1" || r.NextChecks[1].Prompt != "Check recent deploys" {
+ t.Fatalf("NextChecks[1] = %+v, want {check_1, Check recent deploys}", r.NextChecks[1])
+ }
+ if r.ReportHash == "" {
+ t.Fatalf("ReportHash empty")
+ }
+}
+
+// --- additional stubs used by the demo-shape regression test ---
+
+type stubGetIncident IncidentSummary
+
+func (s stubGetIncident) GetIncident(_ context.Context, _ string) (IncidentSummary, error) {
+ return IncidentSummary(s), nil
+}
+
+type stubBlastSnapshot struct{ out BlastSnapshotResult }
+
+func (s stubBlastSnapshot) BlastSnapshot(_ context.Context, _ IncidentSummary, _ BuildOptions) (BlastSnapshotResult, error) {
+ return s.out, nil
+}
+
+type stubStoryResult struct{ out FirstFailureResult }
+
+func (s stubStoryResult) FirstFailureStory(_ context.Context, _ IncidentSummary, _ BuildOptions) (FirstFailureResult, error) {
+ return s.out, nil
+}
+
+type stubSignalsResult struct{ out []SignalEvidence }
+
+func (s stubSignalsResult) SignalsFor(_ context.Context, _ IncidentSummary, _ BuildOptions) ([]SignalEvidence, error) {
+ return s.out, nil
+}
+
+// stubNextChecksResult mirrors the production adapter: it consumes
+// inc.NextChecks and converts them to NextCheckSpec entries with stable IDs.
+type stubNextChecksResult struct{}
+
+func (stubNextChecksResult) NextChecks(_ context.Context, inc IncidentSummary) ([]NextCheckSpec, error) {
+ out := make([]NextCheckSpec, 0, len(inc.NextChecks))
+ for i, prompt := range inc.NextChecks {
+ out = append(out, NextCheckSpec{ID: nextCheckID(i), Prompt: prompt})
+ }
+ return out, nil
+}
+
+func nextCheckID(i int) string {
+ return "check_" + itoa(i)
+}
+
+func itoa(i int) string {
+ switch i {
+ case 0:
+ return "0"
+ case 1:
+ return "1"
+ }
+ // Tests only exercise small indices.
+ return "n"
+}
diff --git a/internal/triage/idempotency_test.go b/internal/triage/idempotency_test.go
new file mode 100644
index 0000000..9327295
--- /dev/null
+++ b/internal/triage/idempotency_test.go
@@ -0,0 +1,62 @@
+package triage
+
+import (
+ "context"
+ "testing"
+ "time"
+)
+
+func TestBuildIsIdempotentForSameInput(t *testing.T) {
+ deps := Deps{
+ Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+ Signals: richSignals{}, NextChecks: richNextChecks{},
+ // Two different "now" values to prove generated_at doesn't enter the hash
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ opts, _ := ParseBuildOptions("15m", false, deps.Now())
+
+ r1, err := eng.Build(context.Background(), "inc_abc", opts)
+ if err != nil {
+ t.Fatalf("build 1: %v", err)
+ }
+ r2, err := eng.Build(context.Background(), "inc_abc", opts)
+ if err != nil {
+ t.Fatalf("build 2: %v", err)
+ }
+ if r1.ReportHash != r2.ReportHash {
+ t.Fatalf("two builds should have identical report_hash, got %q vs %q", r1.ReportHash, r2.ReportHash)
+ }
+}
+
+func TestSnapshotModeUsesIncidentUpdatedAt(t *testing.T) {
+ deps := Deps{
+ Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+ Signals: richSignals{}, NextChecks: richNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, _ := NewEngine(deps)
+
+ wallClockOpts, _ := ParseBuildOptions("15m", false, deps.Now())
+ snapshotOpts, _ := ParseBuildOptions("15m", true, deps.Now())
+
+ wall, err := eng.Build(context.Background(), "inc_abc", wallClockOpts)
+ if err != nil {
+ t.Fatalf("wall build: %v", err)
+ }
+ snap, err := eng.Build(context.Background(), "inc_abc", snapshotOpts)
+ if err != nil {
+ t.Fatalf("snap build: %v", err)
+ }
+ // Both reports describe the same incident state; with the same upstream stubs they hash equal.
+ // The point of this test is that snapshot mode does not crash and produces a valid report.
+ if snap.ReportHash == "" || wall.ReportHash == "" {
+ t.Fatalf("hashes must be non-empty (snap=%q wall=%q)", snap.ReportHash, wall.ReportHash)
+ }
+ if snap.IncidentRef.ID != "inc_abc" {
+ t.Fatalf("snap report missing incident ref")
+ }
+}
diff --git a/internal/triage/options.go b/internal/triage/options.go
new file mode 100644
index 0000000..1b30059
--- /dev/null
+++ b/internal/triage/options.go
@@ -0,0 +1,28 @@
+// Package triage builds the TriageReport for an incident.
+// The Report type is the public artifact (pkg/triage); this package is the orchestrator.
+package triage
+
+import (
+ "fmt"
+ "time"
+)
+
+const defaultWindow = 15 * time.Minute
+
+type BuildOptions struct {
+ Window time.Duration
+ Snapshot bool
+ Now time.Time
+}
+
+func ParseBuildOptions(window string, snapshot bool, now time.Time) (BuildOptions, error) {
+ w := defaultWindow
+ if window != "" {
+ parsed, err := time.ParseDuration(window)
+ if err != nil {
+ return BuildOptions{}, fmt.Errorf("triage: invalid window %q: %w", window, err)
+ }
+ w = parsed
+ }
+ return BuildOptions{Window: w, Snapshot: snapshot, Now: now}, nil
+}
diff --git a/internal/triage/options_test.go b/internal/triage/options_test.go
new file mode 100644
index 0000000..179dd49
--- /dev/null
+++ b/internal/triage/options_test.go
@@ -0,0 +1,50 @@
+package triage
+
+import (
+ "testing"
+ "time"
+)
+
+func TestBuildOptionsDefaults(t *testing.T) {
+ now := time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC)
+ opts, err := ParseBuildOptions("", false, now)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if opts.Window != 15*time.Minute {
+ t.Fatalf("default window should be 15m, got %s", opts.Window)
+ }
+ if opts.Snapshot {
+ t.Fatalf("default snapshot should be false")
+ }
+ if !opts.Now.Equal(now) {
+ t.Fatalf("Now should be passed through")
+ }
+}
+
+func TestBuildOptionsWindowParse(t *testing.T) {
+ now := time.Now()
+ opts, err := ParseBuildOptions("30m", false, now)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if opts.Window != 30*time.Minute {
+ t.Fatalf("got %s want 30m", opts.Window)
+ }
+}
+
+func TestBuildOptionsBadWindow(t *testing.T) {
+ if _, err := ParseBuildOptions("forever", false, time.Now()); err == nil {
+ t.Fatalf("expected error for bad window")
+ }
+}
+
+func TestBuildOptionsSnapshotFlag(t *testing.T) {
+ opts, err := ParseBuildOptions("15m", true, time.Now())
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if !opts.Snapshot {
+ t.Fatalf("snapshot flag not honored")
+ }
+}
diff --git a/internal/triagehttp/handler.go b/internal/triagehttp/handler.go
new file mode 100644
index 0000000..0adc3c2
--- /dev/null
+++ b/internal/triagehttp/handler.go
@@ -0,0 +1,66 @@
+package triagehttp
+
+import (
+ "encoding/json"
+ "errors"
+ "net/http"
+ "strings"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+type Handler struct {
+ engine *triage.Engine
+}
+
+func NewHandler(engine *triage.Engine) *Handler {
+ return &Handler{engine: engine}
+}
+
+func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+ return
+ }
+ id := strings.TrimPrefix(r.URL.Path, "/v1/triage/")
+ id = strings.Trim(id, "/")
+ if id == "" {
+ writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
+ return
+ }
+ q := r.URL.Query()
+ opts, err := triage.ParseBuildOptions(q.Get("window"), q.Get("snapshot") == "true", time.Now())
+ if err != nil {
+ writeError(w, http.StatusBadRequest, "bad_options", err.Error(), "")
+ return
+ }
+ rep, err := h.engine.Build(r.Context(), id, opts)
+ if errors.Is(err, triage.ErrUnknownIncident) {
+ writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+ return
+ }
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "triage_build_failed", err.Error(), "")
+ return
+ }
+ writeJSON(w, http.StatusOK, rep)
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(status)
+ _ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(status)
+ _ = json.NewEncoder(w).Encode(map[string]any{
+ "error": map[string]string{
+ "code": code,
+ "message": message,
+ "detail": detail,
+ },
+ })
+}
diff --git a/internal/triagehttp/handler_test.go b/internal/triagehttp/handler_test.go
new file mode 100644
index 0000000..fba414e
--- /dev/null
+++ b/internal/triagehttp/handler_test.go
@@ -0,0 +1,141 @@
+package triagehttp_test
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+ "github.com/sssmaran/WaylogCLI/internal/triagehttp"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestTriageHandlerReturnsReport(t *testing.T) {
+ eng := newTriageEngineForHandler(t)
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+
+ if rr.Code != http.StatusOK {
+ t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String())
+ }
+ var rep pkgtriage.Report
+ if err := json.Unmarshal(rr.Body.Bytes(), &rep); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if rep.IncidentRef.ID != "inc_abc" {
+ t.Fatalf("got id %q want inc_abc", rep.IncidentRef.ID)
+ }
+}
+
+func TestTriageHandlerHonorsSnapshotQuery(t *testing.T) {
+ eng := newTriageEngineForHandler(t)
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc?snapshot=true&window=30m", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+
+ if rr.Code != http.StatusOK {
+ t.Fatalf("status = %d body=%s", rr.Code, rr.Body.String())
+ }
+}
+
+func TestTriageHandlerRejectsMissingID(t *testing.T) {
+ eng := newTriageEngineForHandler(t)
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/triage/", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+ if rr.Code != http.StatusBadRequest {
+ t.Fatalf("expected 400 for missing id, got %d", rr.Code)
+ }
+}
+
+func TestTriageHandlerRejectsNonGET(t *testing.T) {
+ eng := newTriageEngineForHandler(t)
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodPost, "/v1/triage/inc_abc", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+ if rr.Code != http.StatusMethodNotAllowed {
+ t.Fatalf("expected 405 for POST, got %d", rr.Code)
+ }
+}
+
+func TestTriageHandlerUnknownIncidentIsNotFound(t *testing.T) {
+ eng := newTriageEngineForHandlerWithIncidents(t, handlerUnknownIncidents{})
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_missing", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+ if rr.Code != http.StatusNotFound {
+ t.Fatalf("expected 404 for unknown incident, got %d; body=%s", rr.Code, rr.Body.String())
+ }
+}
+
+// helper: stub engine
+func newTriageEngineForHandler(t *testing.T) *triage.Engine {
+ return newTriageEngineForHandlerWithIncidents(t, handlerStubIncidents{})
+}
+
+func newTriageEngineForHandlerWithIncidents(t *testing.T, incidents triage.IncidentLookup) *triage.Engine {
+ t.Helper()
+ deps := triage.Deps{
+ Incidents: incidents,
+ Blast: handlerStubBlast{},
+ Story: handlerStubStory{},
+ Signals: handlerStubSignals{},
+ NextChecks: handlerStubNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := triage.NewEngine(deps)
+ if err != nil {
+ t.Fatalf("engine: %v", err)
+ }
+ return eng
+}
+
+type handlerStubIncidents struct{}
+
+func (handlerStubIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+ return triage.IncidentSummary{ID: id, Window: "15m", Confidence: pkgtriage.ConfidenceMedium}, nil
+}
+
+type handlerUnknownIncidents struct{}
+
+func (handlerUnknownIncidents) GetIncident(ctx context.Context, id string) (triage.IncidentSummary, error) {
+ return triage.IncidentSummary{}, triage.ErrUnknownIncident
+}
+
+type handlerStubBlast struct{}
+
+func (handlerStubBlast) BlastSnapshot(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.BlastSnapshotResult, error) {
+ return triage.BlastSnapshotResult{}, nil
+}
+
+type handlerStubStory struct{}
+
+func (handlerStubStory) FirstFailureStory(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) (triage.FirstFailureResult, error) {
+ return triage.FirstFailureResult{}, nil
+}
+
+type handlerStubSignals struct{}
+
+func (handlerStubSignals) SignalsFor(ctx context.Context, inc triage.IncidentSummary, opts triage.BuildOptions) ([]triage.SignalEvidence, error) {
+ return nil, nil
+}
+
+type handlerStubNextChecks struct{}
+
+func (handlerStubNextChecks) NextChecks(ctx context.Context, inc triage.IncidentSummary) ([]triage.NextCheckSpec, error) {
+ return nil, nil
+}
diff --git a/pkg/triage/report.go b/pkg/triage/report.go
new file mode 100644
index 0000000..388933d
--- /dev/null
+++ b/pkg/triage/report.go
@@ -0,0 +1,102 @@
+// Package triage exposes the TriageReport schema. Experimental: report shape may change until triage.v2.
+package triage
+
+import (
+ "crypto/sha256"
+ "encoding/hex"
+ "encoding/json"
+ "fmt"
+)
+
+type Confidence string
+
+const (
+ ConfidenceLow Confidence = "low"
+ ConfidenceMedium Confidence = "medium"
+ ConfidenceHigh Confidence = "high"
+)
+
+type Report struct {
+ SchemaVersion string `json:"schema_version"`
+ IncidentRef IncidentRef `json:"incident_ref"`
+ BlastSnapshot BlastSnapshot `json:"blast_snapshot"`
+ FirstFailure json.RawMessage `json:"first_failure,omitempty"`
+ SampleTraces []TraceSample `json:"sample_traces,omitempty"`
+ Signals []SignalRef `json:"signals,omitempty"`
+ NextChecks []NextCheck `json:"next_checks,omitempty"`
+ Confidence Confidence `json:"confidence"`
+ GeneratedAt string `json:"generated_at"`
+ PlanRunID string `json:"plan_run_id,omitempty"`
+ ReportHash string `json:"report_hash"`
+}
+
+type IncidentRef struct {
+ ID string `json:"id"`
+ Window string `json:"window"`
+}
+
+type BlastSnapshot struct {
+ Requests int `json:"requests"`
+ Users int `json:"users"`
+ Services int `json:"services"`
+ TopErrorFamilies []ErrorFamily `json:"top_error_families"`
+}
+
+type ErrorFamily struct {
+ Service string `json:"service"`
+ Step string `json:"step"`
+ ErrorCode string `json:"error_code"`
+ Count int `json:"count"`
+}
+
+type TraceSample struct {
+ TraceID string `json:"trace_id"`
+ Summary string `json:"summary"`
+}
+
+type SignalRef struct {
+ ID string `json:"id"`
+ Type string `json:"type"`
+ EvidenceIDs []string `json:"evidence_ids"`
+}
+
+type NextCheck struct {
+ ID string `json:"id"`
+ Prompt string `json:"prompt"`
+}
+
+const SchemaVersionV1 = "triage.v1"
+
+func (r *Report) Validate() error {
+ if r.SchemaVersion != SchemaVersionV1 {
+ return fmt.Errorf("triage: schema_version must be %q, got %q", SchemaVersionV1, r.SchemaVersion)
+ }
+ if r.IncidentRef.ID == "" {
+ return fmt.Errorf("triage: incident_ref.id required")
+ }
+ switch r.Confidence {
+ case ConfidenceLow, ConfidenceMedium, ConfidenceHigh:
+ default:
+ return fmt.Errorf("triage: confidence must be low|medium|high, got %q", r.Confidence)
+ }
+ if r.GeneratedAt == "" {
+ return fmt.Errorf("triage: generated_at required")
+ }
+ return nil
+}
+
+// CanonicalHash returns sha256: over the report's canonical JSON,
+// excluding generated_at, plan_run_id, and report_hash itself.
+// Two reports built from the same upstream state produce the same hash.
+func (r *Report) CanonicalHash() (string, error) {
+ clone := *r
+ clone.GeneratedAt = ""
+ clone.PlanRunID = ""
+ clone.ReportHash = ""
+ raw, err := json.Marshal(&clone)
+ if err != nil {
+ return "", fmt.Errorf("triage: canonical marshal: %w", err)
+ }
+ sum := sha256.Sum256(raw)
+ return "sha256:" + hex.EncodeToString(sum[:]), nil
+}
diff --git a/pkg/triage/report_test.go b/pkg/triage/report_test.go
new file mode 100644
index 0000000..f4f575f
--- /dev/null
+++ b/pkg/triage/report_test.go
@@ -0,0 +1,137 @@
+package triage_test
+
+import (
+ "encoding/json"
+ "strings"
+ "testing"
+
+ "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestReportJSONRoundTrip(t *testing.T) {
+ in := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_test", Window: "15m"},
+ BlastSnapshot: triage.BlastSnapshot{
+ Requests: 12, Users: 8, Services: 4,
+ TopErrorFamilies: []triage.ErrorFamily{
+ {Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
+ },
+ },
+ Signals: []triage.SignalRef{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}},
+ NextChecks: []triage.NextCheck{{ID: "check_1", Prompt: "verify x"}},
+ Confidence: triage.ConfidenceMedium,
+ GeneratedAt: "2026-05-06T00:00:00Z",
+ ReportHash: "sha256:abc",
+ }
+ raw, err := json.Marshal(&in)
+ if err != nil {
+ t.Fatalf("marshal: %v", err)
+ }
+ var out triage.Report
+ if err := json.Unmarshal(raw, &out); err != nil {
+ t.Fatalf("unmarshal: %v", err)
+ }
+ if out.SchemaVersion != in.SchemaVersion {
+ t.Fatalf("schema_version mismatch: got %q want %q", out.SchemaVersion, in.SchemaVersion)
+ }
+ if out.BlastSnapshot.TopErrorFamilies[0].ErrorCode != "PMT_502" {
+ t.Fatalf("top_error_families round-trip lost data: %+v", out.BlastSnapshot.TopErrorFamilies)
+ }
+ if out.Confidence != triage.ConfidenceMedium {
+ t.Fatalf("confidence mismatch: got %q", out.Confidence)
+ }
+}
+
+func TestReportValidate(t *testing.T) {
+ good := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_x"},
+ Confidence: triage.ConfidenceMedium,
+ GeneratedAt: "2026-05-06T00:00:00Z",
+ ReportHash: "sha256:x",
+ }
+ if err := good.Validate(); err != nil {
+ t.Fatalf("good report failed validation: %v", err)
+ }
+
+ cases := map[string]triage.Report{
+ "missing schema_version": {IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+ "wrong schema_version": {SchemaVersion: "triage.v2", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+ "missing incident id": {SchemaVersion: "triage.v1", Confidence: triage.ConfidenceLow, GeneratedAt: "t", ReportHash: "h"},
+ "bad confidence": {SchemaVersion: "triage.v1", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: "extreme", GeneratedAt: "t", ReportHash: "h"},
+ "missing generated_at": {SchemaVersion: "triage.v1", IncidentRef: triage.IncidentRef{ID: "inc_x"}, Confidence: triage.ConfidenceLow, ReportHash: "h"},
+ }
+ for name, r := range cases {
+ t.Run(name, func(t *testing.T) {
+ if err := r.Validate(); err == nil {
+ t.Fatalf("%s: expected validation error, got nil", name)
+ }
+ })
+ }
+}
+
+func TestCanonicalHashExcludesGeneratedAtPlanRunIDAndReportHash(t *testing.T) {
+ a := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_1"},
+ Confidence: triage.ConfidenceMedium,
+ GeneratedAt: "2026-05-06T00:00:00Z",
+ ReportHash: "sha256:placeholder",
+ }
+ hashA, err := a.CanonicalHash()
+ if err != nil {
+ t.Fatalf("hash a: %v", err)
+ }
+
+ b := a
+ b.GeneratedAt = "2099-01-01T00:00:00Z"
+ b.PlanRunID = "plan_other"
+ b.ReportHash = "sha256:something_else"
+ hashB, err := b.CanonicalHash()
+ if err != nil {
+ t.Fatalf("hash b: %v", err)
+ }
+
+ if hashA != hashB {
+ t.Fatalf("CanonicalHash must exclude generated_at, plan_run_id, report_hash. got %q vs %q", hashA, hashB)
+ }
+}
+
+func TestCanonicalHashChangesWhenContentChanges(t *testing.T) {
+ base := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_1"},
+ Confidence: triage.ConfidenceMedium,
+ GeneratedAt: "t",
+ ReportHash: "h",
+ }
+ h1, _ := base.CanonicalHash()
+
+ mutated := base
+ mutated.IncidentRef.ID = "inc_2"
+ h2, _ := mutated.CanonicalHash()
+ if h1 == h2 {
+ t.Fatalf("hash must change when incident_ref.id changes")
+ }
+}
+
+func TestCanonicalHashFormat(t *testing.T) {
+ r := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_1"},
+ Confidence: triage.ConfidenceLow,
+ GeneratedAt: "t",
+ ReportHash: "h",
+ }
+ h, err := r.CanonicalHash()
+ if err != nil {
+ t.Fatalf("hash: %v", err)
+ }
+ if !strings.HasPrefix(h, "sha256:") {
+ t.Fatalf("hash should be prefixed with sha256:, got %q", h)
+ }
+ if len(h) != len("sha256:")+64 {
+ t.Fatalf("hash length wrong: got %d (%q)", len(h), h)
+ }
+}
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index c8dd8f2..1629b72 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -54,9 +54,13 @@ type incident struct {
Status string `json:"status"`
}
+type triageReport struct {
+ ReportHash string `json:"report_hash"`
+}
+
func main() {
if len(os.Args) != 2 {
- fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json ")
+ fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json ")
os.Exit(2)
}
@@ -85,6 +89,8 @@ func main() {
}
case "first-incident-id":
fmt.Println(firstIncidentID(body))
+ case "triage-report-hash":
+ fmt.Println(triageReportHash(body))
default:
fmt.Fprintf(os.Stderr, "unknown command: %s\n", os.Args[1])
os.Exit(2)
@@ -183,3 +189,11 @@ func isPaymentFamily(f errorFamily) bool {
f.Step == "payment.charge" &&
f.ErrorCode == "PMT_502"
}
+
+func triageReportHash(body []byte) string {
+ var rep triageReport
+ if err := json.Unmarshal(body, &rep); err != nil {
+ return ""
+ }
+ return rep.ReportHash
+}
diff --git a/scripts/demo-acceptance-json/main_test.go b/scripts/demo-acceptance-json/main_test.go
index 7b3dfd6..8619a77 100644
--- a/scripts/demo-acceptance-json/main_test.go
+++ b/scripts/demo-acceptance-json/main_test.go
@@ -21,3 +21,13 @@ func TestDependencyIncidentHelpers(t *testing.T) {
t.Fatalf("firstIncidentID = %q, want inc_123", got)
}
}
+
+func TestTriageReportHash(t *testing.T) {
+ body := []byte(`{"schema_version":"triage.v1","incident_ref":{"id":"inc_x"},"confidence":"medium","generated_at":"t","report_hash":"sha256:deadbeef"}`)
+ if got := triageReportHash(body); got != "sha256:deadbeef" {
+ t.Fatalf("triageReportHash = %q, want sha256:deadbeef", got)
+ }
+ if got := triageReportHash([]byte(`{not-json`)); got != "" {
+ t.Fatalf("malformed input should return empty, got %q", got)
+ }
+}
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index 590b315..b40edb1 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -43,6 +43,10 @@ json_first_incident_id() {
"$JSON_BIN" first-incident-id
}
+json_triage_report_hash() {
+ "$JSON_BIN" triage-report-hash
+}
+
if [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
fail "demo stack is not running. Start it with: make demo"
fi
@@ -131,4 +135,15 @@ snapshot="$("${CLI[@]}" incident "$incident_id" --snapshot)" || fail "waylog inc
[[ "$snapshot" == *"payment.charge"* ]] || fail "incident snapshot did not mention payment.charge"
echo "PASS: waylog incident snapshot"
+triage_a="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage failed for incident $incident_id"
+hash_a="$(json_triage_report_hash <<<"$triage_a")"
+[[ -n "$hash_a" ]] || fail "triage report_hash A is empty"
+
+triage_b="$("${CLI[@]}" --json triage "$incident_id" --snapshot)" || fail "waylog triage second run failed for incident $incident_id"
+hash_b="$(json_triage_report_hash <<<"$triage_b")"
+[[ -n "$hash_b" ]] || fail "triage report_hash B is empty"
+
+[[ "$hash_a" == "$hash_b" ]] || fail "triage report_hash unstable across runs: A=$hash_a B=$hash_b"
+echo "PASS: waylog triage stable report_hash=$hash_a"
+
echo "Demo acceptance passed."
From a605ec85d109986d4907b540294cbf33be61cb79 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Thu, 7 May 2026 19:29:06 -0400
Subject: [PATCH 06/14] feat: added incident rebuild and provider credibility
layer
Shipped the M2 credibility layer for incident triage.
- refactored incident ticking into derive/apply paths
- added startup-only hot-window incident rebuild from schema-2.0 WAL
- added atomic ReplaceNonResolved for incident stores
- preserve live tick per-row behavior while rebuild uses atomic replacement
- added rebuild metrics and max-event safety cap
- added runtime incident cause classification and next checks
- added provider-neutral LLM selection with explicit none mode
- make Ask missing-provider errors provider-agnostic
- expose llm and incidents rebuild state in /v1/capabilities
- updated README, env docs, and OpenAPI for M2 provider/rebuild fields
- add regression coverage for rebuild, runtime cause, provider selection, and capabilities
---
README.md | 9 +-
cmd/ingest/main.go | 114 ++++++++---
docs/env.md | 16 +-
docs/openapi.yaml | 35 ++++
internal/cli/root.go | 28 +--
internal/coldstore/incident_store.go | 41 +++-
internal/coldstore/incident_store_test.go | 77 ++++++++
internal/incidents/classifier.go | 24 +++
internal/incidents/classifier_test.go | 138 ++++++++++++++
internal/incidents/engine.go | 219 +++++++++++++++++++++-
internal/incidents/engine_test.go | 130 +++++++++++++
internal/incidents/nextchecks.go | 7 +
internal/incidents/rebuild.go | 40 ++++
internal/incidents/store.go | 15 ++
internal/incidents/types.go | 1 +
internal/ingest/handler.go | 198 ++++++++++---------
internal/ingest/handler_test.go | 175 +++++++++++++++++
internal/llm/provider.go | 75 ++++++++
internal/llm/provider_test.go | 160 ++++++++++++++++
internal/metrics/metrics.go | 24 ++-
20 files changed, 1381 insertions(+), 145 deletions(-)
create mode 100644 internal/incidents/rebuild.go
create mode 100644 internal/llm/provider.go
create mode 100644 internal/llm/provider_test.go
diff --git a/README.md b/README.md
index e5b7712..66893ff 100644
--- a/README.md
+++ b/README.md
@@ -297,8 +297,9 @@ Public alpha. APIs may break before 1.0.
- hot graph with flattened 3-node model + dedicated trace store
- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
- SQLite cold store (events, deployments, signals, incidents, causal claims)
-- signal-driven incident engine with `waylog incidents`, `waylog incident `, and dashboard incident cards
-- 10 deterministic analysis tools, rollup-correct root-cause attribution
+- signal-driven incident engine with `waylog incidents`, `waylog incident `, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
+- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER`; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- 11 deterministic analysis tools, rollup-correct root-cause attribution
- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
- `/v1/traces/story` and indented failure-path rendering in the dashboard
- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
@@ -319,8 +320,8 @@ Public alpha. APIs may break before 1.0.
- OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
- Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
-- Signal and incident records are SQLite-backed; they do not use the event WAL/replay path.
-- Incident cause classification is deterministic and heuristic. `runtime` signals are accepted but do not produce a `runtime` cause label yet.
+- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
+- Incident cause classification is deterministic and heuristic.
- No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
- No multi-tenancy. One instance = one trust boundary.
- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 7d76092..579e3a6 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -29,6 +29,7 @@ import (
"github.com/sssmaran/WaylogCLI/internal/incidents"
"github.com/sssmaran/WaylogCLI/internal/ingest"
ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
+ "github.com/sssmaran/WaylogCLI/internal/llm"
"github.com/sssmaran/WaylogCLI/internal/mcp/stdio"
"github.com/sssmaran/WaylogCLI/internal/metrics"
otelhttp "github.com/sssmaran/WaylogCLI/internal/otel"
@@ -150,6 +151,10 @@ func main() {
causalInterval := config.GetenvDuration("CAUSAL_INTERVAL", 30*time.Second)
trustProxy := config.GetenvBool("WAYLOG_TRUST_PROXY", false)
+ if _, err := llm.SelectFromEnv(); err != nil {
+ slog.Error("LLM provider config error", "err", err)
+ os.Exit(1)
+ }
dedupCache := ingest.NewDedupCache()
planStore := ingest.NewPlanStore()
@@ -236,28 +241,31 @@ func main() {
// Create ingest server with the store
ingestServer := ingest.NewServer(ingest.ServerConfig{
- Store: graphStore,
- TraceStore: traceStore,
- MaxBodyBytes: maxBody,
- EventLogDir: eventLogDir,
- Metrics: m,
- StartTime: time.Now(),
- AskRegistry: reg,
- AskMaxStepsDefault: askMaxStepsDefault,
- AskMaxStepsMax: askMaxStepsMax,
- DashboardRefreshSec: dashboardRefreshSec,
- PrometheusURL: prometheusURL,
- GrafanaURL: grafanaURL,
- GraphUI: graphUI,
- DedupCache: dedupCache,
- AgentKey: agentKey,
- TrustProxy: trustProxy,
- ColdWriter: coldWriter,
- ColdStore: coldDB,
- PlanStore: planStore,
- GraphHotWindow: graphHotWindow,
- OTLPEnabled: otlpEnabled,
- V2ReadsEnabled: v2ReadsEnabled,
+ Store: graphStore,
+ TraceStore: traceStore,
+ MaxBodyBytes: maxBody,
+ EventLogDir: eventLogDir,
+ Metrics: m,
+ StartTime: time.Now(),
+ AskRegistry: reg,
+ AskMaxStepsDefault: askMaxStepsDefault,
+ AskMaxStepsMax: askMaxStepsMax,
+ DashboardRefreshSec: dashboardRefreshSec,
+ PrometheusURL: prometheusURL,
+ GrafanaURL: grafanaURL,
+ GraphUI: graphUI,
+ DedupCache: dedupCache,
+ AgentKey: agentKey,
+ TrustProxy: trustProxy,
+ ColdWriter: coldWriter,
+ ColdStore: coldDB,
+ PlanStore: planStore,
+ GraphHotWindow: graphHotWindow,
+ OTLPEnabled: otlpEnabled,
+ V2ReadsEnabled: v2ReadsEnabled,
+ IncidentsEnabled: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+ IncidentsPersistent: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+ IncidentRebuildSupported: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
})
// SSE hub for real-time dashboard updates
@@ -428,6 +436,66 @@ func main() {
slog.Error("incident engine bootstrap failed", "err", err)
os.Exit(1)
}
+ if config.GetenvBool("WAYLOG_REBUILD_INCIDENTS_ON_START", false) {
+ rebuildMaxEvents := config.GetenvInt("WAYLOG_INCIDENT_REBUILD_MAX_EVENTS", 250000)
+ if rebuildMaxEvents <= 0 {
+ rebuildMaxEvents = 250000
+ }
+ replayWindow := graphHotWindow
+ if minWindow := 2 * incidentCfg.Window; minWindow > replayWindow {
+ replayWindow = minWindow
+ }
+ replaySince := time.Now().UTC().Add(-replayWindow)
+ seed := incidentEngine.SnapshotActive()
+ for _, inc := range seed {
+ if inc.StartedAt.Before(replaySince) {
+ slog.Info("incident continuity broken: started_at older than WAL retention",
+ "incident_id", inc.IncidentID,
+ "started_at", inc.StartedAt,
+ "replay_since", replaySince,
+ )
+ break
+ }
+ }
+ tempIndex := ingestv2.NewRecentIndex(nil)
+ tempDedup := ingestv2.NewDedup(dedupCapacity, nil)
+ tempProjector := ingestv2.NewProjector(tempIndex)
+ replay, err := ingestv2.ReplayWAL(eventLogV2Dir, tempDedup, tempProjector, replaySince, m)
+ if err != nil {
+ m.IncidentRebuildFailures.Inc()
+ slog.Error("incident rebuild WAL replay failed", "err", err)
+ os.Exit(1)
+ }
+ m.IncidentRebuildReplayed.Add(float64(replay.Projected))
+ if replay.Projected > rebuildMaxEvents {
+ m.IncidentRebuildFailures.Inc()
+ slog.Error("incident rebuild replay exceeded max events", "projected", replay.Projected, "max_events", rebuildMaxEvents)
+ os.Exit(1)
+ }
+ if replay.Projected == 0 {
+ if len(seed) > 0 {
+ slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+ }
+ } else {
+ result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{
+ Engine: incidentEngine,
+ Reader: incidentReaderAdapter{reader: ingestv2.NewReader(tempIndex)},
+ Now: time.Now,
+ })
+ if err != nil {
+ m.IncidentRebuildFailures.Inc()
+ slog.Error("incident rebuild failed", "err", err)
+ os.Exit(1)
+ }
+ m.IncidentRebuildDuration.Observe(result.Duration.Seconds())
+ m.IncidentRebuildRows.Add(float64(result.RowsReplaced))
+ slog.Info("incident rebuild complete",
+ "replayed_events", replay.Projected,
+ "rows_replaced", result.RowsReplaced,
+ "duration", result.Duration,
+ )
+ }
+ }
incidentHandler := incidents.NewHandler(incidentEngine)
mux.Handle("/v1/incidents/active", readCORS(incidentHandler.Active))
mux.Handle("/v1/incidents/", readCORS(incidentHandler.Incident))
@@ -463,7 +531,7 @@ func main() {
incidentRunning = true
slog.Info("incident engine enabled", "interval", incidentCfg.TickInterval, "window", incidentCfg.Window)
} else {
- slog.Info("incident engine disabled: SQLITE_PATH is not set")
+ slog.Warn("incidents requested but SQLite not configured; running without incidents")
}
}
} else {
diff --git a/docs/env.md b/docs/env.md
index db9afac..58953f6 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -6,7 +6,19 @@ Reference for configuring the Waylog ingest server and SDK. All variables are re
| Variable | Purpose |
|---|---|
-| `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Required when server-side Ask/tool flows use Gemini |
+| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. For the current Gemini provider, set `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+
+## LLM provider
+
+Deterministic tools, plans, triage, MCP, and read APIs do not require an LLM provider. The provider is only used by natural-language Ask flows. If Ask cannot construct the selected provider, it returns a provider-agnostic "LLM provider not configured" error.
+
+| Variable | Default | Purpose |
+|---|---|---|
+| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values in M2: `none`, `gemini` |
+| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. For Gemini, this takes precedence over `GEMINI_MODEL` |
+| `GEMINI_MODEL` | `gemini-2.5-flash` | Gemini-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `GEMINI_API_BASE` | Gemini API default | Gemini-specific API base URL override |
+| `GEMINI_TOOL_MODE` | `text` | Gemini-specific tool-calling mode |
## Auth
@@ -65,6 +77,8 @@ The `waylog` CLI calls the running ingest server's v2 read APIs. The server must
| `WAYLOG_INCIDENT_RESOLVE_AFTER` | `2m` | Time without renewed matching failures before a recovering incident resolves |
| `WAYLOG_DEPLOY_CORRELATION_WINDOW` | `15m` | Window used to attach deploy signals and deployment records as incident evidence |
| `WAYLOG_INCIDENT_SAMPLE_LIMIT` | `5` | Maximum persisted sample traces per incident |
+| `WAYLOG_REBUILD_INCIDENTS_ON_START` | `false` | Rebuild non-resolved incident rows at startup from the schema-2.0 WAL hot window plus signals |
+| `WAYLOG_INCIDENT_REBUILD_MAX_EVENTS` | `250000` | Safety cap for startup incident rebuild replay |
| `WAYLOG_V2_DEDUP_CAPACITY` | `65536` | Recent schema-2.0 `event_id` dedupe cache capacity |
| `GRAPH_HOT_WINDOW` | `GRAPH_RETENTION` or `24h` | Recent in-memory graph/index retention window and max v2 read window |
| `GRAPH_RETENTION` | `24h` | Hot graph retention. Nodes older than this are pruned every snapshot tick |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index d60eb4d..23f1e8c 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -1849,6 +1849,26 @@ components:
ask:
type: object
additionalProperties: true
+ llm:
+ type: object
+ description: Provider-neutral Ask configuration and runtime state.
+ properties:
+ provider:
+ type: string
+ description: Resolved Ask provider. `custom` is used for injected providers.
+ enum: [none, gemini, custom]
+ model:
+ type: string
+ description: Resolved model for the selected provider, or empty when Ask is disabled.
+ tool_mode:
+ type: string
+ description: Resolved tool-calling mode for the selected provider, or empty when Ask is disabled.
+ configured:
+ type: boolean
+ description: True when a provider was explicitly selected or inferred from credentials.
+ ask_enabled:
+ type: boolean
+ description: True when Ask has a provider implementation available.
dashboard:
type: object
additionalProperties: true
@@ -1867,6 +1887,21 @@ components:
properties:
enabled:
type: boolean
+ incidents:
+ type: object
+ properties:
+ enabled:
+ type: boolean
+ persistent:
+ type: boolean
+ rebuild:
+ type: object
+ properties:
+ supported:
+ type: boolean
+ scope:
+ type: string
+ enum: ["", hot-window]
architecture:
type: object
additionalProperties: true
diff --git a/internal/cli/root.go b/internal/cli/root.go
index 42128b2..069b1b7 100644
--- a/internal/cli/root.go
+++ b/internal/cli/root.go
@@ -130,19 +130,16 @@ func handleAsk(store tools.Store, args []string) {
return
}
- apiKey := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
- if apiKey == "" {
- apiKey = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+ sel, err := llm.SelectFromEnv()
+ if err != nil {
+ fmt.Println(err)
+ return
}
- if apiKey == "" {
- fmt.Println("GEMINI_API_KEY (or GOOGLE_API_KEY) is required")
+ if !sel.AskEnabled {
+ fmt.Println(llm.ErrProviderNotConfigured)
return
}
- model := strings.TrimSpace(os.Getenv("GEMINI_MODEL"))
- baseURL := strings.TrimSpace(os.Getenv("GEMINI_API_BASE"))
- toolMode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE"))
-
reg := tools.NewRegistry()
if err := tools.RegisterGraphTools(reg); err != nil {
fmt.Println("tool registry error:", err)
@@ -158,18 +155,7 @@ func handleAsk(store tools.Store, args []string) {
})
}
- client := llm.NewGeminiClient(apiKey)
- if model != "" {
- client.Model = model
- }
- if baseURL != "" {
- client.BaseURL = baseURL
- }
- if toolMode != "" {
- client.ToolMode = toolMode
- }
-
- answer, _, err := llm.Ask(context.Background(), client, toolDefs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) {
+ answer, _, err := llm.Ask(context.Background(), sel.Impl, toolDefs, llm.ToolExecutorFunc(func(ctx context.Context, name string, params json.RawMessage) (any, error) {
return reg.Call(ctx, store, name, params)
}), prompt, llm.AskOptions{MaxSteps: 5})
if err != nil {
diff --git a/internal/coldstore/incident_store.go b/internal/coldstore/incident_store.go
index b6a3160..9a05201 100644
--- a/internal/coldstore/incident_store.go
+++ b/internal/coldstore/incident_store.go
@@ -20,6 +20,43 @@ func NewIncidentStore(db *SQLiteStore) *IncidentStore {
}
func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) error {
+ if err := upsertIncident(ctx, s.db.writer, inc); err != nil {
+ return fmt.Errorf("coldstore upsert incident: %w", err)
+ }
+ return nil
+}
+
+func (s *IncidentStore) ReplaceNonResolved(ctx context.Context, rows []incidents.Incident) error {
+ tx, err := s.db.writer.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelSerializable})
+ if err != nil {
+ return fmt.Errorf("coldstore replace incidents begin: %w", err)
+ }
+ committed := false
+ defer func() {
+ if !committed {
+ _ = tx.Rollback()
+ }
+ }()
+ if _, err := tx.ExecContext(ctx, `DELETE FROM incidents WHERE status != ?`, string(incidents.StatusResolved)); err != nil {
+ return fmt.Errorf("coldstore replace incidents delete: %w", err)
+ }
+ for _, inc := range rows {
+ if err := upsertIncident(ctx, tx, inc); err != nil {
+ return fmt.Errorf("coldstore replace incident %s: %w", inc.IncidentID, err)
+ }
+ }
+ if err := tx.Commit(); err != nil {
+ return fmt.Errorf("coldstore replace incidents commit: %w", err)
+ }
+ committed = true
+ return nil
+}
+
+type incidentExecer interface {
+ ExecContext(ctx context.Context, query string, args ...any) (sql.Result, error)
+}
+
+func upsertIncident(ctx context.Context, execer incidentExecer, inc incidents.Incident) error {
topServices, err := jsonText(inc.TopServices)
if err != nil {
return fmt.Errorf("coldstore incident top services: %w", err)
@@ -40,7 +77,7 @@ func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) erro
if err != nil {
return fmt.Errorf("coldstore incident warnings: %w", err)
}
- _, err = s.db.writer.ExecContext(ctx, `
+ _, err = execer.ExecContext(ctx, `
INSERT INTO incidents (
incident_id, env, service, error_service, error_step, error_code,
status, cause, confidence, severity, started_at, updated_at, last_seen_at,
@@ -76,7 +113,7 @@ func (s *IncidentStore) Upsert(ctx context.Context, inc incidents.Incident) erro
topServices, samples, evidence, nextChecks, warnings, inc.Lift, inc.BaselineCount, inc.CurrentCount,
)
if err != nil {
- return fmt.Errorf("coldstore upsert incident: %w", err)
+ return err
}
return nil
}
diff --git a/internal/coldstore/incident_store_test.go b/internal/coldstore/incident_store_test.go
index 10cdfe8..9853f3b 100644
--- a/internal/coldstore/incident_store_test.go
+++ b/internal/coldstore/incident_store_test.go
@@ -83,3 +83,80 @@ func TestIncidentStoreRoundtripAndPrune(t *testing.T) {
t.Fatalf("expected not found, got %v", err)
}
}
+
+func TestIncidentStoreReplaceNonResolved(t *testing.T) {
+ ctx := context.Background()
+ managed, err := Open(":memory:")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer managed.Close()
+ store := NewIncidentStore(managed.(*SQLiteStore))
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+
+ oldActive := testColdIncident("inc_old_active", incidents.StatusActive, now.Add(-20*time.Minute))
+ oldRecovering := testColdIncident("inc_old_recovering", incidents.StatusRecovering, now.Add(-15*time.Minute))
+ preservedResolved := testColdIncident("inc_preserved_resolved", incidents.StatusResolved, now.Add(-30*time.Minute))
+ overwrittenResolved := testColdIncident("inc_overwritten_resolved", incidents.StatusResolved, now.Add(-25*time.Minute))
+ for _, inc := range []incidents.Incident{oldActive, oldRecovering, preservedResolved, overwrittenResolved} {
+ if err := store.Upsert(ctx, inc); err != nil {
+ t.Fatal(err)
+ }
+ }
+
+ newActive := testColdIncident("inc_new_active", incidents.StatusActive, now)
+ replacement := testColdIncident("inc_overwritten_resolved", incidents.StatusActive, now)
+ if err := store.ReplaceNonResolved(ctx, []incidents.Incident{newActive, replacement}); err != nil {
+ t.Fatal(err)
+ }
+
+ active, err := store.ListActive(ctx)
+ if err != nil {
+ t.Fatal(err)
+ }
+ gotActive := map[string]incidents.Status{}
+ for _, inc := range active {
+ gotActive[inc.IncidentID] = inc.Status
+ }
+ if _, ok := gotActive["inc_old_active"]; ok {
+ t.Fatalf("old active row preserved unexpectedly: %+v", gotActive)
+ }
+ if _, ok := gotActive["inc_old_recovering"]; ok {
+ t.Fatalf("old recovering row preserved unexpectedly: %+v", gotActive)
+ }
+ if gotActive["inc_new_active"] != incidents.StatusActive || gotActive["inc_overwritten_resolved"] != incidents.StatusActive {
+ t.Fatalf("active rows after replace=%+v", gotActive)
+ }
+ if got, err := store.Get(ctx, "inc_preserved_resolved"); err != nil || got.Status != incidents.StatusResolved {
+ t.Fatalf("preserved resolved row got=%+v err=%v", got, err)
+ }
+}
+
+func testColdIncident(id string, status incidents.Status, at time.Time) incidents.Incident {
+ resolvedAt := at.Add(time.Minute)
+ inc := incidents.Incident{
+ IncidentID: id,
+ Env: "prod",
+ Service: "checkout",
+ ErrorFamily: apiv2.ErrorFamily{Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502"},
+ Status: status,
+ Cause: incidents.CauseDependency,
+ Confidence: incidents.ConfidenceHigh,
+ Severity: 8,
+ StartedAt: at,
+ UpdatedAt: at,
+ LastSeenAt: at,
+ AffectedRequests: 9,
+ AffectedServices: 2,
+ TopServices: []string{"checkout", "payment"},
+ SampleTraces: []string{"trace-a"},
+ Evidence: []incidents.Evidence{{Kind: incidents.EvidenceTrace, Title: "trace", TraceID: "trace-a", OccurredAt: at}},
+ NextChecks: []string{"check downstream"},
+ Lift: 9,
+ CurrentCount: 9,
+ }
+ if status == incidents.StatusResolved {
+ inc.ResolvedAt = &resolvedAt
+ }
+ return inc
+}
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
index 8cde790..5f10c66 100644
--- a/internal/incidents/classifier.go
+++ b/internal/incidents/classifier.go
@@ -52,6 +52,10 @@ func Classify(input ClassificationInput) Classification {
evidence = append(evidence, signalEvidence(*sig, "Deploy signal overlaps incident window"))
return classification(CauseDeploy, ConfidenceHigh, evidence, warnings)
}
+ if sig := matchingRuntimeSignal(input); sig != nil {
+ evidence = append(evidence, signalEvidence(*sig, "Runtime signal overlaps incident window"))
+ return classification(CauseRuntime, ConfidenceHigh, evidence, warnings)
+ }
if len(input.Events) > 0 && input.Incident.ErrorFamily.Step != "" && firstFailingDownstream(input.Events) == "" {
return classification(CauseApp, ConfidenceMedium, evidence, warnings)
}
@@ -101,6 +105,26 @@ func matchingDeployment(input ClassificationInput) *Deployment {
return nil
}
+func matchingRuntimeSignal(input ClassificationInput) *signals.Signal {
+ start := input.Incident.StartedAt
+ lo := start.Add(-5 * time.Minute)
+ hi := start.Add(time.Minute)
+ for i := range input.Signals {
+ sig := input.Signals[i]
+ if sig.Type != signals.TypeRuntime && sig.Type != signals.TypeHealthcheck {
+ continue
+ }
+ if sig.Service != input.Incident.Service {
+ continue
+ }
+ if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
+ continue
+ }
+ return &input.Signals[i]
+ }
+ return nil
+}
+
func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal {
version := sampleVersion(input.Events)
for i := range input.Signals {
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
index 348787a..5c5a16d 100644
--- a/internal/incidents/classifier_test.go
+++ b/internal/incidents/classifier_test.go
@@ -64,3 +64,141 @@ func TestClassifierRules(t *testing.T) {
}
})
}
+
+func TestClassifierRuntime(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+ checkoutEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "cart.validate", "CHK_500", "")
+ paymentEvent := testIncidentEvent("e2", "trace-b", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+ runtimeSig := signals.Signal{
+ SignalID: "sig_rt",
+ Type: signals.TypeRuntime,
+ Service: "checkout",
+ Env: "prod",
+ Reason: "container restarted",
+ Severity: signals.SeverityWarning,
+ Timestamp: now.Add(-time.Minute),
+ }
+
+ t.Run("runtime signal in window", func(t *testing.T) {
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{runtimeSig},
+ })
+ if got.Cause != CauseRuntime || got.Confidence != ConfidenceHigh {
+ t.Fatalf("classification=%+v", got)
+ }
+ found := false
+ for _, ev := range got.Evidence {
+ if ev.SignalID == "sig_rt" {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Fatalf("runtime signal evidence missing: %+v", got.Evidence)
+ }
+ })
+
+ t.Run("healthcheck signal in window", func(t *testing.T) {
+ sig := runtimeSig
+ sig.SignalID = "sig_hc"
+ sig.Type = signals.TypeHealthcheck
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{sig},
+ })
+ if got.Cause != CauseRuntime || got.Confidence != ConfidenceHigh {
+ t.Fatalf("classification=%+v", got)
+ }
+ })
+
+ t.Run("alert with OOM reason does not classify runtime", func(t *testing.T) {
+ sig := runtimeSig
+ sig.Type = signals.TypeAlert
+ sig.Reason = "OOM kill"
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{sig},
+ })
+ if got.Cause == CauseRuntime {
+ t.Fatalf("alert signal classified as runtime: %+v", got)
+ }
+ })
+
+ t.Run("runtime signal outside window", func(t *testing.T) {
+ sig := runtimeSig
+ sig.Timestamp = now.Add(-6 * time.Minute)
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{sig},
+ })
+ if got.Cause == CauseRuntime {
+ t.Fatalf("out-of-window signal classified as runtime: %+v", got)
+ }
+ })
+
+ t.Run("runtime signal for different service", func(t *testing.T) {
+ sig := runtimeSig
+ sig.Service = "payment"
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{sig},
+ })
+ if got.Cause == CauseRuntime {
+ t.Fatalf("foreign-service signal classified as runtime: %+v", got)
+ }
+ })
+
+ t.Run("deploy beats runtime", func(t *testing.T) {
+ deploySig := signals.Signal{
+ SignalID: "sig_dep",
+ Type: signals.TypeDeploy,
+ Service: "checkout",
+ Env: "prod",
+ Severity: signals.SeverityWarning,
+ Timestamp: now.Add(-time.Minute),
+ }
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{checkoutEvent},
+ Signals: []signals.Signal{runtimeSig, deploySig},
+ })
+ if got.Cause != CauseDeploy {
+ t.Fatalf("expected deploy, got %+v", got)
+ }
+ })
+
+ t.Run("dependency beats runtime", func(t *testing.T) {
+ depSig := signals.Signal{
+ SignalID: "sig_depy",
+ Type: signals.TypeDependency,
+ Service: "payment",
+ Env: "prod",
+ Reason: "upstream_5xx",
+ Severity: signals.SeverityCritical,
+ Timestamp: now.Add(-time.Minute),
+ }
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{paymentEvent},
+ Signals: []signals.Signal{runtimeSig, depSig},
+ })
+ if got.Cause != CauseDependency {
+ t.Fatalf("expected dependency, got %+v", got)
+ }
+ })
+}
+
+func TestNextChecksRuntime(t *testing.T) {
+ got := NextChecks(CauseRuntime, ConfidenceHigh)
+ if len(got) == 0 {
+ t.Fatalf("expected non-empty next checks for runtime cause")
+ }
+}
diff --git a/internal/incidents/engine.go b/internal/incidents/engine.go
index 11162c9..4dd9f6c 100644
--- a/internal/incidents/engine.go
+++ b/internal/incidents/engine.go
@@ -134,17 +134,36 @@ func (e *Engine) Tick(ctx context.Context) error {
defer func() { e.metrics.IncidentTickLatency.Observe(time.Since(start).Seconds()) }()
}
now := e.now().UTC()
+ rows, err := e.derive(ctx, now, e.SnapshotActive(), e.reader)
+ if err != nil {
+ return err
+ }
+ return e.ApplyLive(ctx, rows)
+}
+
+// derivedRow carries the derivation output plus whether the row was already
+// in the seed (used by ApplyLive to distinguish Opened vs Updated metrics).
+type derivedRow struct {
+ Incident Incident
+ Existed bool
+}
+
+// derive computes the full set of incident rows for the cycle from the seed +
+// reader without touching e.active or the store. Used by both live Tick and
+// startup Rebuild.
+func (e *Engine) derive(ctx context.Context, now time.Time, seed map[string]Incident, reader Reader) ([]derivedRow, error) {
currentStart := now.Add(-e.cfg.Window)
baselineStart := now.Add(-2 * e.cfg.Window)
statuses := failedStatuses()
- current := e.reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
- baseline := e.reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
+ current := reader.Errors(SearchFilter{Since: currentStart, Until: now, Statuses: statuses}, 200)
+ baseline := reader.Errors(SearchFilter{Since: baselineStart, Until: currentStart, Statuses: statuses}, 200)
baselineByFamily := map[string]int{}
for _, row := range baseline.Rows {
baselineByFamily[familyKey(row.ErrorFamily)] = row.Count
}
seen := map[string]struct{}{}
+ out := make([]derivedRow, 0, len(current.Rows))
for _, row := range current.Rows {
if row.Count < e.cfg.MinCount {
continue
@@ -154,25 +173,126 @@ func (e *Engine) Tick(ctx context.Context) error {
if baselineCount > 0 && lift < e.cfg.MinLift {
continue
}
- inc, err := e.buildIncident(ctx, row, baselineCount, lift, currentStart, now)
+ inc, existed, err := e.buildIncidentFromSeed(ctx, seed, reader, row, baselineCount, lift, currentStart, now)
if err != nil {
- return err
+ return nil, err
}
seen[inc.IncidentID] = struct{}{}
- if err := e.store.Upsert(ctx, inc); err != nil {
+ out = append(out, derivedRow{Incident: inc, Existed: existed})
+ }
+ out = append(out, e.deriveMissing(seed, seen, now)...)
+ return out, nil
+}
+
+// deriveMissing emits transitions for seed rows absent from the current cycle:
+// active → recovering, and recovering → resolved once LastSeenAt is older
+// than ResolveAfter. Mirrors the previous transitionMissing semantics.
+func (e *Engine) deriveMissing(seed map[string]Incident, seen map[string]struct{}, now time.Time) []derivedRow {
+ out := make([]derivedRow, 0)
+ for _, inc := range seed {
+ if _, ok := seen[inc.IncidentID]; ok {
+ continue
+ }
+ switch inc.Status {
+ case StatusActive:
+ row := cloneIncident(inc)
+ row.Status = StatusRecovering
+ t := now
+ row.RecoveringAt = &t
+ row.UpdatedAt = now
+ out = append(out, derivedRow{Incident: row, Existed: true})
+ case StatusRecovering:
+ if now.Sub(inc.LastSeenAt) >= e.cfg.ResolveAfter {
+ row := cloneIncident(inc)
+ row.Status = StatusResolved
+ t := now
+ row.ResolvedAt = &t
+ row.UpdatedAt = now
+ out = append(out, derivedRow{Incident: row, Existed: true})
+ }
+ }
+ }
+ return out
+}
+
+// ApplyLive persists derived rows for a live tick: per-row Upsert, in-memory
+// cache update, and per-transition metric increments matching pre-refactor
+// Tick behavior.
+func (e *Engine) ApplyLive(ctx context.Context, rows []derivedRow) error {
+ for _, dr := range rows {
+ if err := e.store.Upsert(ctx, dr.Incident); err != nil {
return err
}
- e.remember(inc)
+ switch dr.Incident.Status {
+ case StatusResolved:
+ e.forget(dr.Incident.IncidentID)
+ if e.metrics != nil {
+ e.metrics.IncidentResolved.Inc()
+ }
+ case StatusRecovering:
+ e.remember(dr.Incident)
+ if dr.Existed {
+ if e.metrics != nil {
+ e.metrics.IncidentRecovered.Inc()
+ }
+ }
+ default:
+ e.remember(dr.Incident)
+ if e.metrics != nil {
+ if dr.Existed {
+ e.metrics.IncidentUpdated.Inc()
+ } else {
+ e.metrics.IncidentOpened.Inc()
+ }
+ }
+ }
}
- if err := e.transitionMissing(ctx, seen, now); err != nil {
+ if e.metrics != nil {
+ e.metrics.IncidentActive.Set(float64(e.activeCount()))
+ }
+ return nil
+}
+
+// ApplyRebuild atomically replaces non-resolved store rows with the derived
+// set, then reloads the in-memory cache from the store. ApplyRebuild owns
+// cache reload; do NOT call Bootstrap after it. Per-row Opened/Updated/
+// Recovered/Resolved counters are intentionally not incremented here —
+// rebuild metrics live in main.go.
+func (e *Engine) ApplyRebuild(ctx context.Context, rows []derivedRow) error {
+ incs := make([]Incident, 0, len(rows))
+ for _, dr := range rows {
+ incs = append(incs, dr.Incident)
+ }
+ if err := e.store.ReplaceNonResolved(ctx, incs); err != nil {
+ return err
+ }
+ active, err := e.store.ListActive(ctx)
+ if err != nil {
return err
}
+ e.mu.Lock()
+ e.active = make(map[string]Incident, len(active))
+ for _, inc := range active {
+ e.active[inc.IncidentID] = cloneIncident(inc)
+ }
+ e.mu.Unlock()
if e.metrics != nil {
- e.metrics.IncidentActive.Set(float64(e.activeCount()))
+ e.metrics.IncidentActive.Set(float64(len(active)))
}
return nil
}
+// SnapshotActive returns a deep clone of the in-memory active map.
+func (e *Engine) SnapshotActive() map[string]Incident {
+ e.mu.RLock()
+ defer e.mu.RUnlock()
+ out := make(map[string]Incident, len(e.active))
+ for id, inc := range e.active {
+ out[id] = cloneIncident(inc)
+ }
+ return out
+}
+
func (e *Engine) Active(ctx context.Context) ([]Incident, error) {
rows, err := e.store.ListActive(ctx)
if err != nil {
@@ -265,6 +385,69 @@ func (e *Engine) buildIncident(ctx context.Context, row apiv2.ErrorRow, baseline
return inc, nil
}
+func (e *Engine) buildIncidentFromSeed(ctx context.Context, seed map[string]Incident, reader Reader, row apiv2.ErrorRow, baselineCount int, lift float64, since, now time.Time) (Incident, bool, error) {
+ events := sampleEventsFromReader(reader, row.ErrorFamily, since, now, 200)
+ startedAt := earliestEventTime(events, now)
+ env := firstEventEnv(events)
+ if existing, ok := findByFamilyIn(seed, env, row.ErrorFamily); ok {
+ startedAt = existing.StartedAt
+ }
+ id := StableID(env, row.ErrorFamily, startedAt)
+ existing, hadExisting := getCachedIn(seed, id)
+ if !hadExisting {
+ if prior, ok := findByFamilyIn(seed, env, row.ErrorFamily); ok {
+ existing = prior
+ id = prior.IncidentID
+ hadExisting = true
+ }
+ }
+ blast := reader.BlastRadius(
+ SearchFilter{Since: since, Until: now},
+ apiv2.BlastKey{Service: row.ErrorFamily.Service, Step: row.ErrorFamily.Step, ErrorCode: row.ErrorFamily.ErrorCode},
+ )
+ sigs, err := e.querySignals(ctx, env, now.Add(-e.cfg.DeployCorrelationWindow), now)
+ if err != nil && !errors.Is(err, signals.ErrUnavailable) {
+ return Incident{}, false, err
+ }
+ deploys, err := e.queryDeploys(ctx, row.ErrorFamily.Service, now.Add(-e.cfg.DeployCorrelationWindow), now)
+ if err != nil {
+ return Incident{}, false, err
+ }
+ inc := Incident{
+ IncidentID: id,
+ Env: env,
+ Service: row.ErrorFamily.Service,
+ ErrorFamily: row.ErrorFamily,
+ Status: StatusActive,
+ Severity: severity(row.Count, blast.AffectedServices, lift),
+ StartedAt: startedAt,
+ UpdatedAt: now,
+ LastSeenAt: now,
+ AffectedRequests: blast.AffectedRequests,
+ AffectedUsers: cloneInt(row.AffectedUsers),
+ AffectedServices: blast.AffectedServices,
+ TopServices: append([]string(nil), blast.TopServices...),
+ SampleTraces: stableSamples(existing.SampleTraces, events, e.cfg.SampleLimit),
+ Lift: lift,
+ BaselineCount: baselineCount,
+ CurrentCount: row.Count,
+ }
+ if hadExisting {
+ inc.StartedAt = existing.StartedAt
+ inc.RecoveringAt = nil
+ }
+ class := Classify(ClassificationInput{Incident: inc, Events: events, Signals: sigs, Deployments: deploys, Now: now})
+ inc.Cause = class.Cause
+ inc.Confidence = class.Confidence
+ inc.Evidence = class.Evidence
+ inc.NextChecks = class.NextChecks
+ inc.InstrumentationWarnings = class.InstrumentationWarnings
+ if e.metrics != nil {
+ e.observeClassification(inc.Cause, inc.Confidence)
+ }
+ return inc, hadExisting, nil
+}
+
func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}, now time.Time) error {
e.mu.RLock()
rows := make([]Incident, 0, len(e.active))
@@ -309,7 +492,11 @@ func (e *Engine) transitionMissing(ctx context.Context, seen map[string]struct{}
}
func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
- events := e.reader.SearchEvents(SearchFilter{
+ return sampleEventsFromReader(e.reader, f, since, until, limit)
+}
+
+func sampleEventsFromReader(reader Reader, f apiv2.ErrorFamily, since, until time.Time, limit int) []*eventv2.Event {
+ events := reader.SearchEvents(SearchFilter{
Service: f.Service,
ErrorCode: f.ErrorCode,
Since: since,
@@ -325,6 +512,20 @@ func (e *Engine) sampleEvents(f apiv2.ErrorFamily, since, until time.Time, limit
return out
}
+func getCachedIn(seed map[string]Incident, id string) (Incident, bool) {
+ inc, ok := seed[id]
+ return cloneIncident(inc), ok
+}
+
+func findByFamilyIn(seed map[string]Incident, env string, family apiv2.ErrorFamily) (Incident, bool) {
+ for _, inc := range seed {
+ if inc.Env == env && inc.ErrorFamily == family && inc.Status != StatusResolved {
+ return cloneIncident(inc), true
+ }
+ }
+ return Incident{}, false
+}
+
func (e *Engine) querySignals(ctx context.Context, env string, since, until time.Time) ([]signals.Signal, error) {
if e.signals == nil {
return nil, nil
diff --git a/internal/incidents/engine_test.go b/internal/incidents/engine_test.go
index 41403bc..6aea07f 100644
--- a/internal/incidents/engine_test.go
+++ b/internal/incidents/engine_test.go
@@ -125,6 +125,136 @@ func TestEngineUsesDownstreamDependencySignal(t *testing.T) {
}
}
+func TestDerivePreservesSeedContinuity(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ started := now.Add(-20 * time.Minute)
+ seeded := testIncident(started)
+ seeded.SampleTraces = []string{"trace-seeded"}
+ reader := &fakeReader{
+ current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+ ErrorFamily: testFamily(),
+ Count: 6,
+ }}},
+ blast: apiv2.BlastRadiusResponse{AffectedRequests: 6, AffectedServices: 2, TopServices: []string{"checkout", "payment"}},
+ events: []*eventv2.Event{
+ testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+ },
+ }
+ engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{MinCount: 5, SampleLimit: 2}, nil, nil)
+ rows, err := engine.derive(context.Background(), now, map[string]Incident{seeded.IncidentID: seeded}, reader)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(rows) != 1 {
+ t.Fatalf("rows=%+v", rows)
+ }
+ got := rows[0].Incident
+ if !got.StartedAt.Equal(started) {
+ t.Fatalf("started_at=%s want %s", got.StartedAt, started)
+ }
+ if len(got.SampleTraces) != 2 || got.SampleTraces[0] != "trace-seeded" || got.SampleTraces[1] != "trace-new" {
+ t.Fatalf("sample_traces=%+v", got.SampleTraces)
+ }
+ if !rows[0].Existed {
+ t.Fatalf("seeded row should be marked existed")
+ }
+}
+
+func TestDeriveMissingTransitions(t *testing.T) {
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ reader := &fakeReader{}
+ engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{ResolveAfter: time.Minute}, nil, nil)
+
+ active := testIncident(now.Add(-5 * time.Minute))
+ rows, err := engine.derive(context.Background(), now, map[string]Incident{active.IncidentID: active}, reader)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(rows) != 1 || rows[0].Incident.Status != StatusRecovering {
+ t.Fatalf("active missing rows=%+v", rows)
+ }
+
+ recovering := testIncident(now.Add(-5 * time.Minute))
+ recovering.Status = StatusRecovering
+ recovering.LastSeenAt = now.Add(-2 * time.Minute)
+ rows, err = engine.derive(context.Background(), now, map[string]Incident{recovering.IncidentID: recovering}, reader)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(rows) != 1 || rows[0].Incident.Status != StatusResolved {
+ t.Fatalf("recovering missing rows=%+v", rows)
+ }
+}
+
+func TestApplyRebuildReplacesStoreAndReloadsCache(t *testing.T) {
+ ctx := context.Background()
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ store := NewMemoryStore()
+ oldActive := testIncident(now.Add(-10 * time.Minute))
+ resolved := testIncident(now.Add(-20 * time.Minute))
+ resolved.IncidentID = "inc_resolved"
+ resolved.Status = StatusResolved
+ resolvedAt := now.Add(-5 * time.Minute)
+ resolved.ResolvedAt = &resolvedAt
+ if err := store.Upsert(ctx, oldActive); err != nil {
+ t.Fatal(err)
+ }
+ if err := store.Upsert(ctx, resolved); err != nil {
+ t.Fatal(err)
+ }
+ engine := NewEngine(&fakeReader{}, nil, nil, store, Config{}, nil, nil)
+ if err := engine.Bootstrap(ctx); err != nil {
+ t.Fatal(err)
+ }
+ newActive := testIncident(now)
+ newActive.IncidentID = "inc_new"
+ if err := engine.ApplyRebuild(ctx, []derivedRow{{Incident: newActive}}); err != nil {
+ t.Fatal(err)
+ }
+ active, err := engine.Active(ctx)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(active) != 1 || active[0].IncidentID != "inc_new" {
+ t.Fatalf("active after rebuild=%+v", active)
+ }
+ if _, ok := engine.SnapshotActive()["inc_new"]; !ok {
+ t.Fatalf("cache was not reloaded from rebuilt rows")
+ }
+ if _, err := store.Get(ctx, "inc_resolved"); err != nil {
+ t.Fatalf("resolved row should be preserved: %v", err)
+ }
+ if _, err := store.Get(ctx, oldActive.IncidentID); err == nil {
+ t.Fatalf("old non-resolved row should be replaced")
+ }
+}
+
+func TestRebuildOrchestratorUsesRebuildApply(t *testing.T) {
+ ctx := context.Background()
+ now := time.Date(2026, 5, 4, 12, 0, 0, 0, time.UTC)
+ reader := &fakeReader{
+ current: ErrorsResult{Rows: []apiv2.ErrorRow{{
+ ErrorFamily: testFamily(),
+ Count: 6,
+ }}},
+ blast: apiv2.BlastRadiusResponse{AffectedRequests: 6, AffectedServices: 2},
+ events: []*eventv2.Event{
+ testIncidentEvent("new", "trace-new", now.Add(-time.Minute), "checkout", "payment.charge", "PMT_502", "payment"),
+ },
+ }
+ engine := NewEngine(reader, nil, nil, NewMemoryStore(), Config{MinCount: 5, SampleLimit: 2}, nil, nil)
+ result, err := Rebuild(ctx, RebuildDeps{Engine: engine, Reader: reader, Now: func() time.Time { return now }})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if result.RowsReplaced != 1 {
+ t.Fatalf("rows_replaced=%d", result.RowsReplaced)
+ }
+ if len(engine.SnapshotActive()) != 1 {
+ t.Fatalf("cache should reflect rebuilt active rows")
+ }
+}
+
type fakeReader struct {
current ErrorsResult
base ErrorsResult
diff --git a/internal/incidents/nextchecks.go b/internal/incidents/nextchecks.go
index b3a6559..9f0d696 100644
--- a/internal/incidents/nextchecks.go
+++ b/internal/incidents/nextchecks.go
@@ -14,6 +14,13 @@ func NextChecks(cause Cause, confidence Confidence) []string {
"Inspect retries, timeouts, and circuit-breaker state for the failing step.",
"Notify the downstream owner with sample traces and affected service list.",
}
+ case CauseRuntime:
+ return []string{
+ "Check the service for recent restarts or crashloops.",
+ "Inspect memory and CPU usage for OOM kills or resource pressure.",
+ "Review readiness and liveness probe results around the incident start.",
+ "Verify node and task health for the affected service instances.",
+ }
case CauseApp:
return []string{
"Inspect the first failing step and recent application logs.",
diff --git a/internal/incidents/rebuild.go b/internal/incidents/rebuild.go
new file mode 100644
index 0000000..1abd0cc
--- /dev/null
+++ b/internal/incidents/rebuild.go
@@ -0,0 +1,40 @@
+package incidents
+
+import (
+ "context"
+ "fmt"
+ "time"
+)
+
+type RebuildDeps struct {
+ Engine *Engine
+ Reader Reader
+ Now func() time.Time
+}
+
+type RebuildResult struct {
+ RowsReplaced int
+ Duration time.Duration
+}
+
+func Rebuild(ctx context.Context, deps RebuildDeps) (RebuildResult, error) {
+ if deps.Engine == nil {
+ return RebuildResult{}, fmt.Errorf("incidents rebuild: engine required")
+ }
+ if deps.Reader == nil {
+ return RebuildResult{}, fmt.Errorf("incidents rebuild: reader required")
+ }
+ nowFn := deps.Now
+ if nowFn == nil {
+ nowFn = time.Now
+ }
+ start := time.Now()
+ rows, err := deps.Engine.derive(ctx, nowFn().UTC(), deps.Engine.SnapshotActive(), deps.Reader)
+ if err != nil {
+ return RebuildResult{}, err
+ }
+ if err := deps.Engine.ApplyRebuild(ctx, rows); err != nil {
+ return RebuildResult{}, err
+ }
+ return RebuildResult{RowsReplaced: len(rows), Duration: time.Since(start)}, nil
+}
diff --git a/internal/incidents/store.go b/internal/incidents/store.go
index 90b7323..bd3fb5b 100644
--- a/internal/incidents/store.go
+++ b/internal/incidents/store.go
@@ -12,6 +12,7 @@ var ErrNotFound = errors.New("incidents: not found")
type Store interface {
Upsert(ctx context.Context, inc Incident) error
+ ReplaceNonResolved(ctx context.Context, rows []Incident) error
Get(ctx context.Context, id string) (Incident, error)
ListActive(ctx context.Context) ([]Incident, error)
PruneResolvedOlderThan(ctx context.Context, cutoff time.Time) (int, error)
@@ -33,6 +34,20 @@ func (s *MemoryStore) Upsert(_ context.Context, inc Incident) error {
return nil
}
+func (s *MemoryStore) ReplaceNonResolved(_ context.Context, rows []Incident) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ for id, inc := range s.rows {
+ if inc.Status != StatusResolved {
+ delete(s.rows, id)
+ }
+ }
+ for _, inc := range rows {
+ s.rows[inc.IncidentID] = cloneIncident(inc)
+ }
+ return nil
+}
+
func (s *MemoryStore) Get(_ context.Context, id string) (Incident, error) {
s.mu.Lock()
defer s.mu.Unlock()
diff --git a/internal/incidents/types.go b/internal/incidents/types.go
index cf3b59a..fed7350 100644
--- a/internal/incidents/types.go
+++ b/internal/incidents/types.go
@@ -20,6 +20,7 @@ const (
CauseDeploy Cause = "deploy"
CauseApp Cause = "app"
CauseDependency Cause = "dependency"
+ CauseRuntime Cause = "runtime"
CauseUnknown Cause = "unknown"
)
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index bd1a4e8..e6150a1 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -14,7 +14,6 @@ import (
"net"
"net/http"
"net/url"
- "os"
"sort"
"strconv"
"strings"
@@ -136,8 +135,11 @@ type Server struct {
// OTLP capability flag — reported by /v1/capabilities. Set via
// ServerConfig when the OTLP handler is mounted in main.go.
- otlpEnabled bool
- v2ReadsEnabled bool
+ otlpEnabled bool
+ v2ReadsEnabled bool
+ incidentsEnabled bool
+ incidentsPersistent bool
+ incidentsRebuildSupported bool
// SSE
sseHub *SSEHub
@@ -178,31 +180,34 @@ func (s *Server) SetCausalRunResult(err error) {
// ServerConfig holds configuration for creating a new Server.
type ServerConfig struct {
- Store *store.Store
- TraceStore *tracestore.Store
- Sampler *sampler.Sampler
- Metrics *metrics.Metrics
- MaxBodyBytes int64
- EventLogDir string
- StartTime time.Time
- SampleRatePct int // 0 means use sampler's default from env
- AskProvider llm.Provider
- AskRegistry *tools.Registry
- AskMaxStepsDefault int
- AskMaxStepsMax int
- DashboardRefreshSec int
- PrometheusURL string
- GrafanaURL string
- GraphUI bool
- DedupCache *DedupCache
- AgentKey string
- TrustProxy bool
- ColdWriter *coldstore.BatchWriter
- ColdStore coldstore.Store
- PlanStore *PlanStore
- GraphHotWindow time.Duration
- OTLPEnabled bool
- V2ReadsEnabled bool
+ Store *store.Store
+ TraceStore *tracestore.Store
+ Sampler *sampler.Sampler
+ Metrics *metrics.Metrics
+ MaxBodyBytes int64
+ EventLogDir string
+ StartTime time.Time
+ SampleRatePct int // 0 means use sampler's default from env
+ AskProvider llm.Provider
+ AskRegistry *tools.Registry
+ AskMaxStepsDefault int
+ AskMaxStepsMax int
+ DashboardRefreshSec int
+ PrometheusURL string
+ GrafanaURL string
+ GraphUI bool
+ DedupCache *DedupCache
+ AgentKey string
+ TrustProxy bool
+ ColdWriter *coldstore.BatchWriter
+ ColdStore coldstore.Store
+ PlanStore *PlanStore
+ GraphHotWindow time.Duration
+ OTLPEnabled bool
+ V2ReadsEnabled bool
+ IncidentsEnabled bool
+ IncidentsPersistent bool
+ IncidentRebuildSupported bool
}
// NewServer creates a new ingest server with the given configuration.
@@ -216,33 +221,36 @@ func NewServer(cfg ServerConfig) *Server {
startTime = time.Now()
}
s := &Server{
- store: cfg.Store,
- traceStore: cfg.TraceStore,
- builder: build.NewBuilder(),
- sampler: cfg.Sampler,
- metrics: cfg.Metrics,
- maxBodyBytes: maxBody,
- startTime: startTime,
- EventLogDir: cfg.EventLogDir,
- sampleRatePct: cfg.SampleRatePct,
- askProvider: cfg.AskProvider,
- askRegistry: cfg.AskRegistry,
- askMaxStepsDefault: cfg.AskMaxStepsDefault,
- askMaxStepsMax: cfg.AskMaxStepsMax,
- dashboardRefreshSec: cfg.DashboardRefreshSec,
- prometheusURL: cfg.PrometheusURL,
- grafanaURL: cfg.GrafanaURL,
- graphUI: cfg.GraphUI,
- dedupCache: cfg.DedupCache,
- agentKey: cfg.AgentKey,
- trustProxy: cfg.TrustProxy,
- coldWriter: cfg.ColdWriter,
- coldStore: cfg.ColdStore,
- planStore: cfg.PlanStore,
- graphHotWindow: cfg.GraphHotWindow,
- otlpEnabled: cfg.OTLPEnabled,
- v2ReadsEnabled: cfg.V2ReadsEnabled,
- replayStatus: "none",
+ store: cfg.Store,
+ traceStore: cfg.TraceStore,
+ builder: build.NewBuilder(),
+ sampler: cfg.Sampler,
+ metrics: cfg.Metrics,
+ maxBodyBytes: maxBody,
+ startTime: startTime,
+ EventLogDir: cfg.EventLogDir,
+ sampleRatePct: cfg.SampleRatePct,
+ askProvider: cfg.AskProvider,
+ askRegistry: cfg.AskRegistry,
+ askMaxStepsDefault: cfg.AskMaxStepsDefault,
+ askMaxStepsMax: cfg.AskMaxStepsMax,
+ dashboardRefreshSec: cfg.DashboardRefreshSec,
+ prometheusURL: cfg.PrometheusURL,
+ grafanaURL: cfg.GrafanaURL,
+ graphUI: cfg.GraphUI,
+ dedupCache: cfg.DedupCache,
+ agentKey: cfg.AgentKey,
+ trustProxy: cfg.TrustProxy,
+ coldWriter: cfg.ColdWriter,
+ coldStore: cfg.ColdStore,
+ planStore: cfg.PlanStore,
+ graphHotWindow: cfg.GraphHotWindow,
+ otlpEnabled: cfg.OTLPEnabled,
+ v2ReadsEnabled: cfg.V2ReadsEnabled,
+ incidentsEnabled: cfg.IncidentsEnabled,
+ incidentsPersistent: cfg.IncidentsPersistent,
+ incidentsRebuildSupported: cfg.IncidentRebuildSupported,
+ replayStatus: "none",
}
if s.sampler == nil {
s.sampler = sampler.New(sampler.LoadConfigFromEnv())
@@ -559,19 +567,26 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
- askEnabled, model, toolMode := s.askCapabilityState()
+ askState := s.askCapabilityState()
hotWindow := s.effectiveGraphHotWindow()
_, hotWindowSource := runtimeGraphHotWindow()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]any{
"ask": map[string]any{
- "enabled": askEnabled,
- "model": model,
- "tool_mode": toolMode,
+ "enabled": askState.AskEnabled,
+ "model": askState.Model,
+ "tool_mode": askState.ToolMode,
"max_steps_default": s.askMaxStepsDefault,
"max_steps_max": s.askMaxStepsMax,
},
+ "llm": map[string]any{
+ "provider": askState.Provider,
+ "model": askState.Model,
+ "tool_mode": askState.ToolMode,
+ "configured": askState.Configured,
+ "ask_enabled": askState.AskEnabled,
+ },
"dashboard": map[string]any{
"refresh_interval_sec": s.dashboardRefreshSec,
},
@@ -586,6 +601,19 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
"v2_reads": map[string]any{
"enabled": s.v2ReadsEnabled,
},
+ "incidents": map[string]any{
+ "enabled": s.incidentsEnabled,
+ "persistent": s.incidentsPersistent,
+ "rebuild": map[string]any{
+ "supported": s.incidentsRebuildSupported,
+ "scope": func() string {
+ if s.incidentsRebuildSupported {
+ return "hot-window"
+ }
+ return ""
+ }(),
+ },
+ },
"architecture": map[string]any{
"flattened": true,
"graph": map[string]any{
@@ -1785,40 +1813,42 @@ func normalizeJSONValue(v any) any {
return out
}
-func (s *Server) askProviderFromEnv() (llm.Provider, string, string, error) {
- key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
- if key == "" {
- key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
- }
- if key == "" {
- return nil, "", "", errors.New("gemini api key is not configured")
- }
+type askCapability struct {
+ Provider string
+ Model string
+ ToolMode string
+ Configured bool
+ AskEnabled bool
+}
- client := llm.NewGeminiClient(key)
- model := strings.TrimSpace(os.Getenv("GEMINI_MODEL"))
- base := strings.TrimSpace(os.Getenv("GEMINI_API_BASE"))
- mode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE"))
- if model != "" {
- client.Model = model
- }
- if base != "" {
- client.BaseURL = base
+func (s *Server) askProviderFromEnv() (llm.Provider, string, string, error) {
+ sel, err := llm.SelectFromEnv()
+ if err != nil {
+ return nil, "", "", err
}
- if mode != "" {
- client.ToolMode = mode
+ if !sel.AskEnabled {
+ return nil, "", "", llm.ErrProviderNotConfigured
}
- return client, client.Model, client.ToolMode, nil
+ return sel.Impl, sel.Model, sel.ToolMode, nil
}
-func (s *Server) askCapabilityState() (bool, string, string) {
+// askCapabilityState reports current LLM provider state for /v1/capabilities.
+// When s.askProvider != nil (test injection), provider is reported as "custom".
+func (s *Server) askCapabilityState() askCapability {
if s.askProvider != nil {
- return true, "", ""
+ return askCapability{Provider: "custom", Configured: true, AskEnabled: true}
}
- _, model, toolMode, err := s.askProviderFromEnv()
+ sel, err := llm.SelectFromEnv()
if err != nil {
- return false, "", ""
+ return askCapability{Provider: "none"}
+ }
+ return askCapability{
+ Provider: sel.Provider,
+ Model: sel.Model,
+ ToolMode: sel.ToolMode,
+ Configured: sel.Configured,
+ AskEnabled: sel.AskEnabled,
}
- return true, model, toolMode
}
func attrToInt64(v any) int64 {
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 0c99c6c..0ed3768 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -349,6 +349,65 @@ func TestCapabilities_V2ReadsEnabled(t *testing.T) {
}
}
+func TestCapabilities_IncidentsBlock(t *testing.T) {
+ tests := []struct {
+ name string
+ cfg ServerConfig
+ wantEnabled bool
+ wantPersistent bool
+ wantRebuild bool
+ wantRebuildScope string
+ }{
+ {name: "disabled"},
+ {
+ name: "sqlite enabled",
+ cfg: ServerConfig{
+ IncidentsEnabled: true,
+ IncidentsPersistent: true,
+ IncidentRebuildSupported: true,
+ },
+ wantEnabled: true,
+ wantPersistent: true,
+ wantRebuild: true,
+ wantRebuildScope: "hot-window",
+ },
+ {name: "requested but sqlite missing"},
+ }
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ srv := NewServer(tc.cfg)
+ req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+ w := httptest.NewRecorder()
+ srv.Capabilities(w, req)
+ var resp struct {
+ Incidents struct {
+ Enabled bool `json:"enabled"`
+ Persistent bool `json:"persistent"`
+ Rebuild struct {
+ Supported bool `json:"supported"`
+ Scope string `json:"scope"`
+ } `json:"rebuild"`
+ } `json:"incidents"`
+ }
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("invalid json: %v", err)
+ }
+ if resp.Incidents.Enabled != tc.wantEnabled {
+ t.Fatalf("enabled=%v want %v", resp.Incidents.Enabled, tc.wantEnabled)
+ }
+ if resp.Incidents.Persistent != tc.wantPersistent {
+ t.Fatalf("persistent=%v want %v", resp.Incidents.Persistent, tc.wantPersistent)
+ }
+ if resp.Incidents.Rebuild.Supported != tc.wantRebuild {
+ t.Fatalf("rebuild.supported=%v want %v", resp.Incidents.Rebuild.Supported, tc.wantRebuild)
+ }
+ if resp.Incidents.Rebuild.Scope != tc.wantRebuildScope {
+ t.Fatalf("rebuild.scope=%q want %q", resp.Incidents.Rebuild.Scope, tc.wantRebuildScope)
+ }
+ })
+ }
+}
+
const successTrace = "bbbb0000cccc1111dddd2222eeee3333"
func makeTestServerMixed() *Server {
@@ -1686,6 +1745,37 @@ func TestAsk_DedupSafetyNet_PreservesActualStatus(t *testing.T) {
}
}
+func TestAsk_MissingProviderMessageIsProviderAgnostic(t *testing.T) {
+ t.Setenv("WAYLOG_LLM_PROVIDER", "")
+ t.Setenv("GEMINI_API_KEY", "")
+ t.Setenv("GOOGLE_API_KEY", "")
+ srv := &Server{
+ store: graphstore.NewStore(),
+ maxBodyBytes: 1 << 20,
+ dedupCache: NewDedupCache(),
+ }
+ r := httptest.NewRequest("POST", "/v1/ask?envelope=v2", strings.NewReader(`{"prompt":"test"}`))
+ w := httptest.NewRecorder()
+ srv.Ask(w, r)
+
+ if w.Code != http.StatusServiceUnavailable {
+ t.Fatalf("status = %d, want 503", w.Code)
+ }
+ var resp APIResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if resp.Error == nil {
+ t.Fatalf("expected error response")
+ }
+ if got := resp.Error.Message; got != llm.ErrProviderNotConfigured.Error() {
+ t.Fatalf("message = %q, want %q", got, llm.ErrProviderNotConfigured.Error())
+ }
+ if strings.Contains(strings.ToLower(resp.Error.Message), "gemini") {
+ t.Fatalf("message should not pin Gemini: %q", resp.Error.Message)
+ }
+}
+
func TestToolCall_DedupSafetyNet_Exists(t *testing.T) {
dc := NewDedupCache()
reg := tools.NewRegistry()
@@ -1913,3 +2003,88 @@ func TestOverview_IncludesLatestFailedTraceID(t *testing.T) {
t.Fatal("overview response missing latest_failed_trace_id field")
}
}
+
+type stubAskProvider struct{}
+
+func (stubAskProvider) Generate(ctx context.Context, prompt string, tools []llm.ToolDefinition, history []llm.Turn) (llm.Result, error) {
+ return llm.Result{}, nil
+}
+
+func TestCapabilities_LLMBlock(t *testing.T) {
+ tests := []struct {
+ name string
+ env map[string]string
+ askProvider llm.Provider
+ wantProvider string
+ wantConfigured bool
+ wantAskEnabled bool
+ }{
+ {
+ name: "no env",
+ env: map[string]string{},
+ wantProvider: "none",
+ wantConfigured: false,
+ wantAskEnabled: false,
+ },
+ {
+ name: "gemini key set",
+ env: map[string]string{"WAYLOG_LLM_PROVIDER": "gemini", "GEMINI_API_KEY": "test-key"},
+ wantProvider: "gemini",
+ wantConfigured: true,
+ wantAskEnabled: true,
+ },
+ {
+ name: "custom injected provider",
+ env: map[string]string{},
+ askProvider: stubAskProvider{},
+ wantProvider: "custom",
+ wantConfigured: true,
+ wantAskEnabled: true,
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Setenv("WAYLOG_LLM_PROVIDER", "")
+ t.Setenv("WAYLOG_LLM_MODEL", "")
+ t.Setenv("GEMINI_API_KEY", "")
+ t.Setenv("GOOGLE_API_KEY", "")
+ t.Setenv("GEMINI_MODEL", "")
+ t.Setenv("GEMINI_API_BASE", "")
+ t.Setenv("GEMINI_TOOL_MODE", "")
+ for k, v := range tc.env {
+ t.Setenv(k, v)
+ }
+
+ srv := NewServer(ServerConfig{AskProvider: tc.askProvider})
+ req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+ w := httptest.NewRecorder()
+ srv.Capabilities(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want 200: %s", w.Code, w.Body.String())
+ }
+ var resp struct {
+ LLM struct {
+ Provider string `json:"provider"`
+ Model string `json:"model"`
+ ToolMode string `json:"tool_mode"`
+ Configured bool `json:"configured"`
+ AskEnabled bool `json:"ask_enabled"`
+ } `json:"llm"`
+ }
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("invalid json: %v", err)
+ }
+ if resp.LLM.Provider != tc.wantProvider {
+ t.Errorf("provider = %q, want %q", resp.LLM.Provider, tc.wantProvider)
+ }
+ if resp.LLM.Configured != tc.wantConfigured {
+ t.Errorf("configured = %v, want %v", resp.LLM.Configured, tc.wantConfigured)
+ }
+ if resp.LLM.AskEnabled != tc.wantAskEnabled {
+ t.Errorf("ask_enabled = %v, want %v", resp.LLM.AskEnabled, tc.wantAskEnabled)
+ }
+ })
+ }
+}
diff --git a/internal/llm/provider.go b/internal/llm/provider.go
new file mode 100644
index 0000000..c75ea4e
--- /dev/null
+++ b/internal/llm/provider.go
@@ -0,0 +1,75 @@
+package llm
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "strings"
+)
+
+// ErrProviderNotConfigured is returned when Ask cannot construct a provider
+// from the current environment.
+var ErrProviderNotConfigured = errors.New("LLM provider not configured; set WAYLOG_LLM_PROVIDER and provider credentials")
+
+// Selection describes the resolved LLM provider state.
+type Selection struct {
+ Provider string
+ Model string
+ ToolMode string
+ Configured bool
+ AskEnabled bool
+ Impl Provider
+}
+
+// SelectFromEnv resolves the LLM provider from environment variables.
+//
+// WAYLOG_LLM_PROVIDER may be "none" or "gemini". When unset, a Gemini key
+// (GEMINI_API_KEY or GOOGLE_API_KEY) infers gemini; otherwise none.
+// Model precedence: WAYLOG_LLM_MODEL > GEMINI_MODEL > built-in default.
+func SelectFromEnv() (Selection, error) {
+ raw := strings.ToLower(strings.TrimSpace(os.Getenv("WAYLOG_LLM_PROVIDER")))
+ key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
+ if key == "" {
+ key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+ }
+
+ switch raw {
+ case "":
+ if key == "" {
+ return Selection{Provider: "none"}, nil
+ }
+ return buildGemini(key, true), nil
+ case "none":
+ return Selection{Provider: "none", Configured: true}, nil
+ case "gemini":
+ if key == "" {
+ return Selection{Provider: "gemini", Configured: false}, nil
+ }
+ return buildGemini(key, true), nil
+ default:
+ return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini", raw)
+ }
+}
+
+func buildGemini(key string, configured bool) Selection {
+ client := NewGeminiClient(key)
+ if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+ client.Model = model
+ } else if model := strings.TrimSpace(os.Getenv("GEMINI_MODEL")); model != "" {
+ client.Model = model
+ }
+ if base := strings.TrimSpace(os.Getenv("GEMINI_API_BASE")); base != "" {
+ client.BaseURL = base
+ }
+ if mode := strings.TrimSpace(os.Getenv("GEMINI_TOOL_MODE")); mode != "" {
+ client.ToolMode = mode
+ }
+ return Selection{
+ Provider: "gemini",
+ Model: client.Model,
+ ToolMode: client.ToolMode,
+ Configured: configured,
+ AskEnabled: true,
+ Impl: client,
+ }
+}
diff --git a/internal/llm/provider_test.go b/internal/llm/provider_test.go
new file mode 100644
index 0000000..b73e466
--- /dev/null
+++ b/internal/llm/provider_test.go
@@ -0,0 +1,160 @@
+package llm
+
+import (
+ "strings"
+ "testing"
+)
+
+func clearProviderEnv(t *testing.T) {
+ t.Helper()
+ t.Setenv("WAYLOG_LLM_PROVIDER", "")
+ t.Setenv("WAYLOG_LLM_MODEL", "")
+ t.Setenv("GEMINI_API_KEY", "")
+ t.Setenv("GOOGLE_API_KEY", "")
+ t.Setenv("GEMINI_MODEL", "")
+ t.Setenv("GEMINI_API_BASE", "")
+ t.Setenv("GEMINI_TOOL_MODE", "")
+}
+
+func TestSelectFromEnv_NoEnv(t *testing.T) {
+ clearProviderEnv(t)
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "none" {
+ t.Errorf("Provider = %q, want %q", sel.Provider, "none")
+ }
+ if sel.Configured {
+ t.Error("Configured = true, want false")
+ }
+ if sel.AskEnabled {
+ t.Error("AskEnabled = true, want false")
+ }
+ if sel.Impl != nil {
+ t.Error("Impl != nil, want nil")
+ }
+}
+
+func TestSelectFromEnv_NoneExplicit(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "none")
+ t.Setenv("GEMINI_API_KEY", "ignored")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "none" {
+ t.Errorf("Provider = %q, want %q", sel.Provider, "none")
+ }
+ if !sel.Configured {
+ t.Error("Configured = false, want true")
+ }
+ if sel.AskEnabled {
+ t.Error("AskEnabled = true, want false")
+ }
+ if sel.Model != "" || sel.ToolMode != "" {
+ t.Errorf("model/tool mode should be empty for none, got %q/%q", sel.Model, sel.ToolMode)
+ }
+ if sel.Impl != nil {
+ t.Error("Impl != nil, want nil")
+ }
+}
+
+func TestSelectFromEnv_GeminiWithKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+ t.Setenv("GEMINI_API_KEY", "test-key")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "gemini" {
+ t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+ }
+ if !sel.Configured {
+ t.Error("Configured = false, want true")
+ }
+ if !sel.AskEnabled {
+ t.Error("AskEnabled = false, want true")
+ }
+ if sel.Impl == nil {
+ t.Error("Impl = nil, want non-nil")
+ }
+ if sel.Model == "" {
+ t.Error("Model is empty, want default")
+ }
+}
+
+func TestSelectFromEnv_GeminiInferredFromKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("GOOGLE_API_KEY", "test-key")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "gemini" {
+ t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+ }
+ if !sel.AskEnabled {
+ t.Error("AskEnabled = false, want true")
+ }
+}
+
+func TestSelectFromEnv_GeminiMissingKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "gemini" {
+ t.Errorf("Provider = %q, want %q", sel.Provider, "gemini")
+ }
+ if sel.Configured {
+ t.Error("Configured = true, want false")
+ }
+ if sel.AskEnabled {
+ t.Error("AskEnabled = true, want false")
+ }
+ if sel.Impl != nil {
+ t.Error("Impl != nil, want nil")
+ }
+}
+
+func TestSelectFromEnv_UnknownProvider(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+
+ _, err := SelectFromEnv()
+ if err == nil {
+ t.Fatal("expected error, got nil")
+ }
+ if !strings.Contains(err.Error(), "anthropic") {
+ t.Errorf("error %q should mention provider name", err.Error())
+ }
+ if !strings.Contains(err.Error(), "none, gemini") {
+ t.Errorf("error %q should list supported providers", err.Error())
+ }
+}
+
+func TestSelectFromEnv_WaylogModelOverridesGeminiModel(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "gemini")
+ t.Setenv("GEMINI_API_KEY", "test-key")
+ t.Setenv("WAYLOG_LLM_MODEL", "foo")
+ t.Setenv("GEMINI_MODEL", "bar")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Model != "foo" {
+ t.Errorf("Model = %q, want %q", sel.Model, "foo")
+ }
+}
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 6412a70..21d7680 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -78,6 +78,10 @@ type Metrics struct {
IncidentTickLatency prometheus.Histogram
IncidentActive prometheus.Gauge
IncidentClassifications *prometheus.CounterVec
+ IncidentRebuildDuration prometheus.Histogram
+ IncidentRebuildRows prometheus.Counter
+ IncidentRebuildFailures prometheus.Counter
+ IncidentRebuildReplayed prometheus.Counter
CausalRunsTotal prometheus.Counter
CausalRunDuration prometheus.Histogram
@@ -377,11 +381,28 @@ func New(reg *prometheus.Registry) *Metrics {
Name: "waylog_incident_classifications_total",
Help: "Incident classifications by cause and confidence.",
}, []string{"cause", "confidence"})
- for _, cause := range []string{"deploy", "app", "dependency", "unknown"} {
+ for _, cause := range []string{"deploy", "app", "dependency", "runtime", "unknown"} {
for _, confidence := range []string{"high", "medium", "low"} {
m.IncidentClassifications.WithLabelValues(cause, confidence).Add(0)
}
}
+ m.IncidentRebuildDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+ Name: "waylog_incident_rebuild_duration_seconds",
+ Help: "Startup hot-window incident rebuild duration.",
+ Buckets: defaultBuckets,
+ })
+ m.IncidentRebuildRows = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incident_rebuild_rows_replaced",
+ Help: "Incident rows replaced by startup hot-window rebuild.",
+ })
+ m.IncidentRebuildFailures = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incident_rebuild_failures_total",
+ Help: "Failed startup hot-window incident rebuild attempts.",
+ })
+ m.IncidentRebuildReplayed = prometheus.NewCounter(prometheus.CounterOpts{
+ Name: "waylog_incident_rebuild_replayed_events_total",
+ Help: "Schema-2.0 events replayed for startup hot-window incident rebuild.",
+ })
m.CausalRunsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "waylog_causal_runs_total",
@@ -462,6 +483,7 @@ func New(reg *prometheus.Registry) *Metrics {
m.SignalsAccepted, m.SignalsRejected, m.SignalRetentionPruned,
m.IncidentOpened, m.IncidentUpdated, m.IncidentRecovered, m.IncidentResolved,
m.IncidentTickLatency, m.IncidentActive, m.IncidentClassifications,
+ m.IncidentRebuildDuration, m.IncidentRebuildRows, m.IncidentRebuildFailures, m.IncidentRebuildReplayed,
m.CausalRunsTotal, m.CausalRunDuration, m.CausalRunFailures, m.CausalClaimsTotal,
m.OTLPRequestsTotal, m.OTLPSpansReceived, m.OTLPSpansConverted,
m.OTLPSpansDropped, m.OTLPValidationRejects, m.OTLPDecodeFailures,
From 23f92691c3cc51a3465664fc4263091c79c5fb28 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Fri, 8 May 2026 03:24:07 -0400
Subject: [PATCH 07/14] feat: added triage plan template and multi-provider ask
Shipped the M1.5 triage plan shorthand and M2.5 provider layer.
M1.5:
- add `template` + `params` request support to `/v1/plans/execute`
- add built-in `template: "triage"` expansion to a single `triage_incident` step
- preserve existing plan validation, idempotency, `X-Plan-ID`, and SSE progress
- reject unknown templates, missing `params.incident_id`, and mixed `steps`/`template` bodies
- document the triage template response shape at `steps[0].result`
M2.5:
- add Anthropic provider via the Messages API
- add OpenAI provider via the Responses API
- extend `WAYLOG_LLM_PROVIDER` to `none`, `gemini`, `anthropic`, and `openai`
- add Anthropic/OpenAI env selection, model overrides, API base overrides, and missing-key behavior
- preserve OpenAI `call_id` and raw response output items through tool-call follow-up requests
- update OpenAI default model to `gpt-5.4-mini`
- prove triage report hashes do not depend on selected LLM provider
Docs:
- update README with triage plan shorthand and provider list
- update env docs for Anthropic/OpenAI keys and model variables
- update OpenAPI for `/v1/plans/execute`, `PlanResult`, and provider enum
Validation:
- go test ./internal/llm/... ./internal/ingest/... ./internal/triage/...
- go test ./...
- make ci
- make demo + make demo-acceptance
---
README.md | 6 +-
docs/env.md | 12 +-
docs/openapi.yaml | 111 ++++++++++++-
internal/ingest/handler.go | 20 ++-
internal/ingest/handler_test.go | 118 ++++++++++++++
internal/ingest/plan.go | 51 ++++++
internal/ingest/plan_test.go | 59 +++++++
internal/llm/anthropic.go | 198 +++++++++++++++++++++++
internal/llm/anthropic_test.go | 117 ++++++++++++++
internal/llm/openai.go | 239 ++++++++++++++++++++++++++++
internal/llm/openai_test.go | 203 +++++++++++++++++++++++
internal/llm/provider.go | 83 ++++++++--
internal/llm/provider_test.go | 138 +++++++++++++++-
internal/llm/types.go | 7 +-
internal/triage/idempotency_test.go | 34 ++++
15 files changed, 1368 insertions(+), 28 deletions(-)
create mode 100644 internal/llm/anthropic.go
create mode 100644 internal/llm/anthropic_test.go
create mode 100644 internal/llm/openai.go
create mode 100644 internal/llm/openai_test.go
diff --git a/README.md b/README.md
index 66893ff..6871fc1 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,8 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
+
| Tool | Answers |
| ------------------ | ------------------------------------------------------------- |
| `graph_stats` | Overall shape of the graph right now |
@@ -298,12 +300,12 @@ Public alpha. APIs may break before 1.0.
- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
- SQLite cold store (events, deployments, signals, incidents, causal claims)
- signal-driven incident engine with `waylog incidents`, `waylog incident `, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
-- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER`; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
- 11 deterministic analysis tools, rollup-correct root-cause attribution
- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
- `/v1/traces/story` and indented failure-path rendering in the dashboard
- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
+- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `triage`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
- live TUI (`waylog-live --dev` streams via SSE), MCP stdio
- scoped auth (write/read/agent) with startup validation
diff --git a/docs/env.md b/docs/env.md
index 58953f6..65fa480 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -6,7 +6,7 @@ Reference for configuring the Waylog ingest server and SDK. All variables are re
| Variable | Purpose |
|---|---|
-| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. For the current Gemini provider, set `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
+| Provider credentials | Required only when natural-language Ask should call a configured LLM provider. Set the matching key for the selected provider: `GEMINI_API_KEY` or `GOOGLE_API_KEY`, `ANTHROPIC_API_KEY`, or `OPENAI_API_KEY` |
## LLM provider
@@ -14,11 +14,17 @@ Deterministic tools, plans, triage, MCP, and read APIs do not require an LLM pro
| Variable | Default | Purpose |
|---|---|---|
-| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values in M2: `none`, `gemini` |
-| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. For Gemini, this takes precedence over `GEMINI_MODEL` |
+| `WAYLOG_LLM_PROVIDER` | `none` unless a supported provider key is present | LLM provider for Ask. Supported values: `none`, `gemini`, `anthropic`, `openai` |
+| `WAYLOG_LLM_MODEL` | provider default | Provider-neutral model override. Takes precedence over provider-specific model variables |
| `GEMINI_MODEL` | `gemini-2.5-flash` | Gemini-specific model override when `WAYLOG_LLM_MODEL` is unset |
| `GEMINI_API_BASE` | Gemini API default | Gemini-specific API base URL override |
| `GEMINI_TOOL_MODE` | `text` | Gemini-specific tool-calling mode |
+| `ANTHROPIC_API_KEY` | — | Anthropic API key for `WAYLOG_LLM_PROVIDER=anthropic` |
+| `ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `ANTHROPIC_API_BASE` | Anthropic API default | Anthropic-specific API base URL override |
+| `OPENAI_API_KEY` | — | OpenAI API key for `WAYLOG_LLM_PROVIDER=openai` |
+| `OPENAI_MODEL` | `gpt-5.4-mini` | OpenAI-specific model override when `WAYLOG_LLM_MODEL` is unset |
+| `OPENAI_API_BASE` | OpenAI API default | OpenAI-specific API base URL override |
## Auth
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 23f1e8c..7162edb 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -741,6 +741,69 @@ paths:
'503':
description: LLM provider unavailable
+ /v1/plans/execute:
+ post:
+ tags: [Operational]
+ operationId: executePlan
+ summary: Execute a deterministic tool plan
+ description: |
+ Executes explicit plan steps or a built-in template shorthand. The
+ `triage` template expands to one `triage_incident` tool step; the
+ resulting TriageReport is returned at `steps[0].result`.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - type: object
+ required: [steps]
+ properties:
+ steps:
+ type: array
+ minItems: 1
+ maxItems: 10
+ items:
+ type: object
+ required: [id, tool]
+ properties:
+ id: {type: string}
+ tool: {type: string}
+ params:
+ type: object
+ additionalProperties: true
+ - type: object
+ required: [template, params]
+ properties:
+ template:
+ type: string
+ enum: [triage]
+ params:
+ type: object
+ required: [incident_id]
+ properties:
+ incident_id: {type: string}
+ window: {type: string, default: 15m}
+ snapshot: {type: boolean, default: false}
+ responses:
+ '200':
+ description: Plan execution result
+ headers:
+ X-Plan-ID:
+ schema: {type: string}
+ description: Plan ID for `/v1/stream/plans/{id}`.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PlanResult'
+ '400':
+ description: Invalid plan or template parameters
+ '503':
+ description: Tool registry unavailable
+
/v1/tools/{name}:
post:
tags: [Operational]
@@ -1832,11 +1895,55 @@ components:
generated_at: {type: string}
plan_run_id:
type: string
- description: Set only when produced via /v1/plans/execute.
+ description: Reserved for future plan-produced reports; omitted by the M1.5 triage template.
report_hash:
type: string
description: "sha256:"
+ PlanResult:
+ type: object
+ required: [plan_id, steps, completed, total, status]
+ properties:
+ plan_id: {type: string}
+ steps:
+ type: array
+ items:
+ type: object
+ required: [id, index, tool, duration_ms]
+ properties:
+ id: {type: string}
+ index: {type: integer}
+ tool: {type: string}
+ result:
+ description: Tool output. For the triage template this is a TriageReport.
+ oneOf:
+ - $ref: '#/components/schemas/TriageReport'
+ - type: object
+ additionalProperties: true
+ error:
+ type: object
+ nullable: true
+ properties:
+ code: {type: string}
+ message: {type: string}
+ retryable: {type: boolean}
+ duration_ms: {type: integer}
+ completed: {type: integer}
+ total: {type: integer}
+ status:
+ type: string
+ enum: [complete, partial, failed]
+ halted_at:
+ type: integer
+ nullable: true
+ error:
+ type: object
+ nullable: true
+ properties:
+ code: {type: string}
+ message: {type: string}
+ retryable: {type: boolean}
+
CapabilitiesResponse:
type: object
example:
@@ -1856,7 +1963,7 @@ components:
provider:
type: string
description: Resolved Ask provider. `custom` is used for injected providers.
- enum: [none, gemini, custom]
+ enum: [none, gemini, anthropic, openai, custom]
model:
type: string
description: Resolved model for the selected provider, or empty when Ask is disabled.
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index e6150a1..6bf181d 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -2285,9 +2285,7 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
}()
}
- var req struct {
- Steps []PlanStep `json:"steps"`
- }
+ var req PlanExecuteRequest
if err := json.Unmarshal(body, &req); err != nil {
if dedupIsExecutor {
dedupCompleted = true
@@ -2298,6 +2296,8 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
return
}
+ steps, expandErr := ExpandPlanRequest(req)
+
registry := s.askRegistry
if registry == nil {
if dedupIsExecutor {
@@ -2309,7 +2309,17 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
return
}
- if errs := ValidatePlan(req.Steps, registry); len(errs) > 0 {
+ if expandErr != nil {
+ if dedupIsExecutor {
+ dedupCompleted = true
+ s.dedupCache.Complete(r.Method, r.URL.Path, s.dedupPrincipal(r), idempKey, body,
+ http.StatusBadRequest, nil, &APIError{Code: "INVALID_PLAN", Message: expandErr.Error()}, time.Since(start).Milliseconds())
+ }
+ respondError(w, r, http.StatusBadRequest, "INVALID_PLAN", expandErr.Error(), false, APIMeta{RequestID: reqID})
+ return
+ }
+
+ if errs := ValidatePlan(steps, registry); len(errs) > 0 {
msg := strings.Join(errs, "; ")
if dedupIsExecutor {
dedupCompleted = true
@@ -2325,7 +2335,7 @@ func (s *Server) PlanExecute(w http.ResponseWriter, r *http.Request) {
planID = s.planStore.Create()
}
- result := s.executePlanWithProgress(r.Context(), req.Steps, registry, planID)
+ result := s.executePlanWithProgress(r.Context(), steps, registry, planID)
if result.PlanID != "" {
w.Header().Set("X-Plan-ID", result.PlanID)
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 0ed3768..28099a7 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -1714,6 +1714,124 @@ func TestToolCall_InvalidJSON_EnvelopeError(t *testing.T) {
}
}
+func TestPlanExecute_TriageTemplateExecutesAsPlan(t *testing.T) {
+ reg := tools.NewRegistry()
+ if err := reg.Register(tools.Tool{
+ Name: "triage_incident",
+ Description: "test triage",
+ InputSchema: json.RawMessage(`{
+ "type":"object",
+ "required":["incident_id"],
+ "properties":{
+ "incident_id":{"type":"string"},
+ "window":{"type":"string"},
+ "snapshot":{"type":"boolean"}
+ }
+ }`),
+ Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+ var got struct {
+ IncidentID string `json:"incident_id"`
+ Window string `json:"window"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(params, &got); err != nil {
+ return nil, err
+ }
+ return map[string]any{
+ "schema_version": "triage.v1",
+ "incident_ref": map[string]string{"id": got.IncidentID, "window": got.Window},
+ "report_hash": "sha256:test",
+ "snapshot": got.Snapshot,
+ }, nil
+ },
+ }); err != nil {
+ t.Fatalf("register: %v", err)
+ }
+ ps := NewPlanStore()
+ srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg, planStore: ps}
+ body := `{"template":"triage","params":{"incident_id":"inc_abc","window":"15m","snapshot":true}}`
+ r := httptest.NewRequest(http.MethodPost, "/v1/plans/execute", strings.NewReader(body))
+ r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
+ w := httptest.NewRecorder()
+ srv.PlanExecute(w, r)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status = %d, want 200; body=%s", w.Code, w.Body.String())
+ }
+ if w.Header().Get("X-Plan-ID") == "" {
+ t.Fatalf("missing X-Plan-ID")
+ }
+ var result PlanResult
+ if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if result.Status != "complete" || result.Completed != 1 || result.Total != 1 {
+ t.Fatalf("result status = %+v", result)
+ }
+ if result.Steps[0].ID != "triage" || result.Steps[0].Tool != "triage_incident" {
+ t.Fatalf("step = %+v", result.Steps[0])
+ }
+ raw, err := json.Marshal(result.Steps[0].Result)
+ if err != nil {
+ t.Fatalf("marshal result: %v", err)
+ }
+ var rep struct {
+ ReportHash string `json:"report_hash"`
+ IncidentRef struct {
+ ID string `json:"id"`
+ } `json:"incident_ref"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(raw, &rep); err != nil {
+ t.Fatalf("decode report: %v", err)
+ }
+ if rep.ReportHash != "sha256:test" || rep.IncidentRef.ID != "inc_abc" || !rep.Snapshot {
+ t.Fatalf("report = %+v", rep)
+ }
+ entry, ok := ps.Get(result.PlanID)
+ if !ok || len(entry.Events) < 3 {
+ t.Fatalf("expected SSE event log with start/complete/done, got ok=%v entry=%+v", ok, entry)
+ }
+}
+
+func TestPlanExecute_TemplateValidationErrors(t *testing.T) {
+ reg := tools.NewRegistry()
+ if err := reg.Register(tools.Tool{
+ Name: "triage_incident",
+ Description: "test triage",
+ InputSchema: json.RawMessage(`{"type":"object","required":["incident_id"],"properties":{"incident_id":{"type":"string"}}}`),
+ Handler: func(ctx context.Context, store tools.Store, params json.RawMessage) (any, error) {
+ return map[string]string{"ok": "true"}, nil
+ },
+ }); err != nil {
+ t.Fatalf("register: %v", err)
+ }
+ srv := &Server{store: graphstore.NewStore(), maxBodyBytes: 1 << 20, askRegistry: reg}
+ cases := map[string]string{
+ "unknown template": `{"template":"bogus","params":{"incident_id":"inc_abc"}}`,
+ "missing incident id": `{"template":"triage","params":{"snapshot":true}}`,
+ "steps and template both": `{"steps":[{"id":"x","tool":"triage_incident","params":{"incident_id":"inc_abc"}}],"template":"triage","params":{"incident_id":"inc_abc"}}`,
+ }
+ for name, body := range cases {
+ t.Run(name, func(t *testing.T) {
+ r := httptest.NewRequest(http.MethodPost, "/v1/plans/execute?envelope=v2", strings.NewReader(body))
+ r = r.WithContext(ContextWithRequestID(r.Context(), "req_test"))
+ w := httptest.NewRecorder()
+ srv.PlanExecute(w, r)
+ if w.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want 400; body=%s", w.Code, w.Body.String())
+ }
+ var resp APIResponse
+ if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if resp.Error == nil || resp.Error.Code != "INVALID_PLAN" {
+ t.Fatalf("error = %+v, want INVALID_PLAN", resp.Error)
+ }
+ })
+ }
+}
+
func TestAsk_DedupSafetyNet_PreservesActualStatus(t *testing.T) {
dc := NewDedupCache()
srv := &Server{
diff --git a/internal/ingest/plan.go b/internal/ingest/plan.go
index 278d372..b3d6237 100644
--- a/internal/ingest/plan.go
+++ b/internal/ingest/plan.go
@@ -15,6 +15,14 @@ type PlanStep struct {
Params json.RawMessage `json:"params"`
}
+// PlanExecuteRequest is the accepted body for POST /v1/plans/execute.
+// Callers may provide explicit steps or a built-in template shorthand.
+type PlanExecuteRequest struct {
+ Steps []PlanStep `json:"steps"`
+ Template string `json:"template"`
+ Params *json.RawMessage `json:"params"`
+}
+
// PlanStepError is a structured error for a plan step.
type PlanStepError struct {
Code string `json:"code"`
@@ -43,6 +51,49 @@ type PlanResult struct {
Error *PlanStepError `json:"error,omitempty"`
}
+// ExpandPlanRequest turns a supported template shorthand into ordinary plan steps.
+func ExpandPlanRequest(req PlanExecuteRequest) ([]PlanStep, error) {
+ if len(req.Steps) > 0 && strings.TrimSpace(req.Template) != "" {
+ return nil, fmt.Errorf("provide either steps or template, not both")
+ }
+ if strings.TrimSpace(req.Template) == "" {
+ return req.Steps, nil
+ }
+
+ switch strings.TrimSpace(req.Template) {
+ case "triage":
+ return expandTriagePlan(req.Params)
+ default:
+ return nil, fmt.Errorf("unknown plan template %q", req.Template)
+ }
+}
+
+func expandTriagePlan(raw *json.RawMessage) ([]PlanStep, error) {
+ if raw == nil || len(*raw) == 0 {
+ return nil, fmt.Errorf("triage template requires params.incident_id")
+ }
+ var params struct {
+ IncidentID string `json:"incident_id"`
+ Window string `json:"window,omitempty"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(*raw, ¶ms); err != nil {
+ return nil, fmt.Errorf("triage template params: %w", err)
+ }
+ if strings.TrimSpace(params.IncidentID) == "" {
+ return nil, fmt.Errorf("triage template requires params.incident_id")
+ }
+ stepParams, err := json.Marshal(params)
+ if err != nil {
+ return nil, fmt.Errorf("triage template params: %w", err)
+ }
+ return []PlanStep{{
+ ID: "triage",
+ Tool: "triage_incident",
+ Params: stepParams,
+ }}, nil
+}
+
// statusForHalt returns the appropriate status string for a halted plan.
func statusForHalt(haltedIdx int) string {
if haltedIdx == 0 {
diff --git a/internal/ingest/plan_test.go b/internal/ingest/plan_test.go
index 893d046..b1effe9 100644
--- a/internal/ingest/plan_test.go
+++ b/internal/ingest/plan_test.go
@@ -289,3 +289,62 @@ func TestValidatePlan_Valid(t *testing.T) {
t.Fatalf("expected valid plan to have no errors, got: %v", errs)
}
}
+
+func TestExpandPlanRequest_TriageTemplate(t *testing.T) {
+ params := json.RawMessage(`{"incident_id":"inc_abc","window":"30m","snapshot":true}`)
+ steps, err := ExpandPlanRequest(PlanExecuteRequest{
+ Template: "triage",
+ Params: ¶ms,
+ })
+ if err != nil {
+ t.Fatalf("expand: %v", err)
+ }
+ if len(steps) != 1 {
+ t.Fatalf("len(steps) = %d, want 1", len(steps))
+ }
+ step := steps[0]
+ if step.ID != "triage" || step.Tool != "triage_incident" {
+ t.Fatalf("step = %+v, want triage/triage_incident", step)
+ }
+ var got struct {
+ IncidentID string `json:"incident_id"`
+ Window string `json:"window"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(step.Params, &got); err != nil {
+ t.Fatalf("decode params: %v", err)
+ }
+ if got.IncidentID != "inc_abc" || got.Window != "30m" || !got.Snapshot {
+ t.Fatalf("params = %+v", got)
+ }
+}
+
+func TestExpandPlanRequest_RejectsInvalidTemplateInput(t *testing.T) {
+ params := json.RawMessage(`{"incident_id":"inc_abc"}`)
+ cases := map[string]PlanExecuteRequest{
+ "both steps and template": {
+ Steps: []PlanStep{{ID: "x", Tool: "graph_stats"}},
+ Template: "triage",
+ Params: ¶ms,
+ },
+ "unknown template": {
+ Template: "unknown",
+ Params: ¶ms,
+ },
+ "missing incident id": {
+ Template: "triage",
+ Params: ptrRawMessage(json.RawMessage(`{"snapshot":true}`)),
+ },
+ }
+ for name, req := range cases {
+ t.Run(name, func(t *testing.T) {
+ if _, err := ExpandPlanRequest(req); err == nil {
+ t.Fatalf("expected error")
+ }
+ })
+ }
+}
+
+func ptrRawMessage(raw json.RawMessage) *json.RawMessage {
+ return &raw
+}
diff --git a/internal/llm/anthropic.go b/internal/llm/anthropic.go
new file mode 100644
index 0000000..94cd300
--- /dev/null
+++ b/internal/llm/anthropic.go
@@ -0,0 +1,198 @@
+package llm
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+)
+
+const defaultAnthropicModel = "claude-sonnet-4-6"
+const defaultAnthropicBaseURL = "https://api.anthropic.com/v1"
+const anthropicVersion = "2023-06-01"
+
+type AnthropicClient struct {
+ APIKey string
+ Model string
+ BaseURL string
+ HTTPClient *http.Client
+}
+
+func NewAnthropicClient(apiKey string) *AnthropicClient {
+ return &AnthropicClient{
+ APIKey: apiKey,
+ Model: defaultAnthropicModel,
+ BaseURL: defaultAnthropicBaseURL,
+ }
+}
+
+func (c *AnthropicClient) Generate(ctx context.Context, prompt string, tools []ToolDefinition, history []Turn) (Result, error) {
+ if c.APIKey == "" {
+ return Result{}, fmt.Errorf("anthropic api key required")
+ }
+ reqBody, err := c.buildRequest(prompt, tools, history)
+ if err != nil {
+ return Result{}, err
+ }
+
+ baseURL := c.BaseURL
+ if baseURL == "" {
+ baseURL = defaultAnthropicBaseURL
+ }
+ req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/messages", bytes.NewReader(reqBody))
+ if err != nil {
+ return Result{}, err
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("x-api-key", c.APIKey)
+ req.Header.Set("anthropic-version", anthropicVersion)
+
+ client := c.HTTPClient
+ if client == nil {
+ client = &http.Client{Timeout: 30 * time.Second}
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ return Result{}, &ProviderError{Provider: "anthropic", Retryable: true, Message: err.Error(), Cause: err}
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return Result{}, &ProviderError{Provider: "anthropic", Retryable: true, Message: err.Error(), Cause: err}
+ }
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return Result{}, &ProviderError{
+ Provider: "anthropic",
+ StatusCode: resp.StatusCode,
+ Retryable: resp.StatusCode == 429 || resp.StatusCode >= 500,
+ Message: string(body),
+ }
+ }
+ return parseAnthropicResponse(body)
+}
+
+type anthropicRequest struct {
+ Model string `json:"model"`
+ MaxTokens int `json:"max_tokens"`
+ Messages []anthropicMessage `json:"messages"`
+ Tools []anthropicTool `json:"tools,omitempty"`
+}
+
+type anthropicMessage struct {
+ Role string `json:"role"`
+ Content any `json:"content"`
+}
+
+type anthropicTool struct {
+ Name string `json:"name"`
+ Description string `json:"description,omitempty"`
+ InputSchema json.RawMessage `json:"input_schema"`
+}
+
+type anthropicContentBlock struct {
+ Type string `json:"type"`
+ Text string `json:"text,omitempty"`
+ ID string `json:"id,omitempty"`
+ Name string `json:"name,omitempty"`
+ Input json.RawMessage `json:"input,omitempty"`
+ ToolUseID string `json:"tool_use_id,omitempty"`
+ Content string `json:"content,omitempty"`
+}
+
+func (c *AnthropicClient) buildRequest(prompt string, tools []ToolDefinition, history []Turn) ([]byte, error) {
+ model := c.Model
+ if model == "" {
+ model = defaultAnthropicModel
+ }
+ req := anthropicRequest{
+ Model: model,
+ MaxTokens: 1024,
+ Messages: []anthropicMessage{{Role: "user", Content: prompt}},
+ }
+ for _, t := range tools {
+ schema := t.InputSchema
+ if len(schema) == 0 {
+ schema = json.RawMessage(`{"type":"object"}`)
+ }
+ req.Tools = append(req.Tools, anthropicTool{
+ Name: t.Name,
+ Description: t.Description,
+ InputSchema: schema,
+ })
+ }
+
+ nextToolID := 1
+ lastToolID := ""
+ for _, turn := range history {
+ switch {
+ case turn.ToolCall != nil:
+ toolID := fmt.Sprintf("toolu_waylog_%d", nextToolID)
+ nextToolID++
+ lastToolID = toolID
+ input := turn.ToolCall.Arguments
+ if len(input) == 0 {
+ input = json.RawMessage(`{}`)
+ }
+ req.Messages = append(req.Messages, anthropicMessage{
+ Role: "assistant",
+ Content: []anthropicContentBlock{{
+ Type: "tool_use",
+ ID: toolID,
+ Name: turn.ToolCall.Name,
+ Input: input,
+ }},
+ })
+ case turn.ToolResult != nil:
+ toolID := lastToolID
+ if toolID == "" {
+ toolID = "toolu_waylog_0"
+ }
+ payload, err := json.Marshal(turn.ToolResult.Result)
+ if err != nil {
+ return nil, fmt.Errorf("anthropic: marshal tool result: %w", err)
+ }
+ req.Messages = append(req.Messages, anthropicMessage{
+ Role: "user",
+ Content: []anthropicContentBlock{{
+ Type: "tool_result",
+ ToolUseID: toolID,
+ Content: string(payload),
+ }},
+ })
+ case turn.Text != "":
+ req.Messages = append(req.Messages, anthropicMessage{Role: "assistant", Content: turn.Text})
+ }
+ }
+
+ return json.Marshal(req)
+}
+
+type anthropicResponse struct {
+ Content []anthropicContentBlock `json:"content"`
+}
+
+func parseAnthropicResponse(body []byte) (Result, error) {
+ var resp anthropicResponse
+ if err := json.Unmarshal(body, &resp); err != nil {
+ return Result{}, err
+ }
+ var out Result
+ for _, block := range resp.Content {
+ switch block.Type {
+ case "text":
+ out.Text += block.Text
+ case "tool_use":
+ args := block.Input
+ if len(args) == 0 {
+ args = json.RawMessage(`{}`)
+ }
+ out.ToolCalls = append(out.ToolCalls, ToolCall{Name: block.Name, Arguments: args})
+ }
+ }
+ return out, nil
+}
diff --git a/internal/llm/anthropic_test.go b/internal/llm/anthropic_test.go
new file mode 100644
index 0000000..b535744
--- /dev/null
+++ b/internal/llm/anthropic_test.go
@@ -0,0 +1,117 @@
+package llm
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+)
+
+func TestAnthropicGenerateSendsMessagesAndTools(t *testing.T) {
+ var captured struct {
+ Model string `json:"model"`
+ Messages []struct {
+ Role string `json:"role"`
+ Content json.RawMessage `json:"content"`
+ } `json:"messages"`
+ Tools []struct {
+ Name string `json:"name"`
+ InputSchema json.RawMessage `json:"input_schema"`
+ } `json:"tools"`
+ }
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/messages" {
+ t.Fatalf("path = %q, want /messages", r.URL.Path)
+ }
+ if got := r.Header.Get("x-api-key"); got != "test-key" {
+ t.Fatalf("x-api-key = %q", got)
+ }
+ if got := r.Header.Get("anthropic-version"); got != anthropicVersion {
+ t.Fatalf("anthropic-version = %q", got)
+ }
+ if err := json.NewDecoder(r.Body).Decode(&captured); err != nil {
+ t.Fatalf("decode request: %v", err)
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _, _ = w.Write([]byte(`{"content":[{"type":"text","text":"done"}]}`))
+ }))
+ defer srv.Close()
+
+ client := NewAnthropicClient("test-key")
+ client.BaseURL = srv.URL
+ client.Model = "claude-test"
+ res, err := client.Generate(context.Background(), "hello", []ToolDefinition{{
+ Name: "triage_incident",
+ Description: "triage",
+ InputSchema: json.RawMessage(`{"type":"object","properties":{"incident_id":{"type":"string"}}}`),
+ }}, nil)
+ if err != nil {
+ t.Fatalf("generate: %v", err)
+ }
+ if res.Text != "done" {
+ t.Fatalf("Text = %q, want done", res.Text)
+ }
+ if captured.Model != "claude-test" {
+ t.Fatalf("model = %q", captured.Model)
+ }
+ if len(captured.Messages) != 1 || captured.Messages[0].Role != "user" {
+ t.Fatalf("messages = %+v", captured.Messages)
+ }
+ if len(captured.Tools) != 1 || captured.Tools[0].Name != "triage_incident" {
+ t.Fatalf("tools = %+v", captured.Tools)
+ }
+}
+
+func TestParseAnthropicResponseToolUse(t *testing.T) {
+ body := []byte(`{
+ "content": [
+ {"type":"text","text":"checking"},
+ {"type":"tool_use","id":"toolu_1","name":"triage_incident","input":{"incident_id":"inc_abc","snapshot":true}}
+ ]
+ }`)
+ res, err := parseAnthropicResponse(body)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if res.Text != "checking" {
+ t.Fatalf("Text = %q", res.Text)
+ }
+ if len(res.ToolCalls) != 1 {
+ t.Fatalf("len(ToolCalls) = %d", len(res.ToolCalls))
+ }
+ if res.ToolCalls[0].Name != "triage_incident" {
+ t.Fatalf("tool name = %q", res.ToolCalls[0].Name)
+ }
+ var args struct {
+ IncidentID string `json:"incident_id"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(res.ToolCalls[0].Arguments, &args); err != nil {
+ t.Fatalf("args: %v", err)
+ }
+ if args.IncidentID != "inc_abc" || !args.Snapshot {
+ t.Fatalf("args = %+v", args)
+ }
+}
+
+func TestAnthropicGenerateAPIError(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ http.Error(w, "rate limited", http.StatusTooManyRequests)
+ }))
+ defer srv.Close()
+
+ client := NewAnthropicClient("test-key")
+ client.BaseURL = srv.URL
+ _, err := client.Generate(context.Background(), "hello", nil, nil)
+ if err == nil {
+ t.Fatalf("expected error")
+ }
+ pe, ok := err.(*ProviderError)
+ if !ok {
+ t.Fatalf("err = %T, want *ProviderError", err)
+ }
+ if pe.Provider != "anthropic" || !pe.Retryable || pe.StatusCode != http.StatusTooManyRequests {
+ t.Fatalf("provider error = %+v", pe)
+ }
+}
diff --git a/internal/llm/openai.go b/internal/llm/openai.go
new file mode 100644
index 0000000..3dc9019
--- /dev/null
+++ b/internal/llm/openai.go
@@ -0,0 +1,239 @@
+package llm
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+)
+
+const defaultOpenAIModel = "gpt-5.4-mini"
+const defaultOpenAIBaseURL = "https://api.openai.com/v1"
+
+type OpenAIClient struct {
+ APIKey string
+ Model string
+ BaseURL string
+ HTTPClient *http.Client
+}
+
+func NewOpenAIClient(apiKey string) *OpenAIClient {
+ return &OpenAIClient{
+ APIKey: apiKey,
+ Model: defaultOpenAIModel,
+ BaseURL: defaultOpenAIBaseURL,
+ }
+}
+
+func (c *OpenAIClient) Generate(ctx context.Context, prompt string, tools []ToolDefinition, history []Turn) (Result, error) {
+ if c.APIKey == "" {
+ return Result{}, fmt.Errorf("openai api key required")
+ }
+ reqBody, err := c.buildRequest(prompt, tools, history)
+ if err != nil {
+ return Result{}, err
+ }
+
+ baseURL := c.BaseURL
+ if baseURL == "" {
+ baseURL = defaultOpenAIBaseURL
+ }
+ req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/responses", bytes.NewReader(reqBody))
+ if err != nil {
+ return Result{}, err
+ }
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Authorization", "Bearer "+c.APIKey)
+
+ client := c.HTTPClient
+ if client == nil {
+ client = &http.Client{Timeout: 30 * time.Second}
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ return Result{}, &ProviderError{Provider: "openai", Retryable: true, Message: err.Error(), Cause: err}
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return Result{}, &ProviderError{Provider: "openai", Retryable: true, Message: err.Error(), Cause: err}
+ }
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ return Result{}, &ProviderError{
+ Provider: "openai",
+ StatusCode: resp.StatusCode,
+ Retryable: resp.StatusCode == 429 || resp.StatusCode >= 500,
+ Message: string(body),
+ }
+ }
+ return parseOpenAIResponse(body)
+}
+
+type openAIRequest struct {
+ Model string `json:"model"`
+ Input []json.RawMessage `json:"input"`
+ Tools []openAITool `json:"tools,omitempty"`
+}
+
+type openAIMessage struct {
+ Role string `json:"role"`
+ Content string `json:"content"`
+}
+
+type openAIFunctionCallInput struct {
+ Type string `json:"type"`
+ CallID string `json:"call_id"`
+ Name string `json:"name"`
+ Arguments string `json:"arguments"`
+}
+
+type openAIFunctionOutputInput struct {
+ Type string `json:"type"`
+ CallID string `json:"call_id"`
+ Output string `json:"output"`
+}
+
+type openAITool struct {
+ Type string `json:"type"`
+ Name string `json:"name"`
+ Description string `json:"description,omitempty"`
+ Parameters json.RawMessage `json:"parameters"`
+}
+
+func (c *OpenAIClient) buildRequest(prompt string, tools []ToolDefinition, history []Turn) ([]byte, error) {
+ model := c.Model
+ if model == "" {
+ model = defaultOpenAIModel
+ }
+ req := openAIRequest{
+ Model: model,
+ Input: []json.RawMessage{mustMarshalOpenAI(openAIMessage{Role: "user", Content: prompt})},
+ }
+ for _, t := range tools {
+ schema := t.InputSchema
+ if len(schema) == 0 {
+ schema = json.RawMessage(`{"type":"object"}`)
+ }
+ req.Tools = append(req.Tools, openAITool{
+ Type: "function",
+ Name: t.Name,
+ Description: t.Description,
+ Parameters: schema,
+ })
+ }
+
+ nextCallID := 1
+ lastCallID := ""
+ for _, turn := range history {
+ switch {
+ case turn.ToolCall != nil:
+ callID := fmt.Sprintf("call_waylog_%d", nextCallID)
+ nextCallID++
+ if turn.ToolCall.ProviderID != "" {
+ callID = turn.ToolCall.ProviderID
+ }
+ lastCallID = callID
+ if len(turn.ToolCall.ProviderRawItems) > 0 {
+ req.Input = append(req.Input, turn.ToolCall.ProviderRawItems...)
+ continue
+ }
+ if !turn.ToolCall.ProviderRawIncluded {
+ args := string(turn.ToolCall.Arguments)
+ if args == "" {
+ args = "{}"
+ }
+ req.Input = append(req.Input, mustMarshalOpenAI(openAIFunctionCallInput{
+ Type: "function_call",
+ CallID: callID,
+ Name: turn.ToolCall.Name,
+ Arguments: args,
+ }))
+ }
+ case turn.ToolResult != nil:
+ callID := lastCallID
+ if callID == "" {
+ callID = "call_waylog_0"
+ }
+ payload, err := json.Marshal(turn.ToolResult.Result)
+ if err != nil {
+ return nil, fmt.Errorf("openai: marshal tool result: %w", err)
+ }
+ req.Input = append(req.Input, mustMarshalOpenAI(openAIFunctionOutputInput{
+ Type: "function_call_output",
+ CallID: callID,
+ Output: string(payload),
+ }))
+ case turn.Text != "":
+ req.Input = append(req.Input, mustMarshalOpenAI(openAIMessage{Role: "assistant", Content: turn.Text}))
+ }
+ }
+
+ return json.Marshal(req)
+}
+
+func mustMarshalOpenAI(v any) json.RawMessage {
+ raw, _ := json.Marshal(v)
+ return raw
+}
+
+type openAIResponse struct {
+ OutputText string `json:"output_text"`
+ Output []json.RawMessage `json:"output"`
+}
+
+type openAIOutputItem struct {
+ Type string `json:"type"`
+ CallID string `json:"call_id,omitempty"`
+ Name string `json:"name,omitempty"`
+ Arguments string `json:"arguments,omitempty"`
+ Content []struct {
+ Type string `json:"type"`
+ Text string `json:"text"`
+ } `json:"content,omitempty"`
+}
+
+func parseOpenAIResponse(body []byte) (Result, error) {
+ var resp openAIResponse
+ if err := json.Unmarshal(body, &resp); err != nil {
+ return Result{}, err
+ }
+ out := Result{Text: resp.OutputText}
+ rawItems := append([]json.RawMessage(nil), resp.Output...)
+ rawAttached := false
+ for _, raw := range resp.Output {
+ var item openAIOutputItem
+ if err := json.Unmarshal(raw, &item); err != nil {
+ return Result{}, err
+ }
+ switch item.Type {
+ case "function_call":
+ args := json.RawMessage(item.Arguments)
+ if len(args) == 0 {
+ args = json.RawMessage(`{}`)
+ }
+ call := ToolCall{Name: item.Name, Arguments: args, ProviderID: item.CallID}
+ if !rawAttached {
+ call.ProviderRawItems = rawItems
+ rawAttached = true
+ } else {
+ call.ProviderRawIncluded = true
+ }
+ out.ToolCalls = append(out.ToolCalls, call)
+ case "message":
+ if out.Text != "" {
+ continue
+ }
+ for _, part := range item.Content {
+ if part.Type == "output_text" || part.Type == "text" {
+ out.Text += part.Text
+ }
+ }
+ }
+ }
+ return out, nil
+}
diff --git a/internal/llm/openai_test.go b/internal/llm/openai_test.go
new file mode 100644
index 0000000..fa363a0
--- /dev/null
+++ b/internal/llm/openai_test.go
@@ -0,0 +1,203 @@
+package llm
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+)
+
+func TestOpenAIGenerateSendsResponsesRequest(t *testing.T) {
+ var captured struct {
+ Model string `json:"model"`
+ Input []struct {
+ Role string `json:"role,omitempty"`
+ Content string `json:"content,omitempty"`
+ } `json:"input"`
+ Tools []struct {
+ Type string `json:"type"`
+ Name string `json:"name"`
+ Parameters json.RawMessage `json:"parameters"`
+ } `json:"tools"`
+ }
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/responses" {
+ t.Fatalf("path = %q, want /responses", r.URL.Path)
+ }
+ if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
+ t.Fatalf("Authorization = %q", got)
+ }
+ if err := json.NewDecoder(r.Body).Decode(&captured); err != nil {
+ t.Fatalf("decode request: %v", err)
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _, _ = w.Write([]byte(`{"output_text":"done","output":[]}`))
+ }))
+ defer srv.Close()
+
+ client := NewOpenAIClient("test-key")
+ client.BaseURL = srv.URL
+ client.Model = "gpt-test"
+ res, err := client.Generate(context.Background(), "hello", []ToolDefinition{{
+ Name: "triage_incident",
+ Description: "triage",
+ InputSchema: json.RawMessage(`{"type":"object","properties":{"incident_id":{"type":"string"}}}`),
+ }}, nil)
+ if err != nil {
+ t.Fatalf("generate: %v", err)
+ }
+ if res.Text != "done" {
+ t.Fatalf("Text = %q, want done", res.Text)
+ }
+ if captured.Model != "gpt-test" {
+ t.Fatalf("model = %q", captured.Model)
+ }
+ if len(captured.Input) != 1 || captured.Input[0].Role != "user" || captured.Input[0].Content != "hello" {
+ t.Fatalf("input = %+v", captured.Input)
+ }
+ if len(captured.Tools) != 1 || captured.Tools[0].Type != "function" || captured.Tools[0].Name != "triage_incident" {
+ t.Fatalf("tools = %+v", captured.Tools)
+ }
+}
+
+func TestParseOpenAIResponseFunctionCall(t *testing.T) {
+ body := []byte(`{
+ "output": [
+ {"type":"function_call","call_id":"call_1","name":"triage_incident","arguments":"{\"incident_id\":\"inc_abc\",\"snapshot\":true}"}
+ ]
+ }`)
+ res, err := parseOpenAIResponse(body)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if len(res.ToolCalls) != 1 {
+ t.Fatalf("len(ToolCalls) = %d", len(res.ToolCalls))
+ }
+ if res.ToolCalls[0].Name != "triage_incident" {
+ t.Fatalf("tool name = %q", res.ToolCalls[0].Name)
+ }
+ if res.ToolCalls[0].ProviderID != "call_1" {
+ t.Fatalf("ProviderID = %q, want call_1", res.ToolCalls[0].ProviderID)
+ }
+ if len(res.ToolCalls[0].ProviderRawItems) != 1 {
+ t.Fatalf("ProviderRawItems len = %d, want 1", len(res.ToolCalls[0].ProviderRawItems))
+ }
+ var args struct {
+ IncidentID string `json:"incident_id"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(res.ToolCalls[0].Arguments, &args); err != nil {
+ t.Fatalf("args: %v", err)
+ }
+ if args.IncidentID != "inc_abc" || !args.Snapshot {
+ t.Fatalf("args = %+v", args)
+ }
+}
+
+func TestOpenAIRequestPreservesResponseOutputAndCallIDs(t *testing.T) {
+ body := []byte(`{
+ "output": [
+ {"type":"reasoning","id":"rs_1","summary":[]},
+ {"type":"function_call","call_id":"call_1","name":"triage_incident","arguments":"{\"incident_id\":\"inc_abc\"}"},
+ {"type":"function_call","call_id":"call_2","name":"blast_radius","arguments":"{\"code\":\"PMT_502\"}"}
+ ]
+ }`)
+ res, err := parseOpenAIResponse(body)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if len(res.ToolCalls) != 2 {
+ t.Fatalf("len(ToolCalls) = %d, want 2", len(res.ToolCalls))
+ }
+
+ client := NewOpenAIClient("test-key")
+ raw, err := client.buildRequest("triage", nil, []Turn{
+ {ToolCall: &res.ToolCalls[0]},
+ {ToolResult: &ToolResult{Name: "triage_incident", Result: map[string]string{"report_hash": "sha256:x"}}},
+ {ToolCall: &res.ToolCalls[1]},
+ {ToolResult: &ToolResult{Name: "blast_radius", Result: map[string]int{"requests": 12}}},
+ })
+ if err != nil {
+ t.Fatalf("buildRequest: %v", err)
+ }
+
+ var req struct {
+ Input []json.RawMessage `json:"input"`
+ }
+ if err := json.Unmarshal(raw, &req); err != nil {
+ t.Fatalf("decode request: %v", err)
+ }
+ var types []string
+ var callIDs []string
+ for _, item := range req.Input {
+ var got struct {
+ Type string `json:"type"`
+ CallID string `json:"call_id"`
+ }
+ if err := json.Unmarshal(item, &got); err != nil {
+ t.Fatalf("decode input item: %v", err)
+ }
+ if got.Type == "" {
+ continue
+ }
+ types = append(types, got.Type)
+ if got.CallID != "" {
+ callIDs = append(callIDs, got.CallID)
+ }
+ }
+ wantTypes := []string{"reasoning", "function_call", "function_call", "function_call_output", "function_call_output"}
+ if len(types) != len(wantTypes) {
+ t.Fatalf("types = %v, want %v", types, wantTypes)
+ }
+ for i := range wantTypes {
+ if types[i] != wantTypes[i] {
+ t.Fatalf("types = %v, want %v", types, wantTypes)
+ }
+ }
+ wantCallIDs := []string{"call_1", "call_2", "call_1", "call_2"}
+ if len(callIDs) != len(wantCallIDs) {
+ t.Fatalf("callIDs = %v, want %v", callIDs, wantCallIDs)
+ }
+ for i := range wantCallIDs {
+ if callIDs[i] != wantCallIDs[i] {
+ t.Fatalf("callIDs = %v, want %v", callIDs, wantCallIDs)
+ }
+ }
+}
+
+func TestParseOpenAIResponseMessageText(t *testing.T) {
+ body := []byte(`{
+ "output": [
+ {"type":"message","content":[{"type":"output_text","text":"hello"}]}
+ ]
+ }`)
+ res, err := parseOpenAIResponse(body)
+ if err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if res.Text != "hello" {
+ t.Fatalf("Text = %q, want hello", res.Text)
+ }
+}
+
+func TestOpenAIGenerateAPIError(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ http.Error(w, "unavailable", http.StatusServiceUnavailable)
+ }))
+ defer srv.Close()
+
+ client := NewOpenAIClient("test-key")
+ client.BaseURL = srv.URL
+ _, err := client.Generate(context.Background(), "hello", nil, nil)
+ if err == nil {
+ t.Fatalf("expected error")
+ }
+ pe, ok := err.(*ProviderError)
+ if !ok {
+ t.Fatalf("err = %T, want *ProviderError", err)
+ }
+ if pe.Provider != "openai" || !pe.Retryable || pe.StatusCode != http.StatusServiceUnavailable {
+ t.Fatalf("provider error = %+v", pe)
+ }
+}
diff --git a/internal/llm/provider.go b/internal/llm/provider.go
index c75ea4e..777b178 100644
--- a/internal/llm/provider.go
+++ b/internal/llm/provider.go
@@ -23,34 +23,57 @@ type Selection struct {
// SelectFromEnv resolves the LLM provider from environment variables.
//
-// WAYLOG_LLM_PROVIDER may be "none" or "gemini". When unset, a Gemini key
-// (GEMINI_API_KEY or GOOGLE_API_KEY) infers gemini; otherwise none.
-// Model precedence: WAYLOG_LLM_MODEL > GEMINI_MODEL > built-in default.
+// WAYLOG_LLM_PROVIDER may be "none", "gemini", "anthropic", or "openai".
+// When unset, a supported provider key infers the provider; otherwise none.
+// Model precedence: WAYLOG_LLM_MODEL > provider-specific model env > built-in default.
func SelectFromEnv() (Selection, error) {
raw := strings.ToLower(strings.TrimSpace(os.Getenv("WAYLOG_LLM_PROVIDER")))
- key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
- if key == "" {
- key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
- }
switch raw {
case "":
- if key == "" {
- return Selection{Provider: "none"}, nil
+ if key := geminiKeyFromEnv(); key != "" {
+ return buildGemini(key, true), nil
}
- return buildGemini(key, true), nil
+ if key := strings.TrimSpace(os.Getenv("ANTHROPIC_API_KEY")); key != "" {
+ return buildAnthropic(key, true), nil
+ }
+ if key := strings.TrimSpace(os.Getenv("OPENAI_API_KEY")); key != "" {
+ return buildOpenAI(key, true), nil
+ }
+ return Selection{Provider: "none"}, nil
case "none":
return Selection{Provider: "none", Configured: true}, nil
case "gemini":
+ key := geminiKeyFromEnv()
if key == "" {
return Selection{Provider: "gemini", Configured: false}, nil
}
return buildGemini(key, true), nil
+ case "anthropic":
+ key := strings.TrimSpace(os.Getenv("ANTHROPIC_API_KEY"))
+ if key == "" {
+ return Selection{Provider: "anthropic", Configured: false}, nil
+ }
+ return buildAnthropic(key, true), nil
+ case "openai":
+ key := strings.TrimSpace(os.Getenv("OPENAI_API_KEY"))
+ if key == "" {
+ return Selection{Provider: "openai", Configured: false}, nil
+ }
+ return buildOpenAI(key, true), nil
default:
- return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini", raw)
+ return Selection{}, fmt.Errorf("unknown LLM provider %q; supported: none, gemini, anthropic, openai", raw)
}
}
+func geminiKeyFromEnv() string {
+ key := strings.TrimSpace(os.Getenv("GEMINI_API_KEY"))
+ if key == "" {
+ key = strings.TrimSpace(os.Getenv("GOOGLE_API_KEY"))
+ }
+ return key
+}
+
func buildGemini(key string, configured bool) Selection {
client := NewGeminiClient(key)
if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
@@ -73,3 +96,41 @@ func buildGemini(key string, configured bool) Selection {
Impl: client,
}
}
+
+func buildAnthropic(key string, configured bool) Selection {
+ client := NewAnthropicClient(key)
+ if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+ client.Model = model
+ } else if model := strings.TrimSpace(os.Getenv("ANTHROPIC_MODEL")); model != "" {
+ client.Model = model
+ }
+ if base := strings.TrimSpace(os.Getenv("ANTHROPIC_API_BASE")); base != "" {
+ client.BaseURL = base
+ }
+ return Selection{
+ Provider: "anthropic",
+ Model: client.Model,
+ Configured: configured,
+ AskEnabled: true,
+ Impl: client,
+ }
+}
+
+func buildOpenAI(key string, configured bool) Selection {
+ client := NewOpenAIClient(key)
+ if model := strings.TrimSpace(os.Getenv("WAYLOG_LLM_MODEL")); model != "" {
+ client.Model = model
+ } else if model := strings.TrimSpace(os.Getenv("OPENAI_MODEL")); model != "" {
+ client.Model = model
+ }
+ if base := strings.TrimSpace(os.Getenv("OPENAI_API_BASE")); base != "" {
+ client.BaseURL = base
+ }
+ return Selection{
+ Provider: "openai",
+ Model: client.Model,
+ Configured: configured,
+ AskEnabled: true,
+ Impl: client,
+ }
+}
diff --git a/internal/llm/provider_test.go b/internal/llm/provider_test.go
index b73e466..a19b248 100644
--- a/internal/llm/provider_test.go
+++ b/internal/llm/provider_test.go
@@ -14,6 +14,12 @@ func clearProviderEnv(t *testing.T) {
t.Setenv("GEMINI_MODEL", "")
t.Setenv("GEMINI_API_BASE", "")
t.Setenv("GEMINI_TOOL_MODE", "")
+ t.Setenv("ANTHROPIC_API_KEY", "")
+ t.Setenv("ANTHROPIC_MODEL", "")
+ t.Setenv("ANTHROPIC_API_BASE", "")
+ t.Setenv("OPENAI_API_KEY", "")
+ t.Setenv("OPENAI_MODEL", "")
+ t.Setenv("OPENAI_API_BASE", "")
}
func TestSelectFromEnv_NoEnv(t *testing.T) {
@@ -129,16 +135,16 @@ func TestSelectFromEnv_GeminiMissingKey(t *testing.T) {
func TestSelectFromEnv_UnknownProvider(t *testing.T) {
clearProviderEnv(t)
- t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+ t.Setenv("WAYLOG_LLM_PROVIDER", "bogus")
_, err := SelectFromEnv()
if err == nil {
t.Fatal("expected error, got nil")
}
- if !strings.Contains(err.Error(), "anthropic") {
+ if !strings.Contains(err.Error(), "bogus") {
t.Errorf("error %q should mention provider name", err.Error())
}
- if !strings.Contains(err.Error(), "none, gemini") {
+ if !strings.Contains(err.Error(), "none, gemini, anthropic, openai") {
t.Errorf("error %q should list supported providers", err.Error())
}
}
@@ -158,3 +164,129 @@ func TestSelectFromEnv_WaylogModelOverridesGeminiModel(t *testing.T) {
t.Errorf("Model = %q, want %q", sel.Model, "foo")
}
}
+
+func TestSelectFromEnv_AnthropicWithKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+ t.Setenv("ANTHROPIC_API_KEY", "test-key")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "anthropic" {
+ t.Errorf("Provider = %q, want anthropic", sel.Provider)
+ }
+ if !sel.Configured || !sel.AskEnabled || sel.Impl == nil {
+ t.Fatalf("selection = %+v, want configured enabled provider", sel)
+ }
+ if sel.Model == "" {
+ t.Error("Model is empty, want default")
+ }
+}
+
+func TestSelectFromEnv_AnthropicMissingKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "anthropic")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "anthropic" {
+ t.Errorf("Provider = %q, want anthropic", sel.Provider)
+ }
+ if sel.Configured || sel.AskEnabled || sel.Impl != nil {
+ t.Fatalf("selection = %+v, want unavailable provider without startup error", sel)
+ }
+}
+
+func TestSelectFromEnv_OpenAIWithKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "openai")
+ t.Setenv("OPENAI_API_KEY", "test-key")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "openai" {
+ t.Errorf("Provider = %q, want openai", sel.Provider)
+ }
+ if !sel.Configured || !sel.AskEnabled || sel.Impl == nil {
+ t.Fatalf("selection = %+v, want configured enabled provider", sel)
+ }
+ if sel.Model == "" {
+ t.Error("Model is empty, want default")
+ }
+}
+
+func TestSelectFromEnv_OpenAIMissingKey(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", "openai")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != "openai" {
+ t.Errorf("Provider = %q, want openai", sel.Provider)
+ }
+ if sel.Configured || sel.AskEnabled || sel.Impl != nil {
+ t.Fatalf("selection = %+v, want unavailable provider without startup error", sel)
+ }
+}
+
+func TestSelectFromEnv_InferredProviderKeys(t *testing.T) {
+ tests := []struct {
+ name string
+ keyEnv string
+ wantName string
+ }{
+ {name: "anthropic", keyEnv: "ANTHROPIC_API_KEY", wantName: "anthropic"},
+ {name: "openai", keyEnv: "OPENAI_API_KEY", wantName: "openai"},
+ }
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv(tc.keyEnv, "test-key")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Provider != tc.wantName || !sel.AskEnabled {
+ t.Fatalf("selection = %+v, want %s enabled", sel, tc.wantName)
+ }
+ })
+ }
+}
+
+func TestSelectFromEnv_WaylogModelOverridesProviderModel(t *testing.T) {
+ tests := []struct {
+ name string
+ provider string
+ keyEnv string
+ modelEnv string
+ }{
+ {name: "anthropic", provider: "anthropic", keyEnv: "ANTHROPIC_API_KEY", modelEnv: "ANTHROPIC_MODEL"},
+ {name: "openai", provider: "openai", keyEnv: "OPENAI_API_KEY", modelEnv: "OPENAI_MODEL"},
+ }
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ clearProviderEnv(t)
+ t.Setenv("WAYLOG_LLM_PROVIDER", tc.provider)
+ t.Setenv(tc.keyEnv, "test-key")
+ t.Setenv("WAYLOG_LLM_MODEL", "waylog-model")
+ t.Setenv(tc.modelEnv, "provider-model")
+
+ sel, err := SelectFromEnv()
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if sel.Model != "waylog-model" {
+ t.Fatalf("Model = %q, want waylog-model", sel.Model)
+ }
+ })
+ }
+}
diff --git a/internal/llm/types.go b/internal/llm/types.go
index 0c14aee..eaed6d9 100644
--- a/internal/llm/types.go
+++ b/internal/llm/types.go
@@ -12,8 +12,11 @@ type ToolDefinition struct {
}
type ToolCall struct {
- Name string
- Arguments json.RawMessage
+ Name string
+ Arguments json.RawMessage
+ ProviderID string `json:"-"`
+ ProviderRawItems []json.RawMessage `json:"-"`
+ ProviderRawIncluded bool `json:"-"`
}
type ToolResult struct {
diff --git a/internal/triage/idempotency_test.go b/internal/triage/idempotency_test.go
index 9327295..51dd6cd 100644
--- a/internal/triage/idempotency_test.go
+++ b/internal/triage/idempotency_test.go
@@ -60,3 +60,37 @@ func TestSnapshotModeUsesIncidentUpdatedAt(t *testing.T) {
t.Fatalf("snap report missing incident ref")
}
}
+
+func TestBuildHashDoesNotDependOnLLMProviderEnv(t *testing.T) {
+ providers := []string{"none", "gemini", "anthropic", "openai"}
+ var first string
+ for _, provider := range providers {
+ t.Run(provider, func(t *testing.T) {
+ t.Setenv("WAYLOG_LLM_PROVIDER", provider)
+ t.Setenv("GEMINI_API_KEY", "test-gemini-key")
+ t.Setenv("ANTHROPIC_API_KEY", "test-anthropic-key")
+ t.Setenv("OPENAI_API_KEY", "test-openai-key")
+
+ deps := Deps{
+ Incidents: richIncidents{}, Blast: richBlast{}, Story: richStory{},
+ Signals: richSignals{}, NextChecks: richNextChecks{},
+ Now: func() time.Time { return time.Date(2026, 5, 6, 12, 0, 0, 0, time.UTC) },
+ }
+ eng, err := NewEngine(deps)
+ if err != nil {
+ t.Fatalf("new engine: %v", err)
+ }
+ opts, _ := ParseBuildOptions("15m", true, deps.Now())
+ r, err := eng.Build(context.Background(), "inc_abc", opts)
+ if err != nil {
+ t.Fatalf("build: %v", err)
+ }
+ if first == "" {
+ first = r.ReportHash
+ }
+ if r.ReportHash != first {
+ t.Fatalf("provider %s hash = %q, want %q", provider, r.ReportHash, first)
+ }
+ })
+ }
+}
From cd615df7e63fd27c71120e2ec44f2ce8c2502908 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Fri, 8 May 2026 11:57:27 -0400
Subject: [PATCH 08/14] test: added rollup-correct attribution proof
- Add a runnable rollup-comparison demo proof that contrasts Waylog's
root-cause-counted PMT_502 rollup with a naive propagated service-hop count.
- Adds a graph invariant test to keep the root-cause rollup behavior pinned,
extends the demo JSON helper with small extractors used by the proof script,
and documents the new make rollup-comparison target in README.
---
Makefile | 6 +-
README.md | 3 +
.../graph/analysis/rollup_invariant_test.go | 33 +++++++
scripts/demo-acceptance-json/main.go | 47 +++++++++-
scripts/rollup-comparison.sh | 86 +++++++++++++++++++
5 files changed, 173 insertions(+), 2 deletions(-)
create mode 100644 internal/graph/analysis/rollup_invariant_test.go
create mode 100755 scripts/rollup-comparison.sh
diff --git a/Makefile b/Makefile
index e47aeb7..cb1937a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
SHELL := /bin/sh
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
help:
@echo "Targets:"
@@ -19,6 +19,7 @@ help:
@echo " demo - start dashboard demo locally (detached, no Docker)"
@echo " demo-stop - stop demo processes"
@echo " demo-acceptance - verify a running local demo end-to-end"
+ @echo " rollup-comparison - run demo proof for root-cause vs naive rollup counts"
@echo " demo-up - start v2 demo stack in Docker (detached)"
@echo " demo-down - stop Docker demo stack"
@echo " micro-demo - start 4-service micro-demo in foreground for debugging"
@@ -123,6 +124,9 @@ demo-stop:
demo-acceptance:
./scripts/demo-acceptance.sh
+rollup-comparison:
+ ./scripts/rollup-comparison.sh
+
demo-up: docker-dev
demo-down: docker-down
diff --git a/README.md b/README.md
index 6871fc1..22f3471 100644
--- a/README.md
+++ b/README.md
@@ -273,8 +273,11 @@ make test-race # race detector
make ts-test # TypeScript SDK vitest suite
make ci # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
make demo-acceptance # with make demo running, verify demo + CLI triage loop
+make rollup-comparison # demo proof: root-cause counts vs naive propagated counts
```
+`make rollup-comparison` runs the checkout demo burst and prints the PMT_502 root-cause count next to a naive propagated count across touched services. It is the quickest local proof that Waylog's default rollups count the originating failure once per failed request instead of inflating it by every downstream hop.
+
## Auth
Waylog uses three scoped keys. They are independent — the dashboard never holds the agent key.
diff --git a/internal/graph/analysis/rollup_invariant_test.go b/internal/graph/analysis/rollup_invariant_test.go
new file mode 100644
index 0000000..0607852
--- /dev/null
+++ b/internal/graph/analysis/rollup_invariant_test.go
@@ -0,0 +1,33 @@
+package analysis
+
+import (
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/graph/build"
+ "github.com/sssmaran/WaylogCLI/internal/graph/store"
+ "github.com/sssmaran/WaylogCLI/internal/tracestore"
+)
+
+func TestRollupInvariantRootCauseStaysBelowNaivePropagation(t *testing.T) {
+ s := store.NewStore()
+ ts := tracestore.NewStore()
+ b := build.NewBuilder()
+ now := time.Now().UTC()
+
+ const failedRequests = 3
+ for i := range failedRequests {
+ ingestCascade(t, s, ts, b, i, now.Add(-20*time.Second))
+ }
+
+ summary := RollupWindow(graphOf(s), s, ts, now.Add(-time.Minute), now.Add(time.Minute))
+ rootCauseCount := summary.PrimaryErrorCount["PMT_502"]
+ naivePropagatedCount := failedRequests * 3
+
+ if rootCauseCount != failedRequests {
+ t.Fatalf("PMT_502 root-cause count = %d, want %d", rootCauseCount, failedRequests)
+ }
+ if rootCauseCount >= naivePropagatedCount {
+ t.Fatalf("root-cause count should stay below naive propagated count: root=%d naive=%d", rootCauseCount, naivePropagatedCount)
+ }
+}
diff --git a/scripts/demo-acceptance-json/main.go b/scripts/demo-acceptance-json/main.go
index 1629b72..e4d5b67 100644
--- a/scripts/demo-acceptance-json/main.go
+++ b/scripts/demo-acceptance-json/main.go
@@ -13,6 +13,7 @@ type errorsResponse struct {
type errorRow struct {
ErrorFamily errorFamily `json:"error_family"`
+ Count int `json:"count"`
AffectedTraces int `json:"affected_traces"`
SampleTraces []string `json:"sample_traces"`
}
@@ -58,9 +59,13 @@ type triageReport struct {
ReportHash string `json:"report_hash"`
}
+type blastResponse struct {
+ AffectedServices int `json:"affected_services"`
+}
+
func main() {
if len(os.Args) != 2 {
- fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json ")
+ fmt.Fprintln(os.Stderr, "usage: demo-acceptance-json ")
os.Exit(2)
}
@@ -75,6 +80,10 @@ func main() {
if !hasPaymentError(body) {
os.Exit(1)
}
+ case "payment-error-count":
+ fmt.Println(paymentErrorCount(body))
+ case "payment-affected-traces":
+ fmt.Println(paymentAffectedTraces(body))
case "first-payment-trace":
fmt.Println(firstPaymentTrace(body))
case "first-event-id":
@@ -91,6 +100,8 @@ func main() {
fmt.Println(firstIncidentID(body))
case "triage-report-hash":
fmt.Println(triageReportHash(body))
+ case "blast-affected-services":
+ fmt.Println(blastAffectedServices(body))
default:
fmt.Fprintf(os.Stderr, "unknown command: %s\n", os.Args[1])
os.Exit(2)
@@ -110,6 +121,32 @@ func hasPaymentError(body []byte) bool {
return false
}
+func paymentErrorCount(body []byte) int {
+ var resp errorsResponse
+ if err := json.Unmarshal(body, &resp); err != nil {
+ return 0
+ }
+ for _, row := range resp.Rows {
+ if isPayment502(row) {
+ return row.Count
+ }
+ }
+ return 0
+}
+
+func paymentAffectedTraces(body []byte) int {
+ var resp errorsResponse
+ if err := json.Unmarshal(body, &resp); err != nil {
+ return 0
+ }
+ for _, row := range resp.Rows {
+ if isPayment502(row) {
+ return row.AffectedTraces
+ }
+ }
+ return 0
+}
+
func firstPaymentTrace(body []byte) string {
var resp errorsResponse
if err := json.Unmarshal(body, &resp); err != nil {
@@ -197,3 +234,11 @@ func triageReportHash(body []byte) string {
}
return rep.ReportHash
}
+
+func blastAffectedServices(body []byte) int {
+ var resp blastResponse
+ if err := json.Unmarshal(body, &resp); err != nil {
+ return 0
+ }
+ return resp.AffectedServices
+}
diff --git a/scripts/rollup-comparison.sh b/scripts/rollup-comparison.sh
new file mode 100755
index 0000000..252e7c1
--- /dev/null
+++ b/scripts/rollup-comparison.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT"
+
+GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
+INGEST_URL="${INGEST_URL:-http://localhost:8080}"
+WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+REQUESTS="${REQUESTS:-20}"
+CONCURRENCY="${CONCURRENCY:-5}"
+TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
+USE_RUNNING="${WAYLOG_ROLLUP_USE_RUNNING_DEMO:-0}"
+
+CLI_BIN="${WAYLOG_CLI_BIN:-./data/demo-state/bin/waylog}"
+JSON_BIN="${WAYLOG_JSON_HELPER_BIN:-./data/demo-state/bin/demo-acceptance-json}"
+
+fail() {
+ echo "FAIL: $*" >&2
+ exit 1
+}
+
+http_code() {
+ curl -s -o /dev/null -w "%{http_code}" "$1" || echo "000"
+}
+
+cleanup() {
+ if [[ "$USE_RUNNING" != "1" ]]; then
+ make demo-stop >/dev/null 2>&1 || true
+ fi
+}
+trap cleanup EXIT
+
+if [[ "$USE_RUNNING" != "1" ]]; then
+ make demo
+elif [[ "$(http_code "${GATEWAY_URL}/demo")" != "200" ]] || [[ "$(http_code "${INGEST_URL}/healthz")" != "200" ]]; then
+ fail "running demo is not reachable. Start it with make demo or unset WAYLOG_ROLLUP_USE_RUNNING_DEMO"
+fi
+
+mkdir -p ./data/demo-state/bin
+go build -o "$CLI_BIN" ./cmd/waylog
+go build -o "$JSON_BIN" ./scripts/demo-acceptance-json
+
+CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TIMEOUT")
+
+burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
+burst_status="$(curl -s -o /tmp/waylog-rollup-burst.json -w "%{http_code}" \
+ -X POST "${GATEWAY_URL}/demo/burst" \
+ -H 'Content-Type: application/json' \
+ --data "$burst_body" || echo "000")"
+[[ "$burst_status" == "200" ]] || fail "traffic burst failed: HTTP $burst_status"
+
+errors_json=""
+for _ in $(seq 1 15); do
+ errors_json="$("${CLI[@]}" --json errors --window 15m --limit 10)" || fail "waylog errors failed"
+ if "$JSON_BIN" has-payment-error <<<"$errors_json"; then
+ break
+ fi
+ sleep 1
+done
+"$JSON_BIN" has-payment-error <<<"$errors_json" || fail "payment_502 error family did not appear in /v1/errors"
+
+blast_json="$("${CLI[@]}" --json blast checkout:payment.charge:PMT_502 --window 15m)" || fail "waylog blast failed"
+
+root_count="$("$JSON_BIN" payment-error-count <<<"$errors_json")"
+affected_traces="$("$JSON_BIN" payment-affected-traces <<<"$errors_json")"
+affected_services="$("$JSON_BIN" blast-affected-services <<<"$blast_json")"
+
+[[ "$root_count" =~ ^[0-9]+$ ]] || fail "root-cause count is not numeric: $root_count"
+[[ "$affected_services" =~ ^[0-9]+$ ]] || fail "affected services is not numeric: $affected_services"
+(( root_count > 0 )) || fail "root-cause count is empty"
+(( affected_services > 1 )) || fail "blast radius did not show cross-service spread"
+
+naive_count=$((root_count * affected_services))
+(( naive_count > root_count )) || fail "naive propagated count did not exceed root-cause count"
+
+cat <
Date: Fri, 8 May 2026 16:01:50 -0400
Subject: [PATCH 09/14] fix: preserve capabilities fields in CLI output
- Keep provider and incident metadata when decoding capabilities responses in the CLI.
- Render provider, Ask, incident persistence, and rebuild state in human-readable output, and add regression coverage for preserving those fields in JSON output.
---
internal/cli/v2/render.go | 16 ++++++++++++++
internal/cli/v2/render_test.go | 39 +++++++++++++++++++++++++++++++++-
internal/cli/v2/types.go | 15 +++++++++++++
3 files changed, 69 insertions(+), 1 deletion(-)
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index e0bed00..3092b0a 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -286,6 +286,22 @@ func RenderSearch(w io.Writer, resp EventSearchResponse) {
func RenderCapabilities(w io.Writer, resp CapabilitiesResponse) {
fmt.Fprintf(w, "v2_reads: %s\n", enabledLabel(resp.V2Reads.Enabled))
fmt.Fprintf(w, "otlp_http_traces: %s\n", enabledLabel(resp.OTLP.HTTPTraces))
+ if resp.LLM.Provider != "" {
+ fmt.Fprintf(w, "llm: provider=%s configured=%t ask_enabled=%t", resp.LLM.Provider, resp.LLM.Configured, resp.LLM.AskEnabled)
+ if resp.LLM.Model != "" {
+ fmt.Fprintf(w, " model=%s", resp.LLM.Model)
+ }
+ if resp.LLM.ToolMode != "" {
+ fmt.Fprintf(w, " tool_mode=%s", resp.LLM.ToolMode)
+ }
+ fmt.Fprintln(w)
+ }
+ fmt.Fprintf(w, "incidents: enabled=%t persistent=%t rebuild_supported=%t",
+ resp.Incidents.Enabled, resp.Incidents.Persistent, resp.Incidents.Rebuild.Supported)
+ if resp.Incidents.Rebuild.Scope != "" {
+ fmt.Fprintf(w, " rebuild_scope=%s", resp.Incidents.Rebuild.Scope)
+ }
+ fmt.Fprintln(w)
}
func eventRoute(ev *Event) string {
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index a60e4cc..5748f05 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -137,8 +137,45 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
var out bytes.Buffer
resp := CapabilitiesResponse{}
resp.OTLP.HTTPTraces = true
+ resp.LLM.Provider = "none"
+ resp.Incidents.Enabled = true
+ resp.Incidents.Persistent = true
+ resp.Incidents.Rebuild.Supported = true
+ resp.Incidents.Rebuild.Scope = "hot-window"
RenderCapabilities(&out, resp)
- if !strings.Contains(out.String(), "v2_reads: disabled") || !strings.Contains(out.String(), "otlp_http_traces: enabled") {
+ for _, want := range []string{
+ "v2_reads: disabled",
+ "otlp_http_traces: enabled",
+ "llm: provider=none configured=false ask_enabled=false",
+ "incidents: enabled=true persistent=true rebuild_supported=true rebuild_scope=hot-window",
+ } {
+ if !strings.Contains(out.String(), want) {
+ t.Fatalf("output missing %q:\n%s", want, out.String())
+ }
+ }
+}
+
+func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
+ raw := []byte(`{
+ "v2_reads":{"enabled":true},
+ "otlp":{"http_traces":true},
+ "llm":{"provider":"none","model":"","tool_mode":"","configured":false,"ask_enabled":false},
+ "incidents":{"enabled":true,"persistent":true,"rebuild":{"supported":true,"scope":"hot-window"}}
+ }`)
+ var resp CapabilitiesResponse
+ if err := json.Unmarshal(raw, &resp); err != nil {
+ t.Fatal(err)
+ }
+ var out bytes.Buffer
+ if err := renderJSON(&out, resp); err != nil {
+ t.Fatal(err)
+ }
+ for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`} {
+ if !strings.Contains(out.String(), want) {
+ t.Fatalf("json missing %q:\n%s", want, out.String())
+ }
+ }
+ if !resp.Incidents.Rebuild.Supported {
t.Fatalf("output=%s", out.String())
}
}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index 58fcde9..dc707b5 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -15,6 +15,21 @@ type CapabilitiesResponse struct {
OTLP struct {
HTTPTraces bool `json:"http_traces"`
} `json:"otlp"`
+ LLM struct {
+ Provider string `json:"provider"`
+ Model string `json:"model"`
+ ToolMode string `json:"tool_mode"`
+ Configured bool `json:"configured"`
+ AskEnabled bool `json:"ask_enabled"`
+ } `json:"llm"`
+ Incidents struct {
+ Enabled bool `json:"enabled"`
+ Persistent bool `json:"persistent"`
+ Rebuild struct {
+ Supported bool `json:"supported"`
+ Scope string `json:"scope"`
+ } `json:"rebuild"`
+ } `json:"incidents"`
}
type EventSearchResponse = apiv2.EventSearchResponse
From 03c7d899ebe024f54d103eb23afcc23b5e1c1c7c Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Sun, 10 May 2026 15:42:02 -0400
Subject: [PATCH 10/14] feat: added OTLP gRPC trace ingest
- add OTLP TraceService gRPC receiver using the existing trace conversion path
- require write-scope bearer auth on gRPC metadata
- share OTLP export logic between HTTP and gRPC transports
- report OTLP gRPC state in capabilities and CLI output
- add deterministic OTLP conformance target
- add OpenTelemetry Collector example
- document OTLP gRPC env, capabilities, and status
---
Makefile | 8 +-
README.md | 16 ++--
cmd/ingest/main.go | 34 ++++++++
docs/env.md | 4 +-
docs/openapi.yaml | 12 ++-
examples/otel-collector/config.yaml | 21 +++++
go.mod | 3 +
go.sum | 4 +
go.work.sum | 6 +-
internal/cli/v2/render.go | 5 ++
internal/cli/v2/render_test.go | 7 +-
internal/cli/v2/types.go | 4 +-
internal/ingest/handler.go | 14 ++++
internal/ingest/handler_test.go | 32 ++++++++
internal/otel/grpc.go | 118 ++++++++++++++++++++++++++++
internal/otel/grpc_test.go | 95 ++++++++++++++++++++++
internal/otel/handler.go | 74 ++++++++++++-----
scripts/otlp-conformance.sh | 10 +++
18 files changed, 428 insertions(+), 39 deletions(-)
create mode 100644 examples/otel-collector/config.yaml
create mode 100644 internal/otel/grpc.go
create mode 100644 internal/otel/grpc_test.go
create mode 100755 scripts/otlp-conformance.sh
diff --git a/Makefile b/Makefile
index cb1937a..e9f21ee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
SHELL := /bin/sh
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
help:
@echo "Targets:"
@@ -20,6 +20,7 @@ help:
@echo " demo-stop - stop demo processes"
@echo " demo-acceptance - verify a running local demo end-to-end"
@echo " rollup-comparison - run demo proof for root-cause vs naive rollup counts"
+ @echo " otlp-conformance - run deterministic OTLP HTTP/gRPC fixture checks"
@echo " demo-up - start v2 demo stack in Docker (detached)"
@echo " demo-down - stop Docker demo stack"
@echo " micro-demo - start 4-service micro-demo in foreground for debugging"
@@ -83,7 +84,7 @@ vet-sdk: ## Vet SDK modules
cd pkg && go vet ./...
cd pkg/transport/kafka && go vet ./...
-ci: fmt vet vet-sdk test-race test-sdk ts-test check-doc-links check-rollup-contract
+ci: fmt vet vet-sdk test-race test-sdk ts-test check-doc-links check-rollup-contract otlp-conformance
@echo "CI checks passed"
ts-install: ## Install TS SDK deps (skipped if node_modules is already present)
@@ -127,6 +128,9 @@ demo-acceptance:
rollup-comparison:
./scripts/rollup-comparison.sh
+otlp-conformance:
+ ./scripts/otlp-conformance.sh
+
demo-up: docker-dev
demo-down: docker-down
diff --git a/README.md b/README.md
index 22f3471..a0246aa 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
Structured logging that explains failed requests and active incidents.
- Drop-in SDKs (Go, TypeScript) or OTLP/HTTP. Agent-native by design.
+ Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC traces. Agent-native by design.
@@ -133,9 +133,9 @@ func main() {
The recommended SDK path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers. Low-level request APIs such as `Begin`, `Finalize`, and `setField` are for adapter authors, tests, and unusual custom integrations. Full copy-paste examples for `net/http`, chi, gin, echo, standalone TypeScript, Express, Hono, Next.js, and NestJS are in [`docs/sdk-examples.md`](docs/sdk-examples.md).
-### OTLP/HTTP traces
+### OTLP traces
-Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces`. Protobuf bodies are accepted (gzip optional) and HTTP spans convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. **Phase A covers traces over HTTP.** gRPC, logs, and metrics are not yet shipping.
+Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces` for OTLP/HTTP or `localhost:4317` for OTLP/gRPC. Protobuf trace exports convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. A collector config lives in [`examples/otel-collector/`](examples/otel-collector/). **Only traces are supported.** OTLP logs and metrics are not shipping yet.
### Alternative: local ingest server (no Docker)
@@ -244,8 +244,8 @@ The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs a
## Architecture
```text
-Go / TS services (SDK) · OTLP/HTTP collectors
- │ schema-2.0 WideEvents · OTLP/HTTP traces
+Go / TS services (SDK) · OTLP collectors
+ │ schema-2.0 WideEvents · OTLP HTTP/gRPC traces
▼
ingest server
├─ event log (append-only WAL, source of truth)
@@ -297,7 +297,7 @@ Public alpha. APIs may break before 1.0.
**Shipped:**
- Go SDK v2 (`net/http`, chi, gin, echo) and TypeScript SDK v2 (`@waylog/sdk`, ESM, Node 18+, standalone core, Express, Hono, Next.js, NestJS)
-- OTLP/HTTP traces at `/v1/otlp/v1/traces` (Phase A — traces only)
+- OTLP traces over HTTP at `/v1/otlp/v1/traces` and gRPC at `:4317` (traces only)
- durable ingest with WAL + replay
- hot graph with flattened 3-node model + dedicated trace store
- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
@@ -314,7 +314,7 @@ Public alpha. APIs may break before 1.0.
**Planned:**
-- OTLP gRPC, logs, and metrics (Phase B)
+- OTLP logs and metrics
- Python SDK
- Mintlify docs site
@@ -322,7 +322,7 @@ Public alpha. APIs may break before 1.0.
- Single-node only. No HA, no clustering.
- Alpha quality. APIs may break before 1.0.
-- OTLP is HTTP/traces only. gRPC, logs, and metrics are not shipping yet.
+- OTLP supports traces only. Logs and metrics are not shipping yet.
- Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index 579e3a6..b71153f 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"log/slog"
+ "net"
"net/http"
"os"
"os/signal"
@@ -41,6 +42,8 @@ import (
"github.com/sssmaran/WaylogCLI/internal/triagehttp"
apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
eventv2 "github.com/sssmaran/WaylogCLI/pkg/event/v2"
+ coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+ "google.golang.org/grpc"
)
var graphStore *graphstore.Store
@@ -130,6 +133,7 @@ func main() {
grafanaURL := config.Getenv("GRAFANA_URL", "")
graphUI := config.GetenvBool("GRAPH_UI", false)
otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
+ otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
@@ -262,6 +266,8 @@ func main() {
PlanStore: planStore,
GraphHotWindow: graphHotWindow,
OTLPEnabled: otlpEnabled,
+ OTLPGRPCEnabled: otlpEnabled && otlpGRPCAddr != "",
+ OTLPGRPCAddr: otlpGRPCAddr,
V2ReadsEnabled: v2ReadsEnabled,
IncidentsEnabled: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
IncidentsPersistent: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
@@ -390,10 +396,19 @@ func main() {
mux.Handle("/v1/signals", writeAuth(http.HandlerFunc(signalHandler.Signals)))
// OTLP/HTTP traces reuse the same schema-2.0 WAL and projector as the SDK path.
+ var otlpGRPCServer *grpc.Server
if otlpEnabled {
otlpHandler := otelhttp.NewHandler(eventsV2, m, maxBody)
mux.Handle("/v1/otlp/v1/traces", writeAuth(http.HandlerFunc(otlpHandler.ServeHTTP)))
slog.Info("otlp enabled", "endpoint", "/v1/otlp/v1/traces")
+ if otlpGRPCAddr != "" {
+ otlpGRPCServer = grpc.NewServer(
+ grpc.UnaryInterceptor(otelhttp.AuthUnaryInterceptor(authCfg.WriteKeys)),
+ grpc.MaxRecvMsgSize(int(maxBody)),
+ )
+ coltracepb.RegisterTraceServiceServer(otlpGRPCServer, otelhttp.NewTraceServiceServer(eventsV2, m, maxBody))
+ ingestServer.SetOTLPGRPC(true, otlpGRPCAddr)
+ }
}
// Read endpoints — CORS outermost so OPTIONS preflight passes without auth.
@@ -619,6 +634,21 @@ func main() {
}
}()
+ if otlpGRPCServer != nil {
+ lis, err := net.Listen("tcp", otlpGRPCAddr)
+ if err != nil {
+ slog.Error("otlp grpc listen failed", "addr", otlpGRPCAddr, "err", err)
+ os.Exit(1)
+ }
+ go func() {
+ slog.Info("otlp grpc enabled", "addr", otlpGRPCAddr)
+ if err := otlpGRPCServer.Serve(lis); err != nil && !errors.Is(err, grpc.ErrServerStopped) {
+ slog.Error("otlp grpc server error", "err", err)
+ os.Exit(1)
+ }
+ }()
+ }
+
// ---------------- Embedded CLI ----------------
if mcpStdio {
@@ -867,6 +897,10 @@ func main() {
} else {
slog.Info("ingest shutdown complete")
}
+ if otlpGRPCServer != nil {
+ otlpGRPCServer.GracefulStop()
+ slog.Info("otlp grpc shutdown complete")
+ }
planStore.Close()
diff --git a/docs/env.md b/docs/env.md
index 65fa480..de42e43 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -46,7 +46,9 @@ Scoped keys. See the Auth section of the [README](../README.md).
| Variable | Default | Purpose |
|---|---|---|
| `INGEST_ADDR` | `:8080` | Listen address |
-| `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events` |
+| `OTLP_ENABLED` | `true` | Enable OTLP trace ingest over HTTP and gRPC |
+| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver |
+| `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events`, `/v1/otlp/v1/traces`, and OTLP/gRPC receive messages |
| `READ_HEADER_TIMEOUT` | `5s` | HTTP read header timeout |
| `READ_TIMEOUT` | `10s` | HTTP read timeout |
| `WRITE_TIMEOUT` | `10s` | HTTP write timeout |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 7162edb..4d480f1 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -20,7 +20,7 @@ tags:
- name: Ingest
description: Schema-2.0 event ingest and validation.
- name: OTLP
- description: OTLP/HTTP trace ingest converted into schema-2.0 events.
+ description: OTLP trace ingest converted into schema-2.0 events. HTTP is documented here; gRPC uses OTLP TraceService on the configured OTLP_GRPC_ADDR.
- name: Signals
description: Production-context facts used by incident triage.
- name: Events
@@ -207,7 +207,9 @@ paths:
Accepts OTLP ExportTraceServiceRequest protobuf bodies. HTTP spans are
converted into schema-2.0 WideEvents and ingested through the same WAL
and projector as SDK events. Unsupported spans and validation rejects
- are reported via OTLP partial_success.
+ are reported via OTLP partial_success. The same trace conversion path
+ is available over OTLP/gRPC TraceService when the gRPC receiver is
+ enabled.
security:
- ApiKeyHeader: []
- BearerAuth: []
@@ -1949,6 +1951,8 @@ components:
example:
otlp:
http_traces: true
+ grpc_traces: true
+ grpc_addr: ":4317"
v2_reads:
enabled: true
graph: false
@@ -1989,6 +1993,10 @@ components:
properties:
http_traces:
type: boolean
+ grpc_traces:
+ type: boolean
+ grpc_addr:
+ type: string
v2_reads:
type: object
properties:
diff --git a/examples/otel-collector/config.yaml b/examples/otel-collector/config.yaml
new file mode 100644
index 0000000..d499b4f
--- /dev/null
+++ b/examples/otel-collector/config.yaml
@@ -0,0 +1,21 @@
+receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+ http:
+ endpoint: 0.0.0.0:4318
+
+exporters:
+ otlp/waylog:
+ endpoint: ${env:WAYLOG_OTLP_GRPC_ENDPOINT}
+ tls:
+ insecure: true
+ headers:
+ authorization: Bearer ${env:WAYLOG_WRITE_KEY}
+
+service:
+ pipelines:
+ traces:
+ receivers: [otlp]
+ exporters: [otlp/waylog]
diff --git a/go.mod b/go.mod
index 0f46631..ad8dfec 100644
--- a/go.mod
+++ b/go.mod
@@ -54,8 +54,11 @@ require (
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
+ golang.org/x/net v0.50.0 // indirect
golang.org/x/sys v0.41.0 // indirect
golang.org/x/text v0.34.0 // indirect
+ google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect
+ google.golang.org/grpc v1.79.2 // indirect
google.golang.org/protobuf v1.36.11 // indirect
modernc.org/libc v1.67.6 // indirect
modernc.org/mathutil v1.7.1 // indirect
diff --git a/go.sum b/go.sum
index 9147d0e..89a1ad1 100644
--- a/go.sum
+++ b/go.sum
@@ -117,6 +117,10 @@ golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
+google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
+google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/go.work.sum b/go.work.sum
index b5458b3..29d3784 100644
--- a/go.work.sum
+++ b/go.work.sum
@@ -57,6 +57,8 @@ github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtX
golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8=
golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw=
+golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
+golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY=
@@ -64,7 +66,3 @@ golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGN
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
-google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
-google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
diff --git a/internal/cli/v2/render.go b/internal/cli/v2/render.go
index 3092b0a..2e61981 100644
--- a/internal/cli/v2/render.go
+++ b/internal/cli/v2/render.go
@@ -286,6 +286,11 @@ func RenderSearch(w io.Writer, resp EventSearchResponse) {
func RenderCapabilities(w io.Writer, resp CapabilitiesResponse) {
fmt.Fprintf(w, "v2_reads: %s\n", enabledLabel(resp.V2Reads.Enabled))
fmt.Fprintf(w, "otlp_http_traces: %s\n", enabledLabel(resp.OTLP.HTTPTraces))
+ fmt.Fprintf(w, "otlp_grpc_traces: %s", enabledLabel(resp.OTLP.GRPCTraces))
+ if resp.OTLP.GRPCAddr != "" {
+ fmt.Fprintf(w, " addr=%s", resp.OTLP.GRPCAddr)
+ }
+ fmt.Fprintln(w)
if resp.LLM.Provider != "" {
fmt.Fprintf(w, "llm: provider=%s configured=%t ask_enabled=%t", resp.LLM.Provider, resp.LLM.Configured, resp.LLM.AskEnabled)
if resp.LLM.Model != "" {
diff --git a/internal/cli/v2/render_test.go b/internal/cli/v2/render_test.go
index 5748f05..4fa4f34 100644
--- a/internal/cli/v2/render_test.go
+++ b/internal/cli/v2/render_test.go
@@ -137,6 +137,8 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
var out bytes.Buffer
resp := CapabilitiesResponse{}
resp.OTLP.HTTPTraces = true
+ resp.OTLP.GRPCTraces = true
+ resp.OTLP.GRPCAddr = ":4317"
resp.LLM.Provider = "none"
resp.Incidents.Enabled = true
resp.Incidents.Persistent = true
@@ -146,6 +148,7 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
for _, want := range []string{
"v2_reads: disabled",
"otlp_http_traces: enabled",
+ "otlp_grpc_traces: enabled addr=:4317",
"llm: provider=none configured=false ask_enabled=false",
"incidents: enabled=true persistent=true rebuild_supported=true rebuild_scope=hot-window",
} {
@@ -158,7 +161,7 @@ func TestRenderCapabilitiesPrintsReadableFlags(t *testing.T) {
func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
raw := []byte(`{
"v2_reads":{"enabled":true},
- "otlp":{"http_traces":true},
+ "otlp":{"http_traces":true,"grpc_traces":true,"grpc_addr":":4317"},
"llm":{"provider":"none","model":"","tool_mode":"","configured":false,"ask_enabled":false},
"incidents":{"enabled":true,"persistent":true,"rebuild":{"supported":true,"scope":"hot-window"}}
}`)
@@ -170,7 +173,7 @@ func TestCapabilitiesJSONPreservesM2Fields(t *testing.T) {
if err := renderJSON(&out, resp); err != nil {
t.Fatal(err)
}
- for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`} {
+ for _, want := range []string{`"llm"`, `"provider": "none"`, `"incidents"`, `"scope": "hot-window"`, `"grpc_traces": true`} {
if !strings.Contains(out.String(), want) {
t.Fatalf("json missing %q:\n%s", want, out.String())
}
diff --git a/internal/cli/v2/types.go b/internal/cli/v2/types.go
index dc707b5..c281fef 100644
--- a/internal/cli/v2/types.go
+++ b/internal/cli/v2/types.go
@@ -13,7 +13,9 @@ type CapabilitiesResponse struct {
Enabled bool `json:"enabled"`
} `json:"v2_reads"`
OTLP struct {
- HTTPTraces bool `json:"http_traces"`
+ HTTPTraces bool `json:"http_traces"`
+ GRPCTraces bool `json:"grpc_traces"`
+ GRPCAddr string `json:"grpc_addr"`
} `json:"otlp"`
LLM struct {
Provider string `json:"provider"`
diff --git a/internal/ingest/handler.go b/internal/ingest/handler.go
index 6bf181d..452f86f 100644
--- a/internal/ingest/handler.go
+++ b/internal/ingest/handler.go
@@ -136,6 +136,8 @@ type Server struct {
// OTLP capability flag — reported by /v1/capabilities. Set via
// ServerConfig when the OTLP handler is mounted in main.go.
otlpEnabled bool
+ otlpGRPCEnabled bool
+ otlpGRPCAddr string
v2ReadsEnabled bool
incidentsEnabled bool
incidentsPersistent bool
@@ -204,6 +206,8 @@ type ServerConfig struct {
PlanStore *PlanStore
GraphHotWindow time.Duration
OTLPEnabled bool
+ OTLPGRPCEnabled bool
+ OTLPGRPCAddr string
V2ReadsEnabled bool
IncidentsEnabled bool
IncidentsPersistent bool
@@ -246,6 +250,8 @@ func NewServer(cfg ServerConfig) *Server {
planStore: cfg.PlanStore,
graphHotWindow: cfg.GraphHotWindow,
otlpEnabled: cfg.OTLPEnabled,
+ otlpGRPCEnabled: cfg.OTLPGRPCEnabled,
+ otlpGRPCAddr: cfg.OTLPGRPCAddr,
v2ReadsEnabled: cfg.V2ReadsEnabled,
incidentsEnabled: cfg.IncidentsEnabled,
incidentsPersistent: cfg.IncidentsPersistent,
@@ -405,6 +411,12 @@ func (s *Server) AcceptedPtr() *atomic.Uint64 { return &s.accepted }
// Called once at startup after the OTLP route has been registered.
func (s *Server) SetOTLPEnabled(enabled bool) { s.otlpEnabled = enabled }
+// SetOTLPGRPC marks the OTLP/gRPC trace receiver as mounted.
+func (s *Server) SetOTLPGRPC(enabled bool, addr string) {
+ s.otlpGRPCEnabled = enabled
+ s.otlpGRPCAddr = addr
+}
+
// EventSearch handles GET /v1/events/search.
// Both cold-store and JSONL paths return the same []coldstore.SearchResult shape.
func (s *Server) EventSearch(w http.ResponseWriter, r *http.Request) {
@@ -597,6 +609,8 @@ func (s *Server) Capabilities(w http.ResponseWriter, r *http.Request) {
"graph": s.graphUI,
"otlp": map[string]any{
"http_traces": s.otlpEnabled,
+ "grpc_traces": s.otlpGRPCEnabled,
+ "grpc_addr": s.otlpGRPCAddr,
},
"v2_reads": map[string]any{
"enabled": s.v2ReadsEnabled,
diff --git a/internal/ingest/handler_test.go b/internal/ingest/handler_test.go
index 28099a7..c8775b4 100644
--- a/internal/ingest/handler_test.go
+++ b/internal/ingest/handler_test.go
@@ -349,6 +349,38 @@ func TestCapabilities_V2ReadsEnabled(t *testing.T) {
}
}
+func TestCapabilities_OTLPGRPCBlock(t *testing.T) {
+ srv := NewServer(ServerConfig{
+ OTLPEnabled: true,
+ OTLPGRPCEnabled: true,
+ OTLPGRPCAddr: ":4317",
+ })
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/capabilities", nil)
+ w := httptest.NewRecorder()
+ srv.Capabilities(w, req)
+
+ var resp struct {
+ OTLP struct {
+ HTTPTraces bool `json:"http_traces"`
+ GRPCTraces bool `json:"grpc_traces"`
+ GRPCAddr string `json:"grpc_addr"`
+ } `json:"otlp"`
+ }
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("invalid json: %v", err)
+ }
+ if !resp.OTLP.HTTPTraces {
+ t.Fatal("otlp.http_traces = false, want true")
+ }
+ if !resp.OTLP.GRPCTraces {
+ t.Fatal("otlp.grpc_traces = false, want true")
+ }
+ if resp.OTLP.GRPCAddr != ":4317" {
+ t.Fatalf("otlp.grpc_addr = %q, want :4317", resp.OTLP.GRPCAddr)
+ }
+}
+
func TestCapabilities_IncidentsBlock(t *testing.T) {
tests := []struct {
name string
diff --git a/internal/otel/grpc.go b/internal/otel/grpc.go
new file mode 100644
index 0000000..886be3b
--- /dev/null
+++ b/internal/otel/grpc.go
@@ -0,0 +1,118 @@
+package otel
+
+import (
+ "context"
+ "crypto/subtle"
+ "net/http"
+ "strings"
+ "time"
+
+ ingestv2 "github.com/sssmaran/WaylogCLI/internal/ingest/v2"
+ "github.com/sssmaran/WaylogCLI/internal/metrics"
+ coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/metadata"
+ "google.golang.org/grpc/status"
+ "google.golang.org/protobuf/proto"
+)
+
+// TraceServiceServer implements OTLP/gRPC TraceService over the same export
+// processor used by the OTLP/HTTP endpoint.
+type TraceServiceServer struct {
+ coltracepb.UnimplementedTraceServiceServer
+ handler *Handler
+ metrics *metrics.Metrics
+}
+
+func NewTraceServiceServer(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *TraceServiceServer {
+ return &TraceServiceServer{
+ handler: NewHandler(v2Ingest, m, maxBodyBytes),
+ metrics: m,
+ }
+}
+
+func (s *TraceServiceServer) Export(ctx context.Context, req *coltracepb.ExportTraceServiceRequest) (*coltracepb.ExportTraceServiceResponse, error) {
+ start := time.Now()
+ if s.metrics != nil {
+ defer func() {
+ s.metrics.OTLPRequestDuration.Observe(time.Since(start).Seconds())
+ }()
+ if req != nil {
+ s.metrics.OTLPRequestSizeBytes.Observe(float64(proto.Size(req)))
+ }
+ }
+
+ resp, err := s.handler.Export(ctx, req)
+ if err != nil {
+ code := codes.Internal
+ msg := "infrastructure error"
+ if exportErr, ok := err.(*ExportError); ok {
+ if s.metrics != nil {
+ s.metrics.OTLPRequestsTotal.WithLabelValues(exportErr.Bucket).Inc()
+ }
+ code = grpcCode(exportErr.StatusCode)
+ msg = exportErr.Message
+ }
+ return nil, status.Error(code, msg)
+ }
+ return resp, nil
+}
+
+func AuthUnaryInterceptor(writeKeys []string) grpc.UnaryServerInterceptor {
+ keyBytes := make([][]byte, 0, len(writeKeys))
+ for _, key := range writeKeys {
+ key = strings.TrimSpace(key)
+ if key != "" {
+ keyBytes = append(keyBytes, []byte(key))
+ }
+ }
+ return func(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) {
+ if len(keyBytes) == 0 {
+ return handler(ctx, req)
+ }
+ if token := bearerToken(ctx); token != "" && matchesAnyToken([]byte(token), keyBytes) {
+ return handler(ctx, req)
+ }
+ return nil, status.Error(codes.Unauthenticated, "unauthorized")
+ }
+}
+
+func bearerToken(ctx context.Context) string {
+ md, ok := metadata.FromIncomingContext(ctx)
+ if !ok {
+ return ""
+ }
+ for _, auth := range md.Get("authorization") {
+ if idx := strings.IndexByte(auth, ' '); idx > 0 && strings.EqualFold(auth[:idx], "bearer") {
+ return strings.TrimSpace(auth[idx+1:])
+ }
+ }
+ return ""
+}
+
+func matchesAnyToken(token []byte, keys [][]byte) bool {
+ match := 0
+ for _, key := range keys {
+ match |= subtle.ConstantTimeCompare(token, key)
+ }
+ return match == 1
+}
+
+func grpcCode(statusCode int) codes.Code {
+ switch statusCode {
+ case http.StatusBadRequest, http.StatusRequestEntityTooLarge, http.StatusUnsupportedMediaType:
+ return codes.InvalidArgument
+ case http.StatusUnauthorized:
+ return codes.Unauthenticated
+ case http.StatusForbidden:
+ return codes.PermissionDenied
+ case http.StatusServiceUnavailable:
+ return codes.Unavailable
+ default:
+ if statusCode >= 500 {
+ return codes.Internal
+ }
+ return codes.Unknown
+ }
+}
diff --git a/internal/otel/grpc_test.go b/internal/otel/grpc_test.go
new file mode 100644
index 0000000..7e9beb1
--- /dev/null
+++ b/internal/otel/grpc_test.go
@@ -0,0 +1,95 @@
+package otel
+
+import (
+ "context"
+ "net"
+ "testing"
+
+ coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+ "google.golang.org/grpc"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/credentials/insecure"
+ "google.golang.org/grpc/metadata"
+ "google.golang.org/grpc/status"
+ "google.golang.org/grpc/test/bufconn"
+)
+
+const bufSize = 1024 * 1024
+
+func newBufconnClient(t *testing.T, keys []string) (coltracepb.TraceServiceClient, func()) {
+ t.Helper()
+ lis := bufconn.Listen(bufSize)
+ srv := grpc.NewServer(grpc.UnaryInterceptor(AuthUnaryInterceptor(keys)))
+ coltracepb.RegisterTraceServiceServer(srv, NewTraceServiceServer(testV2Ingest(t), nil, 1<<20))
+ go func() {
+ _ = srv.Serve(lis)
+ }()
+
+ ctx := context.Background()
+ conn, err := grpc.DialContext(ctx, "bufnet",
+ grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
+ return lis.Dial()
+ }),
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ )
+ if err != nil {
+ t.Fatalf("dial bufconn: %v", err)
+ }
+ cleanup := func() {
+ _ = conn.Close()
+ srv.Stop()
+ _ = lis.Close()
+ }
+ return coltracepb.NewTraceServiceClient(conn), cleanup
+}
+
+func TestGRPCExportHappyPath(t *testing.T) {
+ client, cleanup := newBufconnClient(t, []string{"write-key"})
+ defer cleanup()
+
+ ctx := metadata.AppendToOutgoingContext(context.Background(), "authorization", "Bearer write-key")
+ resp, err := client.Export(ctx, validOTLPRequest())
+ if err != nil {
+ t.Fatalf("export: %v", err)
+ }
+ if resp.PartialSuccess != nil && resp.PartialSuccess.RejectedSpans != 0 {
+ t.Fatalf("partial_success = %+v, want none", resp.PartialSuccess)
+ }
+}
+
+func TestGRPCExportAuth(t *testing.T) {
+ tests := []struct {
+ name string
+ auth string
+ code codes.Code
+ }{
+ {name: "missing", code: codes.Unauthenticated},
+ {name: "bad key", auth: "Bearer wrong", code: codes.Unauthenticated},
+ {name: "read key", auth: "Bearer read-key", code: codes.Unauthenticated},
+ {name: "agent key", auth: "Bearer agent-key", code: codes.Unauthenticated},
+ }
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ client, cleanup := newBufconnClient(t, []string{"write-key"})
+ defer cleanup()
+
+ ctx := context.Background()
+ if tc.auth != "" {
+ ctx = metadata.AppendToOutgoingContext(ctx, "authorization", tc.auth)
+ }
+ _, err := client.Export(ctx, validOTLPRequest())
+ if status.Code(err) != tc.code {
+ t.Fatalf("code = %v, want %v (err=%v)", status.Code(err), tc.code, err)
+ }
+ })
+ }
+}
+
+func TestGRPCExportDevModeNoKeys(t *testing.T) {
+ client, cleanup := newBufconnClient(t, nil)
+ defer cleanup()
+
+ if _, err := client.Export(context.Background(), validOTLPRequest()); err != nil {
+ t.Fatalf("export without auth in dev mode: %v", err)
+ }
+}
diff --git a/internal/otel/handler.go b/internal/otel/handler.go
index 321c344..1bcac11 100644
--- a/internal/otel/handler.go
+++ b/internal/otel/handler.go
@@ -8,6 +8,7 @@ import (
"compress/gzip"
"context"
"encoding/json"
+ "errors"
"io"
"log/slog"
"mime"
@@ -34,6 +35,29 @@ type Handler struct {
maxBodyBytes int64
}
+// ExportError is returned when decoded OTLP spans cannot be processed after
+// transport-level parsing has succeeded.
+type ExportError struct {
+ StatusCode int
+ Bucket string
+ Message string
+ Cause error
+}
+
+func (e *ExportError) Error() string {
+ if e == nil {
+ return ""
+ }
+ return e.Message
+}
+
+func (e *ExportError) Unwrap() error {
+ if e == nil {
+ return nil
+ }
+ return e.Cause
+}
+
// NewHandler constructs an OTLP traces handler.
func NewHandler(v2Ingest *ingestv2.Handler, m *metrics.Metrics, maxBodyBytes int64) *Handler {
return &Handler{
@@ -104,7 +128,31 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
- convResult := convert.SpansToEvents(&req)
+ resp, err := h.Export(r.Context(), &req)
+ if err != nil {
+ var exportErr *ExportError
+ if errors.As(err, &exportErr) {
+ h.respondStatus(w, exportErr.Bucket, exportErr.StatusCode, exportErr.Message)
+ return
+ }
+ h.respondStatus(w, "5xx", http.StatusInternalServerError, "infrastructure error")
+ return
+ }
+ respBytes, err := proto.Marshal(resp)
+ if err != nil {
+ h.respondStatus(w, "5xx", http.StatusInternalServerError, "failed to encode response")
+ return
+ }
+
+ w.Header().Set("Content-Type", "application/x-protobuf")
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write(respBytes)
+}
+
+// Export converts an OTLP trace export request into schema-2.0 events and
+// writes them through the same ingest path used by SDK events.
+func (h *Handler) Export(ctx context.Context, req *coltracepb.ExportTraceServiceRequest) (*coltracepb.ExportTraceServiceResponse, error) {
+ convResult := convert.SpansToEvents(req)
// Mirror the future-timestamp guard from Server.Events: drop any span
// dated more than 5 minutes ahead of wall-clock so a skewed collector
@@ -141,8 +189,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if h.metrics != nil {
h.metrics.OTLPInfraFailures.Inc()
}
- h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
- return
+ return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error"}
}
bodies, err := marshalEvents(convResult.Events)
if err != nil {
@@ -150,21 +197,18 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if h.metrics != nil {
h.metrics.OTLPInfraFailures.Inc()
}
- h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
- return
+ return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error", Cause: err}
}
- env, err = h.v2Ingest.IngestRaw(r.Context(), bodies, true)
+ env, err = h.v2Ingest.IngestRaw(ctx, bodies, true)
if err != nil {
if err == context.Canceled || err == context.DeadlineExceeded {
- h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "request canceled")
- return
+ return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "request canceled", Cause: err}
}
slog.Error("otlp: v2 ingest infrastructure failure", "err", err)
if h.metrics != nil {
h.metrics.OTLPInfraFailures.Inc()
}
- h.respondStatus(w, "5xx", http.StatusServiceUnavailable, "infrastructure error")
- return
+ return nil, &ExportError{StatusCode: http.StatusServiceUnavailable, Bucket: "5xx", Message: "infrastructure error", Cause: err}
}
if h.metrics != nil && len(env.Rejected) > 0 {
h.metrics.OTLPValidationRejects.Add(float64(len(env.Rejected)))
@@ -180,19 +224,11 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}
}
- respBytes, err := proto.Marshal(resp)
- if err != nil {
- h.respondStatus(w, "5xx", http.StatusInternalServerError, "failed to encode response")
- return
- }
-
if h.metrics != nil {
h.metrics.OTLPRequestsTotal.WithLabelValues("2xx").Inc()
}
- w.Header().Set("Content-Type", "application/x-protobuf")
- w.WriteHeader(http.StatusOK)
- _, _ = w.Write(respBytes)
+ return resp, nil
}
func (h *Handler) respondStatus(w http.ResponseWriter, bucket string, code int, msg string) {
diff --git a/scripts/otlp-conformance.sh b/scripts/otlp-conformance.sh
new file mode 100755
index 0000000..2078f59
--- /dev/null
+++ b/scripts/otlp-conformance.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# Deterministic OTLP fixture checks for Waylog's HTTP and gRPC trace paths.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+echo "[otlp-conformance] running OTLP conversion and receiver tests"
+go test ./internal/otel/...
+echo "OK: OTLP HTTP/gRPC fixture checks passed"
From 5106be19f205a4800b9bc5beda82070c8d1b74b7 Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Sun, 10 May 2026 18:26:50 -0400
Subject: [PATCH 11/14] feat: add alert-linked operator reports
- add /v1/alerts intake for Waylog, Alertmanager, Grafana, and PagerDuty payloads
- store accepted alerts as alert signals and match them to incidents when possible
- include alert evidence in incident classification without changing cause priority
- add alert references to TriageReport and canonical report hashing
- add Markdown, Slack Block Kit, and PagerDuty note renderers
- expose /v1/triage/{incident_id}/report and render_triage_report
- extend demo acceptance to prove alert intake, stable triage hash, and cited reports
- document alert intake, report rendering, and ALERT_MATCH_WINDOW
---
README.md | 10 +-
cmd/ingest/main.go | 15 +
docs/env.md | 1 +
docs/openapi.yaml | 121 +++++++-
internal/alerts/alerts.go | 414 ++++++++++++++++++++++++++
internal/alerts/alerts_test.go | 234 +++++++++++++++
internal/incidents/classifier.go | 44 ++-
internal/incidents/classifier_test.go | 48 +++
internal/reports/reports.go | 152 ++++++++++
internal/reports/reports_test.go | 68 +++++
internal/tools/report.go | 69 +++++
internal/tools/report_test.go | 29 ++
internal/triage/adapter.go | 70 ++++-
internal/triage/adapter_test.go | 37 +++
internal/triage/engine.go | 13 +
internal/triagehttp/handler.go | 57 +++-
internal/triagehttp/handler_test.go | 17 ++
pkg/triage/report.go | 11 +
pkg/triage/report_test.go | 22 ++
scripts/demo-acceptance.sh | 22 ++
20 files changed, 1442 insertions(+), 12 deletions(-)
create mode 100644 internal/alerts/alerts.go
create mode 100644 internal/alerts/alerts_test.go
create mode 100644 internal/reports/reports.go
create mode 100644 internal/reports/reports_test.go
create mode 100644 internal/tools/report.go
create mode 100644 internal/tools/report_test.go
diff --git a/README.md b/README.md
index a0246aa..943ae6b 100644
--- a/README.md
+++ b/README.md
@@ -208,10 +208,12 @@ Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP
### Analysis tools
-All eleven tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+All twelve tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
+External alerts can be posted to `POST /v1/alerts` as Waylog-normalized JSON or Alertmanager, Grafana, or PagerDuty webhooks. Waylog stores them as alert signals, links them to active incidents when possible, and can render cited Markdown, Slack Block Kit, or PagerDuty-note reports from the same deterministic triage artifact.
+
| Tool | Answers |
| ------------------ | ------------------------------------------------------------- |
| `graph_stats` | Overall shape of the graph right now |
@@ -225,6 +227,7 @@ Agents can call the built-in triage plan template with `POST /v1/plans/execute`
| `compare_windows` | Diff error rates between two windows |
| `graph_insights` | Windowed rollup of top errors and patterns |
| `triage_incident` | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
+| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note text rendered from a TriageReport |
Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
@@ -303,8 +306,9 @@ Public alpha. APIs may break before 1.0.
- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
- SQLite cold store (events, deployments, signals, incidents, causal claims)
- signal-driven incident engine with `waylog incidents`, `waylog incident `, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
+- alert intake for Waylog, Alertmanager, Grafana, and PagerDuty webhooks, stored as signals and linked to incidents when possible
- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
-- 11 deterministic analysis tools, rollup-correct root-cause attribution
+- 12 deterministic analysis tools, rollup-correct root-cause attribution
- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
- `/v1/traces/story` and indented failure-path rendering in the dashboard
- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
@@ -327,7 +331,7 @@ Public alpha. APIs may break before 1.0.
- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
- Incident cause classification is deterministic and heuristic.
-- No built-in alerting or paging. Waylog answers questions, it doesn't wake you up.
+- No outbound alerting or paging delivery. Waylog accepts external alerts and renders operator reports, but it doesn't wake you up.
- No multi-tenancy. One instance = one trust boundary.
- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index b71153f..db2f2c7 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -16,6 +16,7 @@ import (
"time"
"github.com/prometheus/client_golang/prometheus"
+ "github.com/sssmaran/WaylogCLI/internal/alerts"
"github.com/sssmaran/WaylogCLI/internal/auth"
"github.com/sssmaran/WaylogCLI/internal/cli"
"github.com/sssmaran/WaylogCLI/internal/coldstore"
@@ -136,6 +137,13 @@ func main() {
otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
+ alertMatchWindow := config.GetenvDuration("ALERT_MATCH_WINDOW", 15*time.Minute)
+ if alertMatchWindow <= 0 {
+ alertMatchWindow = 15 * time.Minute
+ }
+ if alertMatchWindow > 24*time.Hour {
+ alertMatchWindow = 24 * time.Hour
+ }
incidentsEnabled := config.GetenvBool("WAYLOG_INCIDENTS_ENABLED", true)
incidentCfg := incidents.Config{
TickInterval: config.GetenvDuration("WAYLOG_INCIDENT_TICK_INTERVAL", 30*time.Second),
@@ -530,6 +538,7 @@ func main() {
},
),
Signals: triage.NewSignalQueryAdapter(signalStore),
+ Alerts: triage.NewAlertQueryAdapter(signalStore, alertMatchWindow),
NextChecks: triage.NewNextChecksAdapter(),
})
if err != nil {
@@ -540,6 +549,10 @@ func main() {
slog.Error("triage tool register failed", "err", err)
os.Exit(1)
}
+ if err := tools.RegisterTriageReportTool(reg, triageEng); err != nil {
+ slog.Error("triage report tool register failed", "err", err)
+ os.Exit(1)
+ }
triageHandler := triagehttp.NewHandler(triageEng)
mux.Handle("/v1/triage/", readCORS(triageHandler.Triage))
@@ -561,6 +574,8 @@ func main() {
mux.Handle("/v1/topology", readCORS(ingestServer.Topology))
mux.Handle("/v1/stream/dashboard", readCORS(ingestServer.SSEStream))
mux.Handle("/v1/insight", readCORS(ingestServer.Insight))
+ alertHandler := alerts.NewHandler(signalStore, incidentEngine, v2Reader, alertMatchWindow)
+ mux.Handle("/v1/alerts", writeAuth(http.HandlerFunc(alertHandler.Alerts)))
// Deployments — dual method: GET=read, POST=write.
mux.Handle("/v1/deployments", http.HandlerFunc(
diff --git a/docs/env.md b/docs/env.md
index de42e43..3de9e62 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -54,6 +54,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
| `WRITE_TIMEOUT` | `10s` | HTTP write timeout |
| `IDLE_TIMEOUT` | `120s` | HTTP idle timeout |
| `CORS_ORIGIN` | `*` | Allowed CORS origin for read APIs |
+| `ALERT_MATCH_WINDOW` | `15m` | Window for matching `/v1/alerts` to active incidents by `env + service + error_code`; capped at `24h` |
## CLI
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 4d480f1..4c31f77 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -198,6 +198,54 @@ paths:
schema:
$ref: '#/components/schemas/ReadError'
+ /v1/alerts:
+ post:
+ tags: [Signals]
+ operationId: ingestAlert
+ summary: Ingest an external alert and match it to an active incident
+ description: |
+ Accepts Waylog-normalized alerts plus Alertmanager, Grafana, and
+ PagerDuty webhook payloads. Accepted alerts are stored as `type=alert`
+ signals. Matching is best-effort and does not create incidents directly.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ responses:
+ '201':
+ description: Alert accepted and stored as a signal
+ content:
+ application/json:
+ schema:
+ type: object
+ required: [signal, match]
+ properties:
+ signal:
+ $ref: '#/components/schemas/Signal'
+ match:
+ type: object
+ required: [matched, strategy]
+ properties:
+ matched: {type: boolean}
+ incident_id: {type: string}
+ strategy:
+ type: string
+ enum: [incident_id, trace_id, family, none]
+ '400':
+ description: Invalid JSON, unsupported alert shape, or missing required fields
+ '401':
+ description: Unauthorized
+ '405':
+ description: Method Not Allowed
+ '503':
+ description: Signal store unavailable
+
/v1/otlp/v1/traces:
post:
tags: [OTLP]
@@ -559,6 +607,59 @@ paths:
'500':
description: Triage build failed
+ /v1/triage/{incident_id}/report:
+ get:
+ tags: [Triage]
+ operationId: renderTriageReport
+ summary: Render a cited operator report from a TriageReport
+ description: |
+ Builds the same deterministic TriageReport as `/v1/triage/{incident_id}`
+ and renders it as Markdown, Slack Block Kit JSON, or PagerDuty note text.
+ This endpoint does not deliver messages to Slack or PagerDuty.
+ security:
+ - ApiKeyHeader: []
+ - BearerAuth: []
+ parameters:
+ - name: incident_id
+ in: path
+ required: true
+ schema: {type: string}
+ - name: format
+ in: query
+ required: false
+ schema:
+ type: string
+ enum: [markdown, slack, pagerduty]
+ default: markdown
+ - name: window
+ in: query
+ required: false
+ schema: {type: string, default: "15m"}
+ - name: snapshot
+ in: query
+ required: false
+ schema: {type: boolean, default: false}
+ responses:
+ '200':
+ description: Rendered operator report
+ content:
+ text/markdown:
+ schema: {type: string}
+ text/plain:
+ schema: {type: string}
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ '400':
+ description: Missing or invalid parameters
+ '401':
+ description: Unauthorized
+ '404':
+ $ref: '#/components/responses/ReadNotFound'
+ '500':
+ description: Triage build or render failed
+
/v1/insight:
get:
tags: [Operational]
@@ -811,7 +912,10 @@ paths:
tags: [Operational]
operationId: executeTool
summary: Direct tool call
- description: Executes a registered structured tool by name.
+ description: |
+ Executes a registered structured tool by name. `triage_incident`
+ returns a TriageReport; `render_triage_report` renders that report as
+ Markdown, Slack Block Kit JSON, or PagerDuty note text without delivery.
security:
- ApiKeyHeader: []
- BearerAuth: []
@@ -1884,6 +1988,21 @@ components:
id: {type: string}
type: {type: string}
evidence_ids: {type: array, items: {type: string}}
+ alerts:
+ type: array
+ description: Alert signals linked to this incident and cited by operator reports.
+ items:
+ type: object
+ properties:
+ signal_id: {type: string}
+ alert_id: {type: string}
+ source: {type: string, enum: [waylog, alertmanager, grafana, pagerduty]}
+ severity: {type: string}
+ reason: {type: string}
+ provider_url: {type: string}
+ evidence_ids:
+ type: array
+ items: {type: string}
next_checks:
type: array
items:
diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go
new file mode 100644
index 0000000..edde578
--- /dev/null
+++ b/internal/alerts/alerts.go
@@ -0,0 +1,414 @@
+package alerts
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+const (
+ CodeInvalidJSON = "INVALID_JSON"
+ CodeMissingFields = "MISSING_FIELDS"
+ CodeUnsupportedAlert = "UNSUPPORTED_ALERT"
+ CodeSignalUnavailable = "SIGNALS_UNAVAILABLE"
+ CodeInternal = "INTERNAL"
+)
+
+type IncidentSource interface {
+ Active(ctx context.Context) ([]incidents.Incident, error)
+ Get(ctx context.Context, id string) (incidents.Incident, error)
+}
+
+type TraceResolver interface {
+ TraceStoryByTraceID(traceID string) (apiv2.StoryResponse, bool)
+}
+
+type Handler struct {
+ store signals.Store
+ incidents IncidentSource
+ traces TraceResolver
+ now func() time.Time
+ matchWindow time.Duration
+ maxBody int64
+}
+
+type MatchResult struct {
+ Matched bool `json:"matched"`
+ IncidentID string `json:"incident_id,omitempty"`
+ Strategy string `json:"strategy"`
+}
+
+func NewHandler(store signals.Store, incidents IncidentSource, traces TraceResolver, matchWindow time.Duration) *Handler {
+ if store == nil {
+ store = signals.UnavailableStore{}
+ }
+ if matchWindow <= 0 {
+ matchWindow = 15 * time.Minute
+ }
+ if matchWindow > 24*time.Hour {
+ matchWindow = 24 * time.Hour
+ }
+ return &Handler{
+ store: store,
+ incidents: incidents,
+ traces: traces,
+ now: time.Now,
+ matchWindow: matchWindow,
+ maxBody: 1 << 20,
+ }
+}
+
+func (h *Handler) Alerts(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ writeError(w, http.StatusMethodNotAllowed, "METHOD_NOT_ALLOWED", "method not allowed", "")
+ return
+ }
+ r.Body = http.MaxBytesReader(w, r.Body, h.maxBody)
+ body, err := io.ReadAll(r.Body)
+ if err != nil {
+ writeError(w, http.StatusBadRequest, CodeInvalidJSON, "invalid body", err.Error())
+ return
+ }
+ sig, err := Normalize(body, h.now().UTC())
+ if err != nil {
+ var normErr *NormalizeError
+ if errors.As(err, &normErr) {
+ writeError(w, normErr.Status, normErr.Code, normErr.Message, normErr.Detail)
+ return
+ }
+ writeError(w, http.StatusBadRequest, CodeUnsupportedAlert, "unsupported alert", err.Error())
+ return
+ }
+ match := h.Match(r.Context(), sig)
+ sig.SignalID = signals.NewSignalID()
+ sig.ReceivedAt = h.now().UTC()
+ if err := signals.Validate(sig, h.now().UTC(), 5*time.Minute); err != nil {
+ writeError(w, http.StatusBadRequest, CodeMissingFields, "invalid alert", err.Error())
+ return
+ }
+ if err := h.store.Insert(r.Context(), sig); err != nil {
+ if errors.Is(err, signals.ErrUnavailable) {
+ writeError(w, http.StatusServiceUnavailable, CodeSignalUnavailable, "signals unavailable", "set SQLITE_PATH to enable alerts")
+ return
+ }
+ writeError(w, http.StatusInternalServerError, CodeInternal, "internal error", "")
+ return
+ }
+ writeJSON(w, http.StatusCreated, map[string]any{"signal": sig, "match": match})
+}
+
+func (h *Handler) Match(ctx context.Context, sig *signals.Signal) MatchResult {
+ if h.incidents == nil || sig == nil {
+ return MatchResult{Strategy: "none"}
+ }
+ if id := metaString(sig.Metadata, "incident_id"); id != "" {
+ if inc, err := h.incidents.Get(ctx, id); err == nil {
+ return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "incident_id"}
+ }
+ }
+ if traceID := metaString(sig.Metadata, "trace_id"); traceID != "" && h.traces != nil {
+ if story, ok := h.traces.TraceStoryByTraceID(traceID); ok && story.Anchor != nil {
+ if inc, ok := h.findActive(ctx, sig.Env, story.Service, story.Anchor.ErrorCode, sig.Timestamp); ok {
+ return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "trace_id"}
+ }
+ }
+ }
+ if code := metaString(sig.Metadata, "error_code"); code != "" {
+ if inc, ok := h.findActive(ctx, sig.Env, sig.Service, code, sig.Timestamp); ok {
+ return MatchResult{Matched: true, IncidentID: inc.IncidentID, Strategy: "family"}
+ }
+ }
+ return MatchResult{Strategy: "none"}
+}
+
+func (h *Handler) findActive(ctx context.Context, env, service, errorCode string, ts time.Time) (incidents.Incident, bool) {
+ rows, err := h.incidents.Active(ctx)
+ if err != nil {
+ return incidents.Incident{}, false
+ }
+ for _, inc := range rows {
+ if inc.Status != incidents.StatusActive {
+ continue
+ }
+ if env != "" && inc.Env != "" && inc.Env != env {
+ continue
+ }
+ if inc.ErrorFamily.Service != service || inc.ErrorFamily.ErrorCode != errorCode {
+ continue
+ }
+ if !ts.IsZero() {
+ lo := inc.StartedAt.Add(-h.matchWindow)
+ hi := inc.UpdatedAt.Add(h.matchWindow)
+ if ts.Before(lo) || ts.After(hi) {
+ continue
+ }
+ }
+ return inc, true
+ }
+ return incidents.Incident{}, false
+}
+
+type NormalizeError struct {
+ Status int
+ Code string
+ Message string
+ Detail string
+}
+
+func (e *NormalizeError) Error() string {
+ return e.Message
+}
+
+func Normalize(body []byte, now time.Time) (*signals.Signal, error) {
+ var root map[string]json.RawMessage
+ if err := json.Unmarshal(body, &root); err != nil {
+ return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeInvalidJSON, Message: "invalid json", Detail: err.Error()}
+ }
+ if source := getString(root, "source"); source != "" && source != "waylog" {
+ return normalizeWaylog(root, now)
+ }
+ switch {
+ case has(root, "ruleId") || has(root, "ruleUID") || has(root, "evalMatches") || isGrafanaAlertmanagerPayload(root):
+ return normalizeGrafana(root, now)
+ case has(root, "source") || has(root, "alert_id"):
+ return normalizeWaylog(root, now)
+ case has(root, "alerts") && (has(root, "receiver") || has(root, "commonLabels")):
+ return normalizeAlertmanager(root, now)
+ case has(root, "messages") || has(root, "event"):
+ return normalizePagerDuty(root, now)
+ default:
+ return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeUnsupportedAlert, Message: "unsupported alert", Detail: "expected waylog, alertmanager, grafana, or pagerduty payload"}
+ }
+}
+
+func isGrafanaAlertmanagerPayload(root map[string]json.RawMessage) bool {
+ if has(root, "orgId") || has(root, "orgID") {
+ return true
+ }
+ alert := firstObject(root, "alerts")
+ if has(alert, "dashboardURL") || has(alert, "panelURL") || has(alert, "ruleURL") {
+ return true
+ }
+ labels := objectField(alert, "labels")
+ annotations := objectField(alert, "annotations")
+ return stringMapField(labels, "grafana_folder") != "" ||
+ stringMapField(annotations, "__dashboardUid__") != "" ||
+ stringMapField(annotations, "__panelId__") != ""
+}
+
+func normalizeWaylog(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+ src := getString(root, "source")
+ if src == "" {
+ src = "waylog"
+ }
+ ts := getTime(root, "timestamp", now)
+ meta := baseMeta(root, src)
+ return finalize(src, getString(root, "service"), getString(root, "env"), getSeverity(root, "severity"), getString(root, "reason"), getString(root, "message"), ts, meta)
+}
+
+func normalizeAlertmanager(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+ alert := firstObject(root, "alerts")
+ labels := objectField(alert, "labels")
+ annotations := objectField(alert, "annotations")
+ meta := map[string]any{"raw_source": "alertmanager"}
+ put(meta, "alert_id", firstString(alert, labels, "fingerprint", "alertname"))
+ put(meta, "fingerprint", firstString(alert, labels, "fingerprint"))
+ put(meta, "provider_url", firstString(alert, annotations, "generatorURL", "runbook_url"))
+ put(meta, "error_code", stringMapField(labels, "error_code"))
+ ts := timeField(alert, "startsAt", now)
+ reason := firstNonEmpty(stringMapField(annotations, "summary"), stringMapField(annotations, "description"), stringMapField(labels, "alertname"))
+ return finalize("alertmanager", stringMapField(labels, "service"), stringMapField(labels, "env"), severityFromString(stringMapField(labels, "severity")), reason, stringMapField(annotations, "description"), ts, meta)
+}
+
+func normalizeGrafana(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+ alert := firstObject(root, "alerts")
+ labels := objectField(alert, "labels")
+ annotations := objectField(alert, "annotations")
+ if len(alert) == 0 {
+ alert = root
+ }
+ meta := map[string]any{"raw_source": "grafana"}
+ put(meta, "alert_id", firstNonEmpty(getString(root, "ruleUID"), getString(root, "ruleId"), stringMapField(labels, "alertname")))
+ put(meta, "fingerprint", stringMapField(alert, "fingerprint"))
+ put(meta, "provider_url", firstString(alert, root, "dashboardURL", "panelURL", "generatorURL", "ruleUrl"))
+ put(meta, "error_code", firstNonEmpty(stringMapField(labels, "error_code"), getString(root, "error_code")))
+ ts := timeField(alert, "startsAt", now)
+ reason := firstNonEmpty(stringMapField(annotations, "summary"), getString(root, "title"), getString(root, "ruleName"), stringMapField(labels, "alertname"))
+ return finalize("grafana", firstNonEmpty(stringMapField(labels, "service"), getString(root, "service")), firstNonEmpty(stringMapField(labels, "env"), getString(root, "env")), severityFromString(firstNonEmpty(stringMapField(labels, "severity"), getString(root, "state"))), reason, stringMapField(annotations, "description"), ts, meta)
+}
+
+func normalizePagerDuty(root map[string]json.RawMessage, now time.Time) (*signals.Signal, error) {
+ msg := firstObject(root, "messages")
+ event := objectField(msg, "event")
+ data := objectField(event, "data")
+ if len(data) == 0 {
+ data = objectField(root, "incident")
+ }
+ serviceObj := objectField(data, "service")
+ meta := map[string]any{"raw_source": "pagerduty"}
+ put(meta, "alert_id", firstNonEmpty(stringMapField(data, "id"), stringMapField(event, "id")))
+ put(meta, "provider_url", stringMapField(data, "html_url"))
+ put(meta, "error_code", stringMapField(data, "error_code"))
+ put(meta, "incident_id", stringMapField(data, "incident_id"))
+ ts := timeField(data, "created_at", now)
+ reason := firstNonEmpty(stringMapField(data, "title"), stringMapField(data, "summary"), stringMapField(event, "event_type"))
+ return finalize("pagerduty", firstNonEmpty(stringMapField(data, "service"), stringMapField(serviceObj, "summary")), stringMapField(data, "env"), severityFromString(firstNonEmpty(stringMapField(data, "urgency"), stringMapField(data, "severity"))), reason, stringMapField(data, "description"), ts, meta)
+}
+
+func finalize(source, service, env string, severity signals.Severity, reason, message string, ts time.Time, meta map[string]any) (*signals.Signal, error) {
+ if strings.TrimSpace(service) == "" || strings.TrimSpace(env) == "" || strings.TrimSpace(reason) == "" {
+ return nil, &NormalizeError{Status: http.StatusBadRequest, Code: CodeMissingFields, Message: "missing required fields", Detail: "service, env, and reason are required"}
+ }
+ if severity == "" {
+ severity = signals.SeverityWarning
+ }
+ return &signals.Signal{
+ Type: signals.TypeAlert,
+ Source: source,
+ Service: service,
+ Env: env,
+ Severity: severity,
+ Reason: reason,
+ Message: message,
+ Metadata: meta,
+ Timestamp: ts.UTC(),
+ }, nil
+}
+
+func writeJSON(w http.ResponseWriter, status int, v any) {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(status)
+ _ = json.NewEncoder(w).Encode(v)
+}
+
+func writeError(w http.ResponseWriter, status int, code, message, detail string) {
+ writeJSON(w, status, map[string]any{"error": map[string]string{"code": code, "message": message, "detail": detail}})
+}
+
+func has(root map[string]json.RawMessage, key string) bool {
+ _, ok := root[key]
+ return ok
+}
+
+func baseMeta(root map[string]json.RawMessage, source string) map[string]any {
+ meta := map[string]any{"raw_source": source}
+ for _, key := range []string{"alert_id", "error_code", "trace_id", "incident_id", "provider_url", "fingerprint"} {
+ put(meta, key, getString(root, key))
+ }
+ return meta
+}
+
+func put(m map[string]any, key, value string) {
+ if value != "" {
+ m[key] = value
+ }
+}
+
+func metaString(m map[string]any, key string) string {
+ if s, ok := m[key].(string); ok {
+ return s
+ }
+ return ""
+}
+
+func getString(root map[string]json.RawMessage, key string) string {
+ var s string
+ _ = json.Unmarshal(root[key], &s)
+ return strings.TrimSpace(s)
+}
+
+func getSeverity(root map[string]json.RawMessage, key string) signals.Severity {
+ return severityFromString(getString(root, key))
+}
+
+func severityFromString(s string) signals.Severity {
+ switch strings.ToLower(strings.TrimSpace(s)) {
+ case "critical", "error", "high", "triggered":
+ return signals.SeverityCritical
+ case "info", "resolved", "ok":
+ return signals.SeverityInfo
+ case "warning", "warn", "":
+ return signals.SeverityWarning
+ default:
+ return signals.SeverityWarning
+ }
+}
+
+func getTime(root map[string]json.RawMessage, key string, fallback time.Time) time.Time {
+ if t := timeField(root, key, time.Time{}); !t.IsZero() {
+ return t
+ }
+ return fallback
+}
+
+func timeField(root map[string]json.RawMessage, key string, fallback time.Time) time.Time {
+ raw, ok := root[key]
+ if !ok {
+ return fallback
+ }
+ var s string
+ if err := json.Unmarshal(raw, &s); err != nil || s == "" {
+ return fallback
+ }
+ if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
+ return t
+ }
+ return fallback
+}
+
+func firstObject(root map[string]json.RawMessage, key string) map[string]json.RawMessage {
+ var arr []map[string]json.RawMessage
+ if err := json.Unmarshal(root[key], &arr); err == nil && len(arr) > 0 {
+ return arr[0]
+ }
+ return nil
+}
+
+func objectField(root map[string]json.RawMessage, key string) map[string]json.RawMessage {
+ if root == nil {
+ return nil
+ }
+ var obj map[string]json.RawMessage
+ if err := json.Unmarshal(root[key], &obj); err != nil {
+ return nil
+ }
+ return obj
+}
+
+func stringMapField(root map[string]json.RawMessage, key string) string {
+ if root == nil {
+ return ""
+ }
+ var s string
+ _ = json.Unmarshal(root[key], &s)
+ return strings.TrimSpace(s)
+}
+
+func firstString(primary, secondary map[string]json.RawMessage, keys ...string) string {
+ for _, key := range keys {
+ if s := stringMapField(primary, key); s != "" {
+ return s
+ }
+ if s := stringMapField(secondary, key); s != "" {
+ return s
+ }
+ }
+ return ""
+}
+
+func firstNonEmpty(values ...string) string {
+ for _, v := range values {
+ if strings.TrimSpace(v) != "" {
+ return strings.TrimSpace(v)
+ }
+ }
+ return ""
+}
diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go
new file mode 100644
index 0000000..6b0b7da
--- /dev/null
+++ b/internal/alerts/alerts_test.go
@@ -0,0 +1,234 @@
+package alerts
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/incidents"
+ "github.com/sssmaran/WaylogCLI/internal/signals"
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+)
+
+func TestNormalizeWaylogAlert(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ raw := []byte(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","severity":"critical","reason":"PMT_502 spike","error_code":"PMT_502","provider_url":"https://alerts/1","timestamp":"2026-05-10T12:00:00Z"}`)
+ sig, err := Normalize(raw, now)
+ if err != nil {
+ t.Fatalf("normalize: %v", err)
+ }
+ if sig.Type != signals.TypeAlert || sig.Source != "waylog" || sig.Service != "checkout" {
+ t.Fatalf("unexpected signal: %+v", sig)
+ }
+ if sig.Metadata["alert_id"] != "alert_1" || sig.Metadata["error_code"] != "PMT_502" {
+ t.Fatalf("metadata not preserved: %+v", sig.Metadata)
+ }
+}
+
+func TestNormalizeProviderPayloads(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ cases := []struct {
+ name string
+ raw string
+ source string
+ }{
+ {
+ name: "alertmanager",
+ raw: `{"receiver":"team","alerts":[{"fingerprint":"fp1","startsAt":"2026-05-10T12:00:00Z","generatorURL":"https://am/1","labels":{"alertname":"Payment502","service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike"}}]}`,
+ source: "alertmanager",
+ },
+ {
+ name: "grafana",
+ raw: `{"ruleUID":"rule1","title":"Payment failures","state":"alerting","alerts":[{"startsAt":"2026-05-10T12:00:00Z","dashboardURL":"https://grafana/d","labels":{"service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike"}}]}`,
+ source: "grafana",
+ },
+ {
+ name: "pagerduty",
+ raw: `{"messages":[{"event":{"event_type":"incident.trigger","data":{"id":"pd1","html_url":"https://pd/1","title":"PMT_502 spike","service":{"summary":"checkout"},"env":"prod","urgency":"high","error_code":"PMT_502"}}}]}`,
+ source: "pagerduty",
+ },
+ }
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ sig, err := Normalize([]byte(tc.raw), now)
+ if err != nil {
+ t.Fatalf("normalize: %v", err)
+ }
+ if sig.Source != tc.source || sig.Type != signals.TypeAlert {
+ t.Fatalf("unexpected signal: %+v", sig)
+ }
+ if sig.Service != "checkout" || sig.Env != "prod" {
+ t.Fatalf("service/env missing: %+v", sig)
+ }
+ })
+ }
+}
+
+func TestMatcherOrderAndUnmatched(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ inc := incidents.Incident{
+ IncidentID: "inc_1",
+ Env: "prod",
+ Status: incidents.StatusActive,
+ StartedAt: now.Add(-time.Minute),
+ UpdatedAt: now,
+ ErrorFamily: apiv2.ErrorFamily{
+ Service: "checkout",
+ Step: "payment.charge",
+ ErrorCode: "PMT_502",
+ },
+ }
+ h := NewHandler(&memSignalStore{}, incidentSource{rows: []incidents.Incident{inc}}, traceResolver{}, 15*time.Minute)
+
+ sig := &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now, Metadata: map[string]any{"incident_id": "inc_1", "error_code": "OTHER"}}
+ got := h.Match(context.Background(), sig)
+ if !got.Matched || got.Strategy != "incident_id" {
+ t.Fatalf("incident id should win, got %+v", got)
+ }
+
+ sig = &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now, Metadata: map[string]any{"error_code": "PMT_502"}}
+ got = h.Match(context.Background(), sig)
+ if !got.Matched || got.Strategy != "family" {
+ t.Fatalf("family match failed: %+v", got)
+ }
+
+ sig = &signals.Signal{Env: "prod", Service: "checkout", Timestamp: now.Add(2 * time.Hour), Metadata: map[string]any{"error_code": "PMT_502"}}
+ got = h.Match(context.Background(), sig)
+ if got.Matched || got.Strategy != "none" {
+ t.Fatalf("outside window should be unmatched: %+v", got)
+ }
+}
+
+func TestMatcherExplicitIncidentIDCanMatchResolvedIncident(t *testing.T) {
+ inc := incidents.Incident{IncidentID: "inc_resolved", Status: incidents.StatusResolved}
+ h := NewHandler(&memSignalStore{}, incidentSource{rows: []incidents.Incident{inc}}, traceResolver{}, 15*time.Minute)
+
+ got := h.Match(context.Background(), &signals.Signal{Metadata: map[string]any{"incident_id": "inc_resolved"}})
+ if !got.Matched || got.Strategy != "incident_id" || got.IncidentID != "inc_resolved" {
+ t.Fatalf("explicit incident_id should be authoritative, got %+v", got)
+ }
+}
+
+func TestNormalizeGrafanaAlertmanagerPayload(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ raw := []byte(`{"receiver":"grafana","status":"firing","externalURL":"https://grafana.example","alerts":[{"startsAt":"2026-05-10T12:00:00Z","dashboardURL":"https://grafana.example/d/abc","labels":{"service":"checkout","env":"prod","severity":"critical","error_code":"PMT_502"},"annotations":{"summary":"PMT_502 spike","__dashboardUid__":"abc"}}]}`)
+ sig, err := Normalize(raw, now)
+ if err != nil {
+ t.Fatalf("normalize: %v", err)
+ }
+ if sig.Source != "grafana" {
+ t.Fatalf("source=%q want grafana", sig.Source)
+ }
+ if sig.Metadata["provider_url"] != "https://grafana.example/d/abc" {
+ t.Fatalf("provider_url not preserved: %+v", sig.Metadata)
+ }
+}
+
+func TestNormalizeRejectsUnsupported(t *testing.T) {
+ _, err := Normalize([]byte(`{"hello":"world"}`), time.Now())
+ if err == nil {
+ t.Fatal("expected error")
+ }
+ var normErr *NormalizeError
+ if !errors.As(err, &normErr) || normErr.Code != CodeUnsupportedAlert {
+ t.Fatalf("unexpected error: %v", err)
+ }
+}
+
+func TestHandlerStoresWaylogAlert(t *testing.T) {
+ store := &recordingSignalStore{}
+ h := NewHandler(store, incidentSource{}, traceResolver{}, 15*time.Minute)
+ h.now = func() time.Time { return time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) }
+
+ req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","severity":"critical","reason":"PMT_502 spike"}`))
+ rr := httptest.NewRecorder()
+ h.Alerts(rr, req)
+
+ if rr.Code != http.StatusCreated {
+ t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+ }
+ if store.inserted == nil || store.inserted.Type != signals.TypeAlert {
+ t.Fatalf("alert signal was not inserted: %+v", store.inserted)
+ }
+ var out struct {
+ Match MatchResult `json:"match"`
+ }
+ if err := json.Unmarshal(rr.Body.Bytes(), &out); err != nil {
+ t.Fatalf("decode: %v", err)
+ }
+ if out.Match.Matched || out.Match.Strategy != "none" {
+ t.Fatalf("unexpected match for no active incidents: %+v", out.Match)
+ }
+}
+
+func TestHandlerRejectsInvalidJSON(t *testing.T) {
+ h := NewHandler(&recordingSignalStore{}, incidentSource{}, traceResolver{}, 15*time.Minute)
+ req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{`))
+ rr := httptest.NewRecorder()
+ h.Alerts(rr, req)
+ if rr.Code != http.StatusBadRequest {
+ t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+ }
+}
+
+func TestHandlerUnavailableSignalStore(t *testing.T) {
+ h := NewHandler(signals.UnavailableStore{}, incidentSource{}, traceResolver{}, 15*time.Minute)
+ req := httptest.NewRequest(http.MethodPost, "/v1/alerts", strings.NewReader(`{"source":"waylog","alert_id":"alert_1","service":"checkout","env":"prod","reason":"PMT_502 spike"}`))
+ rr := httptest.NewRecorder()
+ h.Alerts(rr, req)
+ if rr.Code != http.StatusServiceUnavailable {
+ t.Fatalf("status=%d body=%s", rr.Code, rr.Body.String())
+ }
+}
+
+type incidentSource struct {
+ rows []incidents.Incident
+}
+
+func (s incidentSource) Active(context.Context) ([]incidents.Incident, error) {
+ return s.rows, nil
+}
+
+func (s incidentSource) Get(_ context.Context, id string) (incidents.Incident, error) {
+ for _, inc := range s.rows {
+ if inc.IncidentID == id {
+ return inc, nil
+ }
+ }
+ return incidents.Incident{}, incidents.ErrNotFound
+}
+
+type traceResolver struct{}
+
+func (traceResolver) TraceStoryByTraceID(string) (apiv2.StoryResponse, bool) {
+ return apiv2.StoryResponse{Service: "checkout", Anchor: &apiv2.StoryAnchor{Step: "payment.charge", ErrorCode: "PMT_502"}}, true
+}
+
+type memSignalStore struct{}
+
+func (*memSignalStore) Insert(context.Context, *signals.Signal) error { return nil }
+func (*memSignalStore) Query(context.Context, signals.Filter) ([]signals.Signal, error) {
+ return nil, nil
+}
+func (*memSignalStore) PruneOlderThan(context.Context, time.Time) (int, error) { return 0, nil }
+
+type recordingSignalStore struct {
+ inserted *signals.Signal
+}
+
+func (s *recordingSignalStore) Insert(_ context.Context, sig *signals.Signal) error {
+ copy := *sig
+ s.inserted = ©
+ return nil
+}
+func (*recordingSignalStore) Query(context.Context, signals.Filter) ([]signals.Signal, error) {
+ return nil, nil
+}
+func (*recordingSignalStore) PruneOlderThan(context.Context, time.Time) (int, error) {
+ return 0, nil
+}
diff --git a/internal/incidents/classifier.go b/internal/incidents/classifier.go
index 5f10c66..6d57e82 100644
--- a/internal/incidents/classifier.go
+++ b/internal/incidents/classifier.go
@@ -28,6 +28,7 @@ type Classification struct {
func Classify(input ClassificationInput) Classification {
evidence := collectTraceEvidence(input.Events)
+ evidence = append(evidence, matchingAlertEvidence(input)...)
warnings := instrumentationWarnings(input.Events, input.Signals)
if dep := matchingDependencySignal(input); dep != nil {
@@ -142,6 +143,32 @@ func matchingSignal(input ClassificationInput, typ signals.Type) *signals.Signal
return nil
}
+func matchingAlertEvidence(input ClassificationInput) []Evidence {
+ start := input.Incident.StartedAt
+ lo := start.Add(-15 * time.Minute)
+ hi := input.Now
+ if hi.IsZero() {
+ hi = input.Incident.UpdatedAt
+ }
+ out := []Evidence{}
+ for _, sig := range input.Signals {
+ if sig.Type != signals.TypeAlert {
+ continue
+ }
+ if input.Incident.Env != "" && sig.Env != input.Incident.Env {
+ continue
+ }
+ if sig.Service != input.Incident.Service {
+ continue
+ }
+ if sig.Timestamp.Before(lo) || sig.Timestamp.After(hi) {
+ continue
+ }
+ out = append(out, signalEvidence(sig, "External alert overlaps incident window"))
+ }
+ return out
+}
+
func collectTraceEvidence(events []*eventv2.Event) []Evidence {
out := make([]Evidence, 0, 2)
for _, ev := range events {
@@ -173,6 +200,17 @@ func deploymentEvidence(dep Deployment) Evidence {
}
func signalEvidence(sig signals.Signal, title string) Evidence {
+ fields := map[string]any{
+ "type": string(sig.Type),
+ "severity": string(sig.Severity),
+ "source": sig.Source,
+ }
+ if alertID := stringField(sig.Metadata, "alert_id"); alertID != "" {
+ fields["alert_id"] = alertID
+ }
+ if providerURL := stringField(sig.Metadata, "provider_url"); providerURL != "" {
+ fields["provider_url"] = providerURL
+ }
return Evidence{
Kind: EvidenceSignal,
Title: title,
@@ -180,11 +218,7 @@ func signalEvidence(sig signals.Signal, title string) Evidence {
Service: sig.Service,
SignalID: sig.SignalID,
OccurredAt: sig.Timestamp,
- Fields: map[string]any{
- "type": string(sig.Type),
- "severity": string(sig.Severity),
- "source": sig.Source,
- },
+ Fields: fields,
}
}
diff --git a/internal/incidents/classifier_test.go b/internal/incidents/classifier_test.go
index 5c5a16d..10ae22c 100644
--- a/internal/incidents/classifier_test.go
+++ b/internal/incidents/classifier_test.go
@@ -202,3 +202,51 @@ func TestNextChecksRuntime(t *testing.T) {
t.Fatalf("expected non-empty next checks for runtime cause")
}
}
+
+func TestClassifyIncludesAlertEvidenceWithoutChangingCause(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ base := Incident{Service: "checkout", Env: "prod", StartedAt: now, ErrorFamily: testFamily()}
+ paymentEvent := testIncidentEvent("e1", "trace-a", now, "checkout", "payment.charge", "PMT_502", "payment")
+
+ got := Classify(ClassificationInput{
+ Incident: base,
+ Events: []*eventv2.Event{paymentEvent},
+ Signals: []signals.Signal{{
+ SignalID: "sig_alert",
+ Type: signals.TypeAlert,
+ Source: "grafana",
+ Service: "checkout",
+ Env: "prod",
+ Severity: signals.SeverityCritical,
+ Reason: "PMT_502 spike",
+ Timestamp: now,
+ Metadata: map[string]any{"alert_id": "alert_1", "provider_url": "https://grafana/alert"},
+ }, {
+ SignalID: "sig_other_env",
+ Type: signals.TypeAlert,
+ Source: "grafana",
+ Service: "checkout",
+ Env: "staging",
+ Severity: signals.SeverityCritical,
+ Reason: "staging alert",
+ Timestamp: now,
+ Metadata: map[string]any{"alert_id": "alert_staging"},
+ }},
+ Now: now,
+ })
+ if got.Cause != CauseDependency {
+ t.Fatalf("alert should not override dependency cause: %+v", got)
+ }
+ for _, ev := range got.Evidence {
+ if ev.SignalID == "sig_other_env" {
+ t.Fatalf("alert evidence from another env should not be included: %+v", ev)
+ }
+ if ev.SignalID == "sig_alert" && ev.Title == "External alert overlaps incident window" {
+ if ev.Fields["alert_id"] != "alert_1" {
+ t.Fatalf("alert metadata missing: %+v", ev.Fields)
+ }
+ return
+ }
+ }
+ t.Fatalf("alert evidence missing: %+v", got.Evidence)
+}
diff --git a/internal/reports/reports.go b/internal/reports/reports.go
new file mode 100644
index 0000000..9ad67d7
--- /dev/null
+++ b/internal/reports/reports.go
@@ -0,0 +1,152 @@
+package reports
+
+import (
+ "encoding/json"
+ "fmt"
+ "strings"
+
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+const (
+ FormatMarkdown = "markdown"
+ FormatSlack = "slack"
+ FormatPagerDuty = "pagerduty"
+)
+
+type Rendered struct {
+ Format string `json:"format"`
+ ContentType string `json:"content_type"`
+ Body any `json:"body"`
+}
+
+func Render(rep *pkgtriage.Report, format string) (Rendered, error) {
+ if rep == nil {
+ return Rendered{}, fmt.Errorf("report required")
+ }
+ if format == "" {
+ format = FormatMarkdown
+ }
+ switch format {
+ case FormatMarkdown:
+ return Rendered{Format: format, ContentType: "text/markdown", Body: Markdown(rep)}, nil
+ case FormatSlack:
+ return Rendered{Format: format, ContentType: "application/json", Body: Slack(rep)}, nil
+ case FormatPagerDuty:
+ return Rendered{Format: format, ContentType: "text/plain", Body: PagerDuty(rep)}, nil
+ default:
+ return Rendered{}, fmt.Errorf("unsupported report format %q", format)
+ }
+}
+
+func Markdown(rep *pkgtriage.Report) string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "# Waylog Triage Report\n\n")
+ fmt.Fprintf(&b, "- Incident: `%s`\n", nz(rep.IncidentRef.ID))
+ fmt.Fprintf(&b, "- Window: `%s`\n", nz(rep.IncidentRef.Window))
+ fmt.Fprintf(&b, "- Confidence: `%s`\n", nz(string(rep.Confidence)))
+ fmt.Fprintf(&b, "- Report hash: `%s`\n\n", nz(rep.ReportHash))
+
+ fmt.Fprintf(&b, "## Blast Snapshot\n\n")
+ fmt.Fprintf(&b, "- Requests: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Requests, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+ fmt.Fprintf(&b, "- Users: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Users, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+ fmt.Fprintf(&b, "- Services: %d (incident `%s`, report `%s`)\n", rep.BlastSnapshot.Services, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+ for _, f := range rep.BlastSnapshot.TopErrorFamilies {
+ fmt.Fprintf(&b, "- Error family: `%s/%s/%s` count=%d (incident `%s`, report `%s`)\n", nz(f.Service), nz(f.Step), nz(f.ErrorCode), f.Count, nz(rep.IncidentRef.ID), nz(rep.ReportHash))
+ }
+ if len(rep.BlastSnapshot.TopErrorFamilies) == 0 {
+ fmt.Fprintf(&b, "- Error family: not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
+ }
+
+ fmt.Fprintf(&b, "\n## Alert Evidence\n\n")
+ if len(rep.Alerts) == 0 {
+ fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+ } else {
+ for _, a := range rep.Alerts {
+ fmt.Fprintf(&b, "- `%s` from `%s`: %s (signal `%s`, alert `%s`, report `%s`)\n", nz(a.Severity), nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID), nz(rep.ReportHash))
+ }
+ }
+
+ fmt.Fprintf(&b, "\n## Signals\n\n")
+ if len(rep.Signals) == 0 {
+ fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+ } else {
+ for _, s := range rep.Signals {
+ fmt.Fprintf(&b, "- `%s` signal `%s` evidence=%s (report `%s`)\n", nz(s.Type), nz(s.ID), strings.Join(s.EvidenceIDs, ","), nz(rep.ReportHash))
+ }
+ }
+
+ fmt.Fprintf(&b, "\n## Sample Traces\n\n")
+ if len(rep.SampleTraces) == 0 {
+ fmt.Fprintf(&b, "- not available (incident `%s`)\n", nz(rep.IncidentRef.ID))
+ } else {
+ for _, t := range rep.SampleTraces {
+ fmt.Fprintf(&b, "- trace `%s`: %s (incident `%s`)\n", nz(t.TraceID), nz(t.Summary), nz(rep.IncidentRef.ID))
+ }
+ }
+
+ fmt.Fprintf(&b, "\n## Next Checks\n\n")
+ if len(rep.NextChecks) == 0 {
+ fmt.Fprintf(&b, "- not available (report `%s`)\n", nz(rep.ReportHash))
+ } else {
+ for _, c := range rep.NextChecks {
+ fmt.Fprintf(&b, "- %s (check `%s`, report `%s`)\n", nz(c.Prompt), nz(c.ID), nz(rep.ReportHash))
+ }
+ }
+ return b.String()
+}
+
+func Slack(rep *pkgtriage.Report) map[string]any {
+ fields := []map[string]string{
+ {"type": "mrkdwn", "text": "*Incident*\n`" + nz(rep.IncidentRef.ID) + "`"},
+ {"type": "mrkdwn", "text": "*Confidence*\n`" + nz(string(rep.Confidence)) + "`"},
+ {"type": "mrkdwn", "text": "*Report hash*\n`" + nz(rep.ReportHash) + "`"},
+ }
+ alertText := "not available"
+ if len(rep.Alerts) > 0 {
+ a := rep.Alerts[0]
+ alertText = fmt.Sprintf("`%s` %s (signal `%s`, alert `%s`)", nz(a.Source), nz(a.Reason), nz(a.SignalID), nz(a.AlertID))
+ }
+ return map[string]any{
+ "blocks": []map[string]any{
+ {"type": "header", "text": map[string]string{"type": "plain_text", "text": "Waylog triage report"}},
+ {"type": "section", "fields": fields},
+ {"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Alert evidence*\n" + alertText}},
+ {"type": "section", "text": map[string]string{"type": "mrkdwn", "text": "*Next check*\n" + firstCheck(rep)}},
+ },
+ }
+}
+
+func PagerDuty(rep *pkgtriage.Report) string {
+ alert := "not available"
+ if len(rep.Alerts) > 0 {
+ a := rep.Alerts[0]
+ alert = fmt.Sprintf("%s alert %s via signal %s", nz(a.Source), nz(a.AlertID), nz(a.SignalID))
+ }
+ return fmt.Sprintf("Waylog triage: incident %s confidence=%s report_hash=%s alert=%s next_check=%s",
+ nz(rep.IncidentRef.ID), nz(string(rep.Confidence)), nz(rep.ReportHash), alert, firstCheck(rep))
+}
+
+func EncodeBody(r Rendered) ([]byte, error) {
+ if r.Format == FormatSlack {
+ return json.MarshalIndent(r.Body, "", " ")
+ }
+ if s, ok := r.Body.(string); ok {
+ return []byte(s), nil
+ }
+ return json.MarshalIndent(r.Body, "", " ")
+}
+
+func firstCheck(rep *pkgtriage.Report) string {
+ if len(rep.NextChecks) == 0 {
+ return "not available (report `" + nz(rep.ReportHash) + "`)"
+ }
+ return nz(rep.NextChecks[0].Prompt) + " (check `" + nz(rep.NextChecks[0].ID) + "`, report `" + nz(rep.ReportHash) + "`)"
+}
+
+func nz(s string) string {
+ if strings.TrimSpace(s) == "" {
+ return "not available"
+ }
+ return s
+}
diff --git a/internal/reports/reports_test.go b/internal/reports/reports_test.go
new file mode 100644
index 0000000..00befb8
--- /dev/null
+++ b/internal/reports/reports_test.go
@@ -0,0 +1,68 @@
+package reports
+
+import (
+ "encoding/json"
+ "strings"
+ "testing"
+
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+func TestMarkdownReportCitesEvidence(t *testing.T) {
+ out := Markdown(testReport())
+ for _, want := range []string{"Requests: 12 (incident `inc_abc`, report `sha256:test`)", "trace_1", "sig_alert", "alert_1", "check_0"} {
+ if !strings.Contains(out, want) {
+ t.Fatalf("markdown missing %q:\n%s", want, out)
+ }
+ }
+}
+
+func TestSlackReportIsJSONAndCitesEvidence(t *testing.T) {
+ rendered, err := Render(testReport(), FormatSlack)
+ if err != nil {
+ t.Fatal(err)
+ }
+ raw, err := EncodeBody(rendered)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !json.Valid(raw) {
+ t.Fatalf("invalid json: %s", raw)
+ }
+ for _, want := range []string{"sig_alert", "alert_1", "sha256:test"} {
+ if !strings.Contains(string(raw), want) {
+ t.Fatalf("slack payload missing %q:\n%s", want, raw)
+ }
+ }
+}
+
+func TestPagerDutyReportCitesEvidence(t *testing.T) {
+ out := PagerDuty(testReport())
+ for _, want := range []string{"inc_abc", "sig_alert", "alert_1", "sha256:test"} {
+ if !strings.Contains(out, want) {
+ t.Fatalf("pagerduty missing %q:\n%s", want, out)
+ }
+ }
+}
+
+func testReport() *pkgtriage.Report {
+ return &pkgtriage.Report{
+ SchemaVersion: pkgtriage.SchemaVersionV1,
+ IncidentRef: pkgtriage.IncidentRef{ID: "inc_abc", Window: "15m"},
+ BlastSnapshot: pkgtriage.BlastSnapshot{
+ Requests: 12,
+ Users: 2,
+ Services: 3,
+ TopErrorFamilies: []pkgtriage.ErrorFamily{
+ {Service: "checkout", Step: "payment.charge", ErrorCode: "PMT_502", Count: 12},
+ },
+ },
+ SampleTraces: []pkgtriage.TraceSample{{TraceID: "trace_1", Summary: "checkout payment failure"}},
+ Signals: []pkgtriage.SignalRef{{ID: "sig_alert", Type: "alert", EvidenceIDs: []string{"sig_alert"}}},
+ Alerts: []pkgtriage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_alert"}}},
+ NextChecks: []pkgtriage.NextCheck{{ID: "check_0", Prompt: "Check payment health"}},
+ Confidence: pkgtriage.ConfidenceHigh,
+ GeneratedAt: "2026-05-10T12:00:00Z",
+ ReportHash: "sha256:test",
+ }
+}
diff --git a/internal/tools/report.go b/internal/tools/report.go
new file mode 100644
index 0000000..57206c7
--- /dev/null
+++ b/internal/tools/report.go
@@ -0,0 +1,69 @@
+package tools
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "time"
+
+ "github.com/sssmaran/WaylogCLI/internal/reports"
+ "github.com/sssmaran/WaylogCLI/internal/triage"
+)
+
+const renderTriageReportInputSchema = `{
+ "type": "object",
+ "required": ["incident_id"],
+ "properties": {
+ "incident_id": {"type": "string"},
+ "format": {"type": "string", "enum": ["markdown", "slack", "pagerduty"], "default": "markdown"},
+ "window": {"type": "string", "description": "Go duration string, default 15m"},
+ "snapshot": {"type": "boolean"}
+ }
+}`
+
+const renderTriageReportOutputSchema = `{
+ "type": "object",
+ "required": ["format", "content_type", "body"],
+ "properties": {
+ "format": {"type": "string"},
+ "content_type": {"type": "string"},
+ "body": {}
+ }
+}`
+
+func RegisterTriageReportTool(reg *Registry, engine *triage.Engine) error {
+ return reg.Register(Tool{
+ Name: "render_triage_report",
+ Description: "Render a deterministic operator report from a TriageReport.",
+ Version: "triage-report.v1",
+ InputSchema: json.RawMessage(renderTriageReportInputSchema),
+ OutputSchema: json.RawMessage(renderTriageReportOutputSchema),
+ Examples: []string{
+ `{"incident_id":"inc_01HX...","format":"markdown","snapshot":true}`,
+ `{"incident_id":"inc_01HX...","format":"slack"}`,
+ },
+ Handler: func(ctx context.Context, _ Store, params json.RawMessage) (any, error) {
+ var p struct {
+ IncidentID string `json:"incident_id"`
+ Format string `json:"format"`
+ Window string `json:"window"`
+ Snapshot bool `json:"snapshot"`
+ }
+ if err := json.Unmarshal(params, &p); err != nil {
+ return nil, fmt.Errorf("render_triage_report: bad params: %w", err)
+ }
+ if p.IncidentID == "" {
+ return nil, fmt.Errorf("render_triage_report: incident_id required")
+ }
+ opts, err := triage.ParseBuildOptions(p.Window, p.Snapshot, time.Now())
+ if err != nil {
+ return nil, err
+ }
+ rep, err := engine.Build(ctx, p.IncidentID, opts)
+ if err != nil {
+ return nil, err
+ }
+ return reports.Render(rep, p.Format)
+ },
+ })
+}
diff --git a/internal/tools/report_test.go b/internal/tools/report_test.go
new file mode 100644
index 0000000..29df20f
--- /dev/null
+++ b/internal/tools/report_test.go
@@ -0,0 +1,29 @@
+package tools_test
+
+import (
+ "context"
+ "encoding/json"
+ "testing"
+
+ "github.com/sssmaran/WaylogCLI/internal/reports"
+ "github.com/sssmaran/WaylogCLI/internal/tools"
+)
+
+func TestRenderTriageReportToolReturnsRenderedReport(t *testing.T) {
+ reg := tools.NewRegistry()
+ eng := newStubEngine(t)
+ if err := tools.RegisterTriageReportTool(reg, eng); err != nil {
+ t.Fatalf("register: %v", err)
+ }
+ out, err := reg.Call(context.Background(), nil, "render_triage_report", json.RawMessage(`{"incident_id":"inc_abc","format":"markdown"}`))
+ if err != nil {
+ t.Fatalf("call: %v", err)
+ }
+ rendered, ok := out.(reports.Rendered)
+ if !ok {
+ t.Fatalf("got %T, want reports.Rendered", out)
+ }
+ if rendered.Format != reports.FormatMarkdown || rendered.ContentType != "text/markdown" {
+ t.Fatalf("unexpected rendered report: %+v", rendered)
+ }
+}
diff --git a/internal/triage/adapter.go b/internal/triage/adapter.go
index 8188bd9..1723880 100644
--- a/internal/triage/adapter.go
+++ b/internal/triage/adapter.go
@@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"strconv"
+ "time"
"github.com/sssmaran/WaylogCLI/internal/incidents"
"github.com/sssmaran/WaylogCLI/internal/signals"
@@ -208,12 +209,26 @@ func storySummary(s apiv2.StoryResponse, inc IncidentSummary) string {
return "first failure"
}
-type signalQueryAdapter struct{ s SignalStore }
+type signalQueryAdapter struct {
+ s SignalStore
+ alertMatchWindow time.Duration
+}
func NewSignalQueryAdapter(s SignalStore) SignalQuery {
return signalQueryAdapter{s: s}
}
+func NewAlertQueryAdapter(s SignalStore, matchWindow ...time.Duration) AlertQuery {
+ window := 15 * time.Minute
+ if len(matchWindow) > 0 && matchWindow[0] > 0 {
+ window = matchWindow[0]
+ }
+ if window > 24*time.Hour {
+ window = 24 * time.Hour
+ }
+ return signalQueryAdapter{s: s, alertMatchWindow: window}
+}
+
func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error) {
end := opts.Now
if end.IsZero() {
@@ -249,6 +264,59 @@ func (a signalQueryAdapter) SignalsFor(ctx context.Context, inc IncidentSummary,
return out, nil
}
+func (a signalQueryAdapter) AlertsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]pkgtriage.AlertRef, error) {
+ end := opts.Now
+ if end.IsZero() {
+ end = inc.UpdatedAt
+ }
+ since := inc.StartedAt.Add(-a.alertMatchWindow)
+ if inc.StartedAt.IsZero() {
+ window := opts.Window
+ if window <= 0 {
+ window = defaultWindow
+ }
+ since = end.Add(-window)
+ }
+ until := end.Add(a.alertMatchWindow)
+ rows, err := a.s.Query(ctx, signals.Filter{
+ Env: inc.Env,
+ Service: inc.Service,
+ Types: []signals.Type{signals.TypeAlert},
+ Since: since,
+ Until: until,
+ Limit: 200,
+ })
+ if err != nil {
+ if errors.Is(err, signals.ErrUnavailable) {
+ return nil, nil
+ }
+ return nil, err
+ }
+ out := make([]pkgtriage.AlertRef, 0, len(rows))
+ for _, sig := range rows {
+ out = append(out, pkgtriage.AlertRef{
+ SignalID: sig.SignalID,
+ AlertID: stringField(sig.Metadata, "alert_id"),
+ Source: sig.Source,
+ Severity: string(sig.Severity),
+ Reason: sig.Reason,
+ ProviderURL: stringField(sig.Metadata, "provider_url"),
+ EvidenceIDs: []string{sig.SignalID},
+ })
+ }
+ return out, nil
+}
+
+func stringField(m map[string]any, key string) string {
+ if len(m) == 0 {
+ return ""
+ }
+ if s, ok := m[key].(string); ok {
+ return s
+ }
+ return ""
+}
+
type nextChecksAdapter struct{}
// NewNextChecksAdapter returns a passthrough that converts the incident's
diff --git a/internal/triage/adapter_test.go b/internal/triage/adapter_test.go
index 2e678d6..8eeaf9d 100644
--- a/internal/triage/adapter_test.go
+++ b/internal/triage/adapter_test.go
@@ -417,6 +417,43 @@ func TestSignalQueryAdapter_UnavailableReturnsEmpty(t *testing.T) {
}
}
+func TestAlertQueryAdapter_UsesIncidentWindowPlusMatchWindow(t *testing.T) {
+ now := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC)
+ started := now.Add(-2 * time.Hour)
+ store := &fakeSignalStore{out: []signals.Signal{{
+ SignalID: "sig_alert",
+ Type: signals.TypeAlert,
+ Source: "grafana",
+ Service: "checkout",
+ Env: "demo",
+ Severity: signals.SeverityCritical,
+ Reason: "PMT_502 spike",
+ Timestamp: started.Add(-20 * time.Minute),
+ Metadata: map[string]any{"alert_id": "alert_1"},
+ }}}
+ a := triage.NewAlertQueryAdapter(store, 30*time.Minute)
+ got, err := a.AlertsFor(context.Background(), triage.IncidentSummary{
+ Service: "checkout",
+ Env: "demo",
+ StartedAt: started,
+ UpdatedAt: now,
+ }, triage.BuildOptions{Window: 15 * time.Minute, Now: now})
+ if err != nil {
+ t.Fatalf("AlertsFor: %v", err)
+ }
+ wantSince := started.Add(-30 * time.Minute)
+ if !store.got.Since.Equal(wantSince) {
+ t.Fatalf("filter.Since = %v, want %v", store.got.Since, wantSince)
+ }
+ wantUntil := now.Add(30 * time.Minute)
+ if !store.got.Until.Equal(wantUntil) {
+ t.Fatalf("filter.Until = %v, want %v", store.got.Until, wantUntil)
+ }
+ if len(got) != 1 || got[0].AlertID != "alert_1" {
+ t.Fatalf("alert refs wrong: %+v", got)
+ }
+}
+
// ----- NextChecksAdapter -----
func TestNextChecksAdapter_ConsumesIncidentNextChecks(t *testing.T) {
diff --git a/internal/triage/engine.go b/internal/triage/engine.go
index 07f5a04..3df3ad2 100644
--- a/internal/triage/engine.go
+++ b/internal/triage/engine.go
@@ -59,6 +59,10 @@ type SignalQuery interface {
SignalsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]SignalEvidence, error)
}
+type AlertQuery interface {
+ AlertsFor(ctx context.Context, inc IncidentSummary, opts BuildOptions) ([]pkgtriage.AlertRef, error)
+}
+
type NextChecksProvider interface {
NextChecks(ctx context.Context, inc IncidentSummary) ([]NextCheckSpec, error)
}
@@ -68,6 +72,7 @@ type Deps struct {
Blast BlastQuery
Story StoryBuilder
Signals SignalQuery
+ Alerts AlertQuery
NextChecks NextChecksProvider
Now func() time.Time
}
@@ -107,6 +112,13 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
if err != nil {
return nil, fmt.Errorf("triage: signals: %w", err)
}
+ var alerts []pkgtriage.AlertRef
+ if e.deps.Alerts != nil {
+ alerts, err = e.deps.Alerts.AlertsFor(ctx, inc, opts)
+ if err != nil {
+ return nil, fmt.Errorf("triage: alerts: %w", err)
+ }
+ }
checks, err := e.deps.NextChecks.NextChecks(ctx, inc)
if err != nil {
return nil, fmt.Errorf("triage: next_checks: %w", err)
@@ -122,6 +134,7 @@ func (e *Engine) Build(ctx context.Context, incidentID string, opts BuildOptions
FirstFailure: story.Payload,
SampleTraces: story.SampleTraces,
Signals: sigs,
+ Alerts: alerts,
NextChecks: checks,
Confidence: inc.Confidence,
GeneratedAt: e.deps.Now().UTC().Format(time.RFC3339Nano),
diff --git a/internal/triagehttp/handler.go b/internal/triagehttp/handler.go
index 0adc3c2..fe1484f 100644
--- a/internal/triagehttp/handler.go
+++ b/internal/triagehttp/handler.go
@@ -7,6 +7,7 @@ import (
"strings"
"time"
+ "github.com/sssmaran/WaylogCLI/internal/reports"
"github.com/sssmaran/WaylogCLI/internal/triage"
)
@@ -19,12 +20,15 @@ func NewHandler(engine *triage.Engine) *Handler {
}
func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
+ if strings.HasSuffix(strings.TrimRight(r.URL.Path, "/"), "/report") {
+ h.Report(w, r)
+ return
+ }
if r.Method != http.MethodGet {
writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
return
}
- id := strings.TrimPrefix(r.URL.Path, "/v1/triage/")
- id = strings.Trim(id, "/")
+ id := incidentIDFromPath(r.URL.Path)
if id == "" {
writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
return
@@ -47,6 +51,55 @@ func (h *Handler) Triage(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, rep)
}
+func (h *Handler) Report(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ writeError(w, http.StatusMethodNotAllowed, "method_not_allowed", "method not allowed", "")
+ return
+ }
+ id := incidentIDFromPath(strings.TrimSuffix(strings.TrimRight(r.URL.Path, "/"), "/report"))
+ if id == "" {
+ writeError(w, http.StatusBadRequest, "missing_incident_id", "incident_id required in path", "")
+ return
+ }
+ q := r.URL.Query()
+ opts, err := triage.ParseBuildOptions(q.Get("window"), q.Get("snapshot") == "true", time.Now())
+ if err != nil {
+ writeError(w, http.StatusBadRequest, "bad_options", err.Error(), "")
+ return
+ }
+ rep, err := h.engine.Build(r.Context(), id, opts)
+ if errors.Is(err, triage.ErrUnknownIncident) {
+ writeError(w, http.StatusNotFound, "not_found", "incident not found", "")
+ return
+ }
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "triage_build_failed", err.Error(), "")
+ return
+ }
+ rendered, err := reports.Render(rep, q.Get("format"))
+ if err != nil {
+ writeError(w, http.StatusBadRequest, "bad_format", err.Error(), "")
+ return
+ }
+ body, err := reports.EncodeBody(rendered)
+ if err != nil {
+ writeError(w, http.StatusInternalServerError, "render_failed", err.Error(), "")
+ return
+ }
+ w.Header().Set("Content-Type", rendered.ContentType)
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write(body)
+}
+
+func incidentIDFromPath(path string) string {
+ id := strings.TrimPrefix(path, "/v1/triage/")
+ id = strings.Trim(id, "/")
+ if strings.Contains(id, "/") {
+ return ""
+ }
+ return id
+}
+
func writeJSON(w http.ResponseWriter, status int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
diff --git a/internal/triagehttp/handler_test.go b/internal/triagehttp/handler_test.go
index fba414e..a274813 100644
--- a/internal/triagehttp/handler_test.go
+++ b/internal/triagehttp/handler_test.go
@@ -5,6 +5,7 @@ import (
"encoding/json"
"net/http"
"net/http/httptest"
+ "strings"
"testing"
"time"
@@ -82,6 +83,22 @@ func TestTriageHandlerUnknownIncidentIsNotFound(t *testing.T) {
}
}
+func TestTriageReportHandlerRendersMarkdown(t *testing.T) {
+ eng := newTriageEngineForHandler(t)
+ h := triagehttp.NewHandler(eng)
+
+ req := httptest.NewRequest(http.MethodGet, "/v1/triage/inc_abc/report?format=markdown", nil)
+ rr := httptest.NewRecorder()
+ h.Triage(rr, req)
+
+ if rr.Code != http.StatusOK {
+ t.Fatalf("status = %d body=%s", rr.Code, rr.Body.String())
+ }
+ if !strings.Contains(rr.Body.String(), "Waylog Triage Report") || !strings.Contains(rr.Body.String(), "inc_abc") {
+ t.Fatalf("unexpected report:\n%s", rr.Body.String())
+ }
+}
+
// helper: stub engine
func newTriageEngineForHandler(t *testing.T) *triage.Engine {
return newTriageEngineForHandlerWithIncidents(t, handlerStubIncidents{})
diff --git a/pkg/triage/report.go b/pkg/triage/report.go
index 388933d..c1920b6 100644
--- a/pkg/triage/report.go
+++ b/pkg/triage/report.go
@@ -23,6 +23,7 @@ type Report struct {
FirstFailure json.RawMessage `json:"first_failure,omitempty"`
SampleTraces []TraceSample `json:"sample_traces,omitempty"`
Signals []SignalRef `json:"signals,omitempty"`
+ Alerts []AlertRef `json:"alerts,omitempty"`
NextChecks []NextCheck `json:"next_checks,omitempty"`
Confidence Confidence `json:"confidence"`
GeneratedAt string `json:"generated_at"`
@@ -60,6 +61,16 @@ type SignalRef struct {
EvidenceIDs []string `json:"evidence_ids"`
}
+type AlertRef struct {
+ SignalID string `json:"signal_id"`
+ AlertID string `json:"alert_id,omitempty"`
+ Source string `json:"source"`
+ Severity string `json:"severity"`
+ Reason string `json:"reason"`
+ ProviderURL string `json:"provider_url,omitempty"`
+ EvidenceIDs []string `json:"evidence_ids"`
+}
+
type NextCheck struct {
ID string `json:"id"`
Prompt string `json:"prompt"`
diff --git a/pkg/triage/report_test.go b/pkg/triage/report_test.go
index f4f575f..db8bd4d 100644
--- a/pkg/triage/report_test.go
+++ b/pkg/triage/report_test.go
@@ -18,6 +18,7 @@ func TestReportJSONRoundTrip(t *testing.T) {
{Service: "payment", Step: "payment.charge", ErrorCode: "PMT_502", Count: 11},
},
},
+ Alerts: []triage.AlertRef{{SignalID: "sig_2", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_2"}}},
Signals: []triage.SignalRef{{ID: "sig_1", Type: "deploy", EvidenceIDs: []string{"e1"}}},
NextChecks: []triage.NextCheck{{ID: "check_1", Prompt: "verify x"}},
Confidence: triage.ConfidenceMedium,
@@ -41,6 +42,9 @@ func TestReportJSONRoundTrip(t *testing.T) {
if out.Confidence != triage.ConfidenceMedium {
t.Fatalf("confidence mismatch: got %q", out.Confidence)
}
+ if len(out.Alerts) != 1 || out.Alerts[0].AlertID != "alert_1" {
+ t.Fatalf("alerts round-trip lost data: %+v", out.Alerts)
+ }
}
func TestReportValidate(t *testing.T) {
@@ -116,6 +120,24 @@ func TestCanonicalHashChangesWhenContentChanges(t *testing.T) {
}
}
+func TestCanonicalHashChangesWhenAlertEvidenceChanges(t *testing.T) {
+ base := triage.Report{
+ SchemaVersion: "triage.v1",
+ IncidentRef: triage.IncidentRef{ID: "inc_1"},
+ Confidence: triage.ConfidenceMedium,
+ GeneratedAt: "t",
+ ReportHash: "h",
+ }
+ h1, _ := base.CanonicalHash()
+
+ withAlert := base
+ withAlert.Alerts = []triage.AlertRef{{SignalID: "sig_alert", AlertID: "alert_1", Source: "grafana", Severity: "critical", Reason: "PMT_502 spike", EvidenceIDs: []string{"sig_alert"}}}
+ h2, _ := withAlert.CanonicalHash()
+ if h1 == h2 {
+ t.Fatalf("hash must change when alert evidence changes")
+ }
+}
+
func TestCanonicalHashFormat(t *testing.T) {
r := triage.Report{
SchemaVersion: "triage.v1",
diff --git a/scripts/demo-acceptance.sh b/scripts/demo-acceptance.sh
index b40edb1..9e78dcc 100755
--- a/scripts/demo-acceptance.sh
+++ b/scripts/demo-acceptance.sh
@@ -4,6 +4,7 @@ set -euo pipefail
GATEWAY_URL="${GATEWAY_URL:-http://localhost:9081}"
INGEST_URL="${INGEST_URL:-http://localhost:8080}"
WAYLOG_READ_KEY="${WAYLOG_READ_KEY:-demo}"
+WAYLOG_WRITE_KEY="${WAYLOG_WRITE_KEY:-demo}"
REQUESTS="${REQUESTS:-20}"
CONCURRENCY="${CONCURRENCY:-5}"
TIMEOUT="${WAYLOG_CLI_TIMEOUT:-5s}"
@@ -73,6 +74,19 @@ CLI=("$CLI_BIN" --addr "$INGEST_URL" --api-key "$WAYLOG_READ_KEY" --timeout "$TI
"${CLI[@]}" --json capabilities >/dev/null || fail "waylog capabilities failed"
echo "PASS: waylog capabilities"
+alert_id="alert_demo_pmt_502"
+alert_timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+alert_body="{\"source\":\"waylog\",\"alert_id\":\"${alert_id}\",\"service\":\"checkout\",\"env\":\"demo\",\"severity\":\"critical\",\"reason\":\"PMT_502 spike\",\"message\":\"demo alert for checkout payment failures\",\"error_code\":\"PMT_502\",\"timestamp\":\"${alert_timestamp}\"}"
+alert_status="$(curl -s -o /tmp/waylog-demo-alert.json -w "%{http_code}" \
+ -X POST "${INGEST_URL}/v1/alerts" \
+ -H "Authorization: Bearer ${WAYLOG_WRITE_KEY}" \
+ -H 'Content-Type: application/json' \
+ --data "$alert_body" || echo "000")"
+[[ "$alert_status" == "201" ]] || fail "alert webhook failed: HTTP $alert_status"
+grep -q '"signal_id"' /tmp/waylog-demo-alert.json || fail "alert webhook response did not include a signal"
+grep -q '"matched"' /tmp/waylog-demo-alert.json || fail "alert webhook response did not include match state"
+echo "PASS: alert webhook accepted"
+
burst_body="{\"requests\":${REQUESTS},\"concurrency\":${CONCURRENCY}}"
burst_status="$(curl -s -o /tmp/waylog-demo-burst.json -w "%{http_code}" \
-X POST "${GATEWAY_URL}/demo/burst" \
@@ -146,4 +160,12 @@ hash_b="$(json_triage_report_hash <<<"$triage_b")"
[[ "$hash_a" == "$hash_b" ]] || fail "triage report_hash unstable across runs: A=$hash_a B=$hash_b"
echo "PASS: waylog triage stable report_hash=$hash_a"
+report_status="$(curl -s -o /tmp/waylog-demo-triage-report.md -w "%{http_code}" \
+ -H "Authorization: Bearer ${WAYLOG_READ_KEY}" \
+ "${INGEST_URL}/v1/triage/${incident_id}/report?format=markdown&snapshot=true" || echo "000")"
+[[ "$report_status" == "200" ]] || fail "triage markdown report failed: HTTP $report_status"
+grep -q "$hash_a" /tmp/waylog-demo-triage-report.md || fail "triage markdown report did not cite report_hash"
+grep -q "$alert_id" /tmp/waylog-demo-triage-report.md || fail "triage markdown report did not cite alert evidence"
+echo "PASS: triage markdown report cites alert evidence"
+
echo "Demo acceptance passed."
From ef1c0fb7bb69a3a1b4ccf3262299b4581f5c025e Mon Sep 17 00:00:00 2001
From: skota-hash
Date: Tue, 12 May 2026 19:01:03 -0400
Subject: [PATCH 12/14] feat: added proof-loop and RCA scorecard acceptance
gates Reproducible end-to-end harnesses (alert -> burst -> errors ->
incidents -> triage) for v2.1 incident triage, plus the demo-acceptance
JSON helpers, microdemo burst hooks, incident-store tests, auth config
tightening, and README/docs reframing they depend on.
---
.gitignore | 1 +
Makefile | 10 +-
README.md | 482 ++++++++++++++--------
cmd/ingest/main.go | 49 ++-
docs/env.md | 13 +-
docs/openapi.yaml | 5 +-
examples/cmd/api-gateway/main.go | 10 +-
examples/microdemo/gateway.go | 14 +
examples/microdemo/proof.go | 431 +++++++++++++++++++
examples/microdemo/proof_test.go | 88 ++++
examples/microdemo/ui.html | 278 ++++++++++++-
examples/microdemo/ui_test.go | 26 ++
internal/auth/config.go | 40 +-
internal/auth/config_test.go | 56 +++
internal/dashboard/static/index.html | 26 +-
internal/dashboard/static_test.go | 6 +
internal/incidents/store_test.go | 84 ++++
internal/ingest/handler.go | 4 +
internal/llm/openai.go | 2 +-
internal/llm/openai_test.go | 10 +
internal/reports/reports.go | 69 ++--
internal/reports/reports_test.go | 15 +-
internal/triagehttp/handler_test.go | 2 +-
scripts/demo-acceptance-json/main.go | 185 ++++++++-
scripts/demo-acceptance-json/main_test.go | 45 ++
scripts/demo.sh | 8 +-
scripts/proof-loop.sh | 162 ++++++++
scripts/rca-scorecard.sh | 173 ++++++++
28 files changed, 2046 insertions(+), 248 deletions(-)
create mode 100644 examples/microdemo/proof.go
create mode 100644 examples/microdemo/proof_test.go
create mode 100644 internal/incidents/store_test.go
create mode 100644 scripts/proof-loop.sh
create mode 100644 scripts/rca-scorecard.sh
diff --git a/.gitignore b/.gitignore
index 0dfc197..a8cb890 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ profile.cov
/*.log
/data/
+data/demo-state/
*.md
# Local runtime/build artifacts
diff --git a/Makefile b/Makefile
index e9f21ee..46f0d90 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
SHELL := /bin/sh
-.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
+.PHONY: help build build-examples ingest ingest-mcp waylog waylog-live checkout test test-race test-sdk lint ci fmt vet vet-sdk clean kafka-up kafka-down demo demo-stop demo-acceptance proof-loop rca-scorecard rollup-comparison otlp-conformance demo-up demo-down micro-demo micro-demo-stop docker-build docker-up docker-down docker-reset docker-dev docker-prod ts-install ts-build ts-test bench-gate
help:
@echo "Targets:"
@@ -19,6 +19,8 @@ help:
@echo " demo - start dashboard demo locally (detached, no Docker)"
@echo " demo-stop - stop demo processes"
@echo " demo-acceptance - verify a running local demo end-to-end"
+ @echo " proof-loop - run alert -> incident -> triage -> report -> rollup proof"
+ @echo " rca-scorecard - run deterministic RCA scorecard over the demo scenario"
@echo " rollup-comparison - run demo proof for root-cause vs naive rollup counts"
@echo " otlp-conformance - run deterministic OTLP HTTP/gRPC fixture checks"
@echo " demo-up - start v2 demo stack in Docker (detached)"
@@ -125,6 +127,12 @@ demo-stop:
demo-acceptance:
./scripts/demo-acceptance.sh
+proof-loop:
+ bash ./scripts/proof-loop.sh
+
+rca-scorecard:
+ bash ./scripts/rca-scorecard.sh
+
rollup-comparison:
./scripts/rollup-comparison.sh
diff --git a/README.md b/README.md
index 943ae6b..7f6d92f 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,47 @@
-
-
+
+

-
- Structured logging that explains failed requests and active incidents.
- Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC traces. Agent-native by design.
-
+
+ Production incident triage you can hash, cite, and hand to an agent.
+ Drop-in SDKs (Go, TypeScript) or OTLP HTTP/gRPC. Deterministic. Single Go binary.
+
-
- polyglot SDKs · agent-native API · failure tree · rollup-correct root cause
-
+
+ schema 2.0 WideEvents · signal-correlated incidents · cited operator reports · rollup-correct root cause · agent-native
+
-
- Public alpha — request triage plus signal-driven incident triage for backend systems.
-
+
+ alpha · go 1.24+ · single-node · OTLP-compatible · MCP-native · LLM-optional
+
+
+
+> **Public alpha for single-node production-style incident triage.** APIs may break before 1.0.
+
+---
+
+## Try it in 60 seconds
+
+```bash
+git clone https://github.com/sssmaran/WaylogCLI
+cd WaylogCLI
+make demo
+```
+
+Then open and click **Run proof loop**.
+
+You'll see the full v2.1 flow run end-to-end: an external alert is accepted, a payment-failure burst spikes an error family, the spike detector opens an incident, the cause is classified as `dependency`, a triage report is built across four surfaces (CLI, read endpoint, direct tool, plan template) and verified byte-stable, and Markdown / Slack / PagerDuty operator reports are rendered with citations to every alert, signal, and trace.
+
+Stop with `make demo-stop`. No Docker. No Kafka. No bridge process. SQLite + a single Go binary.
---
-## What Waylog does
+## What Waylog is
-A request hits your API gateway, fans out to three services, and one of them fails. The gateway returns 502. Your logs say "upstream error." Waylog tells you exactly what happened in the request, then groups repeated failures into an incident with signal-backed cause evidence:
+Waylog turns failed requests and external alerts into **incidents with deterministic, cited triage reports** that humans and agents can both consume. Three things make it different:
+
+- **Deterministic.** Every triage report has a `report_hash` that is byte-stable across CLI, REST read, direct tool call, and plan template — within a single engine tick. No LLM required to get an answer; LLMs are an optional UX layer over the same bytes.
+- **Agent-native.** The same tool registry powers the CLI, MCP stdio, REST `/v1/tools/*`, and `/v1/plans/execute`. Agents read the exact bytes the human read. Built-in triage plan template, idempotency keys, structured envelopes.
+- **Drop-in.** Go SDK (`net/http`, chi, gin, echo), TypeScript SDK (Express, Hono, Next.js, NestJS), or OTLP HTTP / gRPC. Use what you already have. Single binary + embedded SQLite. No Docker required.
```text
trace 7f3a2b9c… flow=purchase user=standard region=us-east-1
@@ -28,56 +51,44 @@ A request hits your API gateway, fans out to three services, and one of them fai
└─ db 200 — 3 ms
└─ payment 502 PMT_502 5 ms ← first failure
- blast radius: 12 requests · 8 users · 4 services
+ blast radius: 12 requests · 8 users · 4 services
+ incident: inc_a43c189fc63eff31 (cause=dependency · confidence=high)
+ report_hash: sha256:1ed7c21b… (stable across cli / read / tool / plan)
```
-This is not log search, metrics storage, or incident management. Waylog builds request-triage views from WideEvents, accepts production-context signals such as deploys and dependency health, and returns deterministic answers for "why did this trace fail?", "what incident is active?", and "who is affected by `PMT_502`?". Root-cause rollups count the originating failure once, not once per propagated hop.
-
-Run `make demo` and see it yourself.
-
-## Quick start
+Root-cause rollups count the originating failure **once**, not once per propagated hop. The same hash answers "what failed" identically whether the question came from a terminal, a webhook, or an LLM tool call.
-```bash
-make demo
-```
-
-This starts the ingest server plus four real Go demo services wired through the schema-2.0 Go SDK (`api-gateway → checkout → db/payment`), enables `WAYLOG_V2_READS=true`, stores demo signals/incidents in local SQLite, and does not require Docker, Kafka, or the bridge process.
+---
-Once the stack is up:
+## The incident loop
-1. Open demo controls at , or open the dashboard at . The local demo disables dashboard login.
-2. Click **Run traffic burst** to post demo deploy/dependency signals and fire a production-like mix through the checkout chain. For a focused single-trace look, click **Run payment outage** instead, or run:
- ```bash
- curl -s -X POST http://localhost:9081/purchase \
- -H 'Content-Type: application/json' \
- --data '{"sku":"X1","scenario":"payment_502"}'
- ```
-3. Investigate with the v2 CLI:
- ```bash
- ./waylog incidents
- ./waylog incident --snapshot
- ./waylog errors --window 15m
- ./waylog explain
- ./waylog blast --service checkout --step payment.charge --code PMT_502 --window 15m
- ./waylog blast --code PMT_502 --window 15m
- ./waylog triage
- ```
+The v2.1 product in one paragraph: services emit WideEvents (or push OTel spans). External systems post production-context signals (deploys, dependency health, runtime events) and alerts (Alertmanager, Grafana, PagerDuty webhooks, or Waylog-native JSON). When an error family spikes, Waylog opens an **incident**, correlates the signals and alerts that overlap its window, classifies the cause deterministically (`deploy | app | dependency | runtime | unknown`), and exposes a cited **TriageReport** to humans (CLI, dashboard) and agents (REST, plan template, MCP) — same bytes, same hash.
-The traffic burst posts fresh demo deploy/dependency signals on each run so the incident panel has evidence to attach. The demo also supports `happy` and `suppressed_payment_502` scenarios through the UI or `POST /purchase`.
+```bash
+# 1. Drive a real failure through the demo stack
+curl -X POST http://localhost:9081/purchase \
+ -H 'Content-Type: application/json' \
+ -d '{"sku":"X1","scenario":"payment_502"}'
-Stop with `make demo-stop`.
+# 2. List active incidents
+waylog incidents
-Prefer Docker? Use `make docker-dev` / `make docker-down`. Prefer foreground service logs while hacking on Go code? Use `make micro-demo` and stop with `make micro-demo-stop`.
+# 3. Get the cited triage report — and verify hash agreement across surfaces
+waylog triage inc_a43c189fc63eff31 --snapshot
+curl -H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
+ -d '{"incident_id":"inc_a43c189fc63eff31","snapshot":true}' \
+ http://localhost:8080/v1/tools/triage_incident
+# 4. Render an operator report (Markdown / Slack Block Kit / PagerDuty note)
+curl -H "Authorization: Bearer $WAYLOG_READ_KEY" \
+ "http://localhost:8080/v1/triage/inc_a43c189fc63eff31/report?format=slack"
+```
-## How it works
+> **Alerts correlate; they do not create incidents.** Incidents are opened by the spike detector. Alerts and signals attach as evidence and shape the deterministic cause classification.
-1. **Capture** — services emit [WideEvents](docs/waylog-sdk-contract.md) via the Go or TypeScript SDK, or push OpenTelemetry spans to `/v1/otlp/v1/traces`. Every event is durably logged (WAL + fsync) before it enters the derived read models.
-2. **Signal** — deploy systems, dependency monitors, or operators post small production-context facts to `/v1/signals`.
-3. **Triage** — the ingest server projects request views (`recent`, `errors`, `explain`, `blast`) and opens incidents when error families spike against overlapping signals.
-4. **Operator** — CLI, REST, MCP, TUI, and the embedded dashboard query the same derived views. Primary incident surfaces are `waylog incidents`, `waylog incident `, `/v1/incidents/*`, and the dashboard incident cards.
+---
-## Get traces in
+## Capture: send Waylog your traces
All three paths feed the same schema-2.0 ingest and read APIs. Pick whichever matches your stack.
@@ -105,7 +116,7 @@ app.post("/buy", (req, res) => {
});
```
-`@waylog/sdk` is ESM-only, Node 18+, and ships standalone core APIs plus Express, Hono, Next.js, and NestJS entrypoints (`@waylog/sdk/express`, `@waylog/sdk/hono`, `@waylog/sdk/next`, `@waylog/sdk/nest`).
+ESM-only, Node 18+. Standalone core plus framework entrypoints: `@waylog/sdk/express`, `@waylog/sdk/hono`, `@waylog/sdk/next`, `@waylog/sdk/nest`. Full examples in [`docs/sdk-examples.md`](docs/sdk-examples.md).
### Go SDK
@@ -131,208 +142,313 @@ func main() {
}
```
-The recommended SDK path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers. Low-level request APIs such as `Begin`, `Finalize`, and `setField` are for adapter authors, tests, and unusual custom integrations. Full copy-paste examples for `net/http`, chi, gin, echo, standalone TypeScript, Express, Hono, Next.js, and NestJS are in [`docs/sdk-examples.md`](docs/sdk-examples.md).
+Middleware adapters for `net/http`, chi, gin, and echo are in [`docs/sdk-examples.md`](docs/sdk-examples.md). The recommended path is framework middleware plus `waylog.From(ctx)` / `useLogger(...)` inside handlers — low-level `Begin` / `Finalize` / `setField` APIs are for adapter authors.
+
+### OTLP / OpenTelemetry
+
+Point your existing OTel collector at Waylog. Both protocols, same conversion path, same downstream views.
-### OTLP traces
+```yaml
+exporters:
+ otlphttp/waylog:
+ endpoint: http://localhost:8080/v1/otlp/v1/traces
+ headers:
+ authorization: "Bearer ${WAYLOG_WRITE_KEY}"
+ otlp/waylog:
+ endpoint: localhost:4317
+ headers:
+ authorization: "Bearer ${WAYLOG_WRITE_KEY}"
+```
+
+Sample collector config: [`examples/otel-collector/`](examples/otel-collector/). Only traces are accepted; OTLP logs and metrics are not shipping. Bind `OTLP_GRPC_ADDR=127.0.0.1:4317` for single-host installs that don't need cross-host collectors.
-Point your existing OpenTelemetry collector at `http://localhost:8080/v1/otlp/v1/traces` for OTLP/HTTP or `localhost:4317` for OTLP/gRPC. Protobuf trace exports convert to schema-2.0 WideEvents on the way in, then show up in the same errors, explain, blast, and recent-trace APIs as SDK events when `WAYLOG_V2_READS=true`. A collector config lives in [`examples/otel-collector/`](examples/otel-collector/). **Only traces are supported.** OTLP logs and metrics are not shipping yet.
+**Auth.** Both endpoints require `WAYLOG_WRITE_KEY` when `WAYLOG_PROFILE=prod`; the server refuses to boot with unauthenticated OTLP in prod. `make demo` runs unauthenticated by design.
-### Alternative: local ingest server (no Docker)
+### Local ingest only (no demo services)
```bash
make ingest
```
-Runs only the ingest server. Point your own services at it via an SDK or OTLP. Full env-var reference: [`docs/env.md`](docs/env.md).
+Runs only the ingest server. Point your own services at it via SDK or OTLP. Full env reference: [`docs/env.md`](docs/env.md).
-## What you can ask
+---
+
+## Operate: CLI, dashboard, agents
### CLI
```bash
-WAYLOG_V2_READS=true ./ingest
-
-waylog capabilities
-waylog recent --limit 5
-waylog errors --window 15m
-waylog blast checkout:payment.charge:PMT_502 --window 15m
-waylog explain trace_01HX...
-waylog trace trace_01HX...
-waylog event event_01HX...
-waylog search PMT_502 --window 1h
+./ingest # v2 reads are on by default
+
+waylog capabilities # diagnose server config / profile / feature flags
+waylog incidents # active incidents, deterministic order
+waylog incident --snapshot # full detail with frozen sample traces
+waylog triage # cited triage report
+waylog errors --window 15m # top error families
+waylog explain # first observable failing step
+waylog blast checkout:payment.charge:PMT_502 --window 15m
+waylog recent --limit 5
+waylog event
+waylog trace
+waylog search PMT_502 --window 1h
```
-The `waylog` binary is now the v2 operator CLI over the running ingest server's read APIs. Most verbs require the server to advertise `v2_reads.enabled=true` from `/v1/capabilities`; `waylog capabilities` is intentionally ungated so it can diagnose server setup. The CLI uses `INGEST_ADDR`, `WAYLOG_READ_KEY`, and `WAYLOG_CLI_TIMEOUT` by default. Add `--json` to any verb for machine-readable output.
+`waylog capabilities` is intentionally ungated so it can diagnose server setup; other verbs require `v2_reads.enabled=true` (the default). Defaults: `INGEST_ADDR`, `WAYLOG_READ_KEY`, `WAYLOG_CLI_TIMEOUT`. Add `--json` to any verb for machine-readable output.
+
+### Dashboard
-### REST (direct tool call)
+Embedded Geist UI at . Uses the dashboard session cookie for read-scope auth and runs against the default `WAYLOG_V2_READS=true` reader.
+
+- `#/errors` — top error families over `/v1/errors`
+- `#/incident/` — incident evidence and next checks over `/v1/incidents/{id}`
+- `#/explain/` — first observable failing step over `/v1/traces/story`
+- `#/blast/` — impact panel over `/v1/blast_radius`
+- recent-request stream from `/v1/traces/recent`, polled every 5 s
+
+No Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts. Just the triage loop.
+
+### Agent surface
+
+Twelve deterministic tools, exposed identically through CLI, REST `/v1/tools/{name}`, MCP stdio, and plan execution. Same idempotency keys, same structured envelopes, same bytes.
+
+| Tool | Answers |
+| ---------------------- | -------------------------------------------------------------------------------------------- |
+| `triage_incident` | Structured TriageReport for an open incident (blast + first failure + signals + next checks) |
+| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note from a TriageReport |
+| `explain_request` | Why did this specific trace fail? |
+| `trace_summary` | Span tree and timing for a trace |
+| `graph_failures` | Which requests are currently failing? |
+| `failure_patterns` | What error codes dominate this window? |
+| `blast_radius` | How many requests, users, and services does this error touch? |
+| `failure_chain` | How did this failure propagate through services? |
+| `graph_query` | DSL query over the graph (`expr` + `window`) |
+| `compare_windows` | Diff error rates between two windows |
+| `graph_insights` | Windowed rollup of top errors and patterns |
+| `graph_stats` | Overall shape of the graph right now |
```bash
+# Direct tool call
curl -X POST http://localhost:8080/v1/tools/blast_radius \
- -H 'Content-Type: application/json' \
-H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
-d '{"error_code":"PMT_502","window":"10m","include_services":true}'
-```
-### REST (multi-step plan)
-
-```bash
+# Built-in triage plan template — same hash as the CLI/read/tool surfaces
curl -X POST http://localhost:8080/v1/plans/execute \
- -H 'Content-Type: application/json' \
-H "Authorization: Bearer $WAYLOG_AGENT_KEY" \
- -d '{
- "steps": [
- {"id":"patterns", "tool":"failure_patterns", "params":{"window":"10m"}},
- {"id":"blast", "tool":"blast_radius", "params":{"error_code":"PMT_502","window":"10m"}}
- ]
- }'
+ -d '{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}'
```
-Plans execute deterministically server-side with SSE progress on `/v1/stream/plans/{id}`.
+Plans execute deterministically server-side with SSE progress on `/v1/stream/plans/{id}`. Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
-### Trace story
+### MCP
```bash
-curl "http://localhost:8080/v1/traces/story?trace_id=$TRACE" \
- -H "Authorization: Bearer $WAYLOG_READ_KEY"
+make ingest-mcp # MCP_STDIO=1
```
-Returns the first failing step, contributing path, logs, downstream calls, and linkage mode used by the dashboard and `waylog explain`.
+Same registry, same idempotency keys. Plugs into Claude, Cursor, and other MCP clients.
-### MCP (agent surface)
+### External alerts
```bash
-make ingest-mcp # MCP_STDIO=1
+curl -X POST http://localhost:8080/v1/alerts \
+ -H "Authorization: Bearer $WAYLOG_WRITE_KEY" \
+ -d @grafana-webhook.json
```
-Exposes the same tool registry over MCP stdio for Claude, Cursor, and other MCP clients. Same semantics as the REST API.
+`POST /v1/alerts` accepts Waylog-normalized JSON plus Alertmanager, Grafana, and PagerDuty webhook payloads. Accepted alerts are stored as `type=alert` signals and **correlated** with active incidents when possible — alerts do not create incidents. The matching alert evidence then appears in cited Markdown / Slack / PagerDuty triage reports.
-### Analysis tools
+---
-All twelve tools are deterministic, idempotent, and available via CLI, REST `/v1/tools/{name}`, MCP, and plan execution.
+## Architecture
-Agents can call the built-in triage plan template with `POST /v1/plans/execute` and `{"template":"triage","params":{"incident_id":"inc_...","snapshot":true}}`; the TriageReport is returned at `steps[0].result`.
+```text
+ Go / TS services (SDK) OTel collectors
+ │ │
+ schema-2.0 WideEvents OTLP HTTP / gRPC
+ ╰──────────────┬─────────────────╯
+ ▼
+ ingest server
+ ┌──────────────┴──────────────┐
+ │ │
+ event log (append-only WAL, SQLite cold store
+ source of truth) (events · deployments ·
+ │ signals · incidents ·
+ ▼ causal claims)
+ derived read models
+ (errors · explain · blast ·
+ recent · incidents · triage)
+ │
+ ├──▶ /ui dashboard (Geist, no vendored chart/topology)
+ ├──▶ /v1/tools/* (deterministic agent surface)
+ ├──▶ /v1/plans/execute (server-side plan execution + SSE)
+ └──▶ waylog CLI · TUI · MCP
+```
-External alerts can be posted to `POST /v1/alerts` as Waylog-normalized JSON or Alertmanager, Grafana, or PagerDuty webhooks. Waylog stores them as alert signals, links them to active incidents when possible, and can render cited Markdown, Slack Block Kit, or PagerDuty-note reports from the same deterministic triage artifact.
+- **Single binary** plus embedded SQLite. No Docker, no Kafka, no bridge.
+- **WAL is source of truth.** Crash → replay on next boot rebuilds the derived read models.
+- **Hot graph + dedicated trace store.** Pruned per snapshot tick to bound memory.
+- **`report_hash` excludes `generated_at`, `plan_run_id`, and itself.** Same upstream state → same bytes across every surface.
+- **OTLP path reuses the same WAL and projector** as the SDK path. No separate ingestion plane.
-| Tool | Answers |
-| ------------------ | ------------------------------------------------------------- |
-| `graph_stats` | Overall shape of the graph right now |
-| `explain_request` | Why did this specific trace fail? |
-| `trace_summary` | Span tree and timing for a trace |
-| `graph_failures` | Which requests are currently failing? |
-| `failure_patterns` | What error codes dominate this window? |
-| `blast_radius` | How many requests, users, and services does this error touch? |
-| `failure_chain` | How did this failure propagate through services? |
-| `graph_query` | DSL query over the graph (`expr` + `window`) |
-| `compare_windows` | Diff error rates between two windows |
-| `graph_insights` | Windowed rollup of top errors and patterns |
-| `triage_incident` | One structured TriageReport for an open incident (blast + first failure + signals + next checks) |
-| `render_triage_report` | Markdown, Slack Block Kit JSON, or PagerDuty note text rendered from a TriageReport |
+Durability model, retention, merge semantics, readiness policy, and counter buffer details: [`docs/internals.md`](docs/internals.md). Full HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml).
-Full schemas: `GET /v1/tools` or [`docs/openapi.yaml`](docs/openapi.yaml).
+---
-## Dashboard
+## Auth & profiles
-The embedded dashboard at `/ui` is a v2 triage surface over the same read APIs as the CLI. It requires `WAYLOG_V2_READS=true` and uses the dashboard session cookie for read-scope auth.
+Three independent scoped keys. The dashboard never holds the agent key.
-- dark, minimal Geist UI with aligned KPI modules and inline SVG mini-graphs
-- `#/errors` — top error families over `/v1/errors`
-- `#/explain/` — first observable failing step over `/v1/traces/story`
-- `#/blast/` — impact panel over `/v1/blast_radius`
-- `#/incident/` — incident evidence and next checks over `/v1/incidents/{id}`
-- recent-request stream from `/v1/traces/recent`, polled every 5s
-- no Chart.js, Cytoscape, topology-first UI, Ask panel, deploy diff, or large dashboard charts
+| Key | Protects |
+| ------------------ | ----------------------------------------------------------- |
+| `WAYLOG_WRITE_KEY` | `/v1/events`, OTLP HTTP + gRPC, `/v1/signals`, `/v1/alerts` |
+| `WAYLOG_READ_KEY` | Read APIs, dashboard session |
+| `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*` |
+
+`WAYLOG_API_KEY` is a legacy alias for the write scope. `ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
+`WAYLOG_PROFILE` controls auth strictness. The current profile is reported on `/v1/capabilities`.
-## Architecture
+| Profile | Use case | Defaults |
+| ------- | --------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `demo` | `make demo` showcase | All endpoints open. **Not safe to expose to a network.** |
+| `dev` | Local development (default) | Open OTLP, optional read auth |
+| `prod` | Real deployments | Refuses to boot without all three scoped keys **and** without `WAYLOG_WRITE_KEY` when OTLP is enabled |
-```text
-Go / TS services (SDK) · OTLP collectors
- │ schema-2.0 WideEvents · OTLP HTTP/gRPC traces
- ▼
- ingest server
- ├─ event log (append-only WAL, source of truth)
- ├─ derived read models (errors · explain · blast · recent traces · incidents)
- ├─ SQLite cold store (events · deployments · signals · incidents · causal claims)
- ├─ tool registry · Ask · plan execution
- └─ v2 dashboard · health · metrics · OpenAPI
- │
- ├──▶ /ui dashboard (Geist, no vendored chart/topology libs)
- ├──▶ /v1/tools/* · /v1/plans/execute (agent-native)
- └──▶ CLI · TUI · MCP · agents
+Set `WAYLOG_PROFILE=prod` for any deployment that crosses a trust boundary.
+
+---
+
+## Try every claim locally
+
+```bash
+make demo # one-shot local stack (no Docker)
+make demo-acceptance # 15-check gate over CLI + browser proof
+make proof-loop # full alert → incident → triage → cited reports loop
+make rca-scorecard # cold-start latency + measured report_hash_stable
+make rollup-comparison # rollup-correct counts vs naive propagated counts
+make demo-stop
```
-Events are durably logged before projection — if the process crashes, replay rebuilds the read models from the WAL on next boot.
+`make proof-loop` writes shareable artifacts to `./data/demo-state/proof/`:
+
+- `triage.json` — the TriageReport from the read endpoint
+- `report.md`, `slack.json`, `pagerduty.txt` — the same triage rendered for three operator surfaces
+- `rollup-comparison.txt` — root-cause counts next to naive propagated counts
+- `scorecard.json` — measured `report_hash_stable`, `triage_latency_ms`, `scenario` (`cold-demo` for `rca-scorecard`, `warm-demo` chained from `proof-loop`), inflation-avoided count, and the `report_hash` itself
-Durability model, retention, merge semantics, readiness policy, and counter buffer: [`docs/internals.md`](docs/internals.md). Full v2 HTTP contract: [`docs/openapi.yaml`](docs/openapi.yaml).
+The browser has the same flow at → **Run proof loop**.
+
+> Artifacts are local-only. Alert payloads include the demo write key (`Bearer demo`). `data/demo-state/` is gitignored.
+
+---
## Development
```bash
-make build # core binaries
-make build-examples # demo services
-make fmt vet test # checks
-make test-race # race detector
-make ts-test # TypeScript SDK vitest suite
-make ci # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
-make demo-acceptance # with make demo running, verify demo + CLI triage loop
-make rollup-comparison # demo proof: root-cause counts vs naive propagated counts
+make build # core binaries
+make build-examples # demo services
+make fmt vet test # checks
+make test-race # race detector
+make ts-test # TypeScript SDK vitest suite
+make ci # fmt + vet + test-race + test-sdk + ts-test + doc-link + rollup-contract
```
-`make rollup-comparison` runs the checkout demo burst and prints the PMT_502 root-cause count next to a naive propagated count across touched services. It is the quickest local proof that Waylog's default rollups count the originating failure once per failed request instead of inflating it by every downstream hop.
+Full env-var reference: [`docs/env.md`](docs/env.md). Reproducible demo gate: `make demo-acceptance`.
-## Auth
+---
-Waylog uses three scoped keys. They are independent — the dashboard never holds the agent key.
+## What's new in v2.1
-| Key | Protects |
-| ------------------ | ----------------------------------------------------- |
-| `WAYLOG_WRITE_KEY` | `/v1/events`, `/v1/otlp/v1/traces`, `/v1/signals` (SDKs, collectors, production signals) |
-| `WAYLOG_READ_KEY` | Read APIs, dashboard session |
-| `WAYLOG_AGENT_KEY` | `/v1/tools/*`, `/v1/ask`, `/v1/plans/*` |
+- **Signal-correlated incident engine** at `/v1/incidents/*` with deterministic cause classification (`deploy | app | dependency | runtime | unknown`).
+- **Deterministic TriageReport** (`triage.v1`) with stable per-tick `report_hash` across CLI, read endpoint, direct tool, and plan template.
+- **Alert intake** at `POST /v1/alerts` for Waylog-native, Alertmanager, Grafana, and PagerDuty webhooks. Alerts correlate with active incidents; they do not create incidents.
+- **Cited operator reports** rendered as Markdown, Slack Block Kit, or PagerDuty notes via `GET /v1/triage/{id}/report`.
+- **OTLP/gRPC trace receiver** on `OTLP_GRPC_ADDR` (default `:4317`).
+- **Provider-neutral Ask** configuration: `gemini`, `anthropic`, `openai`, or `none`. All deterministic surfaces work with no LLM configured.
+- **`WAYLOG_PROFILE=demo|dev|prod`** gates auth defaults; `prod` hard-fails on unsafe configs.
+- **`WAYLOG_V2_READS` defaults to `true`.** Set `false` only for legacy v1-only stacks.
+- **`/v1/insight`** retained as a compat shim returning the top active incident. New clients should use `/v1/incidents/*`.
-`WAYLOG_API_KEY` is a legacy alias for the write scope. `ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
+---
## Status
-Public alpha. APIs may break before 1.0.
+Public alpha for single-node production-style incident triage. APIs may break before 1.0.
-**Shipped:**
+**Shipped**
- Go SDK v2 (`net/http`, chi, gin, echo) and TypeScript SDK v2 (`@waylog/sdk`, ESM, Node 18+, standalone core, Express, Hono, Next.js, NestJS)
-- OTLP traces over HTTP at `/v1/otlp/v1/traces` and gRPC at `:4317` (traces only)
-- durable ingest with WAL + replay
-- hot graph with flattened 3-node model + dedicated trace store
-- schema-2.0 recent-index read APIs behind `WAYLOG_V2_READS=true`
+- OTLP HTTP at `/v1/otlp/v1/traces` and OTLP/gRPC at `OTLP_GRPC_ADDR` (traces only)
+- Durable ingest with WAL + replay
+- Hot graph with flattened 3-node model + dedicated trace store
+- Schema-2.0 recent-index read APIs (default)
- SQLite cold store (events, deployments, signals, incidents, causal claims)
-- signal-driven incident engine with `waylog incidents`, `waylog incident `, dashboard incident cards, runtime cause classification, and startup hot-window rebuild from the schema-2.0 WAL
-- alert intake for Waylog, Alertmanager, Grafana, and PagerDuty webhooks, stored as signals and linked to incidents when possible
-- provider-neutral Ask configuration via `WAYLOG_LLM_PROVIDER` (`none`, `gemini`, `anthropic`, `openai`); deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
-- 12 deterministic analysis tools, rollup-correct root-cause attribution
-- agent-native REST (`/v1/tools/*`, `/v1/ask`, `/v1/plans/execute`) with idempotency and structured envelopes
-- `/v1/traces/story` and indented failure-path rendering in the dashboard
-- dashboard: minimal v2 triage loop (errors, explain, blast, recent requests)
-- v2 operator CLI (`capabilities`, `recent`, `incidents`, `incident`, `triage`, `errors`, `event`, `trace`, `explain`, `blast`, `search`) over read APIs
-- live TUI (`waylog-live --dev` streams via SSE), MCP stdio
-- scoped auth (write/read/agent) with startup validation
-
-**Planned:**
+- Signal-correlated incident engine with stable IDs, deterministic classification, and startup hot-window rebuild from the schema-2.0 WAL
+- Alert intake from four webhook formats, stored as signals and correlated with active incidents
+- Deterministic triage report with stable hash across CLI / read endpoint / direct tool / plan template within a single engine tick
+- Provider-neutral Ask configuration; deterministic CLI, tools, plans, triage, and MCP work with no LLM configured
+- Twelve deterministic analysis tools, rollup-correct root-cause attribution
+- Agent-native REST with idempotency and structured envelopes
+- MCP stdio, live TUI (`waylog-live --dev` streams via SSE), embedded Geist dashboard
+- Scoped auth (write / read / agent) with startup validation and `WAYLOG_PROFILE=prod` hard-fail
+
+**Planned**
- OTLP logs and metrics
- Python SDK
+- Resolved-incident retention janitor
- Mintlify docs site
+---
+
## Known limitations
-- Single-node only. No HA, no clustering.
-- Alpha quality. APIs may break before 1.0.
-- OTLP supports traces only. Logs and metrics are not shipping yet.
-- Only Go and TypeScript SDKs today. Python / Java / Ruby are not available.
-- SQLite cold store fits demos and small deployments; not sized for production-scale retention.
-- Signal records are SQLite-backed. Incident rows are a SQLite read cache and can be rebuilt within the hot window from the schema-2.0 WAL plus signals.
-- Incident cause classification is deterministic and heuristic.
-- No outbound alerting or paging delivery. Waylog accepts external alerts and renders operator reports, but it doesn't wake you up.
-- No multi-tenancy. One instance = one trust boundary.
-- No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.
-
-**Fastest walkthrough:** `make demo`, open , click **Run traffic burst**, then use the dashboard or `waylog incidents`, `waylog recent`, `waylog errors`, `waylog explain`, and `waylog blast` to answer what failed, which downstream was involved, and how broad the impact is.
+- **Public alpha.** APIs may break before 1.0. Not production-ready. Not HA.
+- **Triage report hash is stable per tick, not forever.** Hash changes when the underlying recent-index window changes (≈30 s default). Use as a short-window dedup key, not a long-term incident fingerprint.
+- **Alerts correlate; they do not create incidents.** Incidents are opened by the spike detector. The alert path is for routing context, not paging primitives.
+- **Resolved incidents are not pruned automatically.** Per the v2.1 plan, the retention janitor is deferred. Manual cleanup:
+ ```sql
+ DELETE FROM incidents WHERE status = 'resolved' AND resolved_at < datetime('now', '-7 days');
+ ```
+- **Stale `active` rows after long downtime.** If the WAL has rolled past an incident's `started_at` and `WAYLOG_REBUILD_INCIDENTS_ON_START=true`, the engine transitions only the stale rows to `recovering` on next start; they resolve after `WAYLOG_INCIDENT_RESOLVE_AFTER` without new evidence.
+- **Single-node only.** No HA, no clustering, no multi-tenant.
+- **SQLite cold store** fits demos and small deployments. Postgres is not shipping.
+- **OTLP supports traces only.** Logs and metrics are not shipping yet.
+- **Only Go and TypeScript SDKs today.** Python / Java / Ruby are not available.
+- **No outbound paging.** Waylog accepts external alerts and renders operator reports; it does not page.
+- **No multi-tenancy.** One instance = one trust boundary.
+- **No full log search, Slack/PagerDuty automation, RBAC/SSO, or automatic remediation.**
+- **Incident cause classification is heuristic and deterministic.** Intentionally explainable, not ML-based.
+
+---
+
+## Documentation
+
+| File | What's in it |
+| ------------------------------------------------------------ | ---------------------------------------------------------------------------------------- |
+| [`docs/env.md`](docs/env.md) | Full env-var reference (auth, profiles, retention, OTLP, incident engine, LLM providers) |
+| [`docs/sdk-examples.md`](docs/sdk-examples.md) | Copy-paste SDK examples for every supported framework |
+| [`docs/waylog-sdk-contract.md`](docs/waylog-sdk-contract.md) | WideEvent schema and validation rules |
+| [`docs/openapi.yaml`](docs/openapi.yaml) | Full HTTP contract |
+| [`docs/internals.md`](docs/internals.md) | Durability model, retention, merge semantics, readiness policy, counter buffer |
+
+---
+
+## Project layout
+
+```
+cmd/ executable binaries (ingest, waylog, waylog-live, ...)
+pkg/ public SDK importable by external services
+internal/ private implementation (auth, incidents, triage, ingest, ...)
+examples/ demo services + collector config + microdemo
+scripts/ demo + proof + ci helpers
+docs/ reference and contracts
+```
+
+---
+
+## License
+
+Not yet declared. The project is in public alpha; a license will be added before tagging 1.0. Until then, contact the maintainers if you'd like to use Waylog in a context where licensing matters to you.
diff --git a/cmd/ingest/main.go b/cmd/ingest/main.go
index db2f2c7..d2d3252 100644
--- a/cmd/ingest/main.go
+++ b/cmd/ingest/main.go
@@ -115,7 +115,7 @@ func main() {
var sm *auth.SessionManager
if authCfg.DashboardMode != "off" {
sm = auth.NewSessionManager(authCfg.SessionSecret, auth.DefaultSessionMaxAge)
- sm.Secure = os.Getenv("WAYLOG_PROFILE") == "prod"
+ sm.Secure = authCfg.Profile == auth.ProfileProd
}
sessionCheck := auth.SessionCheckFunc(sm)
@@ -135,7 +135,11 @@ func main() {
graphUI := config.GetenvBool("GRAPH_UI", false)
otlpEnabled := config.GetenvBool("OTLP_ENABLED", true)
otlpGRPCAddr := config.Getenv("OTLP_GRPC_ADDR", ":4317")
- v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", false)
+ if authCfg.Profile == auth.ProfileProd && otlpEnabled && len(authCfg.WriteKeys) == 0 {
+ slog.Error("WAYLOG_PROFILE=prod with OTLP enabled requires WAYLOG_WRITE_KEY — refusing to boot with unauthenticated OTLP")
+ os.Exit(1)
+ }
+ v2ReadsEnabled := config.GetenvBool("WAYLOG_V2_READS", true)
signalRetention := config.GetenvDuration("WAYLOG_SIGNAL_RETENTION", 72*time.Hour)
alertMatchWindow := config.GetenvDuration("ALERT_MATCH_WINDOW", 15*time.Minute)
if alertMatchWindow <= 0 {
@@ -280,6 +284,7 @@ func main() {
IncidentsEnabled: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
IncidentsPersistent: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
IncidentRebuildSupported: v2ReadsEnabled && incidentsEnabled && sqlitePath != "",
+ Profile: authCfg.Profile,
})
// SSE hub for real-time dashboard updates
@@ -496,8 +501,46 @@ func main() {
os.Exit(1)
}
if replay.Projected == 0 {
+ // Empty WAL replay while rebuild was explicitly requested.
+ // Transition only the seed rows whose StartedAt precedes
+ // replaySince — those are stale beyond the hot window and
+ // their continuing "active" status is no longer evidence-
+ // backed. Non-stale active rows in the same seed are left
+ // untouched and will be re-evaluated by the next live tick.
+ staleTransitioned := 0
if len(seed) > 0 {
- slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+ incidentStoreRef := incidentStore
+ now := time.Now().UTC()
+ for _, inc := range seed {
+ if inc.Status != incidents.StatusActive {
+ continue
+ }
+ if !inc.StartedAt.Before(replaySince) {
+ continue
+ }
+ row := inc
+ row.Status = incidents.StatusRecovering
+ t := now
+ row.RecoveringAt = &t
+ row.UpdatedAt = now
+ if err := incidentStoreRef.Upsert(context.Background(), row); err != nil {
+ slog.Warn("stale-active rebuild transition failed",
+ "incident_id", row.IncidentID, "err", err)
+ continue
+ }
+ staleTransitioned++
+ }
+ if staleTransitioned > 0 {
+ if err := incidentEngine.Bootstrap(context.Background()); err != nil {
+ slog.Error("incident engine re-bootstrap after stale transition failed", "err", err)
+ os.Exit(1)
+ }
+ slog.Info("incidents rebuild: stale active rows transitioned to recovering",
+ "transitioned", staleTransitioned,
+ "replay_since", replaySince)
+ } else {
+ slog.Warn("incidents rebuild skipped: WAL replay returned no events; preserving SQLite as-is")
+ }
}
} else {
result, err := incidents.Rebuild(context.Background(), incidents.RebuildDeps{
diff --git a/docs/env.md b/docs/env.md
index 3de9e62..0ef8836 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -32,14 +32,15 @@ Scoped keys. See the Auth section of the [README](../README.md).
| Variable | Scope |
|---|---|
-| `WAYLOG_WRITE_KEY` | Write auth for `/v1/events` (preferred) |
+| `WAYLOG_PROFILE` | Auth strictness profile: `demo` (open), `dev` (default — open OTLP, optional auth), `prod` (refuses to boot without write/read/agent keys, and without write auth on OTLP HTTP+gRPC) |
+| `WAYLOG_WRITE_KEY` | Write auth for `/v1/events`, `/v1/otlp/v1/traces`, OTLP/gRPC, `/v1/signals`, `/v1/alerts` (preferred) |
| `WAYLOG_API_KEY` | Legacy alias for write scope. Supports `Authorization: Bearer` and `X-API-Key` headers |
| `WAYLOG_READ_KEY` | Read auth for read endpoints + dashboard session validation |
| `WAYLOG_AGENT_KEY` | Agent auth for `/v1/tools/*`, `/v1/ask`, `/v1/plans/*`. No session fallback |
| `DASHBOARD_AUTH` | Dashboard auth mode: `off` \| `basic::` \| `key:` |
-| `DASHBOARD_SESSION_SECRET` | Session signing key (derived from `DASHBOARD_AUTH` if unset) |
+| `DASHBOARD_SESSION_SECRET` | Session signing key (derived from `DASHBOARD_AUTH` if unset; required when `WAYLOG_PROFILE=prod`) |
-`ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination.
+`ParseConfig` validates the auth matrix at startup and refuses to boot with an unsafe combination. When `WAYLOG_PROFILE=prod`, all three scoped keys (`WAYLOG_WRITE_KEY`, `WAYLOG_READ_KEY`, `WAYLOG_AGENT_KEY`) are required, and OTLP cannot run unauthenticated.
## Ingest server
@@ -47,7 +48,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
|---|---|---|
| `INGEST_ADDR` | `:8080` | Listen address |
| `OTLP_ENABLED` | `true` | Enable OTLP trace ingest over HTTP and gRPC |
-| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver |
+| `OTLP_GRPC_ADDR` | `:4317` | OTLP/gRPC trace receiver listen address. Set empty to disable the gRPC receiver. For single-host installs, bind `127.0.0.1:4317`. When `WAYLOG_PROFILE=prod`, the server refuses to boot if OTLP is enabled without `WAYLOG_WRITE_KEY` |
| `MAX_BODY_BYTES` | `1048576` (1 MB) | Max body size for `/v1/events`, `/v1/otlp/v1/traces`, and OTLP/gRPC receive messages |
| `READ_HEADER_TIMEOUT` | `5s` | HTTP read header timeout |
| `READ_TIMEOUT` | `10s` | HTTP read timeout |
@@ -58,7 +59,7 @@ Scoped keys. See the Auth section of the [README](../README.md).
## CLI
-The `waylog` CLI calls the running ingest server's v2 read APIs. The server must run with `WAYLOG_V2_READS=true`.
+The `waylog` CLI calls the running ingest server's v2 read APIs. The server runs with `WAYLOG_V2_READS=true` by default; only set it to `false` for legacy v1-only stacks.
| Variable | Default | Purpose |
|---|---|---|
@@ -106,7 +107,7 @@ See [Internals](internals.md) for the full durability model.
| Variable | Default | Purpose |
|---|---|---|
| `GRAPH_UI` | `false` | Enable optional graph topology endpoint `/v1/graph/topology` |
-| `WAYLOG_V2_READS` | `false` | Route v2 read endpoints to the schema-2.0 recent index |
+| `WAYLOG_V2_READS` | `true` | Route v2 read endpoints to the schema-2.0 recent index. Set `false` only for legacy v1-only stacks |
| `CAUSAL_ENABLED` | `false` | Enable shadow-mode causal inference |
| `CAUSAL_INTERVAL` | `30s` | Causal inference ticker interval |
| `HAPPY_SAMPLE_RATE_PCT` | `2` | Success-event sampling rate. Set `100` in dev profiles |
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index 4c31f77..6cc08dd 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -202,11 +202,12 @@ paths:
post:
tags: [Signals]
operationId: ingestAlert
- summary: Ingest an external alert and match it to an active incident
+ summary: Ingest an external alert and correlate it with an active incident
description: |
Accepts Waylog-normalized alerts plus Alertmanager, Grafana, and
PagerDuty webhook payloads. Accepted alerts are stored as `type=alert`
- signals. Matching is best-effort and does not create incidents directly.
+ signals and correlated with active incidents when possible. Alerts do
+ not create incidents — the spike detector owns incident lifecycle.
security:
- ApiKeyHeader: []
- BearerAuth: []
diff --git a/examples/cmd/api-gateway/main.go b/examples/cmd/api-gateway/main.go
index 24aa63d..9662c36 100644
--- a/examples/cmd/api-gateway/main.go
+++ b/examples/cmd/api-gateway/main.go
@@ -16,16 +16,22 @@ func main() {
}
checkoutURL := config.Getenv("CHECKOUT_URL", "http://localhost:9082")
+ ingestURL := config.Getenv("INGEST_URL", "http://localhost:8080")
+ writeKey := config.Getenv("WAYLOG_WRITE_KEY", "")
+ readKey := config.Getenv("WAYLOG_READ_KEY", writeKey)
+ agentKey := config.Getenv("WAYLOG_AGENT_KEY", readKey)
gateway := microdemo.NewGatewayHandler(checkoutURL)
gateway.SetSignalPoster(microdemo.NewDemoSignalPoster(
- config.Getenv("INGEST_URL", "http://localhost:8080"),
- config.Getenv("WAYLOG_WRITE_KEY", ""),
+ ingestURL,
+ writeKey,
))
+ gateway.SetWaylogAPI(ingestURL, readKey, writeKey, agentKey)
mux := http.NewServeMux()
mux.Handle("/purchase", gateway.PurchaseHandler())
mux.HandleFunc("/demo", gateway.ServeDemo)
mux.HandleFunc("/demo/burst", gateway.ServeBurst)
+ mux.HandleFunc("/demo/proof", gateway.ServeProof)
microdemo.RunService("api-gateway", ":9081", mux)
}
diff --git a/examples/microdemo/gateway.go b/examples/microdemo/gateway.go
index f065440..e4a0a52 100644
--- a/examples/microdemo/gateway.go
+++ b/examples/microdemo/gateway.go
@@ -28,7 +28,12 @@ var uiHTML []byte
type GatewayHandler struct {
checkoutURL string
+ ingestURL string
+ readKey string
+ writeKey string
+ agentKey string
client *http.Client
+ proofClient *http.Client
purchase http.Handler
signals SignalPoster
}
@@ -44,6 +49,7 @@ func NewGatewayHandler(checkoutURL string) *GatewayHandler {
client: &http.Client{
Transport: wayloghttp.NewTransport(demoHTTPTransport(), "checkout"),
},
+ proofClient: &http.Client{Timeout: 10 * demoSignalTimeout},
}
// Pre-wrap so the live /purchase route and /demo/burst dispatch share a
// single instance — and so callers can't forget to wire it up.
@@ -67,6 +73,14 @@ func (h *GatewayHandler) SetSignalPoster(poster SignalPoster) {
h.signals = poster
}
+// SetWaylogAPI configures the demo-only proof loop proxy.
+func (h *GatewayHandler) SetWaylogAPI(ingestURL, readKey, writeKey, agentKey string) {
+ h.ingestURL = strings.TrimRight(strings.TrimSpace(ingestURL), "/")
+ h.readKey = strings.TrimSpace(readKey)
+ h.writeKey = strings.TrimSpace(writeKey)
+ h.agentKey = strings.TrimSpace(agentKey)
+}
+
func (h *GatewayHandler) ServeDemo(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write(uiHTML)
diff --git a/examples/microdemo/proof.go b/examples/microdemo/proof.go
new file mode 100644
index 0000000..41236ac
--- /dev/null
+++ b/examples/microdemo/proof.go
@@ -0,0 +1,431 @@
+package microdemo
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "time"
+
+ apiv2 "github.com/sssmaran/WaylogCLI/pkg/api/v2"
+ pkgtriage "github.com/sssmaran/WaylogCLI/pkg/triage"
+)
+
+const (
+ proofWindow = "15m"
+ proofPollDelay = 750 * time.Millisecond
+ proofPollAttempts = 24
+)
+
+type ProofSummary struct {
+ AlertID string `json:"alert_id"`
+ IncidentID string `json:"incident_id"`
+ ReportHash string `json:"report_hash"`
+ Hashes map[string]string `json:"hashes"`
+ Burst BurstSummary `json:"burst"`
+ Evidence ProofEvidence `json:"evidence"`
+ Reports ProofReports `json:"reports"`
+ Scorecard ProofScorecard `json:"scorecard"`
+}
+
+type ProofEvidence struct {
+ TraceID string `json:"trace_id"`
+ AlertLinked bool `json:"alert_linked"`
+ DependencySignal bool `json:"dependency_signal"`
+ NextChecks bool `json:"next_checks"`
+}
+
+type ProofReports struct {
+ Markdown string `json:"markdown"`
+ Slack json.RawMessage `json:"slack"`
+ PagerDuty string `json:"pagerduty"`
+}
+
+type ProofScorecard struct {
+ RootCauseAccuracy bool `json:"root_cause_accuracy"`
+ CauseClassificationDependency bool `json:"cause_classification_dependency"`
+ ReportHashStable bool `json:"report_hash_stable"`
+ PropagatedErrorInflationAvoided int `json:"propagated_error_inflation_avoided"`
+ TriageLatencyMS int64 `json:"triage_latency_ms"`
+ Scenario string `json:"scenario"`
+ RootCauseCount int `json:"root_cause_count"`
+ NaivePropagatedCount int `json:"naive_propagated_count"`
+}
+
+type planResult struct {
+ Steps []struct {
+ Result json.RawMessage `json:"result"`
+ } `json:"steps"`
+}
+
+func (h *GatewayHandler) ServeProof(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ if h.ingestURL == "" {
+ http.Error(w, "INGEST_URL is not configured for the demo proof", http.StatusServiceUnavailable)
+ return
+ }
+
+ var req BurstRequest
+ if r.Body != nil {
+ defer r.Body.Close()
+ dec := json.NewDecoder(r.Body)
+ dec.DisallowUnknownFields()
+ if err := dec.Decode(&req); err != nil && err != io.EOF {
+ http.Error(w, "invalid json: "+err.Error(), http.StatusBadRequest)
+ return
+ }
+ }
+
+ result, err := h.runProof(r.Context(), req)
+ if err != nil {
+ http.Error(w, err.Error(), http.StatusBadGateway)
+ return
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(result)
+}
+
+func (h *GatewayHandler) runProof(ctx context.Context, req BurstRequest) (ProofSummary, error) {
+ alertID := fmt.Sprintf("alert_demo_proof_pmt_502_%d", time.Now().Unix())
+ if err := h.postProofAlert(ctx, alertID); err != nil {
+ return ProofSummary{}, err
+ }
+
+ signals := []SignalResult(nil)
+ if h.signals != nil {
+ signals = h.signals.PostDemoSignals(ctx)
+ }
+ burst := runBurst(ctx, h.purchase, req)
+ burst.Signals = signals
+ answerStart := time.Now()
+
+ errorsResp, err := h.pollErrors(ctx)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ inc, incidentsResp, err := h.pollIncident(ctx)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+
+ triageA, err := h.getTriage(ctx, inc.IncidentID)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ triageB, err := h.getTriage(ctx, inc.IncidentID)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ answerEnd := time.Now()
+
+ toolReport, err := h.postToolTriage(ctx, inc.IncidentID)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ planReport, err := h.postPlanTriage(ctx, inc.IncidentID)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ hashes := map[string]string{
+ "read": triageA.ReportHash,
+ "repeat": triageB.ReportHash,
+ "tool": toolReport.ReportHash,
+ "plan": planReport.ReportHash,
+ }
+ hashStable := triageA.ReportHash != "" &&
+ triageA.ReportHash == triageB.ReportHash &&
+ triageA.ReportHash == toolReport.ReportHash &&
+ triageA.ReportHash == planReport.ReportHash
+
+ blast, err := h.getBlast(ctx)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+ reports, err := h.getReports(ctx, inc.IncidentID)
+ if err != nil {
+ return ProofSummary{}, err
+ }
+
+ rootCount := paymentErrorCount(errorsResp)
+ naive := rootCount * blast.AffectedServices
+ return ProofSummary{
+ AlertID: alertID,
+ IncidentID: inc.IncidentID,
+ ReportHash: triageA.ReportHash,
+ Hashes: hashes,
+ Burst: burst,
+ Evidence: ProofEvidence{
+ TraceID: firstTraceID(triageA),
+ AlertLinked: hasAlertID(triageA, alertID),
+ DependencySignal: hasSignalType(triageA, "dependency"),
+ NextChecks: len(triageA.NextChecks) > 0,
+ },
+ Reports: reports,
+ Scorecard: ProofScorecard{
+ RootCauseAccuracy: triageRootCauseAccurate(triageA),
+ CauseClassificationDependency: incidentCauseIsDependency(incidentsResp, inc.IncidentID),
+ ReportHashStable: hashStable,
+ PropagatedErrorInflationAvoided: naive - rootCount,
+ TriageLatencyMS: answerEnd.Sub(answerStart).Milliseconds(),
+ Scenario: "warm-demo",
+ RootCauseCount: rootCount,
+ NaivePropagatedCount: naive,
+ },
+ }, nil
+}
+
+func (h *GatewayHandler) postProofAlert(ctx context.Context, alertID string) error {
+ body := map[string]any{
+ "source": "waylog",
+ "alert_id": alertID,
+ "service": "checkout",
+ "env": "demo",
+ "severity": "critical",
+ "reason": "PMT_502 spike",
+ "message": "browser demo alert for checkout payment failures",
+ "error_code": "PMT_502",
+ "timestamp": time.Now().UTC().Format(time.RFC3339),
+ }
+ status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/alerts", h.writeKey, body, nil)
+ if err != nil {
+ return err
+ }
+ if status != http.StatusCreated {
+ return fmt.Errorf("alert webhook failed: HTTP %d", status)
+ }
+ return nil
+}
+
+func (h *GatewayHandler) pollErrors(ctx context.Context) (apiv2.ErrorsResponse, error) {
+ var last apiv2.ErrorsResponse
+ for i := 0; i < proofPollAttempts; i++ {
+ q := url.Values{"window": {proofWindow}, "limit": {"10"}}
+ status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/errors?"+q.Encode(), h.readKey, nil, &last)
+ if err == nil && status == http.StatusOK && paymentErrorCount(last) > 0 {
+ return last, nil
+ }
+ if err != nil {
+ return apiv2.ErrorsResponse{}, err
+ }
+ sleepOrDone(ctx, proofPollDelay)
+ }
+ return apiv2.ErrorsResponse{}, fmt.Errorf("payment_502 error family did not appear")
+}
+
+func (h *GatewayHandler) pollIncident(ctx context.Context) (apiv2.Incident, apiv2.IncidentListResponse, error) {
+ var last apiv2.IncidentListResponse
+ for i := 0; i < proofPollAttempts; i++ {
+ status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/incidents/active", h.readKey, nil, &last)
+ if err != nil {
+ return apiv2.Incident{}, apiv2.IncidentListResponse{}, err
+ }
+ if status == http.StatusOK {
+ for _, inc := range last.Incidents {
+ if isPaymentFamily(inc.ErrorFamily) && inc.Cause == "dependency" && inc.Status == "active" {
+ return inc, last, nil
+ }
+ }
+ }
+ sleepOrDone(ctx, proofPollDelay)
+ }
+ return apiv2.Incident{}, apiv2.IncidentListResponse{}, fmt.Errorf("dependency incident did not appear")
+}
+
+func (h *GatewayHandler) getTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+ var rep pkgtriage.Report
+ status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/triage/"+url.PathEscape(incidentID)+"?snapshot=true", h.readKey, nil, &rep)
+ if err != nil {
+ return nil, err
+ }
+ if status != http.StatusOK {
+ return nil, fmt.Errorf("triage read failed: HTTP %d", status)
+ }
+ return &rep, nil
+}
+
+func (h *GatewayHandler) postToolTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+ var rep pkgtriage.Report
+ status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/tools/triage_incident", h.agentKey, map[string]any{"incident_id": incidentID, "snapshot": true}, &rep)
+ if err != nil {
+ return nil, err
+ }
+ if status != http.StatusOK {
+ return nil, fmt.Errorf("triage tool failed: HTTP %d", status)
+ }
+ return &rep, nil
+}
+
+func (h *GatewayHandler) postPlanTriage(ctx context.Context, incidentID string) (*pkgtriage.Report, error) {
+ var plan planResult
+ status, _, err := h.doJSON(ctx, http.MethodPost, "/v1/plans/execute", h.agentKey, map[string]any{
+ "template": "triage",
+ "params": map[string]any{"incident_id": incidentID, "snapshot": true},
+ }, &plan)
+ if err != nil {
+ return nil, err
+ }
+ if status != http.StatusOK || len(plan.Steps) == 0 {
+ return nil, fmt.Errorf("triage plan failed: HTTP %d", status)
+ }
+ var rep pkgtriage.Report
+ if err := json.Unmarshal(plan.Steps[0].Result, &rep); err != nil {
+ return nil, fmt.Errorf("triage plan result decode: %w", err)
+ }
+ return &rep, nil
+}
+
+func (h *GatewayHandler) getBlast(ctx context.Context) (apiv2.BlastRadiusResponse, error) {
+ q := url.Values{"window": {proofWindow}, "error_family": {"checkout:payment.charge:PMT_502"}}
+ var blast apiv2.BlastRadiusResponse
+ status, _, err := h.doJSON(ctx, http.MethodGet, "/v1/blast_radius?"+q.Encode(), h.readKey, nil, &blast)
+ if err != nil {
+ return apiv2.BlastRadiusResponse{}, err
+ }
+ if status != http.StatusOK {
+ return apiv2.BlastRadiusResponse{}, fmt.Errorf("blast failed: HTTP %d", status)
+ }
+ return blast, nil
+}
+
+func (h *GatewayHandler) getReports(ctx context.Context, incidentID string) (ProofReports, error) {
+ var out ProofReports
+ for _, format := range []string{"markdown", "slack", "pagerduty"} {
+ path := "/v1/triage/" + url.PathEscape(incidentID) + "/report?format=" + format + "&snapshot=true"
+ status, raw, err := h.doJSON(ctx, http.MethodGet, path, h.readKey, nil, nil)
+ if err != nil {
+ return ProofReports{}, err
+ }
+ if status != http.StatusOK {
+ return ProofReports{}, fmt.Errorf("%s report failed: HTTP %d", format, status)
+ }
+ switch format {
+ case "markdown":
+ out.Markdown = string(raw)
+ case "slack":
+ out.Slack = append(json.RawMessage(nil), raw...)
+ case "pagerduty":
+ out.PagerDuty = string(raw)
+ }
+ }
+ return out, nil
+}
+
+func (h *GatewayHandler) doJSON(ctx context.Context, method, path, key string, body any, out any) (int, []byte, error) {
+ var reader io.Reader
+ if body != nil {
+ raw, err := json.Marshal(body)
+ if err != nil {
+ return 0, nil, err
+ }
+ reader = bytes.NewReader(raw)
+ }
+ req, err := http.NewRequestWithContext(ctx, method, h.ingestURL+path, reader)
+ if err != nil {
+ return 0, nil, err
+ }
+ if body != nil {
+ req.Header.Set("Content-Type", "application/json")
+ }
+ if key != "" {
+ req.Header.Set("Authorization", "Bearer "+key)
+ }
+ client := h.proofClient
+ if client == nil {
+ client = http.DefaultClient
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer resp.Body.Close()
+ raw, err := io.ReadAll(io.LimitReader(resp.Body, 4<<20))
+ if err != nil {
+ return resp.StatusCode, nil, err
+ }
+ if out != nil && resp.StatusCode >= 200 && resp.StatusCode < 300 {
+ if err := json.Unmarshal(raw, out); err != nil {
+ return resp.StatusCode, raw, err
+ }
+ }
+ return resp.StatusCode, raw, nil
+}
+
+func sleepOrDone(ctx context.Context, d time.Duration) {
+ timer := time.NewTimer(d)
+ defer timer.Stop()
+ select {
+ case <-ctx.Done():
+ case <-timer.C:
+ }
+}
+
+func paymentErrorCount(resp apiv2.ErrorsResponse) int {
+ for _, row := range resp.Rows {
+ if isPaymentFamily(row.ErrorFamily) {
+ return row.Count
+ }
+ }
+ return 0
+}
+
+func isPaymentFamily(f apiv2.ErrorFamily) bool {
+ return f.Service == "checkout" && f.Step == "payment.charge" && f.ErrorCode == "PMT_502"
+}
+
+func firstTraceID(rep *pkgtriage.Report) string {
+ if rep == nil || len(rep.SampleTraces) == 0 {
+ return ""
+ }
+ return rep.SampleTraces[0].TraceID
+}
+
+func hasAlertID(rep *pkgtriage.Report, alertID string) bool {
+ if rep == nil {
+ return false
+ }
+ for _, alert := range rep.Alerts {
+ if alert.AlertID == alertID && alert.SignalID != "" {
+ return true
+ }
+ }
+ return false
+}
+
+func hasSignalType(rep *pkgtriage.Report, typ string) bool {
+ if rep == nil {
+ return false
+ }
+ for _, sig := range rep.Signals {
+ if sig.ID != "" && sig.Type == typ {
+ return true
+ }
+ }
+ return false
+}
+
+func triageRootCauseAccurate(rep *pkgtriage.Report) bool {
+ if rep == nil {
+ return false
+ }
+ for _, family := range rep.BlastSnapshot.TopErrorFamilies {
+ if family.Service == "checkout" && family.Step == "payment.charge" && family.ErrorCode == "PMT_502" {
+ return true
+ }
+ }
+ return false
+}
+
+func incidentCauseIsDependency(resp apiv2.IncidentListResponse, incidentID string) bool {
+ for _, inc := range resp.Incidents {
+ if inc.IncidentID == incidentID {
+ return inc.Cause == "dependency"
+ }
+ }
+ return false
+}
diff --git a/examples/microdemo/proof_test.go b/examples/microdemo/proof_test.go
new file mode 100644
index 0000000..12904ab
--- /dev/null
+++ b/examples/microdemo/proof_test.go
@@ -0,0 +1,88 @@
+package microdemo
+
+import (
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+)
+
+func TestServeProofRejectsNonPOST(t *testing.T) {
+ gateway := NewGatewayHandler("http://checkout.example")
+ rec := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodGet, "/demo/proof", nil)
+ gateway.ServeProof(rec, req)
+ if rec.Code != http.StatusMethodNotAllowed {
+ t.Fatalf("status = %d, want 405", rec.Code)
+ }
+}
+
+func TestServeProofRequiresIngestURL(t *testing.T) {
+ gateway := NewGatewayHandler("http://checkout.example")
+ gateway.SetPurchaseHandler(okBurstDispatch())
+ rec := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodPost, "/demo/proof", strings.NewReader(`{}`))
+ gateway.ServeProof(rec, req)
+ if rec.Code != http.StatusServiceUnavailable {
+ t.Fatalf("status = %d, want 503", rec.Code)
+ }
+}
+
+func TestServeProofRejectsUnknownFields(t *testing.T) {
+ gateway := NewGatewayHandler("http://checkout.example")
+ gateway.SetWaylogAPI("http://ingest.example", "read", "write", "agent")
+ rec := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodPost, "/demo/proof", strings.NewReader(`{"foo":1}`))
+ gateway.ServeProof(rec, req)
+ if rec.Code != http.StatusBadRequest {
+ t.Fatalf("status = %d, want 400", rec.Code)
+ }
+}
+
+func TestProofSummaryJSONShape(t *testing.T) {
+ out := ProofSummary{
+ AlertID: "alert_1",
+ IncidentID: "inc_1",
+ ReportHash: "sha256:x",
+ Hashes: map[string]string{"read": "sha256:x"},
+ Evidence: ProofEvidence{TraceID: "trace_1", AlertLinked: true, DependencySignal: true, NextChecks: true},
+ Scorecard: ProofScorecard{
+ RootCauseAccuracy: true,
+ CauseClassificationDependency: true,
+ ReportHashStable: true,
+ TriageLatencyMS: 42,
+ Scenario: "warm-demo",
+ },
+ }
+ raw, err := json.Marshal(out)
+ if err != nil {
+ t.Fatalf("marshal: %v", err)
+ }
+ for _, want := range []string{"alert_1", "inc_1", "trace_1", "cause_classification_dependency", `"triage_latency_ms":42`, `"scenario":"warm-demo"`} {
+ if !strings.Contains(string(raw), want) {
+ t.Fatalf("json missing %q: %s", want, raw)
+ }
+ }
+}
+
+func TestProofSummaryReportHashStableFalse(t *testing.T) {
+ out := ProofSummary{
+ AlertID: "alert_1",
+ IncidentID: "inc_1",
+ ReportHash: "sha256:x",
+ Hashes: map[string]string{"read": "sha256:x", "repeat": "sha256:y"},
+ Scorecard: ProofScorecard{
+ ReportHashStable: false,
+ TriageLatencyMS: 7,
+ Scenario: "warm-demo",
+ },
+ }
+ raw, err := json.Marshal(out)
+ if err != nil {
+ t.Fatalf("marshal: %v", err)
+ }
+ if !strings.Contains(string(raw), `"report_hash_stable":false`) {
+ t.Fatalf("json missing report_hash_stable:false: %s", raw)
+ }
+}
diff --git a/examples/microdemo/ui.html b/examples/microdemo/ui.html
index b5ac343..b0d3cf0 100644
--- a/examples/microdemo/ui.html
+++ b/examples/microdemo/ui.html
@@ -346,6 +346,150 @@
display: grid;
gap: 16px;
}
+ .proof-grid {
+ display: grid;
+ grid-template-columns: minmax(0, 1fr);
+ gap: 20px;
+ align-items: start;
+ }
+ .proof-hero {
+ display: grid;
+ gap: 16px;
+ padding-bottom: 4px;
+ }
+ .proof-headline {
+ margin: 8px 0 0;
+ font-size: clamp(1.4rem, 3vw, 2rem);
+ line-height: 1.1;
+ letter-spacing: -0.035em;
+ max-width: 22ch;
+ }
+ .proof-summary {
+ display: grid;
+ grid-template-columns: repeat(3, minmax(0, 1fr));
+ gap: 10px;
+ }
+ .proof-panel {
+ display: grid;
+ gap: 12px;
+ }
+ .proof-columns {
+ display: grid;
+ grid-template-columns: minmax(0, 0.95fr) minmax(360px, 1.05fr);
+ gap: 18px;
+ align-items: start;
+ }
+ .proof-checklist {
+ display: grid;
+ gap: 8px;
+ margin: 0;
+ padding: 0;
+ list-style: none;
+ }
+ .proof-checklist li {
+ display: grid;
+ grid-template-columns: 18px minmax(0, 1fr);
+ gap: 9px;
+ align-items: start;
+ padding: 9px 10px;
+ background: var(--surface-2);
+ border: 1px solid var(--line);
+ border-radius: var(--radius);
+ color: var(--muted);
+ font-size: 0.86rem;
+ }
+ .proof-checklist .ok { color: var(--accent); font-family: var(--font-mono); font-weight: 600; }
+ .proof-checklist .bad { color: var(--danger); font-family: var(--font-mono); font-weight: 600; }
+ .proof-table {
+ width: 100%;
+ border-collapse: collapse;
+ background: var(--surface-2);
+ border: 1px solid var(--line);
+ border-radius: var(--radius);
+ overflow: hidden;
+ font-family: var(--font-mono);
+ font-size: 0.74rem;
+ }
+ .proof-table th,
+ .proof-table td {
+ padding: 8px 9px;
+ border-bottom: 1px solid var(--line);
+ text-align: left;
+ vertical-align: top;
+ }
+ .proof-table tr:last-child td { border-bottom: 0; }
+ .proof-table td { overflow-wrap: anywhere; }
+ .proof-table .hash-short { color: var(--ink); font-weight: 600; }
+ .proof-table .hash-full { color: var(--faint); display: block; margin-top: 3px; }
+ .proof-status {
+ color: var(--accent);
+ font-weight: 600;
+ white-space: nowrap;
+ }
+ .proof-status.fail { color: var(--danger); }
+ .proof-metric {
+ display: grid;
+ gap: 4px;
+ padding: 11px 12px;
+ background: var(--surface-2);
+ border: 1px solid var(--line);
+ border-radius: var(--radius);
+ min-width: 0;
+ }
+ .proof-metric span {
+ color: var(--muted);
+ font-family: var(--font-mono);
+ font-size: 0.68rem;
+ letter-spacing: 0.06em;
+ text-transform: uppercase;
+ }
+ .proof-metric strong {
+ font-family: var(--font-mono);
+ font-size: 0.86rem;
+ font-weight: 500;
+ overflow-wrap: anywhere;
+ }
+ .proof-report {
+ max-height: 360px;
+ overflow: auto;
+ white-space: pre-wrap;
+ overflow-wrap: anywhere;
+ background: var(--surface-2);
+ border: 1px solid var(--line);
+ border-radius: var(--radius);
+ padding: 14px;
+ margin: 0;
+ font-family: var(--font-mono);
+ font-size: 0.76rem;
+ line-height: 1.5;
+ }
+ .proof-report-head {
+ display: flex;
+ flex-wrap: wrap;
+ justify-content: space-between;
+ gap: 10px;
+ align-items: end;
+ }
+ .proof-tabs {
+ display: inline-flex;
+ flex-wrap: wrap;
+ gap: 6px;
+ }
+ .proof-tab {
+ border: 1px solid var(--line);
+ background: var(--surface-2);
+ border-radius: var(--radius-sm);
+ color: var(--muted);
+ font-family: var(--font-mono);
+ font-size: 0.68rem;
+ padding: 5px 7px;
+ text-transform: uppercase;
+ letter-spacing: 0.04em;
+ }
+ .proof-tab.active {
+ color: var(--ink);
+ border-color: var(--line-strong);
+ }
.burst-counts {
display: flex;
flex-wrap: wrap;
@@ -479,6 +623,7 @@
@media (max-width: 760px) {
.expectations { grid-template-columns: 1fr; }
.result-grid { grid-template-columns: 1fr; }
+ .proof-summary, .proof-columns { grid-template-columns: 1fr; }
.burst-row { grid-template-columns: 1fr; }
.burst-form { justify-content: flex-start; }
main { padding: 36px 0 64px; }
@@ -551,6 +696,14 @@ Production-like traffic mix
+
+
Result
Choose a scenario to send a real request through the demo services.
@@ -613,12 +766,15 @@ Result
const scenarioButtons = Array.from(document.querySelectorAll("[data-scenario]"));
const burstForm = document.getElementById("burst-form");
const burstButton = document.getElementById("burst-run");
+ const proofButton = document.getElementById("proof-run");
const burstControls = Array.from(burstForm.elements);
let burstInFlight = false;
+ let proofInFlight = false;
scenarioButtons.forEach(button => {
button.addEventListener("click", () => purchase(button.dataset.scenario, button));
});
burstForm.addEventListener("submit", runBurst);
+ proofButton.addEventListener("click", runProofLoop);
function esc(value) {
return String(value ?? "").replace(/[&<>"']/g, ch => ({
@@ -627,7 +783,7 @@ Result
}
function setLoading(activeButton, loading) {
- [...scenarioButtons, ...burstControls].forEach(control => {
+ [...scenarioButtons, ...burstControls, proofButton].forEach(control => {
control.disabled = loading;
control.setAttribute("aria-busy", loading ? "true" : "false");
control.querySelector?.(".spinner")?.remove();
@@ -637,6 +793,14 @@ Result
}
}
+ function burstPayloadFromForm() {
+ const formData = new FormData(burstForm);
+ return {
+ requests: Number(formData.get("requests")),
+ concurrency: Number(formData.get("concurrency"))
+ };
+ }
+
async function purchase(scenario, activeButton) {
const result = document.getElementById("result");
setLoading(activeButton, true);
@@ -664,11 +828,7 @@ Result
if (burstInFlight) return;
burstInFlight = true;
setLoading(burstButton, true);
- const formData = new FormData(burstForm);
- const payload = {
- requests: Number(formData.get("requests")),
- concurrency: Number(formData.get("concurrency"))
- };
+ const payload = burstPayloadFromForm();
const result = document.getElementById("result");
result.className = "result-empty";
result.textContent = "Posting demo signals and running production-like traffic through the checkout chain…";
@@ -689,6 +849,30 @@ Result
}
}
+ async function runProofLoop() {
+ if (proofInFlight) return;
+ proofInFlight = true;
+ setLoading(proofButton, true);
+ const result = document.getElementById("result");
+ result.className = "result-empty";
+ result.textContent = "Running alert → incident → triage → reports → scorecard. This takes a few seconds…";
+ try {
+ const resp = await fetch("/demo/proof", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(burstPayloadFromForm())
+ });
+ if (!resp.ok) throw new Error(await resp.text() || "proof loop failed");
+ renderProofLoop(await resp.json());
+ } catch (err) {
+ result.className = "result-empty";
+ result.textContent = "Error: " + err.message;
+ } finally {
+ proofInFlight = false;
+ setLoading(proofButton, false);
+ }
+ }
+
async function pollStory(scenario, data) {
if (!data.trace_id) return;
const attempts = 16;
@@ -793,6 +977,88 @@ Result
`;
}
+
+ function renderProofLoop(proof) {
+ const score = proof.scorecard || {};
+ const evidence = proof.evidence || {};
+ const markdown = proof.reports?.markdown || "Report not available";
+ const hashes = proof.hashes || {};
+ const hashOK = Object.values(hashes).length > 0 && Object.values(hashes).every(value => value === proof.report_hash);
+ const rootCause = "checkout:payment.charge:PMT_502";
+ const fullHash = proof.report_hash || "";
+ const shortHash = shortHashValue(fullHash);
+ const checklist = [
+ ["Alert accepted", Boolean(proof.alert_id), proof.alert_id || "not available"],
+ ["Incident opened", Boolean(proof.incident_id), proof.incident_id || "not available"],
+ ["Triage built", Boolean(proof.report_hash), shortHash || "not available"],
+ ["Read/tool/plan hashes agree", hashOK, "direct read, direct tool, plan template, repeat snapshot"],
+ ["Reports rendered", Boolean(markdown && proof.reports?.slack && proof.reports?.pagerduty), "Markdown, Slack Block Kit, PagerDuty note"],
+ ["Scorecard passed", Boolean(score.root_cause_accuracy && score.cause_classification_dependency && score.report_hash_stable), "root cause, dependency classification, hash stability"]
+ ];
+ const hashRows = [
+ ["read endpoint", hashes.read],
+ ["repeat snapshot", hashes.repeat],
+ ["direct tool", hashes.tool],
+ ["plan template", hashes.plan]
+ ].map(([label, value]) => `| ${esc(label)} | ${esc(shortHashValue(value) || "not available")}${esc(value || "")} | ${value === proof.report_hash ? "pass" : "fail"} |
`).join("");
+ const result = document.getElementById("result");
+ result.className = "bracketed";
+ result.innerHTML = `
+
+
+
Proof loop complete
+
Alert correlated. Root cause identified. Report verified.
+
Waylog correlated an external alert with an active incident, found ${esc(rootCause)}, and produced a cited operator report whose hash was stable across CLI, read, direct tool, and plan-template surfaces in this run.
+
+
+
Root cause${esc(rootCause)}
+
Stable report hash${esc(shortHash || "not available")}
+
Inflation avoided${esc(score.propagated_error_inflation_avoided ?? "not available")} propagated errors
+
+
+
+
+
+ ${checklist.map(([label, ok, detail]) => `- ${ok ? "✓" : "!"}${esc(label)}
${esc(detail)} `).join("")}
+
+
Evidence IDsalert=${esc(proof.alert_id || "not available")} · trace=${esc(evidence.trace_id || "not available")} · incident=${esc(proof.incident_id || "not available")}
+
Evidence completenessalert linked · dependency signal present · next checks ready
+
+ | Surface | Report hash | Status |
+ ${hashRows}
+
+
RCA scorecard${humanBool(score.root_cause_accuracy, "Root cause identified")} · ${humanBool(score.cause_classification_dependency, "Dependency cause confirmed")} · ${humanBool(score.report_hash_stable, "Hash stable")} · time to answer ${esc(score.time_to_answer_ms || 0)}ms
+
+
+
+
+
Operator reportCited, deterministic, LLM-free
+
+ Markdown
+ Slack JSON
+ PagerDuty note
+
+
+
${esc(markdown)}
+
+
+
`;
+ }
+
+ function shortHashValue(value) {
+ value = String(value || "");
+ if (value.length <= 22) return value;
+ if (value.startsWith("sha256:")) return "sha256:" + value.slice(7, 15) + "…" + value.slice(-6);
+ return value.slice(0, 12) + "…" + value.slice(-6);
+ }
+
+ function humanBool(ok, label) {
+ return ok ? label : "Missing " + label.toLowerCase();
+ }