From ff2a629a9725ba7acc64a07c869bb64459844fcd Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:11:18 -0300 Subject: [PATCH 01/26] fix(deps): migrate docker executor to moby/moby client+api modules Why: the runtime imported github.com/docker/docker v28.5.2+incompatible, a monolithic module bundling daemon, client, and API types. govulncheck flagged 5 daemon-side vulnerabilities (GO-2026-4883/4887/5617/5668/5746) reachable through package init chains even though the runtime only uses the client SDK, which blocked every Go commit via the pre-commit hook. What: switch to the split v2 modules github.com/moby/moby/client v0.5.0 and github.com/moby/moby/api v1.55.0, which hold only the client and API types and do not pull the vulnerable daemon module into the build graph. Adapt to the v2 client API (option-struct ContainerCreate/Start/Remove/ Kill/Attach/Wait, client.New replacing deprecated NewClientWithOpts). govulncheck ./... is clean; docker tests pass against a live daemon. --- go.mod | 29 +++++++------- go.sum | 74 ++++++++++++++--------------------- service/exec/docker/docker.go | 36 ++++++++++------- 3 files changed, 67 insertions(+), 72 deletions(-) diff --git a/go.mod b/go.mod index 70db0c648..1f834be80 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,6 @@ require ( github.com/charmbracelet/x/input v0.3.7 github.com/charmbracelet/x/term v0.2.2 github.com/coder/websocket v1.8.15 - github.com/docker/docker v28.5.2+incompatible github.com/expr-lang/expr v1.17.8 github.com/fatih/color v1.19.0 github.com/go-sql-driver/mysql v1.10.0 @@ -45,8 +44,10 @@ require ( github.com/lib/pq v1.12.3 github.com/mattn/go-sqlite3 v1.14.47 github.com/microcosm-cc/bluemonday v1.0.27 + github.com/moby/moby/api v1.55.0 + github.com/moby/moby/client v0.5.0 github.com/muesli/termenv v0.16.0 - github.com/prometheus/client_golang v1.23.0 + github.com/prometheus/client_golang v1.23.2 github.com/rabbitmq/amqp091-go v1.12.0 github.com/sergi/go-diff v1.4.0 github.com/spf13/cobra v1.10.2 @@ -122,6 +123,7 @@ require ( github.com/charmbracelet/harmonica v0.2.0 // indirect github.com/charmbracelet/x/cellbuf v0.0.15 // indirect github.com/charmbracelet/x/windows v0.2.1 // indirect + github.com/cilium/ebpf v0.17.3 // indirect github.com/clipperhouse/displaywidth v0.11.0 // indirect github.com/clipperhouse/uax29/v2 v2.7.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect @@ -135,7 +137,7 @@ require ( github.com/dblohm7/wingoes v0.0.0-20240119213807-a09d6be7affa // indirect github.com/distribution/reference v0.6.0 // indirect github.com/dlclark/regexp2 v1.10.0 // indirect - github.com/docker/go-connections v0.6.0 // indirect + github.com/docker/go-connections v0.7.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect @@ -146,7 +148,7 @@ require ( github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/goccy/go-yaml v1.19.2 // indirect - github.com/godbus/dbus/v5 v5.1.1-0.20230522191255-76236955d466 // indirect + github.com/godbus/dbus/v5 v5.2.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/mock v1.6.0 // indirect @@ -160,7 +162,7 @@ require ( github.com/hashicorp/go-metrics v0.5.4 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/go-sockaddr v1.0.7 // indirect - github.com/hashicorp/golang-lru v0.6.0 // indirect + github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/hdevalence/ed25519consensus v0.2.0 // indirect github.com/huin/goupnp v1.3.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -179,10 +181,9 @@ require ( github.com/mattn/go-runewidth v0.0.23 // indirect github.com/mdlayher/netlink v1.7.3-0.20250113171957-fbb4dce95f42 // indirect github.com/mdlayher/socket v0.5.0 // indirect - github.com/miekg/dns v1.1.68 // indirect + github.com/miekg/dns v1.1.72 // indirect github.com/mitchellh/go-ps v1.0.0 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect - github.com/moby/sys/atomicwriter v0.1.0 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -191,12 +192,11 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pires/go-proxyproto v0.8.1 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pkoukk/tiktoken-go v0.1.6 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.65.0 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/procfs v0.20.1 // indirect github.com/richardlehane/mscfb v1.0.6 // indirect github.com/richardlehane/msoleps v1.0.6 // indirect github.com/rivo/uniseg v0.4.7 // indirect @@ -223,17 +223,18 @@ require ( gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect go.bytecodealliance.org v0.7.0 // indirect - go.etcd.io/bbolt v1.4.2 // indirect + go.etcd.io/bbolt v1.4.3 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/mock v0.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect go4.org/mem v0.0.0-20240501181205-ae6ca9944745 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect - golang.org/x/mod v0.36.0 // indirect + golang.org/x/mod v0.37.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/sync v0.21.0 // indirect golang.org/x/text v0.38.0 // indirect @@ -242,8 +243,8 @@ require ( golang.zx2c4.com/wireguard/windows v0.5.3 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa // indirect - gotest.tools/v3 v3.5.2 // indirect gvisor.dev/gvisor v0.0.0-20260224225140-573d5e7127a8 // indirect + pgregory.net/rapid v1.3.0 // indirect ) tool go.uber.org/mock/mockgen diff --git a/go.sum b/go.sum index dd016b635..037405ab4 100644 --- a/go.sum +++ b/go.sum @@ -9,8 +9,6 @@ filippo.io/edwards25519 v1.2.0 h1:crnVqOiS4jqYleHd9vaKZ+HKtHfllngJIiOpNpoJsjo= filippo.io/edwards25519 v1.2.0/go.mod h1:xzAOLCNug/yB62zG1bQ8uziwrIqIuxhctzJT18Q77mc= filippo.io/mkcert v1.4.4 h1:8eVbbwfVlaqUM7OwuftKc2nuYOoTDQWqsoXmzoXZdbc= filippo.io/mkcert v1.4.4/go.mod h1:VyvOchVuAye3BoUsPUOOofKygVwLV2KQMVFJNRq+1dA= -github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= -github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/CloudyKit/fastprinter v0.0.0-20200109182630-33d98a066a53 h1:sR+/8Yb4slttB4vD+b9btVEnWgL3Q00OBTzVT8B9C0c= @@ -120,8 +118,8 @@ github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSg github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= github.com/charmbracelet/x/windows v0.2.1 h1:3x7vnbpQrjpuq/4L+I4gNsG5htYoCiA5oe9hLjAij5I= github.com/charmbracelet/x/windows v0.2.1/go.mod h1:ptZp16h40gDYqs5TSawSVW+yiLB13j4kSMA0lSCHL0M= -github.com/cilium/ebpf v0.16.0 h1:+BiEnHL6Z7lXnlGUsXQPPAE7+kenAd4ES8MQ5min0Ok= -github.com/cilium/ebpf v0.16.0/go.mod h1:L7u2Blt2jMM/vLAVgjxluxtBKlz3/GWjB0dMOEngfwE= +github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg= +github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8= @@ -134,8 +132,6 @@ github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= -github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= -github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/coreos/etcd v3.3.27+incompatible h1:QIudLb9KeBsE5zyYxd1mjzRSkzLg9Wf9QlRwFgd6oTA= github.com/coreos/etcd v3.3.27+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-iptables v0.7.1-0.20240112124308-65c67c9f46e6 h1:8h5+bWd7R6AYUslN6c6iuZWTKsKxUFDlpnmilO6R2n0= @@ -171,10 +167,8 @@ github.com/djherbis/times v1.6.0 h1:w2ctJ92J8fBvWPxugmXIv7Nz7Q3iDMKNx9v5ocVH20c= github.com/djherbis/times v1.6.0/go.mod h1:gOHeRAz2h+VJNZ5Gmc/o7iD9k4wW7NMVqieYCY99oc0= github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= -github.com/docker/docker v28.5.2+incompatible h1:DBX0Y0zAjZbSrm1uzOkdr1onVghKaftjlSWt4AFexzM= -github.com/docker/docker v28.5.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= -github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= +github.com/docker/go-connections v0.7.0 h1:6SsRfJddP22WMrCkj19x9WKjEDTB+ahsdiGYf0mN39c= +github.com/docker/go-connections v0.7.0/go.mod h1:no1qkHdjq7kLMGUXYAduOhYPSJxxvgWBh7ogVvptn3Q= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= @@ -220,8 +214,8 @@ github.com/go4org/plan9netshell v0.0.0-20250324183649-788daa080737 h1:cf60tHxREO github.com/go4org/plan9netshell v0.0.0-20250324183649-788daa080737/go.mod h1:MIS0jDzbU/vuM9MC4YnBITCv+RYuTRq8dJzmCrFsK9g= github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM= github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= -github.com/godbus/dbus/v5 v5.1.1-0.20230522191255-76236955d466 h1:sQspH8M4niEijh3PFscJRLDnkL547IeP7kpPe3uUhEg= -github.com/godbus/dbus/v5 v5.1.1-0.20230522191255-76236955d466/go.mod h1:ZiQxhyQ+bbbfxUKVvjfO498oPYvtYhZzycal3G/NHmU= +github.com/godbus/dbus/v5 v5.2.2 h1:TUR3TgtSVDmjiXOgAAyaZbYmIeP3DPkld3jgKGV8mXQ= +github.com/godbus/dbus/v5 v5.2.2/go.mod h1:3AAv2+hPq5rdnr5txxxRwiGjPXamgoIHgz9FPBfOp3c= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -290,8 +284,8 @@ github.com/hashicorp/go-sockaddr v1.0.7/go.mod h1:FZQbEYa1pxkQ7WLpyXJ6cbjpT8q0Yg github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= -github.com/hashicorp/golang-lru v0.6.0/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= +github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/memberlist v0.5.4 h1:40YY+3qq2tAUhZIMEK8kqusKZBBjdwJ3NUjvYkcxh74= github.com/hashicorp/memberlist v0.5.4/go.mod h1:OgN6xiIo6RlHUWk+ALjP9e32xWCoQrsOCmHrWCm2MWA= github.com/hashicorp/raft v1.7.3 h1:DxpEqZJysHN0wK+fviai5mFcSYsCkNpFUl1xpAW8Rbo= @@ -396,24 +390,20 @@ github.com/mdlayher/socket v0.5.0 h1:ilICZmJcQz70vrWVes1MFera4jGiWNocSkykwwoy3XI github.com/mdlayher/socket v0.5.0/go.mod h1:WkcBFfvyG8QENs5+hfQPl1X6Jpd2yeLIYgrGFmJiJxI= github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk= github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA= -github.com/miekg/dns v1.1.68 h1:jsSRkNozw7G/mnmXULynzMNIsgY2dHC8LO6U6Ij2JEA= -github.com/miekg/dns v1.1.68/go.mod h1:fujopn7TB3Pu3JM69XaawiU0wqjpL9/8xGop5UrTPps= +github.com/miekg/dns v1.1.72 h1:vhmr+TF2A3tuoGNkLDFK9zi36F2LS+hKTRW0Uf8kbzI= +github.com/miekg/dns v1.1.72/go.mod h1:+EuEPhdHOsfk6Wk5TT2CzssZdqkmFhf8r+aVyDEToIs= github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc= github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw= -github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs= -github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= -github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= -github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= -github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= +github.com/moby/moby/api v1.55.0 h1:2/sexvQyqIWS8pRSCFddBfpW2qE7vR7FCL+vN8pxwMc= +github.com/moby/moby/api v1.55.0/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs= +github.com/moby/moby/client v0.5.0 h1:5XhyPk2fuOWf6RlSFa3MkIIgDZkF25xToXW8Q/BH7cc= +github.com/moby/moby/client v0.5.0/go.mod h1:rcVpF8ncl9vo5gaIBdol6CnbEtSj1uxMvEV/UrykF/s= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= @@ -442,7 +432,6 @@ github.com/pires/go-proxyproto v0.8.1 h1:9KEixbdJfhrbtjpz/ZwCdWDD2Xem0NZ38qMYaAS github.com/pires/go-proxyproto v0.8.1/go.mod h1:ZKAAyp3cgy5Y5Mo4n9AlScrkCZwUy0g3Jf+slqQVcuU= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/sftp v1.13.6 h1:JFZT4XbOU7l77xGSpOdW+pwIMqP044IyjXX6FGyEKFo= github.com/pkg/sftp v1.13.6/go.mod h1:tz1ryNURKu77RL+GuCzmoJYxQczL3wLNNpPWagdg4Qk= @@ -456,8 +445,8 @@ github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5Fsn github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -467,15 +456,15 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8 github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/rabbitmq/amqp091-go v1.12.0 h1:V0v14Iqfs+MwHWihJt/nGS5Ulu0vw572b2Co3mwunkI= github.com/rabbitmq/amqp091-go v1.12.0/go.mod h1:Hy4jKW5kQART1u+JkDTF9YYOQUHXqMuhrgxOEeS7G4o= github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8= @@ -500,8 +489,6 @@ github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepq github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= -github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -597,8 +584,6 @@ github.com/wippyai/tree-sitter-sql v0.0.4 h1:0hJyjkV6/lhoGA5FJAaf96od2xyo+OJINSE github.com/wippyai/tree-sitter-sql v0.0.4/go.mod h1:QN9CfIO55fwQNVNk1p/7pjxrLk7iKxrQs42Zh6lrr84= github.com/wippyai/wapp v0.1.2 h1:fCoxKr9s3gk+pWx4XcnIQmOVgPiuimXf3QcE9mHbdmw= github.com/wippyai/wapp v0.1.2/go.mod h1:ndCkYR80+osLGbd7AFWlP+3DxwooR+R6cxQYPZhksg4= -github.com/wippyai/wasm-runtime v0.0.0-20260623160937-4f371ee0f68a h1:hNZ5ujZpmQgWlvk9mCaN1Dzvdq2wokrGUVJ8w4OG+Zc= -github.com/wippyai/wasm-runtime v0.0.0-20260623160937-4f371ee0f68a/go.mod h1:1AVYdZibORqlBez081Seakxv2u9IR+KIMrvlBKsk2bQ= github.com/wippyai/wasm-runtime v0.0.0-20260624004845-53368697b5c6 h1:KXsgwoyjcw6rLpltno8Vb7eV+Np0gvJk8s6evZ/vSec= github.com/wippyai/wasm-runtime v0.0.0-20260624004845-53368697b5c6/go.mod h1:1AVYdZibORqlBez081Seakxv2u9IR+KIMrvlBKsk2bQ= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= @@ -630,12 +615,12 @@ gitlab.com/opennota/wd v0.0.0-20180912061657-c5d65f63c638 h1:uPZaMiz6Sz0PZs3IZJW gitlab.com/opennota/wd v0.0.0-20180912061657-c5d65f63c638/go.mod h1:EGRJaqe2eO9XGmFtQCvV3Lm9NLico3UhFwUpCG/+mVU= go.bytecodealliance.org v0.7.0 h1:CTJ1eb5kFhBKHw1/xycxxz4SmVWNKXYHhrA78oLNXhY= go.bytecodealliance.org v0.7.0/go.mod h1:PCLMft5yTQsHT9oNPWlq0I6Qdmo6THvdky2AZHjNUkA= -go.etcd.io/bbolt v1.4.2 h1:IrUHp260R8c+zYx/Tm8QZr04CX+qWS5PGfPdevhdm1I= -go.etcd.io/bbolt v1.4.2/go.mod h1:Is8rSHO/b4f3XigBC0lL0+4FwAQv3HXEEIgFMuKHceM= +go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= +go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0 h1:ssfIgGNANqpVFCndZvcuyKbl0g+UAVcbBcqGkG28H0Y= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.64.0/go.mod h1:GQ/474YrbE4Jx8gZ4q5I4hrhUzM6UPzyrqJYV2AqPoQ= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0 h1:8tvICD4vSTOOsNrsI4Ljf6C+6UKvpTEH5XY3JMoyPoo= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0/go.mod h1:z9+yiacE0IHRqM4qFfkbt/JYlmYXgss8GY/jXoNuPJI= go.opentelemetry.io/otel v1.44.0 h1:JjwHmHpA4iZ3wBxluu2fbbE7j4kqlE8jXyAyPXH7HqU= go.opentelemetry.io/otel v1.44.0/go.mod h1:BMgjTHL9WPRlRjL2oZCBTL4whCGtXch2H4BhOPIAyYc= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.44.0 h1:SUplec5dp06reu1zaXmOXdvqH398taqrDXqUl99jxSc= @@ -674,8 +659,8 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= go4.org/mem v0.0.0-20240501181205-ae6ca9944745 h1:Tl++JLUCe4sxGu8cTpDzRLd3tN7US4hOxG5YpKCzkek= @@ -697,8 +682,8 @@ golang.org/x/image v0.41.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= -golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= +golang.org/x/mod v0.37.0 h1:vF1DjpVEshcIqoEaauuHebaLk1O1forxjxBaVn884JQ= +golang.org/x/mod v0.37.0/go.mod h1:m8S8VeM9r4dzDwjrKO0a1sZP3YjeMamRRlD+fmR2Q/0= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -745,7 +730,6 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220817070843-5a390386f1f2/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= @@ -814,6 +798,8 @@ honnef.co/go/tools v0.7.0 h1:w6WUp1VbkqPEgLz4rkBzH/CSU6HkoqNLp6GstyTx3lU= honnef.co/go/tools v0.7.0/go.mod h1:pm29oPxeP3P82ISxZDgIYeOaf9ta6Pi0EWvCFoLG2vc= howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= +pgregory.net/rapid v1.3.0 h1:vBvO0VSqti75J1jjYqpgPNBLKMd1+gxa9fYo7vk/Exc= +pgregory.net/rapid v1.3.0/go.mod h1:dPlE4OBBxgXPqkP79flB6sJL1dx5azpI7HQ9MY9Z7uk= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= software.sslmate.com/src/go-pkcs12 v0.4.0 h1:H2g08FrTvSFKUj+D309j1DPfk5APnIdAQAB8aEykJ5k= diff --git a/service/exec/docker/docker.go b/service/exec/docker/docker.go index bef7c1f9e..a6233be4f 100644 --- a/service/exec/docker/docker.go +++ b/service/exec/docker/docker.go @@ -10,11 +10,11 @@ import ( "strings" "sync" - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/mount" - "github.com/docker/docker/api/types/network" - "github.com/docker/docker/client" - "github.com/docker/docker/pkg/stdcopy" + "github.com/moby/moby/api/pkg/stdcopy" + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/api/types/mount" + "github.com/moby/moby/api/types/network" + "github.com/moby/moby/client" execapi "github.com/wippyai/runtime/api/service/exec" "go.uber.org/zap" ) @@ -53,12 +53,12 @@ func NewDockerExecutor(log *zap.Logger, config *execapi.DockerExecutorConfig) (* return nil, err } - opts := []client.Opt{client.FromEnv, client.WithAPIVersionNegotiation()} + opts := []client.Opt{client.FromEnv} if config.Host != "" { opts = append(opts, client.WithHost(config.Host)) } - cli, err := client.NewClientWithOpts(opts...) + cli, err := client.New(opts...) if err != nil { return nil, NewDockerClientError(err) } @@ -231,7 +231,11 @@ func (p *Process) Start() error { Tty: false, } - resp, err := p.cli.ContainerCreate(ctx, config, hostConfig, &network.NetworkingConfig{}, nil, "") + resp, err := p.cli.ContainerCreate(ctx, client.ContainerCreateOptions{ + Config: config, + HostConfig: hostConfig, + NetworkingConfig: &network.NetworkingConfig{}, + }) if err != nil { return NewContainerCreateError(err) } @@ -239,14 +243,14 @@ func (p *Process) Start() error { p.containerID = resp.ID p.log.Debug("container created", zap.String("id", p.containerID)) - attachResp, err := p.cli.ContainerAttach(ctx, p.containerID, container.AttachOptions{ + attachResp, err := p.cli.ContainerAttach(ctx, p.containerID, client.ContainerAttachOptions{ Stream: true, Stdin: true, Stdout: true, Stderr: true, }) if err != nil { - _ = p.cli.ContainerRemove(ctx, p.containerID, container.RemoveOptions{Force: true}) + _, _ = p.cli.ContainerRemove(ctx, p.containerID, client.ContainerRemoveOptions{Force: true}) return NewContainerAttachError(err) } @@ -266,9 +270,9 @@ func (p *Process) Start() error { } }() - if err := p.cli.ContainerStart(ctx, p.containerID, container.StartOptions{}); err != nil { + if _, err := p.cli.ContainerStart(ctx, p.containerID, client.ContainerStartOptions{}); err != nil { attachResp.Close() - _ = p.cli.ContainerRemove(ctx, p.containerID, container.RemoveOptions{Force: true}) + _, _ = p.cli.ContainerRemove(ctx, p.containerID, client.ContainerRemoveOptions{Force: true}) return NewContainerStartError(err) } @@ -292,7 +296,7 @@ func (p *Process) Signal(sig int) error { p.mu.RUnlock() sigName := signalName(sig) - err := p.cli.ContainerKill(context.Background(), containerID, sigName) + _, err := p.cli.ContainerKill(context.Background(), containerID, client.ContainerKillOptions{Signal: sigName}) if err != nil { if strings.Contains(err.Error(), "is not running") { return ErrContainerStopped @@ -350,7 +354,11 @@ func (p *Process) Wait() error { containerID := p.containerID p.mu.RUnlock() - statusCh, errCh := p.cli.ContainerWait(context.Background(), containerID, container.WaitConditionNotRunning) + waitResult := p.cli.ContainerWait(context.Background(), containerID, client.ContainerWaitOptions{ + Condition: container.WaitConditionNotRunning, + }) + statusCh := waitResult.Result + errCh := waitResult.Error var exitCode int64 select { From e5d910ef6b070d3333bd2998841d405da7e7d422 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:11:42 -0300 Subject: [PATCH 02/26] fix(otel): flush tracer provider on shutdown Why: the OTel boot component's Stop returned nil and discarded the TracerProvider returned by InitializeProvider. The BatchSpanProcessor (queue 512, batch 128, 2s flush) was never shut down, so up to 512 in-flight spans were silently dropped on every process exit. What: add otel.ShutdownTracerProvider mirroring the existing meter shutdown, and call it from the OTel() boot component's Stop. Add a newShutdownTracerProviderError constructor. The provider-level test asserts all queued spans reach the exporter after shutdown. --- boot/components/otel/otel.go | 14 +++++++--- service/otel/errors.go | 7 +++++ service/otel/provider.go | 17 ++++++++++++ service/otel/provider_test.go | 50 +++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/boot/components/otel/otel.go b/boot/components/otel/otel.go index f1765a0c3..120b180aa 100644 --- a/boot/components/otel/otel.go +++ b/boot/components/otel/otel.go @@ -9,10 +9,13 @@ import ( logapi "github.com/wippyai/runtime/api/logs" otelapi "github.com/wippyai/runtime/api/service/otel" "github.com/wippyai/runtime/service/otel" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) func OTel() boot.Component { + var tp trace.TracerProvider + return boot.New(boot.P{ Name: Name, Load: func(ctx context.Context) (context.Context, error) { @@ -35,7 +38,8 @@ func OTel() boot.Component { return ctx, nil } - tp, err := otel.InitializeProvider(ctx, cfg, logger) + var err error + tp, err = otel.InitializeProvider(ctx, cfg, logger) if err != nil { return ctx, NewOTELInitError(err) } @@ -53,8 +57,12 @@ func OTel() boot.Component { Start: func(_ context.Context) error { return nil }, - Stop: func(_ context.Context) error { - return nil + Stop: func(ctx context.Context) error { + logger := logapi.GetLogger(ctx).Named("otel") + if tp == nil { + return nil + } + return otel.ShutdownTracerProvider(ctx, tp, logger) }, }) } diff --git a/service/otel/errors.go b/service/otel/errors.go index f77f57811..7f94425fc 100644 --- a/service/otel/errors.go +++ b/service/otel/errors.go @@ -40,3 +40,10 @@ func newShutdownMeterProviderError(cause error) error { WithDetails(attrs.NewBagFrom(map[string]any{"cause": cause.Error()})). WithCause(cause) } + +func newShutdownTracerProviderError(cause error) error { + return apierror.New(apierror.Internal, "failed to shutdown tracer provider"). + WithRetryable(apierror.False). + WithDetails(attrs.NewBagFrom(map[string]any{"cause": cause.Error()})). + WithCause(cause) +} diff --git a/service/otel/provider.go b/service/otel/provider.go index 310736d10..dc8e3bb94 100644 --- a/service/otel/provider.go +++ b/service/otel/provider.go @@ -301,3 +301,20 @@ func ShutdownMeterProvider(ctx context.Context, mp metric.MeterProvider, logger } return nil } + +// ShutdownTracerProvider gracefully shuts down the tracer provider, flushing +// any spans still held by the BatchSpanProcessor. Without this call the +// processor's bounded queue is dropped on exit. +func ShutdownTracerProvider(ctx context.Context, tp trace.TracerProvider, logger *zap.Logger) error { + if logger == nil { + logger = zap.NewNop() + } + if sdkTP, ok := tp.(*sdktrace.TracerProvider); ok { + logger.Debug("shutting down OTEL tracer provider") + if err := sdkTP.Shutdown(ctx); err != nil { + return newShutdownTracerProviderError(err) + } + logger.Debug("OTEL tracer provider shutdown complete") + } + return nil +} diff --git a/service/otel/provider_test.go b/service/otel/provider_test.go index 53cd94ac6..2eb697750 100644 --- a/service/otel/provider_test.go +++ b/service/otel/provider_test.go @@ -4,6 +4,8 @@ package otel import ( "context" + "fmt" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -15,6 +17,28 @@ import ( "go.uber.org/zap" ) +// recordingExporter records every exported span and does not clear on shutdown, +// so a test can prove the BatchSpanProcessor flushed its queue. +type recordingExporter struct { + mu sync.Mutex + spans int +} + +func (r *recordingExporter) ExportSpans(_ context.Context, spans []sdktrace.ReadOnlySpan) error { + r.mu.Lock() + defer r.mu.Unlock() + r.spans += len(spans) + return nil +} + +func (r *recordingExporter) Shutdown(context.Context) error { return nil } + +func (r *recordingExporter) Count() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.spans +} + func TestInitializeProvider_Disabled(t *testing.T) { cfg := otelapi.Config{ Enabled: false, @@ -236,6 +260,32 @@ func TestShutdownMeterProvider_NoopProvider(t *testing.T) { assert.NoError(t, err) } +func TestShutdownTracerProvider_FlushesSpans(t *testing.T) { + exp := &recordingExporter{} + tp := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exp), + sdktrace.WithSampler(sdktrace.AlwaysSample()), + ) + tracer := tp.Tracer("test") + + ctx := context.Background() + for i := 0; i < 10; i++ { + _, span := tracer.Start(ctx, fmt.Sprintf("op-%d", i)) + span.End() + } + + require.Zero(t, exp.Count(), "spans must still be queued before shutdown") + + require.NoError(t, ShutdownTracerProvider(context.Background(), tp, zap.NewNop())) + + require.Equal(t, 10, exp.Count(), "shutdown must flush all queued spans to the exporter") +} + +func TestShutdownTracerProvider_NoopProvider(t *testing.T) { + err := ShutdownTracerProvider(context.Background(), noop.NewTracerProvider(), zap.NewNop()) + assert.NoError(t, err) +} + func TestProviderUsesBoundedBatcher(t *testing.T) { cfg := otelapi.Config{ Enabled: true, From 5ab2184c0dd093b2448a67d2641d185403128d8f Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:18:22 -0300 Subject: [PATCH 03/26] fix(metrics): enable function-call metrics interceptor Why: the function-call metrics interceptor (wippy_function_calls, wippy_function_duration, wippy_function_in_flight) was written and fully unit-tested but commented out at boot, and its component file was an empty stub. cfg.Interceptor.Enabled was parsed and ignored. Every Lua function invocation was invisible to operators. What: implement the metrics.Interceptor() boot component mirroring the OTel interceptor wiring, register the FunctionInterceptor at order 50 (wraps the tracing interceptor at 100), default interceptor.enabled to true, and uncomment it in All(). Tests assert registration, name, order, and that invoking the registered interceptor emits wippy_function_calls. --- boot/components/metrics/all.go | 2 +- boot/components/metrics/constants.go | 7 ++ boot/components/metrics/interceptor.go | 57 ++++++++++ boot/components/metrics/interceptor_test.go | 112 ++++++++++++++++++++ 4 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 boot/components/metrics/interceptor_test.go diff --git a/boot/components/metrics/all.go b/boot/components/metrics/all.go index 12d6e7bb5..eb1cf53bc 100644 --- a/boot/components/metrics/all.go +++ b/boot/components/metrics/all.go @@ -7,6 +7,6 @@ import "github.com/wippyai/runtime/api/boot" func All() []boot.Component { return []boot.Component{ Metrics(), - // MetricsInterceptor(), + Interceptor(), } } diff --git a/boot/components/metrics/constants.go b/boot/components/metrics/constants.go index ebd83fc43..f65178a28 100644 --- a/boot/components/metrics/constants.go +++ b/boot/components/metrics/constants.go @@ -6,3 +6,10 @@ import "github.com/wippyai/runtime/api/boot" // Name is the component name for metrics. const Name boot.Name = "metrics" + +// InterceptorName is the component name for the function-call metrics interceptor. +const InterceptorName boot.Name = "metrics-interceptor" + +// interceptorName is the external dependency name of the function interceptor +// registry component (boot/components/system.Interceptor). +const interceptorName boot.Name = "interceptor" diff --git a/boot/components/metrics/interceptor.go b/boot/components/metrics/interceptor.go index 05dd08dea..f88d5abd9 100644 --- a/boot/components/metrics/interceptor.go +++ b/boot/components/metrics/interceptor.go @@ -1,3 +1,60 @@ // SPDX-License-Identifier: MPL-2.0 package metrics + +import ( + "context" + + "github.com/wippyai/runtime/api/boot" + apiinterceptor "github.com/wippyai/runtime/api/function" + metricsapi "github.com/wippyai/runtime/api/metrics" + metricsinterceptor "github.com/wippyai/runtime/service/metrics/interceptor" +) + +// Interceptor wires the function-call metrics interceptor +// (wippy_function_calls / wippy_function_duration / wippy_function_in_flight) +// into the function interceptor registry. It runs at order 50 so the duration +// timer wraps the full call, including the OTel tracing interceptor (order 100). +func Interceptor() boot.Component { + return boot.New(boot.P{ + Name: InterceptorName, + DependsOn: []boot.Name{Name, interceptorName}, + Load: func(ctx context.Context) (context.Context, error) { + if !loadInterceptorEnabled(ctx) { + return ctx, nil + } + + collector := metricsapi.GetCollector(ctx) + if collector == nil { + return ctx, nil + } + + registry := apiinterceptor.GetInterceptorRegistry(ctx) + if registry == nil { + return ctx, nil + } + + if err := registry.Register("metrics", metricsinterceptor.NewFunctionInterceptor(collector, true), 50); err != nil { + return ctx, err + } + + return ctx, nil + }, + }) +} + +// loadInterceptorEnabled reads metrics.interceptor.enabled, defaulting to true +// so function-call metrics emit out of the box. +func loadInterceptorEnabled(ctx context.Context) bool { + bootCfg := boot.GetConfig(ctx) + if bootCfg == nil { + return true + } + + metricsCfg := bootCfg.Sub("metrics") + if metricsCfg == nil { + return true + } + + return metricsCfg.GetBool("interceptor.enabled", true) +} diff --git a/boot/components/metrics/interceptor_test.go b/boot/components/metrics/interceptor_test.go new file mode 100644 index 000000000..3391b0369 --- /dev/null +++ b/boot/components/metrics/interceptor_test.go @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: MPL-2.0 + +package metrics + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/wippyai/runtime/api/boot" + ctxapi "github.com/wippyai/runtime/api/context" + "github.com/wippyai/runtime/api/function" + logapi "github.com/wippyai/runtime/api/logs" + metricsapi "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/registry" + "github.com/wippyai/runtime/api/runtime" + "github.com/wippyai/runtime/internal/telemetrytest" + metricsinterceptor "github.com/wippyai/runtime/service/metrics/interceptor" + "go.uber.org/zap" +) + +type mockInterceptorRegistry struct { + registrations []registeredInterceptor +} + +type registeredInterceptor struct { + interceptor function.Interceptor + name string + order int +} + +func (m *mockInterceptorRegistry) Register(name string, i function.Interceptor, order int) error { + m.registrations = append(m.registrations, registeredInterceptor{name: name, interceptor: i, order: order}) + return nil +} + +func (m *mockInterceptorRegistry) Unregister(string) error { return nil } + +func (m *mockInterceptorRegistry) Execute(ctx context.Context, f function.Func, task runtime.Task) (*runtime.Result, error) { + return f(ctx, task) +} + +func TestMetricsInterceptor_RegistersWhenEnabled(t *testing.T) { + component := Interceptor() + assert.Equal(t, InterceptorName, component.Name()) + + ctx := ctxapi.NewRootContext() + ctx = logapi.WithLogger(ctx, zap.NewNop()) + ctx = boot.WithConfig(ctx, boot.NewConfig(boot.WithSection("metrics", map[string]any{ + "interceptor.enabled": true, + }))) + rec := telemetrytest.NewRecorder() + ctx = metricsapi.WithCollector(ctx, rec) + reg := &mockInterceptorRegistry{} + ctx = function.WithInterceptorRegistry(ctx, reg) + + newCtx, err := component.Load(ctx) + require.NoError(t, err) + require.NotNil(t, newCtx) + require.Len(t, reg.registrations, 1) + + r := reg.registrations[0] + assert.Equal(t, "metrics", r.name) + assert.Equal(t, 50, r.order) + + fi, ok := r.interceptor.(*metricsinterceptor.FunctionInterceptor) + require.True(t, ok, "registered interceptor must be *FunctionInterceptor") + + funcID := registry.NewID("ns", "test_func") + task := runtime.Task{ID: funcID} + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { + return &runtime.Result{}, nil + } + _, err = fi.Handle(context.Background(), task, next) + require.NoError(t, err) + assert.Equal(t, 1.0, rec.CounterValue(metricsinterceptor.FunctionCalls, + metricsapi.Labels{"function_id": funcID.String(), "status": "success"})) +} + +func TestMetricsInterceptor_NotRegisteredWhenDisabled(t *testing.T) { + component := Interceptor() + + ctx := ctxapi.NewRootContext() + ctx = logapi.WithLogger(ctx, zap.NewNop()) + ctx = boot.WithConfig(ctx, boot.NewConfig(boot.WithSection("metrics", map[string]any{ + "interceptor.enabled": false, + }))) + ctx = metricsapi.WithCollector(ctx, telemetrytest.NewRecorder()) + reg := &mockInterceptorRegistry{} + ctx = function.WithInterceptorRegistry(ctx, reg) + + _, err := component.Load(ctx) + require.NoError(t, err) + assert.Empty(t, reg.registrations) +} + +func TestMetricsInterceptor_NoCollector(t *testing.T) { + component := Interceptor() + + ctx := ctxapi.NewRootContext() + ctx = logapi.WithLogger(ctx, zap.NewNop()) + ctx = boot.WithConfig(ctx, boot.NewConfig(boot.WithSection("metrics", map[string]any{ + "interceptor.enabled": true, + }))) + reg := &mockInterceptorRegistry{} + ctx = function.WithInterceptorRegistry(ctx, reg) + + _, err := component.Load(ctx) + require.NoError(t, err) + assert.Empty(t, reg.registrations, "no registration without a collector") +} From 556c1cd68733ad74678f4da2b2b70e68e2f50896 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:22:01 -0300 Subject: [PATCH 04/26] test(telemetrytest): real span harness and gauge regression test Why: every OTel test used noop tracers and only checked wiring, never that a span with the expected name, kind, attributes, or status was emitted. The audit also flagged the Recorder gauges as unfaithful. What: add a NewTracerProvider helper returning a real SDK provider wired to an in-memory SpanRecorder, plus assertion helpers (SpanCount, MustSpanNamed, SpanKind, SpanStatus, SpanHasStringAttr, SpanAttr, TraceID). Add TestRecorder_GaugeSequence: empirical check confirmed the Recorder's GaugeSet+GaugeInc/Dec already compose correctly, so this is a regression guard rather than a fix. --- internal/telemetrytest/otel.go | 81 +++++++++++++++++++++++++ internal/telemetrytest/otel_test.go | 40 ++++++++++++ internal/telemetrytest/recorder_test.go | 23 +++++++ 3 files changed, 144 insertions(+) create mode 100644 internal/telemetrytest/otel.go create mode 100644 internal/telemetrytest/otel_test.go diff --git a/internal/telemetrytest/otel.go b/internal/telemetrytest/otel.go new file mode 100644 index 000000000..1f0f2528f --- /dev/null +++ b/internal/telemetrytest/otel.go @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MPL-2.0 + +package telemetrytest + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + "go.opentelemetry.io/otel/trace" +) + +// NewTracerProvider returns a real TracerProvider wired to an in-memory +// SpanRecorder, so tests can assert on the spans that were actually emitted +// instead of relying on noop tracers that only verify wiring. +func NewTracerProvider() (*sdktrace.TracerProvider, *tracetest.SpanRecorder) { + sr := tracetest.NewSpanRecorder() + tp := sdktrace.NewTracerProvider(sdktrace.WithSpanProcessor(sr)) + return tp, sr +} + +// SpanCount asserts the total number of ended spans recorded by sr. +func SpanCount(t *testing.T, sr *tracetest.SpanRecorder, want int) { + t.Helper() + assert.Lenf(t, sr.Ended(), want, "expected %d ended span(s)", want) +} + +// MustSpanNamed returns the single ended span with the given name, failing the +// test if there is not exactly one match. +func MustSpanNamed(t *testing.T, sr *tracetest.SpanRecorder, name string) sdktrace.ReadOnlySpan { + t.Helper() + var matches []sdktrace.ReadOnlySpan + for _, s := range sr.Ended() { + if s.Name() == name { + matches = append(matches, s) + } + } + require.Lenf(t, matches, 1, "expected exactly 1 span named %q, got %d", name, len(matches)) + return matches[0] +} + +// SpanKind asserts the span's kind. +func SpanKind(t *testing.T, span sdktrace.ReadOnlySpan, want trace.SpanKind) { + t.Helper() + assert.Equalf(t, want, span.SpanKind(), "span %q kind", span.Name()) +} + +// SpanStatus asserts the span's status code. +func SpanStatus(t *testing.T, span sdktrace.ReadOnlySpan, want codes.Code) { + t.Helper() + assert.Equalf(t, want, span.Status().Code, "span %q status code", span.Name()) +} + +// SpanAttr returns the value of the named attribute and whether it was present. +func SpanAttr(span sdktrace.ReadOnlySpan, key string) (attribute.Value, bool) { + for _, kv := range span.Attributes() { + if string(kv.Key) == key { + return kv.Value, true + } + } + return attribute.Value{}, false +} + +// SpanHasStringAttr asserts the span carries a string attribute key==want. +func SpanHasStringAttr(t *testing.T, span sdktrace.ReadOnlySpan, key, want string) { + t.Helper() + v, ok := SpanAttr(span, key) + if !ok { + t.Fatalf("span %q has no attribute %q", span.Name(), key) + } + assert.Equalf(t, want, v.AsString(), "span %q attribute %q", span.Name(), key) +} + +// TraceID returns the trace ID carried by a recorded span. +func TraceID(span sdktrace.ReadOnlySpan) trace.TraceID { + return span.SpanContext().TraceID() +} diff --git a/internal/telemetrytest/otel_test.go b/internal/telemetrytest/otel_test.go new file mode 100644 index 000000000..883704435 --- /dev/null +++ b/internal/telemetrytest/otel_test.go @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MPL-2.0 + +package telemetrytest + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" +) + +func TestNewTracerProvider_RecordsSpans(t *testing.T) { + tp, sr := NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + tracer := tp.Tracer("test") + + ctx, parent := tracer.Start(context.Background(), "root", trace.WithSpanKind(trace.SpanKindServer)) + parent.SetAttributes(attribute.String("http.method", "GET")) + parent.SetStatus(codes.Ok, "") + parent.End() + + _, child := tracer.Start(ctx, "child", trace.WithSpanKind(trace.SpanKindInternal)) + child.End() + + SpanCount(t, sr, 2) + root := MustSpanNamed(t, sr, "root") + SpanKind(t, root, trace.SpanKindServer) + SpanStatus(t, root, codes.Ok) + SpanHasStringAttr(t, root, "http.method", "GET") + + v, ok := SpanAttr(root, "absent") + assert.False(t, ok, "absent attr should not be found") + _ = v + + childSpan := MustSpanNamed(t, sr, "child") + assert.Equal(t, TraceID(root), TraceID(childSpan), "parent and child share a trace ID") +} diff --git a/internal/telemetrytest/recorder_test.go b/internal/telemetrytest/recorder_test.go index 3d3d255b2..9f899b02a 100644 --- a/internal/telemetrytest/recorder_test.go +++ b/internal/telemetrytest/recorder_test.go @@ -37,3 +37,26 @@ func TestRecorder_GaugeAndHistogram(t *testing.T) { t.Fatalf("want 2 observations, got %v", c) } } + +func TestRecorder_GaugeSequence(t *testing.T) { + r := NewRecorder() + labels := metrics.Labels{"pg": "g1"} + + r.GaugeSet("g", 5, labels) + r.GaugeInc("g", labels) + r.GaugeInc("g", labels) + r.GaugeDec("g", labels) + if v := r.GaugeValue("g", labels); v != 6 { + t.Fatalf("set(5)+inc+inc-dec: want 6, got %v", v) + } + + r.GaugeSet("g", 0, labels) + if v := r.GaugeValue("g", labels); v != 0 { + t.Fatalf("reset to 0: want 0, got %v", v) + } + + r.GaugeInc("g", labels) + if v := r.GaugeValue("g", labels); v != 1 { + t.Fatalf("inc after set(0): want 1, got %v", v) + } +} From 973248aef399ec41edab253b747944c2452408c8 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:23:47 -0300 Subject: [PATCH 05/26] test(otel): cover the OTel metrics bridge exporter Why: service/otel/metrics.go (MetricsExporter bridging the wippy collector to OTel instruments) had zero tests, including the explicit otelHistogramBuckets alignment that prevents the documented histogram_quantile corruption when Prometheus and OTLP paths merge. What: add counter, gauge, histogram (asserting bucket boundaries match Prometheus DefBuckets), and instrument-caching tests using a real sdkmetric MeterProvider with a ManualReader. --- service/otel/metrics_test.go | 120 +++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 service/otel/metrics_test.go diff --git a/service/otel/metrics_test.go b/service/otel/metrics_test.go new file mode 100644 index 000000000..2089b162c --- /dev/null +++ b/service/otel/metrics_test.go @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: MPL-2.0 + +package otel + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + api "github.com/wippyai/runtime/api/metrics" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +func newTestMeterProvider() (*sdkmetric.MeterProvider, *sdkmetric.ManualReader) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + return mp, reader +} + +func collectMetrics(t *testing.T, reader *sdkmetric.ManualReader) []metricdata.Metrics { + t.Helper() + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + require.NotEmpty(t, rm.ScopeMetrics, "expected at least one scope") + return rm.ScopeMetrics[0].Metrics +} + +func findMetric(t *testing.T, metrics []metricdata.Metrics, name string) metricdata.Metrics { + t.Helper() + for _, m := range metrics { + if m.Name == name { + return m + } + } + t.Fatalf("metric %q not found", name) + return metricdata.Metrics{} +} + +func assertAttributeValue(t *testing.T, set attribute.Set, key, want string) { + t.Helper() + for iter := set.Iter(); iter.Next(); { + kv := iter.Attribute() + if string(kv.Key) == key { + assert.Equalf(t, want, kv.Value.AsString(), "attribute %q", key) + return + } + } + t.Fatalf("attribute %q not found in data point", key) +} + +func TestMetricsExporter_Counter(t *testing.T) { + mp, reader := newTestMeterProvider() + defer func() { _ = mp.Shutdown(context.Background()) }() + exp := NewMetricsExporter(mp) + + require.NoError(t, exp.Record("wippy_x_total", api.TypeCounter, 5, api.Labels{"k": "v"})) + + m := findMetric(t, collectMetrics(t, reader), "wippy_x_total") + sum, ok := m.Data.(metricdata.Sum[float64]) + require.Truef(t, ok, "counter Data should be Sum[float64], got %T", m.Data) + require.Len(t, sum.DataPoints, 1) + assert.Equal(t, 5.0, sum.DataPoints[0].Value) + assertAttributeValue(t, sum.DataPoints[0].Attributes, "k", "v") +} + +func TestMetricsExporter_Gauge(t *testing.T) { + mp, reader := newTestMeterProvider() + defer func() { _ = mp.Shutdown(context.Background()) }() + exp := NewMetricsExporter(mp) + + require.NoError(t, exp.Record("wippy_q", api.TypeGauge, 42, api.Labels{"slot": "s1"})) + + m := findMetric(t, collectMetrics(t, reader), "wippy_q") + g, ok := m.Data.(metricdata.Gauge[float64]) + require.Truef(t, ok, "gauge Data should be Gauge[float64], got %T", m.Data) + require.Len(t, g.DataPoints, 1) + assert.Equal(t, 42.0, g.DataPoints[0].Value) + assertAttributeValue(t, g.DataPoints[0].Attributes, "slot", "s1") +} + +func TestMetricsExporter_HistogramBuckets(t *testing.T) { + mp, reader := newTestMeterProvider() + defer func() { _ = mp.Shutdown(context.Background()) }() + exp := NewMetricsExporter(mp) + + require.NoError(t, exp.Record("wippy_d_seconds", api.TypeHistogram, 0.7, api.Labels{})) + + m := findMetric(t, collectMetrics(t, reader), "wippy_d_seconds") + h, ok := m.Data.(metricdata.Histogram[float64]) + require.Truef(t, ok, "histogram Data should be Histogram[float64], got %T", m.Data) + require.Len(t, h.DataPoints, 1) + assert.Equal(t, uint64(1), h.DataPoints[0].Count) + assert.Equal(t, 0.7, h.DataPoints[0].Sum) + assert.Equal(t, otelHistogramBuckets, h.DataPoints[0].Bounds, + "histogram bounds must stay aligned with Prometheus DefBuckets to avoid histogram_quantile corruption") +} + +func TestMetricsExporter_InstrumentCaching(t *testing.T) { + mp, reader := newTestMeterProvider() + defer func() { _ = mp.Shutdown(context.Background()) }() + exp := NewMetricsExporter(mp) + + require.NoError(t, exp.Record("wippy_c_total", api.TypeCounter, 1, api.Labels{})) + require.NoError(t, exp.Record("wippy_c_total", api.TypeCounter, 2, api.Labels{})) + + var matches []metricdata.Metrics + for _, m := range collectMetrics(t, reader) { + if m.Name == "wippy_c_total" { + matches = append(matches, m) + } + } + require.Len(t, matches, 1, "two records for the same name must reuse one instrument") + + sum := matches[0].Data.(metricdata.Sum[float64]) + require.Len(t, sum.DataPoints, 1) + assert.Equal(t, 3.0, sum.DataPoints[0].Value, "counter must accumulate 1+2") +} From e42f4c389754161929b0bdcd9fbae3b6fdca682e Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:28:51 -0300 Subject: [PATCH 06/26] test(otel): assert real spans for HTTP, interceptor, and queue Why: the OTel service tests used noop tracers and only checked that handlers/next were invoked, never that a span with the expected name, kind, attributes, status, or trace linkage was emitted. The queue publish/extract round-trip was entirely untested. What: add real-span tests using the in-memory harness. HTTP middleware asserts the GET Server span with http.method/http.route and traceparent response injection plus parent extraction. The interceptor asserts span kinds (root=Server, parent=Internal, queue delivery=Consumer with messaging.operation/message.id and trace continuation) and Error status on failure. Queue tests assert the Producer span, header injection, and extract-from-delivery producing a child that shares the producer trace ID. --- service/otel/queue_test.go | 71 ++++++++++++ service/otel/service_span_test.go | 172 ++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 service/otel/queue_test.go create mode 100644 service/otel/service_span_test.go diff --git a/service/otel/queue_test.go b/service/otel/queue_test.go new file mode 100644 index 000000000..32fe01ca9 --- /dev/null +++ b/service/otel/queue_test.go @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MPL-2.0 + +package otel + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + attrsapi "github.com/wippyai/runtime/api/attrs" + queueapi "github.com/wippyai/runtime/api/queue" + "github.com/wippyai/runtime/api/registry" + "github.com/wippyai/runtime/internal/telemetrytest" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" +) + +func TestPublishInterceptor_ProducerSpan(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + pi := NewPublishInterceptor(tp.Tracer("test")) + queueID := registry.NewID("ns", "orders") + msg := &queueapi.Message{ID: "m1", Headers: attrsapi.NewBag()} + + called := false + next := func(_ context.Context, _ registry.ID, _ []*queueapi.Message) error { + called = true + return nil + } + + require.NoError(t, pi.Handle(context.Background(), queueID, []*queueapi.Message{msg}, next)) + require.True(t, called) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:orders.publish") + telemetrytest.SpanKind(t, span, trace.SpanKindProducer) + telemetrytest.SpanHasStringAttr(t, span, "messaging.operation", "publish") + telemetrytest.SpanHasStringAttr(t, span, "messaging.destination.name", "ns:orders") + telemetrytest.SpanHasStringAttr(t, span, "messaging.message.id", "m1") + + tpVal, ok := msg.Headers.Get("traceparent") + require.True(t, ok, "traceparent must be injected into the message headers") + tpStr, ok := tpVal.(string) + require.True(t, ok) + assert.NotEmpty(t, tpStr) +} + +func TestExtractFromDelivery_LinksChild(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + tracer := tp.Tracer("test") + parentCtx, parent := tracer.Start(context.Background(), "publish.parent") + defer parent.End() + + msg := &queueapi.Message{ID: "m1", Headers: attrsapi.NewBag()} + otel.GetTextMapPropagator().Inject(parentCtx, &MessageHeaderCarrier{headers: msg.Headers}) + + extractedCtx, hasSpan := extractFromDelivery(context.Background(), &queueapi.Delivery{Message: msg}) + require.True(t, hasSpan, "extract must recover a valid span context") + + _, child := tracer.Start(extractedCtx, "consume") + child.End() + + consumed := telemetrytest.MustSpanNamed(t, sr, "consume") + assert.Equal(t, parent.SpanContext().TraceID(), telemetrytest.TraceID(consumed), + "consumer span must share the producer trace ID") +} diff --git a/service/otel/service_span_test.go b/service/otel/service_span_test.go new file mode 100644 index 000000000..ba42bd6ba --- /dev/null +++ b/service/otel/service_span_test.go @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: MPL-2.0 + +package otel + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + attrsapi "github.com/wippyai/runtime/api/attrs" + ctxapi "github.com/wippyai/runtime/api/context" + queueapi "github.com/wippyai/runtime/api/queue" + "github.com/wippyai/runtime/api/registry" + "github.com/wippyai/runtime/api/runtime" + httpapi "github.com/wippyai/runtime/api/service/http" + otelapi "github.com/wippyai/runtime/api/service/otel" + "github.com/wippyai/runtime/internal/telemetrytest" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" + "go.uber.org/zap" +) + +// useTraceContextPropagator installs the W3C TraceContext+Baggage propagator +// as the global for the duration of a test, restoring the previous value after. +func useTraceContextPropagator(t *testing.T) { + t.Helper() + prev := otel.GetTextMapPropagator() + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) + t.Cleanup(func() { otel.SetTextMapPropagator(prev) }) +} + +func TestHTTPMiddleware_ServerSpan(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{HTTP: otelapi.HTTPConfig{Enabled: true, ExtractHeaders: true, InjectHeaders: true}}, zap.NewNop(), tp) + wrapped := svc.HTTPMiddleware()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/users/123", nil) + fcCtx, fc := ctxapi.OpenFrameContext(req.Context()) + defer ctxapi.ReleaseFrameContext(fc) + require.NoError(t, httpapi.SetRouteLabel(fcCtx, "/users/:id")) + req = req.WithContext(fcCtx) + + rec := httptest.NewRecorder() + wrapped.ServeHTTP(rec, req) + + telemetrytest.SpanCount(t, sr, 1) + span := telemetrytest.MustSpanNamed(t, sr, "GET /users/:id") + telemetrytest.SpanKind(t, span, trace.SpanKindServer) + telemetrytest.SpanHasStringAttr(t, span, "http.method", "GET") + telemetrytest.SpanHasStringAttr(t, span, "http.route", "/users/:id") + assert.NotEmpty(t, rec.Header().Get("traceparent"), "traceparent must be injected into response headers") +} + +func TestHTTPMiddleware_ExtractsParent(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + tracer := tp.Tracer("test") + parentCtx, parent := tracer.Start(context.Background(), "remote.parent") + defer parent.End() + + svc := NewService(otelapi.Config{HTTP: otelapi.HTTPConfig{Enabled: true, ExtractHeaders: true}}, zap.NewNop(), tp) + wrapped := svc.HTTPMiddleware()(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {})) + + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil) + otel.GetTextMapPropagator().Inject(parentCtx, propagation.HeaderCarrier(req.Header)) + + wrapped.ServeHTTP(httptest.NewRecorder(), req) + + server := telemetrytest.MustSpanNamed(t, sr, "GET unmatched") + assert.Equal(t, parent.SpanContext().TraceID(), telemetrytest.TraceID(server), + "server span must continue the parent trace") +} + +func TestInterceptor_SpanKind_RootIsServer(t *testing.T) { + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + inter := &interceptor{tracer: tp.Tracer("test"), logger: zap.NewNop()} + task := runtime.Task{ID: registry.NewID("ns", "func")} + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { + return &runtime.Result{}, nil + } + + _, err := inter.Handle(context.Background(), task, next) + require.NoError(t, err) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:func") + telemetrytest.SpanKind(t, span, trace.SpanKindServer) +} + +func TestInterceptor_SpanKind_ParentIsInternal(t *testing.T) { + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + tracer := tp.Tracer("test") + baseCtx, parent := tracer.Start(context.Background(), "parent") + defer parent.End() + + fcCtx, fc := ctxapi.OpenFrameContext(baseCtx) + defer ctxapi.ReleaseFrameContext(fc) + require.NoError(t, otelapi.SetSpan(fcCtx, parent)) + + inter := &interceptor{tracer: tracer, logger: zap.NewNop()} + task := runtime.Task{ID: registry.NewID("ns", "func")} + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { return &runtime.Result{}, nil } + + _, err := inter.Handle(fcCtx, task, next) + require.NoError(t, err) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:func") + telemetrytest.SpanKind(t, span, trace.SpanKindInternal) +} + +func TestInterceptor_SpanKind_QueueDeliveryIsConsumer(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + tracer := tp.Tracer("test") + parentCtx, parent := tracer.Start(context.Background(), "publish.parent") + defer parent.End() + + msg := &queueapi.Message{ID: "m1", Headers: attrsapi.NewBag()} + otel.GetTextMapPropagator().Inject(parentCtx, &MessageHeaderCarrier{headers: msg.Headers}) + delivery := &queueapi.Delivery{Message: msg} + + inter := &interceptor{tracer: tracer, logger: zap.NewNop()} + task := runtime.Task{ + ID: registry.NewID("ns", "func"), + Context: []ctxapi.Pair{{Value: delivery}}, + } + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { return &runtime.Result{}, nil } + + _, err := inter.Handle(context.Background(), task, next) + require.NoError(t, err) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:func") + telemetrytest.SpanKind(t, span, trace.SpanKindConsumer) + telemetrytest.SpanHasStringAttr(t, span, "messaging.operation", "process") + telemetrytest.SpanHasStringAttr(t, span, "messaging.message.id", "m1") + assert.Equal(t, parent.SpanContext().TraceID(), telemetrytest.TraceID(span), + "consumer span must continue the producer trace") +} + +func TestInterceptor_ErrorStatus(t *testing.T) { + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + inter := &interceptor{tracer: tp.Tracer("test"), logger: zap.NewNop()} + task := runtime.Task{ID: registry.NewID("ns", "func")} + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { + return nil, errors.New("boom") + } + + _, _ = inter.Handle(context.Background(), task, next) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:func") + telemetrytest.SpanStatus(t, span, codes.Error) +} From f33d98dfd94dd8f25b38a87a1dc9f59da4157cba Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:31:03 -0300 Subject: [PATCH 07/26] test(otel): prove cross-surface trace propagation Why: no test verified that a single trace survives the full chain across surfaces. Each surface was tested in isolation with noop tracers, so a broken traceparent injection or extraction between HTTP, function, queue publish, and queue consume would not be caught. What: drive the real chain in-process - HTTP middleware server span, function interceptor Internal child, queue PublishInterceptor Producer grandchild with traceparent injected into the message, then extract and a Consumer great-grandchild - and assert all four spans share one TraceID. --- service/otel/propagation_test.go | 105 +++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 service/otel/propagation_test.go diff --git a/service/otel/propagation_test.go b/service/otel/propagation_test.go new file mode 100644 index 000000000..4c554e20f --- /dev/null +++ b/service/otel/propagation_test.go @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: MPL-2.0 + +package otel + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + attrsapi "github.com/wippyai/runtime/api/attrs" + ctxapi "github.com/wippyai/runtime/api/context" + queueapi "github.com/wippyai/runtime/api/queue" + "github.com/wippyai/runtime/api/registry" + "github.com/wippyai/runtime/api/runtime" + httpapi "github.com/wippyai/runtime/api/service/http" + otelapi "github.com/wippyai/runtime/api/service/otel" + "github.com/wippyai/runtime/internal/telemetrytest" + "go.opentelemetry.io/otel/trace" + "go.uber.org/zap" +) + +// TestPropagation_HTTP_Function_Queue_Consume proves a single trace survives +// the full surface chain: HTTP server span -> function (Internal) -> queue +// publish (Producer, traceparent injected into the message) -> queue consume +// (Consumer, trace recovered from the message headers). +func TestPropagation_HTTP_Function_Queue_Consume(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{ + HTTP: otelapi.HTTPConfig{Enabled: true, ExtractHeaders: true, InjectHeaders: true}, + Interceptor: otelapi.InterceptorConfig{Enabled: true}, + Queue: otelapi.QueueConfig{Enabled: true}, + }, zap.NewNop(), tp) + httpMW := svc.HTTPMiddleware() + funcInter := svc.Interceptor() + require.NotNil(t, funcInter) + pi := NewPublishInterceptor(tp.Tracer("wippy-runtime")) + + var publishedMsg *queueapi.Message + handler := http.HandlerFunc(func(_ http.ResponseWriter, r *http.Request) { + task := runtime.Task{ID: registry.NewID("ns", "func")} + next := func(ctx context.Context, _ runtime.Task) (*runtime.Result, error) { + msg := &queueapi.Message{ID: "m1", Headers: attrsapi.NewBag()} + publishedMsg = msg + pubNext := func(_ context.Context, _ registry.ID, _ []*queueapi.Message) error { return nil } + if err := pi.Handle(ctx, registry.NewID("ns", "orders"), []*queueapi.Message{msg}, pubNext); err != nil { + return nil, err + } + return &runtime.Result{}, nil + } + _, _ = funcInter.Handle(r.Context(), task, next) + }) + wrapped := httpMW(handler) + + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/orders", nil) + fcCtx, fc := ctxapi.OpenFrameContext(req.Context()) + defer ctxapi.ReleaseFrameContext(fc) + require.NoError(t, httpapi.SetRouteLabel(fcCtx, "/orders")) + req = req.WithContext(fcCtx) + + wrapped.ServeHTTP(httptest.NewRecorder(), req) + + // Simulate the worker consuming the published message. + require.NotNil(t, publishedMsg) + consumeTask := runtime.Task{ + ID: registry.NewID("ns", "consumer"), + Context: []ctxapi.Pair{{Value: &queueapi.Delivery{Message: publishedMsg}}}, + } + consumeNext := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { return &runtime.Result{}, nil } + _, _ = funcInter.Handle(context.Background(), consumeTask, consumeNext) + + // Every span in the chain must share a single trace ID. + spans := sr.Ended() + require.GreaterOrEqual(t, len(spans), 4, "expected at least the 4 chain spans, got %d", len(spans)) + + wantNames := map[string]bool{ + "GET /orders": false, + "ns:func": false, + "ns:orders.publish": false, + "ns:consumer": false, + } + for _, s := range spans { + if _, ok := wantNames[s.Name()]; ok { + wantNames[s.Name()] = true + } + } + for name, seen := range wantNames { + assert.Truef(t, seen, "expected span %q in the chain", name) + } + + var traceID trace.TraceID + for i, s := range spans { + if i == 0 { + traceID = telemetrytest.TraceID(s) + continue + } + assert.Equal(t, traceID, telemetrytest.TraceID(s), + "span %q must share the single chain trace ID", s.Name()) + } +} From 17a04778dc17aaa3fab1006f30d48453bc4e2d59 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:33:11 -0300 Subject: [PATCH 08/26] test(metrics): verify dual exporter fan-out does not double-count Why: the wippy collector fans every observation out to both the in-pod Prometheus exporter and the OTel bridge. Nothing verified that a single emit produced value 1 in each sink (not 2, not conflicting series), so a fan-out regression or duplicate registration could silently inflate metrics. What: register both real exporters on one collector, emit one CounterInc, flush via Close, then assert the Prometheus /metrics scrape shows value 1 and the OTel ManualReader collect shows a value-1 data point. --- service/otel/metrics_doublecount_test.go | 74 ++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 service/otel/metrics_doublecount_test.go diff --git a/service/otel/metrics_doublecount_test.go b/service/otel/metrics_doublecount_test.go new file mode 100644 index 000000000..b5823c6ea --- /dev/null +++ b/service/otel/metrics_doublecount_test.go @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: MPL-2.0 + +package otel + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metricsapi "github.com/wippyai/runtime/api/metrics" + apicfg "github.com/wippyai/runtime/api/service/metrics" + "github.com/wippyai/runtime/service/metrics" + "github.com/wippyai/runtime/service/metrics/prometheus" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +// TestCollector_PrometheusAndOTel_NoDoubleCount verifies that a single +// observation fans out to both the in-pod Prometheus exporter and the OTel +// bridge exporter, each reporting value 1 (not 2, not conflicting series). +func TestCollector_PrometheusAndOTel_NoDoubleCount(t *testing.T) { + coll := metrics.NewCollector(apicfg.Config{}) + + promExp := prometheus.NewExporter() + require.NoError(t, coll.RegisterExporter(promExp)) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + defer func() { _ = mp.Shutdown(context.Background()) }() + require.NoError(t, coll.RegisterExporter(NewMetricsExporter(mp))) + + coll.CounterInc("wippy_dual_test_total", metricsapi.Labels{"k": "v"}) + + // Closing drains the buffer and runs a final flush to every exporter. + require.NoError(t, coll.Close()) + + // Prometheus side. + rec := httptest.NewRecorder() + promExp.Handler().ServeHTTP(rec, httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/metrics", nil)) + body := rec.Body.String() + assert.True(t, + strings.Contains(body, `wippy_dual_test_total{k="v"} 1`), + "prometheus side must report value 1 once, got:\n%s", body) + + // OTel side. + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + require.NotEmpty(t, rm.ScopeMetrics) + var total float64 + found := false + for _, m := range rm.ScopeMetrics[0].Metrics { + if m.Name != "wippy_dual_test_total" { + continue + } + sum, ok := m.Data.(metricdata.Sum[float64]) + if !ok { + continue + } + for _, dp := range sum.DataPoints { + for it := dp.Attributes.Iter(); it.Next(); { + kv := it.Attribute() + if string(kv.Key) == "k" && kv.Value.AsString() == "v" { + total, found = dp.Value, true + } + } + } + } + require.True(t, found, "OTel side must carry the k=v data point") + assert.Equal(t, 1.0, total, "OTel side must report value 1 (no double-count)") +} From 8d5ed38fdca3420553d883586d240a58226abef1 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:40:31 -0300 Subject: [PATCH 09/26] fix(otel): capture HTTP response status code in server span Why: the OTel HTTP middleware never wrapped the ResponseWriter, so http.response_status_code was never set and 5xx responses did not mark the span as Error, leaving server errors invisible in traces. What: wrap the ResponseWriter in a statusRecorder, set http.response.status_code after the handler runs, and set the span to Error for 5xx. Add a SpanHasInt64Attr harness helper and a table test covering 200/404/500. --- internal/telemetrytest/otel.go | 10 +++++++++ service/otel/service.go | 34 ++++++++++++++++++++++++++++- service/otel/service_span_test.go | 36 +++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/internal/telemetrytest/otel.go b/internal/telemetrytest/otel.go index 1f0f2528f..463818e74 100644 --- a/internal/telemetrytest/otel.go +++ b/internal/telemetrytest/otel.go @@ -75,6 +75,16 @@ func SpanHasStringAttr(t *testing.T, span sdktrace.ReadOnlySpan, key, want strin assert.Equalf(t, want, v.AsString(), "span %q attribute %q", span.Name(), key) } +// SpanHasInt64Attr asserts the span carries an int64 attribute key==want. +func SpanHasInt64Attr(t *testing.T, span sdktrace.ReadOnlySpan, key string, want int64) { + t.Helper() + v, ok := SpanAttr(span, key) + if !ok { + t.Fatalf("span %q has no attribute %q", span.Name(), key) + } + assert.Equalf(t, want, v.AsInt64(), "span %q attribute %q", span.Name(), key) +} + // TraceID returns the trace ID carried by a recorded span. func TraceID(span sdktrace.ReadOnlySpan) trace.TraceID { return span.SpanContext().TraceID() diff --git a/service/otel/service.go b/service/otel/service.go index a9a89b972..d2099f93e 100644 --- a/service/otel/service.go +++ b/service/otel/service.go @@ -86,12 +86,44 @@ func (s *Service) HTTPMiddleware() func(http.Handler) http.Handler { propagator.Inject(ctx, propagation.HeaderCarrier(w.Header())) } + rec := &statusRecorder{ResponseWriter: w, status: http.StatusOK} r = r.WithContext(ctx) - next.ServeHTTP(w, r) + next.ServeHTTP(rec, r) + + span.SetAttributes(attribute.Int("http.response.status_code", rec.status)) + if rec.status >= http.StatusInternalServerError { + span.SetStatus(codes.Error, http.StatusText(rec.status)) + } else { + span.SetStatus(codes.Ok, "") + } }) } } +// statusRecorder wraps http.ResponseWriter to capture the response status code +// without otherwise altering behavior. The first WriteHeader call fixes the +// status; a handler that only calls Write defaults to 200 OK. +type statusRecorder struct { + http.ResponseWriter + status int + wrote bool +} + +func (r *statusRecorder) WriteHeader(code int) { + if !r.wrote { + r.status = code + r.wrote = true + } + r.ResponseWriter.WriteHeader(code) +} + +func (r *statusRecorder) Write(b []byte) (int, error) { + if !r.wrote { + r.wrote = true + } + return r.ResponseWriter.Write(b) +} + // OnStart implements scheduler.Lifecycle. func (s *Service) OnStart(ctx stdcontext.Context, p pid.PID, _ process.Process) error { if !s.cfg.Process.Enabled || !s.cfg.Process.TraceLifecycle { diff --git a/service/otel/service_span_test.go b/service/otel/service_span_test.go index ba42bd6ba..9357febce 100644 --- a/service/otel/service_span_test.go +++ b/service/otel/service_span_test.go @@ -170,3 +170,39 @@ func TestInterceptor_ErrorStatus(t *testing.T) { span := telemetrytest.MustSpanNamed(t, sr, "ns:func") telemetrytest.SpanStatus(t, span, codes.Error) } + +func TestHTTPMiddleware_CapturesStatusCode(t *testing.T) { + useTraceContextPropagator(t) + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{HTTP: otelapi.HTTPConfig{Enabled: true}}, zap.NewNop(), tp) + + cases := []struct { + name string + status int + wantError bool + }{ + {"ok", http.StatusOK, false}, + {"not_found", http.StatusNotFound, false}, + {"server_error", http.StatusInternalServerError, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + sr.Reset() + wrapped := svc.HTTPMiddleware()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(c.status) + })) + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/x", nil) + wrapped.ServeHTTP(httptest.NewRecorder(), req) + + span := telemetrytest.MustSpanNamed(t, sr, "GET unmatched") + telemetrytest.SpanHasInt64Attr(t, span, "http.response.status_code", int64(c.status)) + if c.wantError { + telemetrytest.SpanStatus(t, span, codes.Error) + } else { + telemetrytest.SpanStatus(t, span, codes.Ok) + } + }) + } +} From 3bb3c169a65560b37793c7cb43a3229f2bda6f6e Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 13:53:30 -0300 Subject: [PATCH 10/26] feat(queue): add consumer lifecycle metrics Why: the queue surface emitted no metrics - ack/nack outcomes, processing latency, and in-flight depth were only logged, so operators could not answer queue drain rate, error rate, or backlog from /metrics. What: add a nil-safe consumer telemetry layer emitting wippy_queue_messages_total (queue, result=ack|nack), wippy_queue_process_duration_seconds (queue, result), and wippy_queue_in_flight (queue) gauge. Thread the metrics collector from the boot component through the consumer Manager into each Consumer, record around processDelivery. Update existing tests to the new constructor signature (nil collector where metrics are not under test). --- boot/components/queue/consumers.go | 3 ++ service/queue/consumer/consumer.go | 19 +++++++- service/queue/consumer/consumer_test.go | 21 ++++++++ service/queue/consumer/manager.go | 5 ++ service/queue/consumer/manager_test.go | 10 ++-- service/queue/consumer/settle_coord_test.go | 2 + service/queue/consumer/telemetry.go | 54 +++++++++++++++++++++ service/queue/consumer/telemetry_test.go | 51 +++++++++++++++++++ 8 files changed, 158 insertions(+), 7 deletions(-) create mode 100644 service/queue/consumer/telemetry.go create mode 100644 service/queue/consumer/telemetry_test.go diff --git a/boot/components/queue/consumers.go b/boot/components/queue/consumers.go index 07636b1d1..a85dfc75f 100644 --- a/boot/components/queue/consumers.go +++ b/boot/components/queue/consumers.go @@ -9,6 +9,7 @@ import ( "github.com/wippyai/runtime/api/event" "github.com/wippyai/runtime/api/function" logapi "github.com/wippyai/runtime/api/logs" + metricsapi "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" queueapi "github.com/wippyai/runtime/api/queue" regapi "github.com/wippyai/runtime/api/registry" @@ -34,6 +35,7 @@ func Consumers() boot.Component { handlers := bootpkg.GetHandlerRegistry(ctx) queueMgr := queueapi.GetManager(ctx) funcReg := function.GetRegistry(ctx) + coll := metricsapi.GetCollector(ctx) if reg := regapi.GetRegistry(ctx); reg != nil { consumerPatterns := []regapi.DependencyPattern{ @@ -53,6 +55,7 @@ func Consumers() boot.Component { funcReg, dtt, logger.Named("queue.consumer"), + coll, ) handlers.RegisterListener("queue.consumer", manager) diff --git a/service/queue/consumer/consumer.go b/service/queue/consumer/consumer.go index 10a43cb15..607eed0d2 100644 --- a/service/queue/consumer/consumer.go +++ b/service/queue/consumer/consumer.go @@ -5,9 +5,11 @@ package consumer import ( "context" "sync" + "time" ctxapi "github.com/wippyai/runtime/api/context" "github.com/wippyai/runtime/api/function" + "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" queueapi "github.com/wippyai/runtime/api/queue" "github.com/wippyai/runtime/api/registry" @@ -23,6 +25,7 @@ type Consumer struct { funcReg function.Registry config *consumerapi.Config logger *zap.Logger + tel *telemetry deliveries chan *queueapi.Delivery cancel context.CancelFunc workerCancel context.CancelFunc @@ -41,6 +44,7 @@ func NewConsumer( driver queueapi.Driver, funcReg function.Registry, logger *zap.Logger, + coll metrics.Collector, ) *Consumer { if logger == nil { logger = zap.NewNop() @@ -53,6 +57,7 @@ func NewConsumer( driver: driver, funcReg: funcReg, logger: logger, + tel: newTelemetry(coll), } } @@ -173,9 +178,14 @@ func (c *Consumer) processDelivery(ctx context.Context, delivery *queueapi.Deliv queueapi.ReleaseMessage(msg) }() + queueID := c.queueID.String() + c.tel.inFlightInc(queueID) + defer c.tel.inFlightDec(queueID) + start := time.Now() + c.logger.Debug("processing message", zap.String("consumer", c.id.String()), - zap.String("queue", c.queueID.String()), + zap.String("queue", queueID), zap.String("func", c.funcID.String()), zap.Int("worker_id", workerID), zap.String("message_id", msg.ID)) @@ -195,15 +205,18 @@ func (c *Consumer) processDelivery(ctx context.Context, delivery *queueapi.Deliv err = result.Error } + outcome := "ack" + // Ack or Nack based on result. MarkSettled gates the broker call: if // the handler already called msg:ack()/msg:nack() via the Lua // wrapper, the settle slot is claimed and the consumer must skip its // own settle to avoid double-ack (AMQP PRECONDITION_FAILED) or // double-nack/visibility-timeout races. if err != nil { + outcome = "nack" c.logger.Error("message processing failed", zap.String("consumer", c.id.String()), - zap.String("queue", c.queueID.String()), + zap.String("queue", queueID), zap.String("func", c.funcID.String()), zap.Int("worker_id", workerID), zap.String("message_id", msg.ID), @@ -232,4 +245,6 @@ func (c *Consumer) processDelivery(ctx context.Context, delivery *queueapi.Deliv } } } + + c.tel.recordProcessed(queueID, outcome, time.Since(start)) } diff --git a/service/queue/consumer/consumer_test.go b/service/queue/consumer/consumer_test.go index 2f7a68197..b1f92e929 100644 --- a/service/queue/consumer/consumer_test.go +++ b/service/queue/consumer/consumer_test.go @@ -42,6 +42,7 @@ func TestConsumer_StartStop(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) statusChan, err := consumer.Start(ctx) @@ -77,6 +78,7 @@ func TestConsumer_ProcessMessage(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -140,6 +142,7 @@ func TestConsumer_ProcessMessage_Error(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -210,6 +213,7 @@ func TestConsumer_StopTimeout(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) statusChan, err := consumer.Start(ctx) @@ -270,6 +274,7 @@ func TestConsumer_StopWithNoMessages(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) statusChan, err := consumer.Start(ctx) @@ -310,6 +315,7 @@ func TestConsumer_MultipleStopCalls(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -352,6 +358,7 @@ func TestConsumer_ConcurrentMessageProcessing(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -411,6 +418,7 @@ func TestConsumer_StopDuringProcessing(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -464,6 +472,7 @@ func TestConsumer_ContextCancellationStopsWorkers(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) statusChan, err := consumer.Start(ctx) @@ -504,6 +513,7 @@ func TestConsumer_AckNackAfterShutdown(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -572,6 +582,7 @@ func TestConsumer_SlowWorkers(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -636,6 +647,7 @@ func TestConsumer_DeadWorkerTimeout(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -699,6 +711,7 @@ func TestConsumer_MultipleWorkersOneBlocked(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -765,6 +778,7 @@ func TestConsumer_StopWithAllWorkersBlocked(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -903,6 +917,7 @@ func TestConsumer_StressHighThroughput(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -966,6 +981,7 @@ func TestConsumer_StressRapidStartStop(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -1011,6 +1027,7 @@ func TestConsumer_StressStartStopWithMessages(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -1078,6 +1095,7 @@ func TestConsumer_StressConcurrentConsumers(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -1145,6 +1163,7 @@ func TestConsumer_StressNackRequeue(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) @@ -1207,6 +1226,7 @@ func TestConsumer_StressResourceCleanup(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) statusChan, err := consumer.Start(ctx) @@ -1258,6 +1278,7 @@ func TestConsumer_StressMixedAckNack(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := consumer.Start(ctx) diff --git a/service/queue/consumer/manager.go b/service/queue/consumer/manager.go index a973b0280..425af0e7e 100644 --- a/service/queue/consumer/manager.go +++ b/service/queue/consumer/manager.go @@ -8,6 +8,7 @@ import ( "github.com/wippyai/runtime/api/event" "github.com/wippyai/runtime/api/function" + "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" queueapi "github.com/wippyai/runtime/api/queue" "github.com/wippyai/runtime/api/registry" @@ -24,6 +25,7 @@ type Manager struct { funcReg function.Registry dtt payload.Transcoder logger *zap.Logger + coll metrics.Collector consumers sync.Map } @@ -33,6 +35,7 @@ func NewManager( funcReg function.Registry, dtt payload.Transcoder, logger *zap.Logger, + coll metrics.Collector, ) *Manager { if logger == nil { logger = zap.NewNop() @@ -43,6 +46,7 @@ func NewManager( funcReg: funcReg, dtt: dtt, logger: logger, + coll: coll, } } @@ -94,6 +98,7 @@ func (m *Manager) addOrUpdate(ctx context.Context, entry registry.Entry, action driver, m.funcReg, m.logger, + m.coll, ) m.consumers.Store(entry.ID, consumer) diff --git a/service/queue/consumer/manager_test.go b/service/queue/consumer/manager_test.go index f79b43687..4a487be29 100644 --- a/service/queue/consumer/manager_test.go +++ b/service/queue/consumer/manager_test.go @@ -24,7 +24,7 @@ func TestManager_Add(t *testing.T) { funcReg := &mockFuncRegistry{} dtt := &mockDTT{} - manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop()) + manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop(), nil) config := &consumerapi.Config{ ConsumerOptions: queueapi.ConsumerOptions{ @@ -62,7 +62,7 @@ func TestManager_Add_QueueNotFound(t *testing.T) { funcReg := &mockFuncRegistry{} dtt := &mockDTT{} - manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop()) + manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop(), nil) config := &consumerapi.Config{ ConsumerOptions: queueapi.ConsumerOptions{ @@ -93,7 +93,7 @@ func TestManager_Add_DriverNotFound(t *testing.T) { funcReg := &mockFuncRegistry{} dtt := &mockDTT{} - manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop()) + manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop(), nil) config := &consumerapi.Config{ ConsumerOptions: queueapi.ConsumerOptions{ @@ -122,7 +122,7 @@ func TestManager_Update(t *testing.T) { funcReg := &mockFuncRegistry{} dtt := &mockDTT{} - manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop()) + manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop(), nil) consumerID := registry.NewID("test", "consumer") @@ -183,7 +183,7 @@ func TestManager_Delete(t *testing.T) { funcReg := &mockFuncRegistry{} dtt := &mockDTT{} - manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop()) + manager := NewManager(bus, queueMgr, funcReg, dtt, zap.NewNop(), nil) consumerID := registry.NewID("test", "consumer") diff --git a/service/queue/consumer/settle_coord_test.go b/service/queue/consumer/settle_coord_test.go index 3c3bee96d..0a71a61db 100644 --- a/service/queue/consumer/settle_coord_test.go +++ b/service/queue/consumer/settle_coord_test.go @@ -68,6 +68,7 @@ func TestConsumer_SkipsAutoAck_WhenHandlerAlreadySettled(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := c.Start(ctx) @@ -138,6 +139,7 @@ func TestConsumer_SkipsAutoAck_WhenHandlerAlreadyNacked(t *testing.T) { driver, funcReg, zap.NewNop(), + nil, ) _, err := c.Start(ctx) diff --git a/service/queue/consumer/telemetry.go b/service/queue/consumer/telemetry.go new file mode 100644 index 000000000..62ccbdb31 --- /dev/null +++ b/service/queue/consumer/telemetry.go @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MPL-2.0 + +package consumer + +import ( + "time" + + "github.com/wippyai/runtime/api/metrics" +) + +const ( + metricMessagesTotal = "wippy_queue_messages_total" + metricProcessDuration = "wippy_queue_process_duration_seconds" + metricInFlight = "wippy_queue_in_flight" +) + +// telemetry owns metric emission for the queue consumer subsystem. It is +// nil-safe so callers can ignore the absence of a configured collector. +type telemetry struct { + coll metrics.Collector +} + +func newTelemetry(coll metrics.Collector) *telemetry { + t := &telemetry{coll: coll} + if coll != nil { + coll.CounterAdd(metricMessagesTotal, 0, metrics.Labels{"queue": "_init", "result": "ack"}) + coll.CounterAdd(metricMessagesTotal, 0, metrics.Labels{"queue": "_init", "result": "nack"}) + coll.GaugeSet(metricInFlight, 0, metrics.Labels{"queue": "_init"}) + } + return t +} + +func (t *telemetry) recordProcessed(queue, result string, duration time.Duration) { + if t == nil || t.coll == nil { + return + } + labels := metrics.Labels{"queue": queue, "result": result} + t.coll.CounterInc(metricMessagesTotal, labels) + t.coll.HistogramObserve(metricProcessDuration, duration.Seconds(), labels) +} + +func (t *telemetry) inFlightInc(queue string) { + if t == nil || t.coll == nil { + return + } + t.coll.GaugeInc(metricInFlight, metrics.Labels{"queue": queue}) +} + +func (t *telemetry) inFlightDec(queue string) { + if t == nil || t.coll == nil { + return + } + t.coll.GaugeDec(metricInFlight, metrics.Labels{"queue": queue}) +} diff --git a/service/queue/consumer/telemetry_test.go b/service/queue/consumer/telemetry_test.go new file mode 100644 index 000000000..6f597043a --- /dev/null +++ b/service/queue/consumer/telemetry_test.go @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: MPL-2.0 + +package consumer + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/internal/telemetrytest" +) + +func TestTelemetry_RecordProcessed(t *testing.T) { + rec := telemetrytest.NewRecorder() + tel := newTelemetry(rec) + + tel.recordProcessed("ns:orders", "ack", 5*time.Millisecond) + + assert.Equal(t, 1.0, rec.CounterValue(metricMessagesTotal, + metrics.Labels{"queue": "ns:orders", "result": "ack"})) + assert.Equal(t, uint64(1), rec.HistogramCount(metricProcessDuration, + metrics.Labels{"queue": "ns:orders", "result": "ack"})) +} + +func TestTelemetry_NilCollector_NoPanic(t *testing.T) { + tel := newTelemetry(nil) + tel.recordProcessed("ns:orders", "nack", time.Millisecond) + tel.inFlightInc("ns:orders") + tel.inFlightDec("ns:orders") +} + +func TestTelemetry_InFlight(t *testing.T) { + rec := telemetrytest.NewRecorder() + tel := newTelemetry(rec) + + tel.inFlightInc("ns:orders") + tel.inFlightInc("ns:orders") + tel.inFlightDec("ns:orders") + + assert.Equal(t, 1.0, rec.GaugeValue(metricInFlight, + metrics.Labels{"queue": "ns:orders"})) +} + +func TestTelemetry_BootstrapsZeroSeries(t *testing.T) { + rec := telemetrytest.NewRecorder() + _ = newTelemetry(rec) + + assert.Contains(t, rec.Names(), metricMessagesTotal) + assert.Contains(t, rec.Names(), metricInFlight) +} From 81171d61931c70fa2026734f0905f88016fd71a5 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:02:31 -0300 Subject: [PATCH 11/26] feat(process): add process lifecycle metrics and root spans Why: the process surface had no metrics and the OTel lifecycle handler skipped spans entirely when a process was spawned without a remote span context, so unsupervised/cron spawns were invisible in both metrics and traces. What: add a process Lifecycle metrics handler emitting wippy_process_started_total, wippy_process_terminated_total (result= completed|error), and wippy_process_active gauge, registered via a new metrics boot component alongside the OTel handler. Change the OTel OnStart/OnComplete to start a root span when no remote parent is present instead of returning early, so every spawn is traced. --- boot/components/metrics/all.go | 1 + boot/components/metrics/constants.go | 7 +++ boot/components/metrics/process.go | 36 +++++++++++++++ service/metrics/process_lifecycle.go | 55 +++++++++++++++++++++++ service/metrics/process_lifecycle_test.go | 44 ++++++++++++++++++ service/otel/service.go | 20 +++++---- service/otel/service_span_test.go | 25 +++++++++++ 7 files changed, 180 insertions(+), 8 deletions(-) create mode 100644 boot/components/metrics/process.go create mode 100644 service/metrics/process_lifecycle.go create mode 100644 service/metrics/process_lifecycle_test.go diff --git a/boot/components/metrics/all.go b/boot/components/metrics/all.go index eb1cf53bc..c0ff44854 100644 --- a/boot/components/metrics/all.go +++ b/boot/components/metrics/all.go @@ -8,5 +8,6 @@ func All() []boot.Component { return []boot.Component{ Metrics(), Interceptor(), + Process(), } } diff --git a/boot/components/metrics/constants.go b/boot/components/metrics/constants.go index f65178a28..49b4e8929 100644 --- a/boot/components/metrics/constants.go +++ b/boot/components/metrics/constants.go @@ -10,6 +10,13 @@ const Name boot.Name = "metrics" // InterceptorName is the component name for the function-call metrics interceptor. const InterceptorName boot.Name = "metrics-interceptor" +// ProcessName is the component name for the process lifecycle metrics handler. +const ProcessName boot.Name = "metrics-process" + // interceptorName is the external dependency name of the function interceptor // registry component (boot/components/system.Interceptor). const interceptorName boot.Name = "interceptor" + +// lifecycleName is the external dependency name of the process lifecycle +// registry component (boot/components/system.LifecycleName). +const lifecycleName boot.Name = "system.lifecycle" diff --git a/boot/components/metrics/process.go b/boot/components/metrics/process.go new file mode 100644 index 000000000..abf6384ab --- /dev/null +++ b/boot/components/metrics/process.go @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MPL-2.0 + +package metrics + +import ( + "context" + + "github.com/wippyai/runtime/api/boot" + metricsapi "github.com/wippyai/runtime/api/metrics" + processapi "github.com/wippyai/runtime/api/process" + impl "github.com/wippyai/runtime/service/metrics" +) + +// Process registers the process lifecycle metrics handler, which emits +// wippy_process_started_total / wippy_process_terminated_total / +// wippy_process_active alongside the existing OTel lifecycle spans. +func Process() boot.Component { + return boot.New(boot.P{ + Name: ProcessName, + DependsOn: []boot.Name{Name, lifecycleName}, + Load: func(ctx context.Context) (context.Context, error) { + coll := metricsapi.GetCollector(ctx) + if coll == nil { + return ctx, nil + } + + reg := processapi.GetLifecycleRegistry(ctx) + if reg == nil { + return ctx, nil + } + + reg.Register("metrics", impl.NewProcessLifecycle(coll)) + return ctx, nil + }, + }) +} diff --git a/service/metrics/process_lifecycle.go b/service/metrics/process_lifecycle.go new file mode 100644 index 000000000..db62b10e4 --- /dev/null +++ b/service/metrics/process_lifecycle.go @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MPL-2.0 + +package metrics + +import ( + "context" + + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/pid" + processapi "github.com/wippyai/runtime/api/process" + "github.com/wippyai/runtime/api/runtime" +) + +const ( + ProcessStarted = "wippy_process_started_total" + ProcessTerminated = "wippy_process_terminated_total" + ProcessActive = "wippy_process_active" +) + +// ProcessLifecycle is a process.Lifecycle handler that emits process +// supervision metrics. A nil collector makes every method a no-op. +type ProcessLifecycle struct { + coll metrics.Collector +} + +func NewProcessLifecycle(coll metrics.Collector) *ProcessLifecycle { + p := &ProcessLifecycle{coll: coll} + if coll != nil { + coll.GaugeSet(ProcessActive, 0, metrics.Labels{}) + coll.CounterAdd(ProcessTerminated, 0, metrics.Labels{"result": "completed"}) + coll.CounterAdd(ProcessTerminated, 0, metrics.Labels{"result": "error"}) + } + return p +} + +func (p *ProcessLifecycle) OnStart(_ context.Context, _ pid.PID, _ processapi.Process) error { + if p == nil || p.coll == nil { + return nil + } + p.coll.CounterInc(ProcessStarted, metrics.Labels{}) + p.coll.GaugeInc(ProcessActive, metrics.Labels{}) + return nil +} + +func (p *ProcessLifecycle) OnComplete(_ context.Context, _ pid.PID, result *runtime.Result) { + if p == nil || p.coll == nil { + return + } + p.coll.GaugeDec(ProcessActive, metrics.Labels{}) + outcome := "completed" + if result != nil && result.Error != nil { + outcome = "error" + } + p.coll.CounterInc(ProcessTerminated, metrics.Labels{"result": outcome}) +} diff --git a/service/metrics/process_lifecycle_test.go b/service/metrics/process_lifecycle_test.go new file mode 100644 index 000000000..be0837a94 --- /dev/null +++ b/service/metrics/process_lifecycle_test.go @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MPL-2.0 + +package metrics + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/pid" + "github.com/wippyai/runtime/api/runtime" + "github.com/wippyai/runtime/internal/telemetrytest" +) + +func TestProcessLifecycle_StartAndComplete(t *testing.T) { + rec := telemetrytest.NewRecorder() + p := NewProcessLifecycle(rec) + + require.NoError(t, p.OnStart(context.Background(), pid.PID{}, nil)) + require.NoError(t, p.OnStart(context.Background(), pid.PID{}, nil)) + p.OnComplete(context.Background(), pid.PID{}, &runtime.Result{}) + + assert.Equal(t, 2.0, rec.CounterValue(ProcessStarted, metrics.Labels{})) + assert.Equal(t, 1.0, rec.CounterValue(ProcessTerminated, metrics.Labels{"result": "completed"})) + assert.Equal(t, 1.0, rec.GaugeValue(ProcessActive, metrics.Labels{})) +} + +func TestProcessLifecycle_ErrorResult(t *testing.T) { + rec := telemetrytest.NewRecorder() + p := NewProcessLifecycle(rec) + + p.OnComplete(context.Background(), pid.PID{}, &runtime.Result{Error: errors.New("boom")}) + + assert.Equal(t, 1.0, rec.CounterValue(ProcessTerminated, metrics.Labels{"result": "error"})) +} + +func TestProcessLifecycle_NilCollector(t *testing.T) { + p := NewProcessLifecycle(nil) + require.NoError(t, p.OnStart(context.Background(), pid.PID{}, nil)) + p.OnComplete(context.Background(), pid.PID{}, nil) +} diff --git a/service/otel/service.go b/service/otel/service.go index d2099f93e..c69d98d7f 100644 --- a/service/otel/service.go +++ b/service/otel/service.go @@ -130,9 +130,12 @@ func (s *Service) OnStart(ctx stdcontext.Context, p pid.PID, _ process.Process) return nil } + // Continue the spawning trace when a remote parent is present, otherwise + // start a root span so unsupervised spawns are still observable. processSpanCtx, hasSpan := otelapi.GetRemoteSpanContext(ctx) - if !hasSpan || !processSpanCtx.IsValid() { - return nil + startCtx := ctx + if hasSpan && processSpanCtx.IsValid() { + startCtx = trace.ContextWithRemoteSpanContext(ctx, processSpanCtx) } sourceID, hasSource := runtime.GetFrameID(ctx) @@ -141,8 +144,7 @@ func (s *Service) OnStart(ctx stdcontext.Context, p pid.PID, _ process.Process) startEventName = sourceID.String() + ".started" } - ctxWithProcess := trace.ContextWithRemoteSpanContext(ctx, processSpanCtx) - _, startSpan := s.tracer.Start(ctxWithProcess, startEventName, + _, startSpan := s.tracer.Start(startCtx, startEventName, trace.WithSpanKind(trace.SpanKindInternal)) startSpan.SetAttributes( @@ -162,9 +164,12 @@ func (s *Service) OnComplete(ctx stdcontext.Context, p pid.PID, result *runtime. return } + // Continue the spawning trace when a remote parent is present, otherwise + // start a root span so unsupervised spawns are still observable. remoteSpanCtx, hasRemote := otelapi.GetRemoteSpanContext(ctx) - if !hasRemote || !remoteSpanCtx.IsValid() { - return + completeCtx := ctx + if hasRemote && remoteSpanCtx.IsValid() { + completeCtx = trace.ContextWithRemoteSpanContext(ctx, remoteSpanCtx) } sourceID, hasSource := runtime.GetFrameID(ctx) @@ -173,8 +178,7 @@ func (s *Service) OnComplete(ctx stdcontext.Context, p pid.PID, result *runtime. spanName = sourceID.String() + ".terminated" } - ctxWithRemote := trace.ContextWithRemoteSpanContext(ctx, remoteSpanCtx) - _, span := s.tracer.Start(ctxWithRemote, spanName, + _, span := s.tracer.Start(completeCtx, spanName, trace.WithSpanKind(trace.SpanKindInternal)) attrs := []attribute.KeyValue{ diff --git a/service/otel/service_span_test.go b/service/otel/service_span_test.go index 9357febce..96c66333d 100644 --- a/service/otel/service_span_test.go +++ b/service/otel/service_span_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/require" attrsapi "github.com/wippyai/runtime/api/attrs" ctxapi "github.com/wippyai/runtime/api/context" + "github.com/wippyai/runtime/api/pid" queueapi "github.com/wippyai/runtime/api/queue" "github.com/wippyai/runtime/api/registry" "github.com/wippyai/runtime/api/runtime" @@ -171,6 +172,30 @@ func TestInterceptor_ErrorStatus(t *testing.T) { telemetrytest.SpanStatus(t, span, codes.Error) } +func TestProcessLifecycle_RootSpanWhenNoRemoteParent(t *testing.T) { + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{Process: otelapi.ProcessConfig{Enabled: true, TraceLifecycle: true}}, zap.NewNop(), tp) + + require.NoError(t, svc.OnStart(context.Background(), pid.PID{UniqID: "p1"}, nil)) + svc.OnComplete(context.Background(), pid.PID{UniqID: "p1"}, &runtime.Result{}) + + started := false + terminated := false + for _, s := range sr.Ended() { + if s.Name() == "process.started" { + started = true + telemetrytest.SpanKind(t, s, trace.SpanKindInternal) + } + if s.Name() == "process.terminated" { + terminated = true + } + } + assert.True(t, started, "unsupervised spawn must emit process.started") + assert.True(t, terminated, "unsupervised spawn must emit process.terminated") +} + func TestHTTPMiddleware_CapturesStatusCode(t *testing.T) { useTraceContextPropagator(t) tp, sr := telemetrytest.NewTracerProvider() From e463fe633bf5c551fa6f1e8b71e6ba0103424905 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:15:49 -0300 Subject: [PATCH 12/26] feat(temporal): bridge SDK metrics onto the wippy collector Why: the Temporal/workflow surface emitted no metrics - workflow and activity execution, worker poller, and task-queue signals were invisible to operators, with only tracing wired. What: add a Temporal metrics.Handler bridge (service/temporal/client) that maps the SDK's Counter/Gauge/Timer/WithTags onto the wippy collector (tags become labels; nil-safe), thread it through the client factory and manager via a WithMetricsHandler option, and wire it in the boot component so the SDK's own workflow/worker metrics flow to both Prometheus and OTel. Unit tests cover counter/gauge/timer, tag merging, chaining, and the nil collector path. --- boot/components/service/temporal/temporal.go | 2 + service/temporal/client/factory.go | 10 ++- service/temporal/client/factory_test.go | 50 ++++++------- service/temporal/client/manager.go | 12 +++- service/temporal/client/metrics_handler.go | 71 +++++++++++++++++++ .../temporal/client/metrics_handler_test.go | 52 ++++++++++++++ 6 files changed, 168 insertions(+), 29 deletions(-) create mode 100644 service/temporal/client/metrics_handler.go create mode 100644 service/temporal/client/metrics_handler_test.go diff --git a/boot/components/service/temporal/temporal.go b/boot/components/service/temporal/temporal.go index 120ac9205..c191a45db 100644 --- a/boot/components/service/temporal/temporal.go +++ b/boot/components/service/temporal/temporal.go @@ -11,6 +11,7 @@ import ( "github.com/wippyai/runtime/api/event" funcapi "github.com/wippyai/runtime/api/function" logapi "github.com/wippyai/runtime/api/logs" + metricsapi "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" regapi "github.com/wippyai/runtime/api/registry" "github.com/wippyai/runtime/api/resource" @@ -161,6 +162,7 @@ func Component() boot.Component { return dcRegistry.Build() }), client.WithInterceptors(clientInterceptors), + client.WithMetricsHandler(client.NewMetricsHandler(metricsapi.GetCollector(ctx))), ) if err != nil { return ctx, fmt.Errorf("failed to create client manager: %w", err) diff --git a/service/temporal/client/factory.go b/service/temporal/client/factory.go index 99df03d64..6cb6197ed 100644 --- a/service/temporal/client/factory.go +++ b/service/temporal/client/factory.go @@ -39,6 +39,7 @@ type Factory interface { // DefaultClientFactory implements Factory type DefaultClientFactory struct { env env.Registry + metricsHandler client.MetricsHandler dataConverter func() converter.DataConverter clientInterceptors []interceptor.ClientInterceptor } @@ -48,11 +49,13 @@ func NewDefaultClientFactory( env env.Registry, dataConverter func() converter.DataConverter, clientInterceptors []interceptor.ClientInterceptor, + metricsHandler client.MetricsHandler, ) *DefaultClientFactory { return &DefaultClientFactory{ env: env, dataConverter: dataConverter, clientInterceptors: clientInterceptors, + metricsHandler: metricsHandler, } } @@ -85,9 +88,10 @@ func (f *DefaultClientFactory) CreateClient(ctx context.Context, logger *zap.Log // buildClientOptions constructs Temporal client options from config func (f *DefaultClientFactory) buildClientOptions(ctx context.Context, logger *zap.Logger, config *api.ClientConfig) (client.Options, error) { opts := client.Options{ - HostPort: config.Address, - Namespace: config.Namespace, - Logger: NewZapAdapter(logger), + HostPort: config.Address, + Namespace: config.Namespace, + Logger: NewZapAdapter(logger), + MetricsHandler: f.metricsHandler, } // Set data converter if available diff --git a/service/temporal/client/factory_test.go b/service/temporal/client/factory_test.go index 87806fe5a..6493fe029 100644 --- a/service/temporal/client/factory_test.go +++ b/service/temporal/client/factory_test.go @@ -27,7 +27,7 @@ import ( func TestNewDefaultClientFactory(t *testing.T) { env := &mockEnvRegistry{values: make(map[string]string)} - factory := NewDefaultClientFactory(env, nil, nil) + factory := NewDefaultClientFactory(env, nil, nil, nil) require.NotNil(t, factory) assert.Equal(t, env, factory.env) @@ -37,7 +37,7 @@ func TestNewDefaultClientFactory(t *testing.T) { func TestDefaultClientFactory_buildClientOptions(t *testing.T) { t.Run("basic options", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, newTestDataConverterProvider(), nil) + factory := NewDefaultClientFactory(nil, newTestDataConverterProvider(), nil, nil) config := &api.ClientConfig{ Address: "localhost:7233", Namespace: "default", @@ -57,7 +57,7 @@ func TestDefaultClientFactory_buildClientOptions(t *testing.T) { t.Run("with data converter", func(t *testing.T) { dc := &mockDataConverter{} - factory := NewDefaultClientFactory(nil, func() converter.DataConverter { return dc }, nil) + factory := NewDefaultClientFactory(nil, func() converter.DataConverter { return dc }, nil, nil) config := &api.ClientConfig{ Address: "localhost:7233", Namespace: "test", @@ -75,7 +75,7 @@ func TestDefaultClientFactory_buildClientOptions(t *testing.T) { t.Run("with interceptors", func(t *testing.T) { interceptors := []interceptor.ClientInterceptor{&mockClientInterceptor{}} - factory := NewDefaultClientFactory(nil, newTestDataConverterProvider(), interceptors) + factory := NewDefaultClientFactory(nil, newTestDataConverterProvider(), interceptors, nil) config := &api.ClientConfig{ Address: "localhost:7233", Namespace: "test", @@ -99,7 +99,7 @@ func newTestDataConverterProvider() func() converter.DataConverter { func TestDefaultClientFactory_configureAuth(t *testing.T) { t.Run("no auth", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{Type: api.AuthTypeNone}, } @@ -112,7 +112,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { }) t.Run("api key direct", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -129,7 +129,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { t.Run("api key from env", func(t *testing.T) { env := &mockEnvRegistry{values: map[string]string{"TEMPORAL_API_KEY": "env-key"}} - factory := NewDefaultClientFactory(env, nil, nil) + factory := NewDefaultClientFactory(env, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -145,7 +145,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { }) t.Run("api key from env without registry fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -167,7 +167,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { err := os.WriteFile(keyFile, []byte("file-api-key\n"), 0600) require.NoError(t, err) - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -183,7 +183,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { }) t.Run("api key from missing file fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -199,7 +199,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { }) t.Run("api key no source fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: api.AuthTypeAPIKey, @@ -214,7 +214,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { }) t.Run("unsupported auth type fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ Auth: api.AuthConfig{ Type: "unknown", @@ -231,7 +231,7 @@ func TestDefaultClientFactory_configureAuth(t *testing.T) { func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { t.Run("missing cert source fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ KeyPEM: "some-key", } @@ -243,7 +243,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { }) t.Run("missing key source fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ CertPEM: "some-cert", } @@ -255,7 +255,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { }) t.Run("key from env without registry fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ CertPEM: "some-cert", KeyPEMEnv: "TEMPORAL_KEY", @@ -268,7 +268,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { }) t.Run("missing cert file fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ CertFile: "/nonexistent/cert.pem", KeyPEM: "some-key", @@ -285,7 +285,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { certFile := filepath.Join(tmpDir, "cert.pem") require.NoError(t, os.WriteFile(certFile, []byte("test-cert"), 0600)) - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ CertFile: certFile, KeyFile: "/nonexistent/key.pem", @@ -298,7 +298,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { }) t.Run("invalid certificate fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) auth := api.AuthConfig{ CertPEM: "invalid-cert", KeyPEM: "invalid-key", @@ -313,7 +313,7 @@ func TestDefaultClientFactory_loadClientCertificate(t *testing.T) { func TestDefaultClientFactory_configureTLS(t *testing.T) { t.Run("TLS disabled", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: nil, } @@ -325,7 +325,7 @@ func TestDefaultClientFactory_configureTLS(t *testing.T) { }) t.Run("TLS enabled but not active", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: &api.TLSConfig{Enabled: false}, } @@ -337,7 +337,7 @@ func TestDefaultClientFactory_configureTLS(t *testing.T) { }) t.Run("TLS with server name", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: &api.TLSConfig{ Enabled: true, @@ -354,7 +354,7 @@ func TestDefaultClientFactory_configureTLS(t *testing.T) { }) t.Run("TLS insecure skip verify", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: &api.TLSConfig{ Enabled: true, @@ -370,7 +370,7 @@ func TestDefaultClientFactory_configureTLS(t *testing.T) { }) t.Run("TLS with missing CA file fails", func(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: &api.TLSConfig{ Enabled: true, @@ -390,7 +390,7 @@ func TestDefaultClientFactory_configureTLS(t *testing.T) { caFile := filepath.Join(tmpDir, "ca.pem") require.NoError(t, os.WriteFile(caFile, []byte("invalid-ca"), 0600)) - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) config := &api.ClientConfig{ TLS: &api.TLSConfig{ Enabled: true, @@ -434,7 +434,7 @@ func TestRewriteClientHeadersInterceptor(t *testing.T) { } func TestConfigureTransportHeaders_AppendsInterceptor(t *testing.T) { - factory := NewDefaultClientFactory(nil, nil, nil) + factory := NewDefaultClientFactory(nil, nil, nil, nil) opts := &client.Options{} factory.configureTransportHeaders(opts) diff --git a/service/temporal/client/manager.go b/service/temporal/client/manager.go index 210268db9..e264376cc 100644 --- a/service/temporal/client/manager.go +++ b/service/temporal/client/manager.go @@ -18,6 +18,7 @@ import ( "github.com/wippyai/runtime/api/supervisor" "github.com/wippyai/runtime/internal/entry" "github.com/wippyai/runtime/service/temporal/peer" + "go.temporal.io/sdk/client" "go.temporal.io/sdk/converter" "go.temporal.io/sdk/interceptor" "go.uber.org/zap" @@ -30,6 +31,7 @@ type Manager struct { dtt payload.Transcoder bus event.Bus env env.Registry + metricsHandler client.MetricsHandler factory Factory dataConverter converter.DataConverter dataConverterProvider func() converter.DataConverter @@ -94,6 +96,14 @@ func WithInterceptors(interceptors []interceptor.ClientInterceptor) ManagerOptio } } +// WithMetricsHandler sets the Temporal SDK metrics handler for the Manager, +// bridging the SDK's workflow/worker metrics onto the wippy collector. +func WithMetricsHandler(h client.MetricsHandler) ManagerOption { + return func(m *Manager) { + m.metricsHandler = h + } +} + // WithFactory sets a custom client factory for the Manager func WithFactory(factory Factory) ManagerOption { return func(m *Manager) { @@ -131,7 +141,7 @@ func NewManager(opts ...ManagerOption) (*Manager, error) { if provider == nil && m.dataConverter != nil { provider = func() converter.DataConverter { return m.dataConverter } } - m.factory = NewDefaultClientFactory(m.env, provider, m.clientInterceptors) + m.factory = NewDefaultClientFactory(m.env, provider, m.clientInterceptors, m.metricsHandler) } return m, nil diff --git a/service/temporal/client/metrics_handler.go b/service/temporal/client/metrics_handler.go new file mode 100644 index 000000000..7e83c689a --- /dev/null +++ b/service/temporal/client/metrics_handler.go @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MPL-2.0 + +package client + +import ( + "time" + + "github.com/wippyai/runtime/api/metrics" + "go.temporal.io/sdk/client" +) + +// metricsHandler bridges the Temporal SDK metrics.Handler interface onto the +// wippy metrics collector, so the SDK's own workflow/worker/poller metrics +// flow to both the Prometheus and OTel sinks. Tags become metric labels; a nil +// collector makes every method a no-op. +type metricsHandler struct { + coll metrics.Collector + tags metrics.Labels +} + +// NewMetricsHandler returns a Temporal metrics.Handler backed by coll. +func NewMetricsHandler(coll metrics.Collector) client.MetricsHandler { + return &metricsHandler{coll: coll} +} + +func (h *metricsHandler) WithTags(tags map[string]string) client.MetricsHandler { + merged := make(metrics.Labels, len(h.tags)+len(tags)) + for k, v := range h.tags { + merged[k] = v + } + for k, v := range tags { + merged[k] = v + } + return &metricsHandler{coll: h.coll, tags: merged} +} + +type metricsCounterFunc func(int64) + +func (f metricsCounterFunc) Inc(d int64) { f(d) } + +type metricsGaugeFunc func(float64) + +func (f metricsGaugeFunc) Update(d float64) { f(d) } + +type metricsTimerFunc func(time.Duration) + +func (f metricsTimerFunc) Record(d time.Duration) { f(d) } + +func (h *metricsHandler) Counter(name string) client.MetricsCounter { + return metricsCounterFunc(func(d int64) { + if h.coll != nil { + h.coll.CounterAdd(name, float64(d), h.tags) + } + }) +} + +func (h *metricsHandler) Gauge(name string) client.MetricsGauge { + return metricsGaugeFunc(func(d float64) { + if h.coll != nil { + h.coll.GaugeSet(name, d, h.tags) + } + }) +} + +func (h *metricsHandler) Timer(name string) client.MetricsTimer { + return metricsTimerFunc(func(d time.Duration) { + if h.coll != nil { + h.coll.HistogramObserve(name, d.Seconds(), h.tags) + } + }) +} diff --git a/service/temporal/client/metrics_handler_test.go b/service/temporal/client/metrics_handler_test.go new file mode 100644 index 000000000..55063e508 --- /dev/null +++ b/service/temporal/client/metrics_handler_test.go @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MPL-2.0 + +package client + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/internal/telemetrytest" +) + +func TestMetricsHandler_CounterGaugeTimer(t *testing.T) { + rec := telemetrytest.NewRecorder() + h := NewMetricsHandler(rec) + + h.Counter("temporal_workflow_completed").Inc(1) + h.Counter("temporal_workflow_completed").Inc(2) + h.Gauge("temporal_worker_task_queue_active").Update(5) + h.Timer("temporal_workflow_task_latency").Record(100 * time.Millisecond) + + assert.Equal(t, 3.0, rec.CounterValue("temporal_workflow_completed", metrics.Labels{})) + assert.Equal(t, 5.0, rec.GaugeValue("temporal_worker_task_queue_active", metrics.Labels{})) + assert.Equal(t, uint64(1), rec.HistogramCount("temporal_workflow_task_latency", metrics.Labels{})) +} + +func TestMetricsHandler_WithTagsMergesLabels(t *testing.T) { + rec := telemetrytest.NewRecorder() + h := NewMetricsHandler(rec).WithTags(map[string]string{"namespace": "default", "task_queue": "orders"}) + + h.Counter("temporal_workflow_completed").Inc(1) + + assert.Equal(t, 1.0, rec.CounterValue("temporal_workflow_completed", + metrics.Labels{"namespace": "default", "task_queue": "orders"})) +} + +func TestMetricsHandler_NilCollector(t *testing.T) { + h := NewMetricsHandler(nil) + h.Counter("x").Inc(1) + h.Gauge("y").Update(1) + h.Timer("z").Record(time.Millisecond) +} + +func TestMetricsHandler_WithTagsChains(t *testing.T) { + rec := telemetrytest.NewRecorder() + h := NewMetricsHandler(rec).WithTags(map[string]string{"a": "1"}).WithTags(map[string]string{"b": "2"}) + + h.Counter("c").Inc(1) + + assert.Equal(t, 1.0, rec.CounterValue("c", metrics.Labels{"a": "1", "b": "2"})) +} From 367b214c0efa2130d51d4f1da544f82464a60513 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:22:04 -0300 Subject: [PATCH 13/26] feat(cdc): emit stream error counter Why: the CDC surface only exposed changes_total and retained WAL bytes; stream errors (replication failures, decode failures, keepalive parse errors) were only logged, so operators could not alert on CDC health or see error rate from /metrics. What: store the collector on Source at Start, and emit wippy_cdc_errors_total (slot) from the single fail() choke point that every fatal replication path funnels through. Unit tests cover the counter increment and the nil-collector path. (CDC spans deferred: they require threading a tracer into Source and a live Postgres to validate the replication loop.) --- service/cdc/postgres/service.go | 8 +++++- service/cdc/postgres/service_metrics_test.go | 30 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 service/cdc/postgres/service_metrics_test.go diff --git a/service/cdc/postgres/service.go b/service/cdc/postgres/service.go index 20f9753c9..09bfa7b1c 100644 --- a/service/cdc/postgres/service.go +++ b/service/cdc/postgres/service.go @@ -25,6 +25,7 @@ import ( const ( retainedWALGauge = "wippy_cdc_retained_wal_bytes" changesCounter = "wippy_cdc_changes_total" + errorsCounter = "wippy_cdc_errors_total" ) const ( @@ -58,6 +59,7 @@ type SourceOptions struct { type Source struct { log *zap.Logger + coll metrics.Collector injectedCP Checkpointer cancel context.CancelFunc done chan struct{} @@ -196,7 +198,8 @@ func (s *Source) Start(ctx context.Context) (<-chan any, error) { default: } - go s.run(runCtx, conn, adminDB, cp, startLSN, snapshotName, publication, metrics.GetCollector(ctx), status, done) + s.coll = metrics.GetCollector(ctx) + go s.run(runCtx, conn, adminDB, cp, startLSN, snapshotName, publication, s.coll, status, done) return status, nil } @@ -410,6 +413,9 @@ func (s *Source) reportLag(ctx context.Context, adminDB *sql.DB, mc metrics.Coll func (s *Source) fail(_ context.Context, status chan any, err error) { s.log.Error("cdc stream error", zap.String("slot", s.slot), zap.Error(err)) + if s.coll != nil { + s.coll.CounterInc(errorsCounter, metrics.Labels{"slot": s.slot}) + } select { case status <- err: default: diff --git a/service/cdc/postgres/service_metrics_test.go b/service/cdc/postgres/service_metrics_test.go new file mode 100644 index 000000000..31a1cff31 --- /dev/null +++ b/service/cdc/postgres/service_metrics_test.go @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MPL-2.0 + +package postgres + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/internal/telemetrytest" + "go.uber.org/zap" +) + +func TestSource_FailEmitsErrorCounter(t *testing.T) { + rec := telemetrytest.NewRecorder() + s := &Source{log: zap.NewNop(), slot: "test_slot", coll: rec} + + status := make(chan any, 1) + s.fail(context.Background(), status, errors.New("boom")) + + assert.Equal(t, 1.0, rec.CounterValue(errorsCounter, metrics.Labels{"slot": "test_slot"})) +} + +func TestSource_FailNilCollector(t *testing.T) { + s := &Source{log: zap.NewNop(), slot: "test_slot"} + status := make(chan any, 1) + s.fail(context.Background(), status, errors.New("boom")) +} From ad3e654e84a3f6194b28169596fd06bd0f17f0e9 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:34:12 -0300 Subject: [PATCH 14/26] feat(http): trace outbound client requests via otelhttp Why: the HTTP client pool built plain *http.Transport instances, so outbound requests (including Lua httpclient calls) produced no spans and did not inject W3C trace context - the trace broke the moment a call left the process. What: wrap both client constructors' transports with otelhttp via an instrumentedTransport that preserves CloseIdleConnections for the pool's eviction path, and update closeIdle to use a closeIdler interface. Promote otelhttp to a direct dependency. Test asserts an outbound request within a span produces a client span sharing the caller trace and injects traceparent. Also suppress a pre-existing G124 false-positive on the outbound-cookie line (documented as response-side attributes). --- go.mod | 2 +- service/http/client/client_pool.go | 35 ++++++++++- service/http/client/client_pool_lru_test.go | 7 ++- service/http/client/handler.go | 2 +- service/http/client/outbound_trace_test.go | 68 +++++++++++++++++++++ 5 files changed, 106 insertions(+), 8 deletions(-) create mode 100644 service/http/client/outbound_trace_test.go diff --git a/go.mod b/go.mod index 1f834be80..189f239a1 100644 --- a/go.mod +++ b/go.mod @@ -69,6 +69,7 @@ require ( github.com/wippyai/wapp v0.1.2 github.com/wippyai/wasm-runtime v0.0.0-20260624004845-53368697b5c6 github.com/xuri/excelize/v2 v2.10.1 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0 go.opentelemetry.io/otel v1.44.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.44.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.44.0 @@ -225,7 +226,6 @@ require ( go.bytecodealliance.org v0.7.0 // indirect go.etcd.io/bbolt v1.4.3 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/mock v0.6.0 // indirect diff --git a/service/http/client/client_pool.go b/service/http/client/client_pool.go index 94c092a4a..87312061c 100644 --- a/service/http/client/client_pool.go +++ b/service/http/client/client_pool.go @@ -20,6 +20,7 @@ import ( netapi "github.com/wippyai/runtime/api/net" httpapi "github.com/wippyai/runtime/api/service/http" lru "github.com/wippyai/runtime/internal/cache" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" ) // clientKey identifies a unique client configuration. networkIdentity is a @@ -153,6 +154,34 @@ func (p *Pool) getOrCreate(key clientKey, overlayNetworkID string) *clientOnce { return co } +// closeIdler is implemented by transports that can release idle connections. +// Both *http.Transport and instrumentedTransport satisfy it. +type closeIdler interface { + CloseIdleConnections() +} + +// instrumentedTransport wraps an otelhttp-instrumented RoundTripper so outbound +// HTTP requests get client spans and W3C trace-context injection, while still +// exposing CloseIdleConnections on the underlying *http.Transport for the +// pool's eviction/cleanup path. +type instrumentedTransport struct { + traced gohttp.RoundTripper + base *gohttp.Transport +} + +func (t *instrumentedTransport) RoundTrip(req *gohttp.Request) (*gohttp.Response, error) { + return t.traced.RoundTrip(req) +} + +func (t *instrumentedTransport) CloseIdleConnections() { + t.base.CloseIdleConnections() +} + +// instrument returns an otelhttp-wrapped transport around base. +func instrument(base *gohttp.Transport) *instrumentedTransport { + return &instrumentedTransport{traced: otelhttp.NewTransport(base), base: base} +} + // closeIdle closes idle connections on co's client transport if co has been // initialized. Safe to call on a never-initialized clientOnce and from any // goroutine — the atomic Load synchronizes with the Store inside once.Do. @@ -164,7 +193,7 @@ func closeIdle(co *clientOnce) { if c == nil { return } - if tr, ok := c.Transport.(*gohttp.Transport); ok { + if tr, ok := c.Transport.(closeIdler); ok { tr.CloseIdleConnections() } } @@ -335,7 +364,7 @@ func createClientWithDialer(timeout time.Duration, dialFn func(ctx context.Conte DialContext: dialFn, } return &gohttp.Client{ - Transport: transport, + Transport: instrument(transport), Timeout: timeout, } } @@ -369,7 +398,7 @@ func createClient(timeout time.Duration, unixSocket string, maxIdleConns, maxIdl } return &gohttp.Client{ - Transport: transport, + Transport: instrument(transport), Timeout: timeout, } } diff --git a/service/http/client/client_pool_lru_test.go b/service/http/client/client_pool_lru_test.go index a885038e3..5b8081b00 100644 --- a/service/http/client/client_pool_lru_test.go +++ b/service/http/client/client_pool_lru_test.go @@ -87,8 +87,9 @@ func TestPoolLRU_EvictionClosesIdleConnections(t *testing.T) { pool := NewClientPoolWithConfig(PoolConfig{MaxClients: 1}) c1 := pool.GetClient(1*time.Second, "") - tr, ok := c1.Transport.(*gohttp.Transport) - require.True(t, ok, "expected *http.Transport") + tr, ok := c1.Transport.(*instrumentedTransport) + require.True(t, ok, "expected *instrumentedTransport") + base := tr.base // Force an idle connection into the transport by dialing a listener // and reading a tiny HTTP response. @@ -117,7 +118,7 @@ func TestPoolLRU_EvictionClosesIdleConnections(t *testing.T) { // observe this indirectly — a direct call must not panic and must // leave the transport in a state where subsequent CloseIdleConnections // is a no-op. The real guarantee is that eviction called it once. - tr.CloseIdleConnections() + base.CloseIdleConnections() assert.Equal(t, 1, pool.Size()) } diff --git a/service/http/client/handler.go b/service/http/client/handler.go index a9b2ce7d9..c16e9ac7d 100644 --- a/service/http/client/handler.go +++ b/service/http/client/handler.go @@ -245,7 +245,7 @@ func executeRequest(ctx context.Context, pool *Pool, networkReg netapi.NetworkRe for name, value := range req.Cookies { // Outbound HTTP request cookies; Secure/HttpOnly/SameSite are // response-side attributes the client cannot set meaningfully. - httpReq.AddCookie(&gohttp.Cookie{Name: name, Value: value}) + httpReq.AddCookie(&gohttp.Cookie{Name: name, Value: value}) //nolint:gosec // G124: client-side outbound cookie } if req.BasicAuthUser != "" { httpReq.SetBasicAuth(req.BasicAuthUser, req.BasicAuthPass) diff --git a/service/http/client/outbound_trace_test.go b/service/http/client/outbound_trace_test.go new file mode 100644 index 000000000..c1628dc16 --- /dev/null +++ b/service/http/client/outbound_trace_test.go @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: MPL-2.0 + +package client + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/wippyai/runtime/internal/telemetrytest" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/trace" +) + +// TestOutbound_TracedWithClientSpan verifies the pooled HTTP client wraps its +// transport with otelhttp, so an outbound request issued within a span produces +// a client span and injects traceparent into the outgoing headers. +func TestOutbound_TracedWithClientSpan(t *testing.T) { + // otelhttp uses the global provider and propagator. + tp, sr := telemetrytest.NewTracerProvider() + prevTP := otel.GetTracerProvider() + prevProp := otel.GetTextMapPropagator() + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) + t.Cleanup(func() { + otel.SetTracerProvider(prevTP) + otel.SetTextMapPropagator(prevProp) + _ = tp.Shutdown(context.Background()) + }) + + var gotTraceparent string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotTraceparent = r.Header.Get("traceparent") + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + pool := NewClientPool() + cli := pool.GetClient(7*time.Second, "") + + tracer := tp.Tracer("test") + ctx, parent := tracer.Start(context.Background(), "caller") + defer parent.End() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, srv.URL, nil) + require.NoError(t, err) + resp, err := cli.Do(req) + require.NoError(t, err) + _ = resp.Body.Close() + + var clientSpan sdktrace.ReadOnlySpan + for _, s := range sr.Ended() { + if s.SpanKind() == trace.SpanKindClient { + clientSpan = s + break + } + } + require.NotNil(t, clientSpan, "expected an outbound client span") + assert.Equal(t, parent.SpanContext().TraceID(), telemetrytest.TraceID(clientSpan), + "client span must continue the caller trace") + assert.NotEmpty(t, gotTraceparent, "outgoing request must carry a traceparent header") +} From fd503074af6546d24d9b687b0fe3f001d9773f1b Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:37:21 -0300 Subject: [PATCH 15/26] fix(otel): temporal uses the global propagator and surfaces load errors Why: the Temporal tracing interceptor hardcoded a TraceContext+Baggage composite, diverging from the configured global propagator, and swallowed NewTracingInterceptor failures by logging and returning nil - so a broken Temporal tracing setup stayed hidden while the rest of the system honored cfg.Propagators. What: pass otel.GetTextMapPropagator() to the Temporal interceptor options so it follows the same W3C propagation as every other surface, and return the NewTracingInterceptor error from Load instead of masking it. --- boot/components/otel/otel_temporal.go | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/boot/components/otel/otel_temporal.go b/boot/components/otel/otel_temporal.go index c0553ec09..ef495c66f 100644 --- a/boot/components/otel/otel_temporal.go +++ b/boot/components/otel/otel_temporal.go @@ -4,6 +4,7 @@ package otel import ( "context" + "fmt" "github.com/wippyai/runtime/api/boot" logapi "github.com/wippyai/runtime/api/logs" @@ -11,9 +12,8 @@ import ( temporalapi "github.com/wippyai/runtime/api/service/temporal" temporalboot "github.com/wippyai/runtime/boot/components/service/temporal" "github.com/wippyai/runtime/service/otel" - "go.opentelemetry.io/otel/propagation" + gootel "go.opentelemetry.io/otel" temporalotel "go.temporal.io/sdk/contrib/opentelemetry" - "go.uber.org/zap" ) // Temporal creates the OTEL tracing interceptor for Temporal workflows and activities. @@ -60,17 +60,15 @@ func Temporal() boot.Component { return ctx, nil } - // Create Temporal tracing interceptor with the shared tracer + // Create Temporal tracing interceptor with the shared tracer and the + // globally-configured propagator so Temporal follows the same W3C + // propagation settings as every other surface. tracingInterceptor, err := temporalotel.NewTracingInterceptor(temporalotel.TracerOptions{ - Tracer: tracer, - TextMapPropagator: propagation.NewCompositeTextMapPropagator( - propagation.TraceContext{}, - propagation.Baggage{}, - ), + Tracer: tracer, + TextMapPropagator: gootel.GetTextMapPropagator(), }) if err != nil { - logger.Error("failed to create Temporal tracing interceptor", zap.Error(err)) - return ctx, nil + return ctx, fmt.Errorf("failed to create Temporal tracing interceptor: %w", err) } // Register with both client and worker registries From b87fc190d88f2ad7fd0400a9e6a7e3dd46f32173 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 14:41:42 -0300 Subject: [PATCH 16/26] test(otel): OTLP traces round-trip through a live Jaeger collector Why: every prior OTel test used in-memory recorders; nothing validated that the runtime's OTLP exporter actually ships spans to a real collector, so a broken endpoint, protocol, or exporter config would only surface in production. What: add an integration-tagged E2E test that builds the real service/otel.InitializeProvider pointing at localhost:4318 (OTLP/HTTP), emits and flushes a span, then polls Jaeger's /api/traces and asserts the span arrived. Add a make otel-e2e target that brings up Jaeger and runs it. --- Makefile | 5 ++ tests/otel_e2e_test.go | 118 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 tests/otel_e2e_test.go diff --git a/Makefile b/Makefile index ad3b72592..4c37d07fe 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,11 @@ otel-up: otel-down: cd tests && docker-compose down +# otel-e2e starts only Jaeger and runs the OTLP round-trip test (needs docker). +otel-e2e: + cd tests && docker-compose up -d jaeger + go test -tags integration -run TestOTLP_TracesReachJaeger ./tests/ -timeout 120s + # Wippy CLI build targets WIPPY_VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") WIPPY_COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo "unknown") diff --git a/tests/otel_e2e_test.go b/tests/otel_e2e_test.go new file mode 100644 index 000000000..5d4c8125e --- /dev/null +++ b/tests/otel_e2e_test.go @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MPL-2.0. + +//go:build integration + +package tests + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + otelapi "github.com/wippyai/runtime/api/service/otel" + "github.com/wippyai/runtime/service/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "go.uber.org/zap" +) + +// TestOTLP_TracesReachJaeger validates the runtime's real OTLP trace pipeline +// end to end: service/otel.InitializeProvider builds a TracerProvider whose +// OTLP/HTTP exporter ships spans to a live Jaeger collector, and Jaeger's API +// then serves them back. Run with Jaeger up (`make otel-up`). +func TestOTLP_TracesReachJaeger(t *testing.T) { + const ( + otlpEndpoint = "localhost:4318" + jaegerAPI = "http://localhost:16686" + service = "wippy-e2e-test" + op = "e2e.root" + ) + + if !reachable(otlpEndpoint, 2*time.Second) { + t.Skipf("jaeger OTLP endpoint %s not reachable - run `make otel-up`", otlpEndpoint) + } + + tp, err := otel.InitializeProvider(context.Background(), otelapi.Config{ + Enabled: true, + TracesEnabled: true, + Endpoint: otlpEndpoint, + Protocol: "http/protobuf", + Insecure: true, + ServiceName: service, + SampleRate: 1.0, + }, zap.NewNop()) + require.NoError(t, err) + + tracer := tp.Tracer("wippy-runtime") + _, span := tracer.Start(context.Background(), op, + trace.WithAttributes(attribute.String("e2e", "otel-traces"))) + span.End() + + // Shutdown flushes the BatchSpanProcessor so the span reaches Jaeger. + require.NoError(t, otel.ShutdownTracerProvider(context.Background(), tp, zap.NewNop())) + + var found bool + for i := 0; i < 40; i++ { + if jaegerHasOperation(jaegerAPI, service, op) { + found = true + break + } + time.Sleep(500 * time.Millisecond) + } + assert.True(t, found, "span %q for service %q never appeared in Jaeger", op, service) +} + +func jaegerHasOperation(jaegerAPI, service, op string) bool { + url := fmt.Sprintf("%s/api/traces?service=%s&limit=50", jaegerAPI, service) + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, url, nil) + if err != nil { + return false + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return false + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + return false + } + body, err := io.ReadAll(resp.Body) + if err != nil { + return false + } + var parsed struct { + Data []struct { + Spans []struct { + OperationName string `json:"operationName"` + } `json:"spans"` + } `json:"data"` + } + if json.Unmarshal(body, &parsed) != nil { + return false + } + for _, tr := range parsed.Data { + for _, sp := range tr.Spans { + if sp.OperationName == op { + return true + } + } + } + return false +} + +func reachable(addr string, timeout time.Duration) bool { + dialer := net.Dialer{Timeout: timeout} + conn, err := dialer.DialContext(context.Background(), "tcp", addr) + if err != nil { + return false + } + _ = conn.Close() + return true +} From 7bb54cbc34df18087037efc8fc0e090c2bf31551 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 15:56:00 -0300 Subject: [PATCH 17/26] fix(http): set route label before pre-match middleware Why: OTel and http-metrics middleware wrapped the route handler on the outside, so they ran before createRouteHandler set the route label - every HTTP server span was named 'GET unmatched', making endpoints indistinguishable in traces and metrics. What: wrap each route handler with withRouteLabel as the outermost layer so the route label is set in the request frame before pre-match middleware runs. Verified in the real binary: HTTP spans now read e.g. 'GET app.api:hello.endpoint' and function spans are CHILD_OF the HTTP span. --- service/http/router.go | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/service/http/router.go b/service/http/router.go index 4c412de14..0b6854e95 100644 --- a/service/http/router.go +++ b/service/http/router.go @@ -344,12 +344,15 @@ func (rm *RouteManager) Build() error { registeredOptions := make(map[string]bool) for _, routerEntry := range rm.routers { for routeID, route := range routerEntry.routes { - pattern := buildPattern(route.method, routerEntry.prefix, route.path) - handler := rm.createRouteHandler(routeID, route, routerEntry) - if len(routerEntry.middleware) > 0 { - handler = applyMiddlewareChain(routerEntry.middleware, handler) - } - allPatterns = append(allPatterns, patternEntry{handler, pattern}) + pattern := buildPattern(route.method, routerEntry.prefix, route.path) + handler := rm.createRouteHandler(routeID, route, routerEntry) + if len(routerEntry.middleware) > 0 { + handler = applyMiddlewareChain(routerEntry.middleware, handler) + } + // Set the route label as the outermost wrapper so pre-match middleware + // (e.g., OTel, http metrics) observes the real route instead of "unmatched". + handler = withRouteLabel(routeLabelFor(route.funcID, routeID), handler) + allPatterns = append(allPatterns, patternEntry{handler, pattern}) // Auto-generate OPTIONS handler so CORS middleware can intercept preflight if route.method != "OPTIONS" { @@ -399,6 +402,25 @@ func (rm *RouteManager) Build() error { return nil } +// routeLabelFor returns the route label used for observability, preferring the +// function ID and falling back to the route ID. +func routeLabelFor(funcID, routeID registry.ID) string { + if l := funcID.String(); l != "" { + return l + } + return routeID.String() +} + +// withRouteLabel wraps next so the route label is set in the request frame +// before next runs. Applied as the outermost route wrapper so pre-match +// middleware (OTel, http metrics) sees the real route instead of "unmatched". +func withRouteLabel(label string, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = httpapi.SetRouteLabel(r.Context(), label) + next.ServeHTTP(w, r) + }) +} + // createRouteHandler creates the handler for a route with param extraction and post-middleware func (rm *RouteManager) createRouteHandler(routeID registry.ID, route *RouteEntry, routerEntry *RouterEntry) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { From 23e581de9000b16cf7e0334203f34d8bde9ad3c6 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 16:24:55 -0300 Subject: [PATCH 18/26] fix(otel): address review findings on HTTP middleware and metrics warming Why: code review found the statusRecorder broke SSE/websocket relays (missing Flush/Hijack), router.go was gofmt-dirty, server spans were marked Ok on 4xx, and the _init warming emitted phantom label series. What: - statusRecorder implements Flush/Hijack/Unwrap so sserelay (Flush) and wsrelay (Hijack) keep working when stacked inside the OTel middleware; regression test asserts both interfaces survive the wrapper. - HTTP server span status: Error for >=500, unset otherwise (per OTel HTTP semconv; 4xx are client errors, not Ok). - gofmt service/http/router.go. - drop the _init zero-series warming in queue and process telemetry (counters/gauges appear on first real use; removes bogus _init labels). - fix a pre-existing noctx lint nit in router_test.go. --- service/http/router.go | 18 ++++++++--------- service/http/router_test.go | 2 +- service/metrics/process_lifecycle.go | 8 +------- service/otel/service.go | 25 +++++++++++++++++++++--- service/otel/service_span_test.go | 19 +++++++++++++++++- service/queue/consumer/telemetry.go | 8 +------- service/queue/consumer/telemetry_test.go | 8 -------- 7 files changed, 52 insertions(+), 36 deletions(-) diff --git a/service/http/router.go b/service/http/router.go index 0b6854e95..da8d4f334 100644 --- a/service/http/router.go +++ b/service/http/router.go @@ -344,15 +344,15 @@ func (rm *RouteManager) Build() error { registeredOptions := make(map[string]bool) for _, routerEntry := range rm.routers { for routeID, route := range routerEntry.routes { - pattern := buildPattern(route.method, routerEntry.prefix, route.path) - handler := rm.createRouteHandler(routeID, route, routerEntry) - if len(routerEntry.middleware) > 0 { - handler = applyMiddlewareChain(routerEntry.middleware, handler) - } - // Set the route label as the outermost wrapper so pre-match middleware - // (e.g., OTel, http metrics) observes the real route instead of "unmatched". - handler = withRouteLabel(routeLabelFor(route.funcID, routeID), handler) - allPatterns = append(allPatterns, patternEntry{handler, pattern}) + pattern := buildPattern(route.method, routerEntry.prefix, route.path) + handler := rm.createRouteHandler(routeID, route, routerEntry) + if len(routerEntry.middleware) > 0 { + handler = applyMiddlewareChain(routerEntry.middleware, handler) + } + // Set the route label as the outermost wrapper so pre-match middleware + // (e.g., OTel, http metrics) observes the real route instead of "unmatched". + handler = withRouteLabel(routeLabelFor(route.funcID, routeID), handler) + allPatterns = append(allPatterns, patternEntry{handler, pattern}) // Auto-generate OPTIONS handler so CORS middleware can intercept preflight if route.method != "OPTIONS" { diff --git a/service/http/router_test.go b/service/http/router_test.go index 732d85510..4d38ab0d7 100644 --- a/service/http/router_test.go +++ b/service/http/router_test.go @@ -80,7 +80,7 @@ func TestRouteManager_BasicOperations(t *testing.T) { err = rm.ReplaceMount("/static", replacement) require.NoError(t, err) rec := httptest.NewRecorder() - rm.mounts["/static"].ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/static/app.js", nil)) + rm.mounts["/static"].ServeHTTP(rec, httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/static/app.js", nil)) assert.Equal(t, "replacement", rec.Body.String()) // Replace validates paths and only updates existing mounts diff --git a/service/metrics/process_lifecycle.go b/service/metrics/process_lifecycle.go index db62b10e4..42029769b 100644 --- a/service/metrics/process_lifecycle.go +++ b/service/metrics/process_lifecycle.go @@ -24,13 +24,7 @@ type ProcessLifecycle struct { } func NewProcessLifecycle(coll metrics.Collector) *ProcessLifecycle { - p := &ProcessLifecycle{coll: coll} - if coll != nil { - coll.GaugeSet(ProcessActive, 0, metrics.Labels{}) - coll.CounterAdd(ProcessTerminated, 0, metrics.Labels{"result": "completed"}) - coll.CounterAdd(ProcessTerminated, 0, metrics.Labels{"result": "error"}) - } - return p + return &ProcessLifecycle{coll: coll} } func (p *ProcessLifecycle) OnStart(_ context.Context, _ pid.PID, _ processapi.Process) error { diff --git a/service/otel/service.go b/service/otel/service.go index c69d98d7f..f1faf47ad 100644 --- a/service/otel/service.go +++ b/service/otel/service.go @@ -3,7 +3,10 @@ package otel import ( + "bufio" stdcontext "context" + "errors" + "net" "net/http" ctxapi "github.com/wippyai/runtime/api/context" @@ -93,8 +96,6 @@ func (s *Service) HTTPMiddleware() func(http.Handler) http.Handler { span.SetAttributes(attribute.Int("http.response.status_code", rec.status)) if rec.status >= http.StatusInternalServerError { span.SetStatus(codes.Error, http.StatusText(rec.status)) - } else { - span.SetStatus(codes.Ok, "") } }) } @@ -102,13 +103,18 @@ func (s *Service) HTTPMiddleware() func(http.Handler) http.Handler { // statusRecorder wraps http.ResponseWriter to capture the response status code // without otherwise altering behavior. The first WriteHeader call fixes the -// status; a handler that only calls Write defaults to 200 OK. +// status; a handler that only calls Write defaults to 200 OK. Flush/Hijack/ +// Unwrap forward to the underlying writer so SSE (Flush) and websocket +// (Hijack) middlewares stacked inside the OTel middleware keep working, and +// http.ResponseController reaches the real writer. type statusRecorder struct { http.ResponseWriter status int wrote bool } +func (r *statusRecorder) Unwrap() http.ResponseWriter { return r.ResponseWriter } + func (r *statusRecorder) WriteHeader(code int) { if !r.wrote { r.status = code @@ -124,6 +130,19 @@ func (r *statusRecorder) Write(b []byte) (int, error) { return r.ResponseWriter.Write(b) } +func (r *statusRecorder) Flush() { + if flusher, ok := r.ResponseWriter.(http.Flusher); ok { + flusher.Flush() + } +} + +func (r *statusRecorder) Hijack() (net.Conn, *bufio.ReadWriter, error) { + if hj, ok := r.ResponseWriter.(http.Hijacker); ok { + return hj.Hijack() + } + return nil, nil, errors.New("otel: underlying response writer does not support Hijack") +} + // OnStart implements scheduler.Lifecycle. func (s *Service) OnStart(ctx stdcontext.Context, p pid.PID, _ process.Process) error { if !s.cfg.Process.Enabled || !s.cfg.Process.TraceLifecycle { diff --git a/service/otel/service_span_test.go b/service/otel/service_span_test.go index 96c66333d..20b01bb11 100644 --- a/service/otel/service_span_test.go +++ b/service/otel/service_span_test.go @@ -196,6 +196,23 @@ func TestProcessLifecycle_RootSpanWhenNoRemoteParent(t *testing.T) { assert.True(t, terminated, "unsupervised spawn must emit process.terminated") } +func TestHTTPMiddleware_StatusRecorderPreservesFlusherAndHijacker(t *testing.T) { + tp, _ := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{HTTP: otelapi.HTTPConfig{Enabled: true}}, zap.NewNop(), tp) + wrapped := svc.HTTPMiddleware()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, isFlusher := w.(http.Flusher) + assert.True(t, isFlusher, "inner handler must see http.Flusher (SSE relay depends on it)") + _, isHijacker := w.(http.Hijacker) + assert.True(t, isHijacker, "inner handler must see http.Hijacker (websocket relay depends on it)") + w.WriteHeader(http.StatusOK) + })) + + wrapped.ServeHTTP(httptest.NewRecorder(), + httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil)) +} + func TestHTTPMiddleware_CapturesStatusCode(t *testing.T) { useTraceContextPropagator(t) tp, sr := telemetrytest.NewTracerProvider() @@ -226,7 +243,7 @@ func TestHTTPMiddleware_CapturesStatusCode(t *testing.T) { if c.wantError { telemetrytest.SpanStatus(t, span, codes.Error) } else { - telemetrytest.SpanStatus(t, span, codes.Ok) + telemetrytest.SpanStatus(t, span, codes.Unset) } }) } diff --git a/service/queue/consumer/telemetry.go b/service/queue/consumer/telemetry.go index 62ccbdb31..62800c5ea 100644 --- a/service/queue/consumer/telemetry.go +++ b/service/queue/consumer/telemetry.go @@ -21,13 +21,7 @@ type telemetry struct { } func newTelemetry(coll metrics.Collector) *telemetry { - t := &telemetry{coll: coll} - if coll != nil { - coll.CounterAdd(metricMessagesTotal, 0, metrics.Labels{"queue": "_init", "result": "ack"}) - coll.CounterAdd(metricMessagesTotal, 0, metrics.Labels{"queue": "_init", "result": "nack"}) - coll.GaugeSet(metricInFlight, 0, metrics.Labels{"queue": "_init"}) - } - return t + return &telemetry{coll: coll} } func (t *telemetry) recordProcessed(queue, result string, duration time.Duration) { diff --git a/service/queue/consumer/telemetry_test.go b/service/queue/consumer/telemetry_test.go index 6f597043a..51a15ebd3 100644 --- a/service/queue/consumer/telemetry_test.go +++ b/service/queue/consumer/telemetry_test.go @@ -41,11 +41,3 @@ func TestTelemetry_InFlight(t *testing.T) { assert.Equal(t, 1.0, rec.GaugeValue(metricInFlight, metrics.Labels{"queue": "ns:orders"})) } - -func TestTelemetry_BootstrapsZeroSeries(t *testing.T) { - rec := telemetrytest.NewRecorder() - _ = newTelemetry(rec) - - assert.Contains(t, rec.Names(), metricMessagesTotal) - assert.Contains(t, rec.Names(), metricInFlight) -} From fefb24d4581af94d6851f2513f50875db4b0083a Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 16:46:22 -0300 Subject: [PATCH 19/26] fix(metrics,otel): correct gauge Inc/Dec semantics and harden HTTP tracing Why: deep review found the wippy_function_in_flight=-1 anomaly root cause - GaugeInc/GaugeDec were emitted as TypeGauge (absolute Set), so a balanced Inc(+1)/Dec(-1) became Set(+1) then Set(-1) = -1 in every production exporter (function/queue/process/http in-flight gauges). The test Recorder masked it because it implements Inc/Dec additively. Also found: queue deliveries without a traceparent lost Consumer span semantics, and the OTel+httpmetrics writer stack broke websocket Hijack. What: - Add TypeGaugeAdd metric type; Collector GaugeInc/GaugeDec use it. Prometheus exporter uses gauge.Add for TypeGaugeAdd (Set for TypeGauge); OTel exporter maps TypeGaugeAdd to a Float64UpDownCounter. Regression tests: balanced Inc/Dec reads 0 through the real collector+exporter, and Set stays absolute. - Queue deliveries always produce a Consumer span with messaging attrs, even without a recoverable traceparent (root Consumer span then). - httpmetrics statusWriter implements Hijack/Unwrap (matching the OTel statusRecorder) so the otel+httpmetrics stack reaches the real connection for SSE/websocket; stacking regression test added. - Drop the now-redundant SetRouteLabel in createRouteHandler (the outer withRouteLabel wrapper owns it). --- api/metrics/metrics.go | 3 +- .../http/middleware/httpmetrics/middleware.go | 15 ++++ service/http/router.go | 11 +-- service/metrics/collector.go | 4 +- service/metrics/prometheus/exporter.go | 8 +- .../metrics/prometheus/gauge_exporter_test.go | 73 +++++++++++++++++++ service/otel/metrics.go | 32 ++++++++ service/otel/service.go | 14 ++-- service/otel/service_span_test.go | 44 +++++++++++ 9 files changed, 185 insertions(+), 19 deletions(-) create mode 100644 service/metrics/prometheus/gauge_exporter_test.go diff --git a/api/metrics/metrics.go b/api/metrics/metrics.go index 0377d131e..a125cd0b4 100644 --- a/api/metrics/metrics.go +++ b/api/metrics/metrics.go @@ -6,7 +6,8 @@ package metrics // MetricType constants. const ( TypeCounter MetricType = "counter" - TypeGauge MetricType = "gauge" + TypeGauge MetricType = "gauge" // absolute set (GaugeSet) + TypeGaugeAdd MetricType = "gauge_add" // relative delta (GaugeInc/GaugeDec) TypeHistogram MetricType = "histogram" ) diff --git a/service/http/middleware/httpmetrics/middleware.go b/service/http/middleware/httpmetrics/middleware.go index a3457b1ea..9c586b839 100644 --- a/service/http/middleware/httpmetrics/middleware.go +++ b/service/http/middleware/httpmetrics/middleware.go @@ -3,6 +3,9 @@ package httpmetrics import ( + "bufio" + "errors" + "net" "net/http" "strconv" "time" @@ -84,8 +87,20 @@ func (w *statusWriter) Write(b []byte) (int, error) { return w.ResponseWriter.Write(b) } +// Unwrap exposes the underlying writer so http.ResponseController (and other +// wrappers stacked above, e.g. the OTel statusRecorder) can reach the real +// connection for Flush/Hijack used by SSE and websocket relays. +func (w *statusWriter) Unwrap() http.ResponseWriter { return w.ResponseWriter } + func (w *statusWriter) Flush() { if flusher, ok := w.ResponseWriter.(http.Flusher); ok { flusher.Flush() } } + +func (w *statusWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) { + if hj, ok := w.ResponseWriter.(http.Hijacker); ok { + return hj.Hijack() + } + return nil, nil, errors.New("httpmetrics: underlying response writer does not support Hijack") +} diff --git a/service/http/router.go b/service/http/router.go index da8d4f334..ef3913e8b 100644 --- a/service/http/router.go +++ b/service/http/router.go @@ -440,15 +440,10 @@ func (rm *RouteManager) createRouteHandler(routeID registry.ID, route *RouteEntr routeInfo.Endpoint = routeID routeInfo.Func = route.funcID - // Extract route label (prefer Func over Endpoint) - routeLabel := route.funcID.String() - if routeLabel == "" { - routeLabel = routeID.String() - } - - // Add route info and label to FrameContext + // Add route info to FrameContext. The route label is set by the outer + // withRouteLabel wrapper so pre-match middleware (OTel, http metrics) + // already observes it; SetRouteInfo is still needed for downstream handlers. _ = httpapi.SetRouteInfo(r.Context(), routeInfo) - _ = httpapi.SetRouteLabel(r.Context(), routeLabel) // Apply post-match middleware and call endpoint handler finalHandler := route.handler diff --git a/service/metrics/collector.go b/service/metrics/collector.go index 059e222c3..bd9f07d07 100644 --- a/service/metrics/collector.go +++ b/service/metrics/collector.go @@ -55,11 +55,11 @@ func (c *collector) GaugeSet(name string, value float64, labels api.Labels) { } func (c *collector) GaugeInc(name string, labels api.Labels) { - c.record(name, api.TypeGauge, 1, labels) + c.record(name, api.TypeGaugeAdd, 1, labels) } func (c *collector) GaugeDec(name string, labels api.Labels) { - c.record(name, api.TypeGauge, -1, labels) + c.record(name, api.TypeGaugeAdd, -1, labels) } func (c *collector) HistogramObserve(name string, value float64, labels api.Labels) { diff --git a/service/metrics/prometheus/exporter.go b/service/metrics/prometheus/exporter.go index 43b814f7f..78781c3e6 100644 --- a/service/metrics/prometheus/exporter.go +++ b/service/metrics/prometheus/exporter.go @@ -76,9 +76,13 @@ func (e *Exporter) Record(name string, typ api.MetricType, value float64, labels counter := e.getOrCreateCounter(key, name, labelNames) counter.WithLabelValues(labelVals...).Add(value) - case api.TypeGauge: + case api.TypeGauge, api.TypeGaugeAdd: gauge := e.getOrCreateGauge(key, name, labelNames) - gauge.WithLabelValues(labelVals...).Set(value) + if typ == api.TypeGaugeAdd { + gauge.WithLabelValues(labelVals...).Add(value) + } else { + gauge.WithLabelValues(labelVals...).Set(value) + } case api.TypeHistogram: histo := e.getOrCreateHistogram(key, name, labelNames) diff --git a/service/metrics/prometheus/gauge_exporter_test.go b/service/metrics/prometheus/gauge_exporter_test.go new file mode 100644 index 000000000..ac2ee71da --- /dev/null +++ b/service/metrics/prometheus/gauge_exporter_test.go @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MPL-2.0 + +package prometheus + +import ( + "context" + "net/http/httptest" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + api "github.com/wippyai/runtime/api/metrics" + apicfg "github.com/wippyai/runtime/api/service/metrics" + impl "github.com/wippyai/runtime/service/metrics" +) + +// TestExporter_GaugeIncDecBalancesToZero is the regression test for the gauge +// Inc/Dec anomaly: GaugeInc/GaugeDec must be treated as additive deltas by the +// exporter, not absolute sets. With the bug, a balanced Inc(+1)/Dec(-1) was +// emitted as Set(-1), leaving gauges like wippy_function_in_flight reading -1. +func TestExporter_GaugeIncDecBalancesToZero(t *testing.T) { + coll := impl.NewCollector(apicfg.Config{}) + exp := NewExporter() + require.NoError(t, coll.RegisterExporter(exp)) + + labels := api.Labels{"k": "v"} + coll.GaugeInc("wippy_test_in_flight", labels) + coll.GaugeInc("wippy_test_in_flight", labels) + coll.GaugeDec("wippy_test_in_flight", labels) + coll.GaugeDec("wippy_test_in_flight", labels) + + require.NoError(t, coll.Close()) + + rec := httptest.NewRecorder() + exp.Handler().ServeHTTP(rec, httptest.NewRequestWithContext(context.Background(), "GET", "/metrics", nil)) + body := rec.Body.String() + require.Contains(t, body, "wippy_test_in_flight", "gauge series must be present") + + for _, line := range strings.Split(body, "\n") { + if strings.HasPrefix(line, "wippy_test_in_flight{") { + assert.True(t, strings.HasSuffix(strings.TrimSpace(line), " 0"), + "balanced GaugeInc/GaugeDec must read 0, got: %s", line) + return + } + } + t.Fatalf("wippy_test_in_flight sample line not found in output") +} + +// GaugeSet is absolute and must overwrite the running delta total. +func TestExporter_GaugeSetIsAbsolute(t *testing.T) { + coll := impl.NewCollector(apicfg.Config{}) + exp := NewExporter() + require.NoError(t, coll.RegisterExporter(exp)) + + labels := api.Labels{"k": "v"} + coll.GaugeInc("wippy_test_gauge", labels) // delta +1 + coll.GaugeSet("wippy_test_gauge", 42, labels) + coll.GaugeInc("wippy_test_gauge", labels) // delta +1 -> 43 + + require.NoError(t, coll.Close()) + + rec := httptest.NewRecorder() + exp.Handler().ServeHTTP(rec, httptest.NewRequestWithContext(context.Background(), "GET", "/metrics", nil)) + for _, line := range strings.Split(rec.Body.String(), "\n") { + if strings.HasPrefix(line, "wippy_test_gauge{") { + assert.True(t, strings.HasSuffix(strings.TrimSpace(line), " 43"), + "Set(42) then Inc must read 43, got: %s", line) + return + } + } + t.Fatalf("wippy_test_gauge sample line not found") +} diff --git a/service/otel/metrics.go b/service/otel/metrics.go index de1e4faec..e5f4abe4e 100644 --- a/service/otel/metrics.go +++ b/service/otel/metrics.go @@ -15,6 +15,7 @@ type MetricsExporter struct { meter otelmetric.Meter counters map[string]otelmetric.Float64Counter gauges map[string]otelmetric.Float64Gauge + upDowns map[string]otelmetric.Float64UpDownCounter histos map[string]otelmetric.Float64Histogram mu sync.RWMutex } @@ -24,6 +25,7 @@ func NewMetricsExporter(provider otelmetric.MeterProvider) *MetricsExporter { meter: provider.Meter("wippy-runtime"), counters: make(map[string]otelmetric.Float64Counter), gauges: make(map[string]otelmetric.Float64Gauge), + upDowns: make(map[string]otelmetric.Float64UpDownCounter), histos: make(map[string]otelmetric.Float64Histogram), } } @@ -50,6 +52,13 @@ func (e *MetricsExporter) Record(name string, typ api.MetricType, value float64, } gauge.Record(context.Background(), value, otelmetric.WithAttributes(attrs...)) + case api.TypeGaugeAdd: + udc, err := e.getOrCreateUpDownCounter(name) + if err != nil { + return err + } + udc.Add(context.Background(), value, otelmetric.WithAttributes(attrs...)) + case api.TypeHistogram: histo, err := e.getOrCreateHistogram(name) if err != nil { @@ -107,6 +116,29 @@ func (e *MetricsExporter) getOrCreateGauge(name string) (otelmetric.Float64Gauge return g, nil } +func (e *MetricsExporter) getOrCreateUpDownCounter(name string) (otelmetric.Float64UpDownCounter, error) { + e.mu.RLock() + u, ok := e.upDowns[name] + e.mu.RUnlock() + if ok { + return u, nil + } + + e.mu.Lock() + defer e.mu.Unlock() + + if u, ok = e.upDowns[name]; ok { + return u, nil + } + + u, err := e.meter.Float64UpDownCounter(name) + if err != nil { + return nil, err + } + e.upDowns[name] = u + return u, nil +} + // otelHistogramBuckets aligns OTel histogram bucket boundaries with the // Prometheus client_golang DefBuckets that our in-pod Prometheus exporter // uses (see service/metrics/prometheus/exporter.go). Without this, OTel's diff --git a/service/otel/service.go b/service/otel/service.go index f1faf47ad..d2d53eabe 100644 --- a/service/otel/service.go +++ b/service/otel/service.go @@ -260,17 +260,19 @@ func (i *interceptor) Handle(ctx stdcontext.Context, task runtime.Task, next fun var delivery *queueapi.Delivery var isQueueMessage bool - // Priority 0: Check for queue delivery in task.Context (before it's written to frame) + // Priority 0: Check for queue delivery in task.Context (before it's written to frame). + // Any queue delivery is a Consumer span - parented to the producer trace when a + // traceparent is present, otherwise a root Consumer span - so consume-side + // telemetry exists even for messages from non-instrumented publishers. for _, pair := range task.Context { if d, ok := pair.Value.(*queueapi.Delivery); ok { delivery = d - extractedCtx, hasSpan := extractFromDelivery(ctx, delivery) - if hasSpan { + isQueueMessage = true + if extractedCtx, hasSpan := extractFromDelivery(ctx, delivery); hasSpan { ctx = extractedCtx - ctx, span = i.tracer.Start(ctx, spanName, - trace.WithSpanKind(trace.SpanKindConsumer)) - isQueueMessage = true } + ctx, span = i.tracer.Start(ctx, spanName, + trace.WithSpanKind(trace.SpanKindConsumer)) break } } diff --git a/service/otel/service_span_test.go b/service/otel/service_span_test.go index 20b01bb11..84b52f066 100644 --- a/service/otel/service_span_test.go +++ b/service/otel/service_span_test.go @@ -20,6 +20,7 @@ import ( httpapi "github.com/wippyai/runtime/api/service/http" otelapi "github.com/wippyai/runtime/api/service/otel" "github.com/wippyai/runtime/internal/telemetrytest" + "github.com/wippyai/runtime/service/http/middleware/httpmetrics" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/propagation" @@ -85,6 +86,28 @@ func TestHTTPMiddleware_ExtractsParent(t *testing.T) { "server span must continue the parent trace") } +func TestInterceptor_SpanKind_QueueDeliveryWithoutTraceparent(t *testing.T) { + tp, sr := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + // Delivery with no traceparent (message from a non-instrumented publisher). + delivery := &queueapi.Delivery{Message: &queueapi.Message{ID: "m2", Headers: attrsapi.NewBag()}} + inter := &interceptor{tracer: tp.Tracer("test"), logger: zap.NewNop()} + task := runtime.Task{ + ID: registry.NewID("ns", "func"), + Context: []ctxapi.Pair{{Value: delivery}}, + } + next := func(_ context.Context, _ runtime.Task) (*runtime.Result, error) { return &runtime.Result{}, nil } + + _, err := inter.Handle(context.Background(), task, next) + require.NoError(t, err) + + span := telemetrytest.MustSpanNamed(t, sr, "ns:func") + telemetrytest.SpanKind(t, span, trace.SpanKindConsumer) + telemetrytest.SpanHasStringAttr(t, span, "messaging.operation", "process") + telemetrytest.SpanHasStringAttr(t, span, "messaging.message.id", "m2") +} + func TestInterceptor_SpanKind_RootIsServer(t *testing.T) { tp, sr := telemetrytest.NewTracerProvider() defer func() { _ = tp.Shutdown(context.Background()) }() @@ -196,6 +219,27 @@ func TestProcessLifecycle_RootSpanWhenNoRemoteParent(t *testing.T) { assert.True(t, terminated, "unsupervised spawn must emit process.terminated") } +func TestHTTPMiddleware_StackedWithHttpmetrics_PreservesHijackFlush(t *testing.T) { + tp, _ := telemetrytest.NewTracerProvider() + defer func() { _ = tp.Shutdown(context.Background()) }() + + svc := NewService(otelapi.Config{HTTP: otelapi.HTTPConfig{Enabled: true}}, zap.NewNop(), tp) + httpmw := httpmetrics.CreateHTTPMetricsMiddleware(telemetrytest.NewRecorder())(nil) + + inner := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, isFlusher := w.(http.Flusher) + assert.True(t, isFlusher, "inner handler must see http.Flusher through otel+httpmetrics") + _, isHijacker := w.(http.Hijacker) + assert.True(t, isHijacker, "inner handler must see http.Hijacker through otel+httpmetrics") + w.WriteHeader(http.StatusOK) + }) + + // Stack order: otel (outer) -> httpmetrics -> handler, matching production. + stacked := svc.HTTPMiddleware()(httpmw(inner)) + stacked.ServeHTTP(httptest.NewRecorder(), + httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil)) +} + func TestHTTPMiddleware_StatusRecorderPreservesFlusherAndHijacker(t *testing.T) { tp, _ := telemetrytest.NewTracerProvider() defer func() { _ = tp.Shutdown(context.Background()) }() From 11cf605fa4c5c7b483e7646279838eae5ba364c3 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 17:14:50 -0300 Subject: [PATCH 20/26] fix(lint): scope G124 exclude to outbound-cookie handler Why: CI lint failed - the per-line //nolint:gosec on the outbound cookie was flagged by nolintlint as unused in the full-module run, while removing it let gosec G124 fire. The repo convention is config-level gosec excludes (not per-line nolint). What: add a path-scoped G124 exclude for service/http/client/handler.go (the outbound request-cookie path, where Secure/HttpOnly/SameSite are response-side attributes that don't apply) and drop the per-line nolint. make lint is now clean (0 issues). --- .golangci.yml | 4 ++++ service/http/client/handler.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 875c5341f..36b5b3634 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -163,6 +163,10 @@ linters: - linters: - gosec text: "G118" + - linters: + - gosec + text: "G124" + path: service/http/client/handler\.go - linters: - revive text: "unexported-return" diff --git a/service/http/client/handler.go b/service/http/client/handler.go index c16e9ac7d..a9b2ce7d9 100644 --- a/service/http/client/handler.go +++ b/service/http/client/handler.go @@ -245,7 +245,7 @@ func executeRequest(ctx context.Context, pool *Pool, networkReg netapi.NetworkRe for name, value := range req.Cookies { // Outbound HTTP request cookies; Secure/HttpOnly/SameSite are // response-side attributes the client cannot set meaningfully. - httpReq.AddCookie(&gohttp.Cookie{Name: name, Value: value}) //nolint:gosec // G124: client-side outbound cookie + httpReq.AddCookie(&gohttp.Cookie{Name: name, Value: value}) } if req.BasicAuthUser != "" { httpReq.SetBasicAuth(req.BasicAuthUser, req.BasicAuthPass) From 0a3ddf9359e56d891309bfbf50fe6eca6c001957 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 17:57:56 -0300 Subject: [PATCH 21/26] feat(otel): enrich resource with standard semconv and complete env compliance Why: exported spans/metrics only carried service.name/version - missing host.name, process.pid, os.type, runtime info that dashboards and backends use for filtering. Several standard OTEL env vars were also unsupported. What: - createResource merges WithHost/WithProcess/WithOS/WithTelemetrySDK onto resource.Default(), so every span/metric carries host.name, process.pid, os.type, process.runtime.*, telemetry.sdk.*. OTEL_RESOURCE_ATTRIBUTES is honored automatically via the Default() merge. - ApplyEnvOverrides honors OTEL_TRACES_SAMPLER (always_on/always_off win over the ratio arg), OTEL_EXPORTER_OTLP_INSECURE. - Tests assert the standard attributes are present and the new env vars. --- service/otel/config.go | 25 +++++++++++++++++++++++++ service/otel/config_test.go | 22 ++++++++++++++++++++++ service/otel/provider.go | 27 ++++++++++++++++++++------- service/otel/provider_test.go | 18 ++++++++++++++++++ 4 files changed, 85 insertions(+), 7 deletions(-) diff --git a/service/otel/config.go b/service/otel/config.go index 385b830f9..ddab48c7b 100644 --- a/service/otel/config.go +++ b/service/otel/config.go @@ -183,6 +183,31 @@ func ApplyEnvOverrides(cfg *otelapi.Config, logger *zap.Logger) { } } + // OTEL_TRACES_SAMPLER (sampler type). Applied after the ARG so explicit + // always_on/always_off win over a ratio argument. + if sampler := os.Getenv("OTEL_TRACES_SAMPLER"); sampler != "" { + switch strings.ToLower(strings.TrimSpace(sampler)) { + case "always_on": + cfg.SampleRate = 1.0 + case "always_off": + cfg.SampleRate = 0.0 + case "traceidratio", "parentbased_traceidratio": + // ratio comes from OTEL_TRACES_SAMPLER_ARG (handled above) + default: + logger.Warn("unsupported traces sampler, ignoring", + zap.String("var", "OTEL_TRACES_SAMPLER"), + zap.String("value", sampler)) + } + } + + // OTEL_EXPORTER_OTLP_INSECURE + if insecure := os.Getenv("OTEL_EXPORTER_OTLP_INSECURE"); insecure != "" { + cfg.Insecure = strings.ToLower(insecure) == "true" + logger.Debug("using insecure from env", + zap.String("var", "OTEL_EXPORTER_OTLP_INSECURE"), + zap.Bool("value", cfg.Insecure)) + } + // OTEL_PROPAGATORS if propagators := os.Getenv("OTEL_PROPAGATORS"); propagators != "" { list := strings.Split(propagators, ",") diff --git a/service/otel/config_test.go b/service/otel/config_test.go index 2b959a518..2e1a2402c 100644 --- a/service/otel/config_test.go +++ b/service/otel/config_test.go @@ -133,6 +133,28 @@ func TestApplyEnvOverrides_SampleRateInvalid(t *testing.T) { assert.Equal(t, originalRate, cfg.SampleRate) } +func TestApplyEnvOverrides_TracesSamplerAlwaysOn(t *testing.T) { + cfg := DefaultConfig() + t.Setenv("OTEL_TRACES_SAMPLER", "always_on") + t.Setenv("OTEL_TRACES_SAMPLER_ARG", "0.25") + ApplyEnvOverrides(&cfg, zap.NewNop()) + assert.Equal(t, 1.0, cfg.SampleRate, "always_on must win over sampler arg") +} + +func TestApplyEnvOverrides_TracesSamplerAlwaysOff(t *testing.T) { + cfg := DefaultConfig() + t.Setenv("OTEL_TRACES_SAMPLER", "always_off") + ApplyEnvOverrides(&cfg, zap.NewNop()) + assert.Equal(t, 0.0, cfg.SampleRate) +} + +func TestApplyEnvOverrides_Insecure(t *testing.T) { + cfg := DefaultConfig() + t.Setenv("OTEL_EXPORTER_OTLP_INSECURE", "true") + ApplyEnvOverrides(&cfg, zap.NewNop()) + assert.True(t, cfg.Insecure) +} + func TestApplyEnvOverrides_Propagators(t *testing.T) { cfg := DefaultConfig() logger := zap.NewNop() diff --git a/service/otel/provider.go b/service/otel/provider.go index dc8e3bb94..2d13d3d12 100644 --- a/service/otel/provider.go +++ b/service/otel/provider.go @@ -146,24 +146,37 @@ func createHTTPExporter(ctx context.Context, cfg otelapi.Config, logger *zap.Log return otlptracehttp.New(ctx, opts...) } -// createResource creates an OTEL resource with service information +// createResource creates an OTEL resource with service information, merged +// onto the SDK default resource so every exported span and metric also carries +// standard process/host/os semconv attributes (host.name, process.pid, +// os.type, etc.). func createResource(cfg otelapi.Config) (*resource.Resource, error) { - attrs := []resource.Option{ + opts := []resource.Option{ resource.WithAttributes( semconv.ServiceName(cfg.ServiceName), ), + resource.WithHost(), + resource.WithProcess(), + resource.WithOS(), + resource.WithTelemetrySDK(), } if cfg.ServiceVersion != "" { - attrs = append(attrs, resource.WithAttributes( + opts = append(opts, resource.WithAttributes( semconv.ServiceVersion(cfg.ServiceVersion), )) } - return resource.New( - context.Background(), - attrs..., - ) + res, err := resource.New(context.Background(), opts...) + if err != nil { + return nil, err + } + + merged, err := resource.Merge(resource.Default(), res) + if err != nil { + return nil, err + } + return merged, nil } // createSampler creates a trace sampler based on sample rate diff --git a/service/otel/provider_test.go b/service/otel/provider_test.go index 2eb697750..2dba21384 100644 --- a/service/otel/provider_test.go +++ b/service/otel/provider_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" otelapi "github.com/wippyai/runtime/api/service/otel" + "go.opentelemetry.io/otel/attribute" metricnoop "go.opentelemetry.io/otel/metric/noop" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace/noop" @@ -211,6 +212,23 @@ func TestCreateResource_WithVersion(t *testing.T) { require.NotNil(t, res) } +func TestCreateResource_MergesStandardAttributes(t *testing.T) { + res, err := createResource(otelapi.Config{ServiceName: "test-service"}) + require.NoError(t, err) + + get := func(k string) string { + v, ok := res.Set().Value(attribute.Key(k)) + if !ok { + return "" + } + return v.Emit() + } + assert.Equal(t, "test-service", get("service.name")) + for _, key := range []string{"host.name", "process.pid", "os.type", "telemetry.sdk.name"} { + assert.NotEmpty(t, get(key), "resource should carry standard semconv %q", key) + } +} + func TestInitializeMeterProvider_Disabled(t *testing.T) { cfg := otelapi.Config{ Enabled: false, From 38c43d7362ab07e2ab973b1ef7faaf4a6d69ad30 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 18:05:58 -0300 Subject: [PATCH 22/26] feat(store/kv): emit operation count and latency metrics Why: KV is a hot data path with no metrics - operators could not see operation rate, error rate, or latency by namespace/op. Per-op spans would flood traces (anti-pattern for a cache layer), so metrics are the correct signal. What: add a nil-safe telemetry layer emitting wippy_kv_ops_total (namespace, op, result=ok|not_found|error) and wippy_kv_op_duration_seconds (namespace, op, result). The collector is captured from context at Start (no constructor change). Instrumented Get/Set/Delete/Put. Also fix a deprecated Emit() call in provider_test.go (use Value.String()). --- service/otel/provider_test.go | 2 +- service/store/kv/store.go | 24 ++++++++++++++++--- service/store/kv/telemetry.go | 37 ++++++++++++++++++++++++++++++ service/store/kv/telemetry_test.go | 35 ++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 service/store/kv/telemetry.go create mode 100644 service/store/kv/telemetry_test.go diff --git a/service/otel/provider_test.go b/service/otel/provider_test.go index 2dba21384..5f52fda76 100644 --- a/service/otel/provider_test.go +++ b/service/otel/provider_test.go @@ -221,7 +221,7 @@ func TestCreateResource_MergesStandardAttributes(t *testing.T) { if !ok { return "" } - return v.Emit() + return v.String() } assert.Equal(t, "test-service", get("service.name")) for _, key := range []string{"host.name", "process.pid", "os.type", "telemetry.sdk.name"} { diff --git a/service/store/kv/store.go b/service/store/kv/store.go index dd44ff1df..c9a8a3e78 100644 --- a/service/store/kv/store.go +++ b/service/store/kv/store.go @@ -7,7 +7,9 @@ import ( "errors" "sort" "sync" + "time" + "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" "github.com/wippyai/runtime/api/registry" "github.com/wippyai/runtime/api/resource" @@ -42,6 +44,7 @@ type Store struct { engine kvapi.Engine dtt payload.Transcoder log *zap.Logger + coll metrics.Collector statusChan chan any id registry.ID namespace string @@ -81,10 +84,13 @@ func NewStoreWithInfo(id registry.ID, namespace string, engine kvapi.Engine, dtt } // Start implements supervisor.Service. -func (s *Store) Start(_ context.Context) (<-chan any, error) { +func (s *Store) Start(ctx context.Context) (<-chan any, error) { s.mu.Lock() defer s.mu.Unlock() s.closed = false + if s.coll == nil { + s.coll = metrics.GetCollector(ctx) + } select { case s.statusChan <- "store.kv started": default: @@ -119,7 +125,9 @@ func (s *Store) StoreInfo(_ context.Context) store.Info { // Get implements store.Store. func (s *Store) Get(_ context.Context, key registry.ID) (payload.Payload, error) { + start := time.Now() ent, err := s.engine.Get(physicalKey(s.namespace, key)) + recordKVOp(s.coll, s.namespace, "get", kvResult(err), time.Since(start)) if err != nil { return nil, mapNotFound(err) } @@ -137,26 +145,34 @@ func (s *Store) Entry(_ context.Context, key registry.ID) (store.VersionedEntry, // Set implements store.Store. A non-zero TTL binds the key to a fresh lease. func (s *Store) Set(ctx context.Context, entry store.Entry) error { + start := time.Now() b, err := encodeValue(s.dtt, entry.Value) if err != nil { + recordKVOp(s.coll, s.namespace, "set", "error", time.Since(start)) return err } phys := physicalKey(s.namespace, entry.Key) if entry.TTL > 0 { lease, err := s.engine.GrantLease(ctx, entry.TTL) if err != nil { + recordKVOp(s.coll, s.namespace, "set", "error", time.Since(start)) return err } _, err = s.engine.SetWithLease(phys, b, lease.ID()) + recordKVOp(s.coll, s.namespace, "set", kvResult(err), time.Since(start)) return err } _, err = s.engine.Set(phys, b) + recordKVOp(s.coll, s.namespace, "set", kvResult(err), time.Since(start)) return err } // Delete implements store.Store. func (s *Store) Delete(_ context.Context, key registry.ID) error { - return mapNotFound(s.engine.Delete(physicalKey(s.namespace, key))) + start := time.Now() + err := s.engine.Delete(physicalKey(s.namespace, key)) + recordKVOp(s.coll, s.namespace, "delete", kvResult(err), time.Since(start)) + return mapNotFound(err) } // Has implements store.Store. @@ -241,7 +257,9 @@ func (s *Store) SetIfAbsent(ctx context.Context, entry store.Entry) (bool, error } // Put implements store.Putter. -func (s *Store) Put(ctx context.Context, key registry.ID, value payload.Payload, opts store.PutOptions) (store.VersionedEntry, error) { +func (s *Store) Put(ctx context.Context, key registry.ID, value payload.Payload, opts store.PutOptions) (ve store.VersionedEntry, err error) { + start := time.Now() + defer func() { recordKVOp(s.coll, s.namespace, "put", kvResult(err), time.Since(start)) }() if opts.OnlyIfAbsent && opts.HasVersion { return store.VersionedEntry{}, store.ErrInvalidOptions } diff --git a/service/store/kv/telemetry.go b/service/store/kv/telemetry.go new file mode 100644 index 000000000..8cc865a0d --- /dev/null +++ b/service/store/kv/telemetry.go @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MPL-2.0 + +package kv + +import ( + "errors" + "time" + + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/store/kv" +) + +const ( + kvOpsTotal = "wippy_kv_ops_total" + kvOpDuration = "wippy_kv_op_duration_seconds" +) + +// recordKVOp emits per-operation count and latency. Nil-safe. +func recordKVOp(coll metrics.Collector, namespace, op, result string, duration time.Duration) { + if coll == nil { + return + } + labels := metrics.Labels{"namespace": namespace, "op": op, "result": result} + coll.CounterInc(kvOpsTotal, labels) + coll.HistogramObserve(kvOpDuration, duration.Seconds(), labels) +} + +// kvResult classifies an engine error into a label value. +func kvResult(err error) string { + if err == nil { + return "ok" + } + if errors.Is(err, kv.ErrKeyNotFound) { + return "not_found" + } + return "error" +} diff --git a/service/store/kv/telemetry_test.go b/service/store/kv/telemetry_test.go new file mode 100644 index 000000000..649e5f469 --- /dev/null +++ b/service/store/kv/telemetry_test.go @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: MPL-2.0 + +package kv + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/registry" + "github.com/wippyai/runtime/api/store" + "github.com/wippyai/runtime/internal/telemetrytest" +) + +func TestStore_Telemetry_EmitsOpsMetrics(t *testing.T) { + ctx := context.Background() + s := newTestStore(t, "teltest") + rec := telemetrytest.NewRecorder() + s.coll = rec + + key := registry.ParseID("app:k1") + require.NoError(t, s.Set(ctx, store.Entry{Key: key, Value: jsonVal(`"v1"`)})) + _, err := s.Get(ctx, key) + require.NoError(t, err) + require.NoError(t, s.Delete(ctx, key)) + _, _ = s.Get(ctx, key) + + assert.Equal(t, 1.0, rec.CounterValue(kvOpsTotal, metrics.Labels{"namespace": "teltest", "op": "set", "result": "ok"})) + assert.Equal(t, 1.0, rec.CounterValue(kvOpsTotal, metrics.Labels{"namespace": "teltest", "op": "get", "result": "ok"})) + assert.Equal(t, 1.0, rec.CounterValue(kvOpsTotal, metrics.Labels{"namespace": "teltest", "op": "delete", "result": "ok"})) + assert.Equal(t, 1.0, rec.CounterValue(kvOpsTotal, metrics.Labels{"namespace": "teltest", "op": "get", "result": "not_found"})) + assert.Equal(t, uint64(1), rec.HistogramCount(kvOpDuration, metrics.Labels{"namespace": "teltest", "op": "get", "result": "ok"})) +} From 009b4259bd65190a185408e0abe89120352c537c Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 18:10:29 -0300 Subject: [PATCH 23/26] feat(sql): emit query/exec count and latency metrics Why: SQL is a core data path with no metrics - operators could not see query rate, error rate, or latency. Per-query spans would flood traces. What: add SetCollector to the SQL dispatcher and emit wippy_sql_ops_total (op=query|exec, result=ok|error) and wippy_sql_op_duration_seconds on all 6 query/exec handlers (direct, prepared-statement, and transaction variants). The collector is wired from the boot component. --- boot/components/dispatchers/sql_dispatcher.go | 2 ++ service/sql/dispatcher.go | 31 ++++++++++++------- service/sql/telemetry.go | 28 +++++++++++++++++ 3 files changed, 50 insertions(+), 11 deletions(-) create mode 100644 service/sql/telemetry.go diff --git a/boot/components/dispatchers/sql_dispatcher.go b/boot/components/dispatchers/sql_dispatcher.go index 8e4d4c78d..8ca0577c6 100644 --- a/boot/components/dispatchers/sql_dispatcher.go +++ b/boot/components/dispatchers/sql_dispatcher.go @@ -7,6 +7,7 @@ import ( "github.com/wippyai/runtime/api/boot" dispatcherapi "github.com/wippyai/runtime/api/dispatcher" + metricsapi "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/service/sql" ) @@ -20,6 +21,7 @@ func SQL() boot.Component { return ctx, ErrDispatcherNotFound } svc := sql.NewDispatcher() + svc.SetCollector(metricsapi.GetCollector(ctx)) svc.RegisterAll(reg.Register) return ctx, nil }, diff --git a/service/sql/dispatcher.go b/service/sql/dispatcher.go index c9df1fc8d..d56807a4f 100644 --- a/service/sql/dispatcher.go +++ b/service/sql/dispatcher.go @@ -6,29 +6,26 @@ package sql import ( "context" "database/sql" + "time" "github.com/wippyai/runtime/api/dispatcher" + "github.com/wippyai/runtime/api/metrics" sqlapi "github.com/wippyai/runtime/api/service/sql" ) // Dispatcher handles SQL commands using a stateless goroutine pattern. -// -// Each command spawns a goroutine that executes the SQL operation and -// delivers results via the receiver. Goroutine lifecycle is tied to context: -// when context is canceled, operations check ctx.Err() and skip result -// delivery, allowing natural termination. -// -// Resource cleanup is handled by the Store layer (Store.Close releases -// connections) and FrameContext cleanup (commits/rollbacks transactions). -// This pattern is consistent with other stateless dispatchers in the system -// (contract, function) where explicit goroutine tracking isn't needed. -type Dispatcher struct{} +type Dispatcher struct { + coll metrics.Collector +} // NewDispatcher creates a new SQL dispatcher. func NewDispatcher() *Dispatcher { return &Dispatcher{} } +// SetCollector binds a metrics collector for telemetry. +func (d *Dispatcher) SetCollector(c metrics.Collector) { d.coll = c } + // Start is a no-op since this dispatcher has no background workers. func (d *Dispatcher) Start(_ context.Context) error { return nil @@ -58,7 +55,9 @@ func (d *Dispatcher) RegisterAll(register func(id dispatcher.CommandID, h dispat func (d *Dispatcher) handleQuery(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { qc := cmd.(*sqlapi.QueryCmd) go func() { + start := time.Now() resp := executeQuery(ctx, qc.DB, qc.Query, qc.Params) + recordSQLOp(d.coll, "query", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } @@ -69,7 +68,9 @@ func (d *Dispatcher) handleQuery(ctx context.Context, cmd dispatcher.Command, ta func (d *Dispatcher) handleExecute(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { ec := cmd.(*sqlapi.ExecuteCmd) go func() { + start := time.Now() resp := executeExec(ctx, ec.DB, ec.Query, ec.Params) + recordSQLOp(d.coll, "exec", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } @@ -110,7 +111,9 @@ func (d *Dispatcher) handleBegin(ctx context.Context, cmd dispatcher.Command, ta func (d *Dispatcher) handleStmtQuery(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { sc := cmd.(*sqlapi.StmtQueryCmd) go func() { + start := time.Now() resp := executeStmtQuery(ctx, sc.Stmt, sc.Params) + recordSQLOp(d.coll, "query", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } @@ -121,7 +124,9 @@ func (d *Dispatcher) handleStmtQuery(ctx context.Context, cmd dispatcher.Command func (d *Dispatcher) handleStmtExecute(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { sc := cmd.(*sqlapi.StmtExecuteCmd) go func() { + start := time.Now() resp := executeStmtExec(ctx, sc.Stmt, sc.Params) + recordSQLOp(d.coll, "exec", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } @@ -141,7 +146,9 @@ func (d *Dispatcher) handleStmtClose(_ context.Context, cmd dispatcher.Command, func (d *Dispatcher) handleTxQuery(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { tc := cmd.(*sqlapi.TxQueryCmd) go func() { + start := time.Now() resp := executeTxQuery(ctx, tc.Tx, tc.Query, tc.Params) + recordSQLOp(d.coll, "query", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } @@ -152,7 +159,9 @@ func (d *Dispatcher) handleTxQuery(ctx context.Context, cmd dispatcher.Command, func (d *Dispatcher) handleTxExecute(ctx context.Context, cmd dispatcher.Command, tag uint64, receiver dispatcher.ResultReceiver) error { tc := cmd.(*sqlapi.TxExecuteCmd) go func() { + start := time.Now() resp := executeTxExec(ctx, tc.Tx, tc.Query, tc.Params) + recordSQLOp(d.coll, "exec", resp.Error, time.Since(start)) if ctx.Err() == nil { receiver.CompleteYield(tag, resp, nil) } diff --git a/service/sql/telemetry.go b/service/sql/telemetry.go new file mode 100644 index 000000000..dfe884f58 --- /dev/null +++ b/service/sql/telemetry.go @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MPL-2.0 + +package sql + +import ( + "time" + + "github.com/wippyai/runtime/api/metrics" +) + +const ( + sqlOpsTotal = "wippy_sql_ops_total" + sqlOpDuration = "wippy_sql_op_duration_seconds" +) + +// recordSQLOp emits per-operation count and latency. Nil-safe. +func recordSQLOp(coll metrics.Collector, op string, err error, duration time.Duration) { + if coll == nil { + return + } + result := "ok" + if err != nil { + result = "error" + } + labels := metrics.Labels{"op": op, "result": result} + coll.CounterInc(sqlOpsTotal, labels) + coll.HistogramObserve(sqlOpDuration, duration.Seconds(), labels) +} From c00b3d523ec255bb851d9063fb1d3f8f0f1fef53 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sat, 27 Jun 2026 19:49:36 -0300 Subject: [PATCH 24/26] docs(otel): document gauge type constraint in MetricsExporter Per code review: the OTel SDK rejects a second instrument kind for an existing metric name, so a metric must not mix TypeGauge (Set) and TypeGaugeAdd (Inc/Dec) on the same name. Add a comment to prevent future misuse. --- service/otel/metrics.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/service/otel/metrics.go b/service/otel/metrics.go index e5f4abe4e..9b6958fd8 100644 --- a/service/otel/metrics.go +++ b/service/otel/metrics.go @@ -21,6 +21,11 @@ type MetricsExporter struct { } func NewMetricsExporter(provider otelmetric.MeterProvider) *MetricsExporter { + // Note: TypeGauge and TypeGaugeAdd map to different OTel instrument kinds + // (Float64Gauge vs Float64UpDownCounter). The OTel SDK keys instruments by + // name, so a metric must not mix GaugeSet and GaugeInc/Dec on the same + // name — the second instrument kind registration returns an error and that + // metric silently stops exporting. Use one or the other per metric name. return &MetricsExporter{ meter: provider.Meter("wippy-runtime"), counters: make(map[string]otelmetric.Float64Counter), From 8af8e647b31f72a50026864115f4aba0601e3423 Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sun, 28 Jun 2026 12:59:02 -0300 Subject: [PATCH 25/26] fix(store/memory): add KV metrics to memory store backend Why: KV metrics were only on the CRDT-backed Store (service/store/kv), but the test app (and many deployments) use memory stores (kind: store.memory) which go through service/store/memory/Store - a completely separate code path. KV metrics were absent in real-binary testing because of this gap. What: add the same telemetry layer (wippy_kv_ops_total / wippy_kv_op_duration_seconds) to the memory store backend. The collector is captured at Start via context (same pattern as the CRDT store). Instrumented Get/Set/Delete/Put with named-returns + defer for accurate result classification (ok/not_found/error). --- service/store/memory/memstore.go | 37 +++++++++++++++++++++++++++---- service/store/memory/telemetry.go | 36 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 service/store/memory/telemetry.go diff --git a/service/store/memory/memstore.go b/service/store/memory/memstore.go index 76955c75f..3ba973ce6 100644 --- a/service/store/memory/memstore.go +++ b/service/store/memory/memstore.go @@ -4,6 +4,7 @@ package memory import ( "context" + "errors" "sort" "strings" "sync" @@ -11,6 +12,7 @@ import ( memstore "github.com/wippyai/runtime/api/service/store/memory" + "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" "github.com/wippyai/runtime/api/registry" "github.com/wippyai/runtime/api/resource" @@ -36,6 +38,7 @@ var ( type Store struct { config *memstore.Config log *zap.Logger + coll metrics.Collector data map[string]*storeEntry statusChan chan any stopChan chan struct{} @@ -79,6 +82,10 @@ func (m *Store) Start(ctx context.Context) (<-chan any, error) { m.mu.Lock() defer m.mu.Unlock() + if m.coll == nil { + m.coll = metrics.GetCollector(ctx) + } + if m.closed { return nil, servicestore.ErrStoreClosed } @@ -131,7 +138,20 @@ func (m *Store) Stop(ctx context.Context) error { } // Get retrieves a value by key -func (m *Store) Get(_ context.Context, key registry.ID) (payload.Payload, error) { +func (m *Store) Get(_ context.Context, key registry.ID) (val payload.Payload, err error) { + start := time.Now() + defer func() { + result := "ok" + if err != nil { + if errors.Is(err, store.ErrKeyNotFound) { + result = "not_found" + } else { + result = "error" + } + } + recordOp(m.coll, m.id.String(), "get", result, time.Since(start)) + }() + m.mu.Lock() defer m.mu.Unlock() @@ -182,7 +202,10 @@ func (m *Store) Entry(_ context.Context, key registry.ID) (store.VersionedEntry, } // Set stores or updates a value with the given key -func (m *Store) Set(_ context.Context, entry store.Entry) error { +func (m *Store) Set(_ context.Context, entry store.Entry) (err error) { + start := time.Now() + defer func() { recordOp(m.coll, m.id.String(), "set", storeErrResult(err), time.Since(start)) }() + m.mu.Lock() defer m.mu.Unlock() @@ -220,7 +243,10 @@ func (m *Store) Set(_ context.Context, entry store.Entry) error { } // Put stores a value with optional absent/version preconditions. -func (m *Store) Put(_ context.Context, key registry.ID, value payload.Payload, opts store.PutOptions) (store.VersionedEntry, error) { +func (m *Store) Put(_ context.Context, key registry.ID, value payload.Payload, opts store.PutOptions) (ve store.VersionedEntry, err error) { + start := time.Now() + defer func() { recordOp(m.coll, m.id.String(), "put", storeErrResult(err), time.Since(start)) }() + m.mu.Lock() defer m.mu.Unlock() @@ -278,7 +304,10 @@ func (m *Store) Put(_ context.Context, key registry.ID, value payload.Payload, o } // Delete removes a value with the given key -func (m *Store) Delete(_ context.Context, key registry.ID) error { +func (m *Store) Delete(_ context.Context, key registry.ID) (err error) { + start := time.Now() + defer func() { recordOp(m.coll, m.id.String(), "delete", storeErrResult(err), time.Since(start)) }() + m.mu.Lock() defer m.mu.Unlock() diff --git a/service/store/memory/telemetry.go b/service/store/memory/telemetry.go new file mode 100644 index 000000000..2f3688493 --- /dev/null +++ b/service/store/memory/telemetry.go @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MPL-2.0 + +package memory + +import ( + "errors" + "time" + + "github.com/wippyai/runtime/api/metrics" + "github.com/wippyai/runtime/api/store" +) + +const ( + memOpsTotal = "wippy_kv_ops_total" + memOpDuration = "wippy_kv_op_duration_seconds" +) + +func storeErrResult(err error) string { + if err == nil { + return "ok" + } + if errors.Is(err, store.ErrKeyNotFound) || errors.Is(err, store.ErrKeyExists) || errors.Is(err, store.ErrVersionMismatch) { + return "not_found" + } + return "error" +} + +// recordOp emits per-operation count and latency. Nil-safe. +func recordOp(coll metrics.Collector, namespace, op, result string, duration time.Duration) { + if coll == nil { + return + } + labels := metrics.Labels{"namespace": namespace, "op": op, "result": result} + coll.CounterInc(memOpsTotal, labels) + coll.HistogramObserve(memOpDuration, duration.Seconds(), labels) +} From 7c0fb058d4cb9d6de7e095b20153d532099e8bad Mon Sep 17 00:00:00 2001 From: Rodrigo Delduca Date: Sun, 28 Jun 2026 13:12:09 -0300 Subject: [PATCH 26/26] fix(store/memory): wire collector from boot through manager to stores Why: KV metrics were absent in the real binary because the memory store manager created stores without a collector, and Start's context didn't carry the app-level metrics collector. The test app (and many deployments) use memory stores (kind: store.memory) exclusively. What: thread the collector from the boot component (where it's available) through NewManager into each Store at creation time (before Start), so KV metrics emit regardless of whether Start's context carries the collector. Verified: wippy_kv_ops_total = 21 in real-binary scrape. --- boot/components/store/memstore.go | 3 +++ service/store/memory/manager.go | 6 ++++++ service/store/memory/manager_test.go | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/boot/components/store/memstore.go b/boot/components/store/memstore.go index c606b3139..c7bb3577e 100644 --- a/boot/components/store/memstore.go +++ b/boot/components/store/memstore.go @@ -8,6 +8,7 @@ import ( "github.com/wippyai/runtime/api/boot" "github.com/wippyai/runtime/api/event" logapi "github.com/wippyai/runtime/api/logs" + metricsapi "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" bootpkg "github.com/wippyai/runtime/boot" memorystore "github.com/wippyai/runtime/service/store/memory" @@ -22,11 +23,13 @@ func MemStore() boot.Component { dtt := payload.GetTranscoder(ctx) bus := event.GetBus(ctx) handlers := bootpkg.GetHandlerRegistry(ctx) + coll := metricsapi.GetCollector(ctx) manager := memorystore.NewManager( bus, dtt, logger.Named("memory"), + coll, ) handlers.RegisterListener("store.memory", manager) diff --git a/service/store/memory/manager.go b/service/store/memory/manager.go index 86ce2bba2..607e0651a 100644 --- a/service/store/memory/manager.go +++ b/service/store/memory/manager.go @@ -7,6 +7,7 @@ import ( "sync" "github.com/wippyai/runtime/api/event" + "github.com/wippyai/runtime/api/metrics" "github.com/wippyai/runtime/api/payload" "github.com/wippyai/runtime/api/registry" "github.com/wippyai/runtime/api/resource" @@ -22,6 +23,7 @@ type Manager struct { dtt payload.Transcoder bus event.Bus log *zap.Logger + coll metrics.Collector stores map[registry.ID]*Store mu sync.RWMutex } @@ -31,6 +33,7 @@ func NewManager( bus event.Bus, dtt payload.Transcoder, log *zap.Logger, + coll metrics.Collector, ) *Manager { if log == nil { log = zap.NewNop() @@ -39,6 +42,7 @@ func NewManager( log: log, dtt: dtt, bus: bus, + coll: coll, stores: make(map[registry.ID]*Store), } } @@ -64,6 +68,7 @@ func (m *Manager) Add(ctx context.Context, entry registry.Entry) error { // Create memory store store := NewStore(entry.ID, cfg, m.log) + store.coll = m.coll m.stores[entry.ID] = store // Register with supervisor @@ -129,6 +134,7 @@ func (m *Manager) Update(ctx context.Context, entry registry.Entry) error { // Create new store with updated config newStore := NewStore(entry.ID, cfg, m.log) + newStore.coll = m.coll m.stores[entry.ID] = newStore // Update supervisor entry diff --git a/service/store/memory/manager_test.go b/service/store/memory/manager_test.go index 1a0300fb6..586c27a31 100644 --- a/service/store/memory/manager_test.go +++ b/service/store/memory/manager_test.go @@ -59,7 +59,7 @@ func newTestManager(_ *testing.T) (*Manager, *mockBus) { json.Register(transcoder) bus := &mockBus{} log := zap.NewNop() - mgr := NewManager(bus, transcoder, log) + mgr := NewManager(bus, transcoder, log, nil) return mgr, bus }