diff --git a/docker-compose/otel-compose.yaml b/docker-compose/otel-compose.yaml index 95379d5..584823d 100644 --- a/docker-compose/otel-compose.yaml +++ b/docker-compose/otel-compose.yaml @@ -1,6 +1,6 @@ services: otel-collector: - image: otel/opentelemetry-collector-contrib:0.118.0 + image: otel/opentelemetry-collector-contrib:0.144.0 container_name: otel-collector command: [ @@ -14,10 +14,7 @@ services: # Host filesystem mounts so the hostmetrics receiver can read the VM's # CPU / memory / disk / network / process stats rather than the # collector container's own (containerized) view. - - /proc:/hostfs/proc:ro - - /sys:/hostfs/sys:ro - - /etc/os-release:/hostfs/etc/os-release:ro - - /:/hostfs/root:ro,rslave + - /:/hostfs:ro,rslave ports: - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver @@ -40,6 +37,11 @@ services: # this because they read via the Docker socket. - nginx_network - apache_network + # Note: bridge networking is required here for active prometheus scraping + # of named docker networks. For scraping host-networked containers + # (network_mode: host), switch to network_mode: host on this service, + # remove the networks: block, use localhost: as scrape target, + # and change logspout command to syslog+tcp://localhost:2255. logspout: image: "gliderlabs/logspout:v3.2.14" diff --git a/docker-compose/otel-config.yaml b/docker-compose/otel-config.yaml index c8b1244..c84b6c5 100644 --- a/docker-compose/otel-config.yaml +++ b/docker-compose/otel-config.yaml @@ -89,19 +89,40 @@ receivers: metrics: system.filesystem.utilization: enabled: true + exclude_fs_types: + fs_types: + - squashfs + - tmpfs + - devtmpfs + - overlay + - nsfs + - autofs + - cgroup + - cgroup2 + - proc + - sysfs + match_type: strict exclude_mount_points: mount_points: - - /hostfs/sys/* - - /hostfs/proc/* - - /var/lib/docker/* - - /hostfs/root/var/lib/docker/* + - /hostfs/sys/.* + - /hostfs/proc/.* + - /hostfs/dev/.* + - /hostfs/run/.* + - /hostfs/var/lib/docker/.* + - /hostfs/snap/.* + - /hostfs/boot.* + - /var/lib/docker/.* match_type: regexp network: paging: processes: - # mute_process_*_error: true silences the routine "permission denied" - # noise for kernel threads and short-lived processes whose - # /proc//{exe,io,cmdline} entries can't be read. + # mute_process_*_error: true silences "permission denied" noise for + # kernel threads and short-lived processes. Note: these flags do NOT + # cover TOCTOU races — if a process exits between PID discovery and + # stat read, /proc//stat returns ENOENT and is still logged as + # an error. On busy hosts with many short-lived processes (kworkers, + # sshd child processes), remove the process: scraper entirely and rely + # on docker_stats for per-container granularity instead. process: mute_process_user_error: true mute_process_io_error: true diff --git a/gemini-cli/.env.example b/gemini-cli/.env.example new file mode 100644 index 0000000..4e45a33 --- /dev/null +++ b/gemini-cli/.env.example @@ -0,0 +1,5 @@ +# Last9 OTLP credentials — get these from: +# https://app.last9.io/integrations?integration=OpenTelemetry + +LAST9_OTLP_ENDPOINT=https://otlp.last9.io +LAST9_OTLP_AUTH=Basic diff --git a/gemini-cli/.gitignore b/gemini-cli/.gitignore new file mode 100644 index 0000000..f0a6075 --- /dev/null +++ b/gemini-cli/.gitignore @@ -0,0 +1,11 @@ +# Environment/secrets +.env +.env.local +.env.*.local + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/gemini-cli/README.md b/gemini-cli/README.md new file mode 100644 index 0000000..16170b9 --- /dev/null +++ b/gemini-cli/README.md @@ -0,0 +1,137 @@ +# Gemini CLI — OpenTelemetry to Last9 + +Send Google's [Gemini CLI](https://github.com/google-gemini/gemini-cli) telemetry — traces, metrics, and logs — to Last9 via OpenTelemetry. + +Gemini CLI emits all three OpenTelemetry signal types. Keep all three on — combined volume is low and traces add per-call model latency / token attribution that metrics alone cannot. + +- **Traces** — spans for tool calls, API requests, agent runs (set `GEMINI_TELEMETRY_TRACES_ENABLED=true`) +- **Metrics** — session counts, token usage, latency, file ops, agent durations +- **Logs** — structured events for prompts, API requests/responses, slash commands, file operations + +## Prerequisites + +- A Last9 account ([app.last9.io](https://app.last9.io)) +- OTLP credentials from **Integrations → OpenTelemetry** in the Last9 dashboard +- Gemini CLI installed (`npm install -g @google/gemini-cli`) + +## Option 1 — Direct Export to Last9 (no Collector) + +Gemini CLI's OTLP exporters take `url` only, but the underlying OpenTelemetry JS SDK reads standard `OTEL_EXPORTER_OTLP_HEADERS` from env for authentication — so direct export works. + +```bash +# Gemini CLI telemetry switches +export GEMINI_TELEMETRY_ENABLED=true +export GEMINI_TELEMETRY_TARGET=local +export GEMINI_TELEMETRY_OTLP_ENDPOINT="https://" +export GEMINI_TELEMETRY_OTLP_PROTOCOL=http +export GEMINI_TELEMETRY_TRACES_ENABLED=true + +# Standard OTel env — picked up by SDK for auth headers +export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic " +``` + +Reload and run: + +```bash +source ~/.zshrc +gemini -p "explain this repo" +``` + +## Option 2 — Local OpenTelemetry Collector + +Useful for batching, retries, or fan-out to multiple backends. + +1. Copy `.env.example` to `.env` and fill in your Last9 credentials: + + ```bash + cp .env.example .env + ``` + +2. Start the collector: + + ```bash + docker compose up -d + ``` + +3. Point Gemini CLI at the local collector: + + ```bash + export GEMINI_TELEMETRY_ENABLED=true + export GEMINI_TELEMETRY_TARGET=local + export GEMINI_TELEMETRY_OTLP_ENDPOINT=http://localhost:4317 + export GEMINI_TELEMETRY_OTLP_PROTOCOL=grpc + export GEMINI_TELEMETRY_TRACES_ENABLED=true + unset OTEL_EXPORTER_OTLP_HEADERS # collector handles auth + ``` + +4. Run Gemini CLI: + + ```bash + gemini + ``` + +## Configuration Reference + +### Gemini CLI-specific + +| Variable | Default | Purpose | +|---|---|---| +| `GEMINI_TELEMETRY_ENABLED` | `false` | Master toggle | +| `GEMINI_TELEMETRY_TARGET` | `local` | `local` (OTLP) or `gcp` (Google Cloud) | +| `GEMINI_TELEMETRY_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP endpoint (base URL — `/v1/traces` etc. appended) | +| `GEMINI_TELEMETRY_OTLP_PROTOCOL` | `grpc` | `grpc` or `http` | +| `GEMINI_TELEMETRY_TRACES_ENABLED` | `false` | Set `true` to export spans (recommended) | +| `GEMINI_TELEMETRY_LOG_PROMPTS` | `true` | Include prompt text in logs | +| `GEMINI_TELEMETRY_OUTFILE` | — | Write to local file instead of OTLP | + +### Standard OTel env (used for auth headers) + +| Variable | Purpose | +|---|---| +| `OTEL_EXPORTER_OTLP_HEADERS` | `Authorization=Basic ` | +| `OTEL_EXPORTER_OTLP_{TRACES,METRICS,LOGS}_HEADERS` | Per-signal override | + +### `.env` variables (Collector path) + +| Variable | Purpose | +|---|---| +| `LAST9_OTLP_ENDPOINT` | Last9 OTLP endpoint (e.g. `https://otlp.last9.io`) | +| `LAST9_OTLP_AUTH` | Last9 Basic auth header (`Basic `) | + +## Verification + +After running a Gemini CLI session for a minute: + +1. **Traces** — filter by `service.name = gemini-cli` in Last9 Traces Explorer. Span name: `llm_call`. +2. **Metrics** — search for `gemini_cli_session_count_total`, `gemini_cli_api_request_count_total`, `gemini_cli_token_usage_total` +3. **Logs** — filter by `service.name = gemini-cli` in Last9 Logs Explorer + +
+Notable metrics + +| Metric | Type | +|---|---| +| `gemini_cli.session.count` | counter | +| `gemini_cli.tool.call.count` | counter | +| `gemini_cli.tool.call.latency` | histogram | +| `gemini_cli.api.request.count` | counter | +| `gemini_cli.api.request.latency` | histogram | +| `gemini_cli.token.usage` | counter | +| `gemini_cli.file.operation.count` | counter | +| `gemini_cli.lines.changed` | counter | +| `gemini_cli.agent.run.count` | counter | +| `gemini_cli.agent.duration` | histogram | +| `gemini_cli.startup.duration` | histogram | +| `gemini_cli.memory.usage` | gauge | +| `gemini_cli.cpu.usage` | gauge | +| `gen_ai.client.token.usage` | counter (GenAI semconv) | +| `gen_ai.client.operation.duration` | histogram (GenAI semconv) | + +
+ +## Troubleshooting + +- **No data in Last9** — confirm `echo $GEMINI_TELEMETRY_ENABLED` returns `true` in the shell that ran `gemini`. Restart shell after editing `~/.zshrc`. +- **Authentication errors** — verify `OTEL_EXPORTER_OTLP_HEADERS` is `Authorization=Basic ` (key=value format, not HTTP colon syntax). Trailing whitespace breaks it. +- **Traces missing** — make sure `GEMINI_TELEMETRY_TRACES_ENABLED=true` is set. Gemini CLI defaults to `false`, so spans only flow when you explicitly enable them. +- **gRPC connection refused** — make sure `-p 4317:4317` is exposed (collector path) and you're using `grpc` protocol with `http://localhost:4317` (not `https`). diff --git a/gemini-cli/docker-compose.yaml b/gemini-cli/docker-compose.yaml new file mode 100644 index 0000000..0f6b064 --- /dev/null +++ b/gemini-cli/docker-compose.yaml @@ -0,0 +1,19 @@ +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:0.144.0 + container_name: gemini-cli-otel-collector + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel/config.yaml:ro + env_file: .env + environment: + - DEPLOYMENT_ENV=${DEPLOYMENT_ENV:-development} + ports: + - "4317:4317" # gRPC receiver + - "4318:4318" # HTTP receiver + - "13133:13133" # Health check + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133"] + interval: 10s + timeout: 5s + retries: 3 diff --git a/gemini-cli/otel-collector-config.yaml b/gemini-cli/otel-collector-config.yaml new file mode 100644 index 0000000..344e559 --- /dev/null +++ b/gemini-cli/otel-collector-config.yaml @@ -0,0 +1,44 @@ +receivers: + # Receive logs, traces, and metrics from Gemini CLI via gRPC and HTTP + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 1000 + + # Add organization-level resource attributes + resource: + attributes: + - key: deployment.environment + value: "${env:DEPLOYMENT_ENV}" + action: upsert + +exporters: + otlp/last9: + endpoint: "${env:LAST9_OTLP_ENDPOINT}" + headers: + "Authorization": "${env:LAST9_OTLP_AUTH}" + + debug: + verbosity: basic + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] + metrics: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] + logs: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] diff --git a/go/custom-metrics/.gitignore b/go/custom-metrics/.gitignore new file mode 100644 index 0000000..c3ad10f --- /dev/null +++ b/go/custom-metrics/.gitignore @@ -0,0 +1,2 @@ +custom-metrics-example +example diff --git a/go/custom-metrics/go.mod b/go/custom-metrics/go.mod new file mode 100644 index 0000000..2217b96 --- /dev/null +++ b/go/custom-metrics/go.mod @@ -0,0 +1,31 @@ +module custom-metrics-example + +go 1.22.7 + +toolchain go1.22.12 + +require ( + go.opentelemetry.io/otel v1.33.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0 + go.opentelemetry.io/otel/metric v1.33.0 + go.opentelemetry.io/otel/sdk v1.33.0 + go.opentelemetry.io/otel/sdk/metric v1.33.0 +) + +require ( + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/otel/trace v1.33.0 // indirect + go.opentelemetry.io/proto/otlp v1.4.0 // indirect + golang.org/x/net v0.32.0 // indirect + golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 // indirect + google.golang.org/grpc v1.68.1 // indirect + google.golang.org/protobuf v1.35.2 // indirect +) diff --git a/go/custom-metrics/go.sum b/go/custom-metrics/go.sum new file mode 100644 index 0000000..b3e9c6c --- /dev/null +++ b/go/custom-metrics/go.sum @@ -0,0 +1,53 @@ +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.33.0 h1:/FerN9bax5LoK51X/sI0SVYrjSE0/yUL7DpxW4K3FWw= +go.opentelemetry.io/otel v1.33.0/go.mod h1:SUUkR6csvUQl+yjReHu5uM3EtVV7MBm5FHKRlNx4I8I= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0 h1:bSjzTvsXZbLSWU8hnZXcKmEVaJjjnandxD0PxThhVU8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0/go.mod h1:aj2rilHL8WjXY1I5V+ra+z8FELtk681deydgYT8ikxU= +go.opentelemetry.io/otel/metric v1.33.0 h1:r+JOocAyeRVXD8lZpjdQjzMadVZp2M4WmQ+5WtEnklQ= +go.opentelemetry.io/otel/metric v1.33.0/go.mod h1:L9+Fyctbp6HFTddIxClbQkjtubW6O9QS3Ann/M82u6M= +go.opentelemetry.io/otel/sdk v1.33.0 h1:iax7M131HuAm9QkZotNHEfstof92xM+N8sr3uHXc2IM= +go.opentelemetry.io/otel/sdk v1.33.0/go.mod h1:A1Q5oi7/9XaMlIWzPSxLRWOI8nG3FnzHJNbiENQuihM= +go.opentelemetry.io/otel/sdk/metric v1.33.0 h1:Gs5VK9/WUJhNXZgn8MR6ITatvAmKeIuCtNbsP3JkNqU= +go.opentelemetry.io/otel/sdk/metric v1.33.0/go.mod h1:dL5ykHZmm1B1nVRk9dDjChwDmt81MjVp3gLkQRwKf/Q= +go.opentelemetry.io/otel/trace v1.33.0 h1:cCJuF7LRjUFso9LPnEAHJDB2pqzp+hbO8eu1qqW2d/s= +go.opentelemetry.io/otel/trace v1.33.0/go.mod h1:uIcdVUZMpTAmz0tI1z04GoVSezK37CbGV4fr1f2nBck= +go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= +go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= +golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576 h1:CkkIfIt50+lT6NHAVoRYEyAvQGFM7xEwXUUywFvEb3Q= +google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576/go.mod h1:1R3kvZ1dtP3+4p4d3G8uJ8rFk/fWlScl38vanWACI08= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 h1:8ZmaLZE4XWrtU3MyClkYqqtl6Oegr3235h7jxsDyqCY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU= +google.golang.org/grpc v1.68.1 h1:oI5oTa11+ng8r8XMMN7jAOmWfPZWbYpCFaMUTACxkM0= +google.golang.org/grpc v1.68.1/go.mod h1:+q1XYFJjShcqn0QZHvCyeR4CXPA+llXIeUIfIe00waw= +google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io= +google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/go/custom-metrics/main.go b/go/custom-metrics/main.go new file mode 100644 index 0000000..9a735be --- /dev/null +++ b/go/custom-metrics/main.go @@ -0,0 +1,144 @@ +package main + +import ( + "context" + "log" + "math/rand" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + otelmetric "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" +) + +func initMeterProvider(ctx context.Context) (*metric.MeterProvider, error) { + exporter, err := otlpmetrichttp.New(ctx) + if err != nil { + return nil, err + } + + res, err := resource.New(ctx, + resource.WithFromEnv(), + resource.WithAttributes( + semconv.ServiceNameKey.String("custom-metrics-example"), + semconv.DeploymentEnvironmentKey.String("development"), + ), + ) + if err != nil { + return nil, err + } + + mp := metric.NewMeterProvider( + metric.WithResource(res), + metric.WithReader(metric.NewPeriodicReader(exporter, + metric.WithInterval(15*time.Second), + )), + ) + otel.SetMeterProvider(mp) + return mp, nil +} + +func main() { + ctx := context.Background() + + mp, err := initMeterProvider(ctx) + if err != nil { + log.Fatalf("init meter provider: %v", err) + } + defer func() { + if err := mp.ForceFlush(ctx); err != nil { + log.Printf("flush error: %v", err) + } + if err := mp.Shutdown(ctx); err != nil { + log.Printf("shutdown error: %v", err) + } + }() + + meter := otel.Meter("custom-metrics-example") + + // Counter: tracks how many times an event occurred + resolutionCounter, err := meter.Int64Counter( + "subscription.upgrade.notification.resolution", + otelmetric.WithDescription("Count of subscription upgrade notification resolutions"), + ) + if err != nil { + log.Fatalf("create counter: %v", err) + } + + // Histogram: tracks distribution of a value (e.g. latency) + requestDuration, err := meter.Float64Histogram( + "request.duration", + otelmetric.WithDescription("Request duration in seconds"), + otelmetric.WithUnit("s"), + ) + if err != nil { + log.Fatalf("create histogram: %v", err) + } + + // Gauge: tracks a current value that can go up or down + queueDepth, err := meter.Int64Gauge( + "queue.depth", + otelmetric.WithDescription("Current number of items in the queue"), + ) + if err != nil { + log.Fatalf("create gauge: %v", err) + } + + statuses := []string{"success", "failure", "timeout"} + reasons := []string{"completed", "rejected", "expired"} + products := []string{"premium", "basic", "trial"} + + log.Println("emitting metrics every 5s, press Ctrl+C to stop") + + for i := 0; i < 10; i++ { + status := statuses[rand.Intn(len(statuses))] + reason := reasons[rand.Intn(len(reasons))] + product := products[rand.Intn(len(products))] + + // IMPORTANT: never pass empty string as attribute value. + // Last9 follows the Prometheus data model: labels with empty values + // silently — the metric is recorded but that label dimension is absent. + // Use a sentinel like "unknown" if the value may be empty. + if status == "" { + status = "unknown" + } + if reason == "" { + reason = "unknown" + } + if product == "" { + product = "unknown" + } + + resolutionCounter.Add(ctx, 1, otelmetric.WithAttributes( + attribute.String("status", status), + attribute.String("reason", reason), + attribute.String("product_type", product), + )) + + duration := 0.1 + rand.Float64()*0.9 + requestDuration.Record(ctx, duration, otelmetric.WithAttributes( + attribute.String("method", "POST"), + attribute.String("status_code", "200"), + attribute.String("product_type", product), + )) + + depth := int64(rand.Intn(100)) + queueDepth.Record(ctx, depth, otelmetric.WithAttributes( + attribute.String("queue_name", "notifications"), + )) + + log.Printf("recorded: status=%s reason=%s product_type=%s duration=%.3fs queue_depth=%d", + status, reason, product, duration, depth) + + time.Sleep(5 * time.Second) + } + + log.Println("done — metrics will appear in Last9 as:") + log.Println(" subscription_upgrade_notification_resolution_total") + log.Println(" request_duration_seconds_bucket / _count / _sum") + log.Println(" queue_depth") +} diff --git a/java-vertx3-rxjava/auto-instrumentation/Dockerfile b/java-vertx3-rxjava/auto-instrumentation/Dockerfile index 743a90d..42ac01e 100644 --- a/java-vertx3-rxjava/auto-instrumentation/Dockerfile +++ b/java-vertx3-rxjava/auto-instrumentation/Dockerfile @@ -4,7 +4,9 @@ FROM maven:3.9-eclipse-temurin-11 AS build WORKDIR /app # Install Last9 auto-configure JARs into local Maven repo. -COPY lib/ /tmp/lib/ +# agent JAR excluded — only needed at runtime, not compile time +COPY lib/vertx-otel-core.jar /tmp/lib/vertx-otel-core.jar +COPY lib/vertx3-rxjava2-otel-autoconfigure.jar /tmp/lib/vertx3-rxjava2-otel-autoconfigure.jar RUN mvn install:install-file \ -Dfile=/tmp/lib/vertx-otel-core.jar \ -DgroupId=io.last9 \ diff --git a/java-vertx3-rxjava/auto-instrumentation/INTEGRATION.md b/java-vertx3-rxjava/auto-instrumentation/INTEGRATION.md index 3c4c6ec..64de1e1 100644 --- a/java-vertx3-rxjava/auto-instrumentation/INTEGRATION.md +++ b/java-vertx3-rxjava/auto-instrumentation/INTEGRATION.md @@ -124,3 +124,4 @@ java -jar target/your-app.jar run com.yourpackage.MainVerticle - HTTP client calls (downstream services) - RxJava2 context propagation - Log correlation (trace_id, span_id in logs) +- **Exceptions** on SERVER spans when handlers use `ctx.fail(throwable)` or unhandled errors reach the failure handler (see `README.md` → *Verify automatic exception capture*) diff --git a/java-vertx3-rxjava/auto-instrumentation/README.md b/java-vertx3-rxjava/auto-instrumentation/README.md index 469c6ca..627266f 100644 --- a/java-vertx3-rxjava/auto-instrumentation/README.md +++ b/java-vertx3-rxjava/auto-instrumentation/README.md @@ -63,10 +63,21 @@ java -javaagent:vertx3-otel-agent.jar \ That's it. Every Router endpoint, JDBC query, Kafka message, Aerospike operation, and outbound HTTP call is automatically traced. -### 4. Start infrastructure and test +### 4. Local E2E (collector + app on host) + +Minimal flow with a **debug-only** collector — no Last9 credentials, no full Docker stack: ```bash -# Start Postgres, Kafka, Aerospike, MySQL +cd java-vertx3-rxjava/auto-instrumentation +./scripts/local-e2e.sh +``` + +Uses collector on host ports **24317/24318** and Postgres on **5433**. Override `VERTX_OTEL_REPO` if your vertx-opentelemetry checkout is elsewhere. + +### 5. Full Docker stack (optional) + +```bash +# Start Postgres, Kafka, Aerospike, MySQL + collector + services docker compose up -d # Test endpoints @@ -78,6 +89,29 @@ curl -X POST http://localhost:8080/v1/holding \ curl http://localhost:8080/v1/portfolio-full/user1 ``` +### Verify automatic exception capture + +These endpoints demonstrate when stack traces appear on SERVER spans (no manual `recordException` in app code for the first two): + +```bash +# Automatic: ctx.fail(throwable) → exception event + ERROR status on SERVER span +curl -s http://localhost:8080/v1/error/fail + +# Automatic: unhandled handler error routed through failure handler +curl -s "http://localhost:8080/v1/error/http?type=runtime" + +# No stack trace (by design): HTTP 500 without a Throwable on the routing context +curl -s http://localhost:8080/v1/error/direct-500 +``` + +In the OTel collector debug output, look for an `exception` event on the SERVER span with `exception.type`, `exception.message`, and `exception.stacktrace`: + +```bash +docker logs vertx3-otel-collector-local 2>&1 | grep -E "(Name |exception\.|Status )" +``` + +Use `ctx.fail(status, error)` (or the global failure handler) in production handlers so exceptions are captured automatically — returning `setStatusCode(500).end()` alone marks the span as ERROR but does not attach a stack trace. + ## What Gets Auto-Traced | Component | Span Kind | Agent Advice | Attributes | @@ -140,8 +174,10 @@ Application-specific variables: | DELETE | `/v1/cache/:key` | Cache delete | Router + Aerospike | | GET | `/v1/mysql/ping` | MySQL health | Router + MySQL | | GET | `/v1/portfolio-full/:userId` | Multi-system query | Router + JDBC + Aerospike + WebClient + Kafka | -| GET | `/v1/error/http` | Error scenario | Router (exception recording) | -| GET | `/v1/error/try-catch` | Manual span error | Router + `Span.recordException()` | +| GET | `/v1/error/fail` | `ctx.fail()` — automatic exception event | Router | +| GET | `/v1/error/direct-500` | 500 without throwable (no stack) | Router | +| GET | `/v1/error/http` | Simulated error (`?type=runtime\|npe\|illegal`) | Router | +| GET | `/v1/error/try-catch` | Manual `Span.recordException()` in handler | Router | ## Key Difference: Zero-Code vs Manual diff --git a/java-vertx3-rxjava/auto-instrumentation/docker-compose.yaml b/java-vertx3-rxjava/auto-instrumentation/docker-compose.yaml index 122f879..1939b2a 100644 --- a/java-vertx3-rxjava/auto-instrumentation/docker-compose.yaml +++ b/java-vertx3-rxjava/auto-instrumentation/docker-compose.yaml @@ -116,6 +116,9 @@ services: AEROSPIKE_HOST: aerospike AEROSPIKE_PORT: "3000" AEROSPIKE_NAMESPACE: test + VERTX_OTEL_BODY_CAPTURE_ENABLED: "true" + VERTX_OTEL_BODY_CAPTURE_REQUEST: "true" + VERTX_OTEL_BODY_CAPTURE_RESPONSE: "true" ports: - "8080:8080" depends_on: diff --git a/java-vertx3-rxjava/auto-instrumentation/src/main/java/com/example/holding/MainVerticle.java b/java-vertx3-rxjava/auto-instrumentation/src/main/java/com/example/holding/MainVerticle.java index a435a86..c037cc6 100644 --- a/java-vertx3-rxjava/auto-instrumentation/src/main/java/com/example/holding/MainVerticle.java +++ b/java-vertx3-rxjava/auto-instrumentation/src/main/java/com/example/holding/MainVerticle.java @@ -197,7 +197,11 @@ public Completable rxStart() { // Complex multi-system endpoint — DB + Aerospike + Kafka + outbound HTTP router.get("/v1/portfolio-full/:userId").handler(this::handleFullPortfolio); - // Exception scenario endpoints + // Exception scenario endpoints — verify exception events on SERVER spans in the collector + router.get("/v1/error/fail").handler(ctx -> + ctx.fail(new RuntimeException("Simulated failure via ctx.fail()"))); + router.get("/v1/error/direct-500").handler(ctx -> + ctx.response().setStatusCode(500).end("error without throwable")); router.get("/v1/error/http").handler(this::handleErrorHttp); router.get("/v1/error/try-catch").handler(this::handleErrorTryCatch); diff --git a/java-vertx3-rxjava/resteasy-agent-test/.env.example b/java-vertx3-rxjava/resteasy-agent-test/.env.example new file mode 100644 index 0000000..e0b29bd --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/.env.example @@ -0,0 +1,14 @@ +# OTel exporter — point to your Last9 OTLP endpoint +OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp.last9.io +OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic +OTEL_SERVICE_NAME=resteasy-agent-test +OTEL_TRACES_EXPORTER=otlp + +# Body capture (FDE-196 Bug 1) +# Option A: capture all requests +VERTX_OTEL_BODY_CAPTURE_ENABLED=true + +# Option B: capture only on errors (4xx/5xx) — no ENABLED needed +# VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY=true + +PORT=8080 diff --git a/java-vertx3-rxjava/resteasy-agent-test/Dockerfile b/java-vertx3-rxjava/resteasy-agent-test/Dockerfile new file mode 100644 index 0000000..074d5ff --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/Dockerfile @@ -0,0 +1,5 @@ +FROM eclipse-temurin:11-jre +WORKDIR /app +COPY target/resteasy-agent-test-1.0.0.jar app.jar +COPY target/vertx3-otel-agent.jar vertx3-otel-agent.jar +ENTRYPOINT ["java", "-javaagent:vertx3-otel-agent.jar", "-jar", "app.jar"] diff --git a/java-vertx3-rxjava/resteasy-agent-test/README.md b/java-vertx3-rxjava/resteasy-agent-test/README.md new file mode 100644 index 0000000..d45b75c --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/README.md @@ -0,0 +1,86 @@ +# RESTEasy Agent Test App + +Tests three FDE-196 fixes in `vertx3-otel-agent`: + +| Bug | Endpoint | What to verify on span | +|-----|----------|------------------------| +| Bug 1 — body capture | `POST /api/v1/contests/{id}/submit` | `http.request.body = {"teamId":"t1","wsId":123}` | +| Bug 2 — async exception | `POST /api/v1/contests/{id}/fail` | exception event + `status = ERROR` | +| Bug 3 — url.query | `GET /api/v1/contests/{id}?wsId=123` | `url.query = wsId=123` | + +## Prerequisites + +The `pom.xml` currently references `2.3.5-beta.9` (released, no FDE-196 fixes). + +To test FDE-196 fixes, build the agent from the `errors` branch and update `agent.version`: + +```bash +cd ~/Projects/vertx-rxjava3-otel-autoconfigure +git checkout errors +mvn install -DskipTests +# Then in pom.xml: 2.3.5-SNAPSHOT +``` + +Without that change the app still runs and traces, but: +- Bug 2 (`/fail`): no exception event on the span +- Bug 1 (`/submit`): no `http.request.body` attribute +- Bug 3 (`/contests/42?wsId=...`): no `url.query` attribute + +## Run locally (no Docker) + +```bash +cp .env.example .env +# edit .env with your OTLP endpoint + +mvn package -DskipTests +source .env +java -javaagent:target/vertx3-otel-agent.jar -jar target/resteasy-agent-test-1.0.0.jar +``` + +## Run with Docker Compose + +```bash +cp .env.example .env +# edit .env + +mvn package -DskipTests +docker-compose up --build +``` + +## Test calls + +```bash +# Bug 3: url.query +curl "http://localhost:8080/api/v1/contests/42?wsId=123&tournamentId=456" +# Span should have: url.query=wsId=123&tournamentId=456 + +# Bug 1: body capture (needs VERTX_OTEL_BODY_CAPTURE_ENABLED=true) +curl -X POST http://localhost:8080/api/v1/contests/42/submit \ + -H "Content-Type: application/json" \ + -d '{"teamId":"t1","wsId":123}' +# Span should have: http.request.body={"teamId":"t1","wsId":123} + +# Bug 2: async exception — RESTEasy calls writeException, not invoke throw +curl -X POST http://localhost:8080/api/v1/contests/42/fail \ + -H "Content-Type: application/json" \ + -d '{"teamId":"t1"}' +# Span should have: status=ERROR, exception event "simulated team submission failure for contest 42" +# Also: http.request.body={"teamId":"t1"} (with body capture + errorOnly=true, status is unknown at startSpan so endSpan uses thrown!=null check) + +# Baseline: sync exception (always worked, for comparison) +curl -X POST http://localhost:8080/api/v1/contests/42/fail-sync \ + -H "Content-Type: application/json" \ + -d '{"teamId":"t1"}' + +# Error-only body capture (VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY=true alone) +# Set VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY=true, remove VERTX_OTEL_BODY_CAPTURE_ENABLED +# Then hit /fail — body should appear; hit /submit (200) — body should NOT appear +``` + +## Env vars reference + +| Var | Effect | +|-----|--------| +| `VERTX_OTEL_BODY_CAPTURE_ENABLED=true` | Capture body on all requests | +| `VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY=true` | Capture body only on 4xx/5xx (or when exception thrown). No `ENABLED` needed. | +| `VERTX_OTEL_BODY_CAPTURE_MAX_BYTES=8192` | Truncate body at N bytes (default 8192) | diff --git a/java-vertx3-rxjava/resteasy-agent-test/docker-compose.yml b/java-vertx3-rxjava/resteasy-agent-test/docker-compose.yml new file mode 100644 index 0000000..a9a0ae0 --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/docker-compose.yml @@ -0,0 +1,25 @@ +version: "3.8" + +services: + app: + build: . + ports: + - "8080:8080" + environment: + OTEL_EXPORTER_OTLP_ENDPOINT: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://collector:4318} + OTEL_EXPORTER_OTLP_HEADERS: ${OTEL_EXPORTER_OTLP_HEADERS:-} + OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-resteasy-agent-test} + OTEL_TRACES_EXPORTER: otlp + VERTX_OTEL_BODY_CAPTURE_ENABLED: ${VERTX_OTEL_BODY_CAPTURE_ENABLED:-true} + VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY: ${VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY:-false} + PORT: "8080" + depends_on: + - collector + + collector: + image: otel/opentelemetry-collector-contrib:0.137.0 + volumes: + - ./otel-collector-config.yaml:/etc/otel/config.yaml + command: ["--config", "/etc/otel/config.yaml"] + ports: + - "4318:4318" diff --git a/java-vertx3-rxjava/resteasy-agent-test/otel-collector-config.yaml b/java-vertx3-rxjava/resteasy-agent-test/otel-collector-config.yaml new file mode 100644 index 0000000..3914130 --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/otel-collector-config.yaml @@ -0,0 +1,19 @@ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +exporters: + otlp/last9: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT} + headers: + Authorization: ${OTEL_EXPORTER_OTLP_HEADERS} + debug: + verbosity: detailed + +service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp/last9, debug] diff --git a/java-vertx3-rxjava/resteasy-agent-test/pom.xml b/java-vertx3-rxjava/resteasy-agent-test/pom.xml new file mode 100644 index 0000000..97f515b --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/pom.xml @@ -0,0 +1,144 @@ + + + 4.0.0 + + com.example + resteasy-agent-test + 1.0.0 + jar + + RESTEasy + vertx3-otel-agent Test App + + Tests FDE-196 fixes in vertx3-otel-agent: + - Body capture on JAX-RS endpoints (Bug 1) + - Async exception recording via writeException (Bug 2) + - url.query / url.scheme on spans (Bug 3) + + + + UTF-8 + 11 + 11 + + 3.15.6.Final + 1.2.13 + 1.7.36 + + + + 2.3.5-beta.9 + + com.example.Main + + + + + + org.jboss.resteasy + resteasy-undertow + ${resteasy.version} + + + + + org.jboss.resteasy + resteasy-jaxrs + ${resteasy.version} + + + + + org.jboss.resteasy + resteasy-rxjava2 + ${resteasy.version} + + + + + org.jboss.resteasy + resteasy-jackson2-provider + ${resteasy.version} + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + ch.qos.logback + logback-classic + ${logback.version} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-agent + prepare-package + copy + + + + io.last9 + vertx3-otel-agent + ${agent.version} + ${project.build.directory} + vertx3-otel-agent.jar + + + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.1 + + + package + shade + + + + ${main.class} + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + false + + + + + + + diff --git a/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/Main.java b/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/Main.java new file mode 100644 index 0000000..d7b3892 --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/Main.java @@ -0,0 +1,46 @@ +package com.example; + +import com.example.api.ContestResource; +import io.undertow.Undertow; +import org.jboss.resteasy.plugins.server.undertow.UndertowJaxrsServer; +import org.jboss.resteasy.spi.ResteasyDeployment; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.Application; +import java.util.HashSet; +import java.util.Set; + +public class Main { + + private static final Logger log = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) throws Exception { + int port = Integer.parseInt(System.getenv().getOrDefault("PORT", "8080")); + + ResteasyDeployment deployment = new ResteasyDeployment(); + deployment.setApplication(new Application() { + @Override + public Set> getClasses() { + Set> classes = new HashSet<>(); + classes.add(ContestResource.class); + return classes; + } + }); + + Undertow.Builder builder = Undertow.builder() + .addHttpListener(port, "0.0.0.0"); + + UndertowJaxrsServer server = new UndertowJaxrsServer() + .start(builder); + server.deploy(deployment, "/"); + + log.info("RESTEasy server started on port {}", port); + log.info("Endpoints:"); + log.info(" GET http://localhost:{}/api/v1/contests/42?wsId=123 (url.query test)", port); + log.info(" POST http://localhost:{}/api/v1/contests/42/submit (body capture + async CompletionStage)", port); + log.info(" POST http://localhost:{}/api/v1/contests/42/fail (async exception via writeException)", port); + + Thread.currentThread().join(); + } +} diff --git a/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/api/ContestResource.java b/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/api/ContestResource.java new file mode 100644 index 0000000..96c4531 --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/src/main/java/com/example/api/ContestResource.java @@ -0,0 +1,111 @@ +package com.example.api; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.*; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionStage; + +/** + * JAX-RS resource that exercises the three FDE-196 fixes in vertx3-otel-agent. + * + * Endpoints: + * + * GET /api/v1/contests/{id} — url.query test (Bug 3) + * POST /api/v1/contests/{id}/submit — body capture + async CompletionStage (Bug 1 + 2) + * POST /api/v1/contests/{id}/fail — async exception via writeException (Bug 2) + * POST /api/v1/contests/{id}/fail-sync — synchronous exception (baseline, always worked) + */ +@Path("/api/v1/contests") +@Produces(MediaType.APPLICATION_JSON) +public class ContestResource { + + private static final Logger log = LoggerFactory.getLogger(ContestResource.class); + + // ---- Bug 3: url.query ---- + + /** + * GET /api/v1/contests/{id}?wsId=123&tournamentId=456 + * + * Verify on span: url.query = "wsId=123&tournamentId=456" + */ + @GET + @Path("/{id}") + public Response getContest(@PathParam("id") String id, + @QueryParam("wsId") String wsId, + @QueryParam("tournamentId") String tournamentId) { + log.info("getContest id={} wsId={} tournamentId={}", id, wsId, tournamentId); + return Response.ok( + "{\"contestId\":\"" + id + "\",\"wsId\":\"" + wsId + "\"}" + ).build(); + } + + // ---- Bug 1 + 2: body capture + async CompletionStage success ---- + + /** + * POST /api/v1/contests/{id}/submit + * Content-Type: application/json + * Body: {"teamId":"t1","wsId":123} + * + * Verify on span with VERTX_OTEL_BODY_CAPTURE_ENABLED=true: + * http.request.body = {"teamId":"t1","wsId":123} + * + * With VERTX_OTEL_BODY_CAPTURE_ERROR_ONLY=true: body appears only on 4xx/5xx. + */ + @POST + @Path("/{id}/submit") + @Consumes(MediaType.APPLICATION_JSON) + public CompletionStage submitTeam(@PathParam("id") String id, String body) { + log.info("submitTeam id={} body={}", id, body); + return CompletableFuture.supplyAsync(() -> + Response.ok("{\"status\":\"ok\",\"contestId\":\"" + id + "\"}").build() + ); + } + + // ---- Bug 2: async exception → writeException → exception event on span ---- + + /** + * POST /api/v1/contests/{id}/fail + * Content-Type: application/json + * Body: {"teamId":"t1"} + * + * Simulates a failed CompletionStage (equivalent to RxJava onErrorResumeNext throwing). + * RESTEasy calls SynchronousDispatcher.writeException() instead of propagating the + * exception through invoke() — so @Advice.Thrown is null without the FDE-196 fix. + * + * Verify on span: + * status = ERROR + * exception event with message "simulated team submission failure" + * http.request.body = {"teamId":"t1"} (with VERTX_OTEL_BODY_CAPTURE_ENABLED=true) + */ + @POST + @Path("/{id}/fail") + @Consumes(MediaType.APPLICATION_JSON) + public CompletionStage failAsync(@PathParam("id") String id, String body) { + log.info("failAsync id={} body={}", id, body); + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally( + new RuntimeException("simulated team submission failure for contest " + id) + ); + return future; + } + + // ---- Baseline: synchronous 5xx (always worked) ---- + + /** + * POST /api/v1/contests/{id}/fail-sync + * + * Synchronous exception — @Advice.Thrown picks this up even without FDE-196. + * Use as a baseline to confirm the agent is working at all. + */ + @POST + @Path("/{id}/fail-sync") + @Consumes(MediaType.APPLICATION_JSON) + public Response failSync(@PathParam("id") String id, String body) { + throw new WebApplicationException("sync error for contest " + id, + Response.Status.INTERNAL_SERVER_ERROR); + } +} diff --git a/java-vertx3-rxjava/resteasy-agent-test/src/main/resources/logback.xml b/java-vertx3-rxjava/resteasy-agent-test/src/main/resources/logback.xml new file mode 100644 index 0000000..5c3cc71 --- /dev/null +++ b/java-vertx3-rxjava/resteasy-agent-test/src/main/resources/logback.xml @@ -0,0 +1,13 @@ + + + + %d{HH:mm:ss} %-5level %logger{36} - %msg%n + + + + + + + + + diff --git a/otel-collector/delta-cumulative-routing/.gitignore b/otel-collector/delta-cumulative-routing/.gitignore new file mode 100644 index 0000000..77e5a0e --- /dev/null +++ b/otel-collector/delta-cumulative-routing/.gitignore @@ -0,0 +1,2 @@ +.env +delta_cumulative_demo diff --git a/otel-collector/delta-cumulative-routing/docker-compose.yaml b/otel-collector/delta-cumulative-routing/docker-compose.yaml new file mode 100644 index 0000000..218206c --- /dev/null +++ b/otel-collector/delta-cumulative-routing/docker-compose.yaml @@ -0,0 +1,73 @@ +services: + + # Simulates K8s Service load-balancing across N collector pods + nginx: + image: nginx:alpine + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + ports: + - "4318:4318" + depends_on: + - collector-1 + - collector-2 + networks: + - demo + + # Collector pod 1 + collector-1: + image: otel/opentelemetry-collector-contrib:0.137.0 + hostname: collector-1 + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-collector-config.yaml:ro + environment: + - HOSTNAME=collector-1 + - LAST9_OTLP_ENDPOINT=${LAST9_OTLP_ENDPOINT} + - LAST9_OTLP_AUTH_HEADER=${LAST9_OTLP_AUTH_HEADER} + networks: + - demo + + # Collector pod 2 — identical config, different hostname → different collector_id on delta series + collector-2: + image: otel/opentelemetry-collector-contrib:0.137.0 + hostname: collector-2 + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-collector-config.yaml:ro + environment: + - HOSTNAME=collector-2 + - LAST9_OTLP_ENDPOINT=${LAST9_OTLP_ENDPOINT} + - LAST9_OTLP_AUTH_HEADER=${LAST9_OTLP_AUTH_HEADER} + networks: + - demo + + # Metrics generator — sends both delta and cumulative counters + generator: + build: ./generator + environment: + - OTEL_EXPORTER_OTLP_ENDPOINT=nginx:4318 + depends_on: + - nginx + restart: on-failure + networks: + - demo + + # Local VictoriaMetrics — receives all metrics via remote write + victoriametrics: + image: victoriametrics/victoria-metrics:v1.115.0 + command: + - --storageDataPath=/storage + - --retentionPeriod=1d + ports: + - "8428:8428" + volumes: + - vm-data:/storage + networks: + - demo + +networks: + demo: + name: delta_cumulative_demo + +volumes: + vm-data: diff --git a/otel-collector/delta-cumulative-routing/generator/Dockerfile b/otel-collector/delta-cumulative-routing/generator/Dockerfile new file mode 100644 index 0000000..f17f9e1 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/Dockerfile @@ -0,0 +1,10 @@ +FROM golang:1.24-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY main.go . +RUN CGO_ENABLED=0 go build -o generator . + +FROM alpine:3.20 +COPY --from=builder /app/generator /generator +ENTRYPOINT ["/generator"] diff --git a/otel-collector/delta-cumulative-routing/generator/go.mod b/otel-collector/delta-cumulative-routing/generator/go.mod new file mode 100644 index 0000000..1794c9b --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/go.mod @@ -0,0 +1,30 @@ +module delta_cumulative_demo + +go 1.24.0 + +require ( + go.opentelemetry.io/otel v1.39.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0 + go.opentelemetry.io/otel/metric v1.39.0 + go.opentelemetry.io/otel/sdk v1.39.0 + go.opentelemetry.io/otel/sdk/metric v1.39.0 +) + +require ( + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/trace v1.39.0 // indirect + go.opentelemetry.io/proto/otlp v1.9.0 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/text v0.31.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/grpc v1.77.0 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) diff --git a/otel-collector/delta-cumulative-routing/generator/go.sum b/otel-collector/delta-cumulative-routing/generator/go.sum new file mode 100644 index 0000000..51df1e6 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/go.sum @@ -0,0 +1,57 @@ +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= +go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0 h1:nKP4Z2ejtHn3yShBb+2KawiXgpn8In5cT7aO2wXuOTE= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0/go.mod h1:NwjeBbNigsO4Aj9WgM0C+cKIrxsZUaRmZUO7A8I7u8o= +go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= +go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= +go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= +go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= +go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= +go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= +go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= +go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/otel-collector/delta-cumulative-routing/generator/main.go b/otel-collector/delta-cumulative-routing/generator/main.go new file mode 100644 index 0000000..aff0b24 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/main.go @@ -0,0 +1,112 @@ +package main + +import ( + "context" + "log" + "math/rand" + "os" + "os/signal" + "syscall" + "time" + + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/metric" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" +) + +func deltaTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality +} + +func cumulativeTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.CumulativeTemporality +} + +func newMeterProvider(ctx context.Context, endpoint string, temporality func(sdkmetric.InstrumentKind) metricdata.Temporality, svcName string) (*sdkmetric.MeterProvider, error) { + exp, err := otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(endpoint), + otlpmetrichttp.WithInsecure(), + otlpmetrichttp.WithTemporalitySelector(temporality), + ) + if err != nil { + return nil, err + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceName(svcName), + semconv.ServiceVersion("1.0.0"), + ), + ) + if err != nil { + return nil, err + } + + return sdkmetric.NewMeterProvider( + sdkmetric.WithReader( + sdkmetric.NewPeriodicReader(exp, sdkmetric.WithInterval(5*time.Second)), + ), + sdkmetric.WithResource(res), + ), nil +} + +func main() { + endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + if endpoint == "" { + endpoint = "localhost:4318" + } + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + // Provider 1: delta temporality — simulates SDK that sends deltas (e.g. Golang default for some instruments) + deltaMp, err := newMeterProvider(ctx, endpoint, deltaTemporality, "demo-delta") + if err != nil { + log.Fatalf("delta provider: %v", err) + } + defer deltaMp.Shutdown(context.Background()) + + // Provider 2: cumulative temporality — simulates SDK that sends cumulative (e.g. Claude Code, Java default) + cumulativeMp, err := newMeterProvider(ctx, endpoint, cumulativeTemporality, "demo-cumulative") + if err != nil { + log.Fatalf("cumulative provider: %v", err) + } + defer cumulativeMp.Shutdown(context.Background()) + + deltaCounter, err := deltaMp.Meter("demo").Int64Counter( + "demo_requests_delta_total", + metric.WithDescription("Request counter with DELTA temporality"), + ) + if err != nil { + log.Fatalf("delta counter: %v", err) + } + + cumulativeCounter, err := cumulativeMp.Meter("demo").Int64Counter( + "demo_requests_cumulative_total", + metric.WithDescription("Request counter with CUMULATIVE temporality"), + ) + if err != nil { + log.Fatalf("cumulative counter: %v", err) + } + + log.Printf("sending to %s — delta + cumulative, every 2s", endpoint) + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Println("shutting down") + return + case <-ticker.C: + inc := int64(rand.Intn(10) + 1) + deltaCounter.Add(ctx, inc) + cumulativeCounter.Add(ctx, inc) + log.Printf("added %d to both counters", inc) + } + } +} diff --git a/otel-collector/delta-cumulative-routing/nginx.conf b/otel-collector/delta-cumulative-routing/nginx.conf new file mode 100644 index 0000000..ebf7889 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/nginx.conf @@ -0,0 +1,24 @@ +events { + worker_connections 1024; +} + +http { + # Round-robin across two collector replicas — simulates K8s Service + upstream otel_collectors { + server collector-1:4318; + server collector-2:4318; + } + + server { + listen 4318; + + location / { + proxy_pass http://otel_collectors; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header Connection ""; + # Force short keepalive so round-robin actually distributes across pods + keepalive_requests 2; + } + } +} diff --git a/otel-collector/delta-cumulative-routing/otel-config.yaml b/otel-collector/delta-cumulative-routing/otel-config.yaml new file mode 100644 index 0000000..2039a65 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/otel-config.yaml @@ -0,0 +1,98 @@ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +processors: + # Drop cumulative metrics so only delta passes through this pipeline + filter/keep_delta: + error_mode: ignore + metrics: + metric: + - 'aggregation_temporality == AGGREGATION_TEMPORALITY_CUMULATIVE' + + # Drop delta metrics so only cumulative passes through this pipeline + filter/keep_cumulative: + error_mode: ignore + metrics: + metric: + - 'aggregation_temporality == AGGREGATION_TEMPORALITY_DELTA' + + # Adds host.name to resource attributes from the OS hostname. + # Required so transform/setshardkey can read resource.attributes["host.name"]. + resourcedetection: + detectors: [system] + system: + hostname_sources: [os] + override: false + + # Stamps collector_id on delta metrics only — needed because each collector + # replica independently accumulates deltas from 0; without isolation the + # same series gets conflicting values from different pods. + transform/setshardkey: + metric_statements: + - context: datapoint + statements: + - set(attributes["collector_id"], resource.attributes["host.name"]) + + # Converts delta to cumulative in-memory per pod. + # Safe here because setshardkey already isolates each pod's series. + deltatocumulative: + max_stale: 5m + max_streams: 10000 + + # Renames delta metric to expose the broken (no shard key) variant alongside correct one. + # Applied AFTER deltatocumulative so d2c state is keyed on original name. + transform/rename_broken: + metric_statements: + - context: metric + statements: + - set(name, "demo_requests_delta_broken_total") where name == "demo_requests_delta_total" + + batch: + send_batch_size: 1000 + timeout: 5s + +exporters: + prometheusremotewrite: + endpoint: http://victoriametrics:8428/api/v1/write + tls: + insecure: true + + # Last9 OTLP exporter — for e2e verification via Last9 MCP + otlphttp/last9: + endpoint: ${env:LAST9_OTLP_ENDPOINT} + headers: + Authorization: ${env:LAST9_OTLP_AUTH_HEADER} + + # Debug exporter — shows what each pipeline actually processes + debug: + verbosity: basic + +service: + telemetry: + logs: + level: info + + pipelines: + # Delta path: filter → detect host.name → stamp collector_id → d2c → write + # Queries must use sum without(collector_id) to aggregate across pods + metrics/delta: + receivers: [otlp] + processors: [filter/keep_delta, resourcedetection, transform/setshardkey, deltatocumulative, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] + + # Broken delta path: no setshardkey → both pods write same series → last-write-wins in VM + # Use demo_requests_delta_broken_total to observe the incorrect value vs correct sum + metrics/delta_broken: + receivers: [otlp] + processors: [filter/keep_delta, resourcedetection, deltatocumulative, transform/rename_broken, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] + + # Cumulative path: no collector_id, no d2c — SDK owns the counter + # Queries work as-is; no dedup label needed + metrics/cumulative: + receivers: [otlp] + processors: [filter/keep_cumulative, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] diff --git a/otel-collector/delta-cumulative-routing/verify.sh b/otel-collector/delta-cumulative-routing/verify.sh new file mode 100755 index 0000000..8bfaa08 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/verify.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Queries VictoriaMetrics to verify correct routing of delta vs cumulative metrics. +# Run after: docker compose up -d && sleep 30 + +VM="http://localhost:8428" + +query() { + local label=$1 + local expr=$2 + echo "" + echo "=== $label ===" + curl -sg "${VM}/api/v1/query" --data-urlencode "query=${expr}" \ + | python3 -c " +import sys, json +d = json.load(sys.stdin) +results = d.get('data', {}).get('result', []) +if not results: + print(' (no data)') +for r in results: + print(' labels:', r['metric']) + print(' value: ', r['value'][1]) +" +} + +echo "========================================" +echo " Verifying delta vs cumulative routing " +echo "========================================" + +# 1. Delta series: should have collector_id label (one per pod) +query "Delta counter — expect 2 series, one per collector_id" \ + 'demo_requests_delta_total' + +# 2. Cumulative series: should have NO collector_id label (one series total) +query "Cumulative counter — expect 1 series, no collector_id" \ + 'demo_requests_cumulative_total' + +# 3. Correct delta total: sum across collector_ids +query "Delta total (correct) — sum without(collector_id)" \ + 'sum without(collector_id)(demo_requests_delta_total)' + +# 4. Naive delta total WITHOUT dedup — shows overcount risk if collector_id absent +query "Delta rate per pod (each pod's partial view)" \ + 'rate(demo_requests_delta_total[1m])' + +# 5. Combined delta rate — sum across pods +query "Delta rate total (correct) — sum of per-pod rates" \ + 'sum without(collector_id)(rate(demo_requests_delta_total[1m]))' + +# 6. Cumulative rate — works directly, no dedup needed +query "Cumulative rate (works natively) — rate(cumulative[1m])" \ + 'rate(demo_requests_cumulative_total[1m])' + +echo "" +echo "========================================" +echo " Series count sanity check " +echo "========================================" + +count() { + local label=$1 + local expr=$2 + local n + n=$(curl -sg "${VM}/api/v1/query" --data-urlencode "query=count(${expr})" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data']['result'][0]['value'][1] if d['data']['result'] else 0)") + echo " $label: $n series" +} + +count "demo_requests_delta_total (expect 2, one per pod)" "demo_requests_delta_total" +count "demo_requests_cumulative_total (expect 1)" "demo_requests_cumulative_total"