diff --git a/docker-compose/otel-compose.yaml b/docker-compose/otel-compose.yaml index 95379d5..584823d 100644 --- a/docker-compose/otel-compose.yaml +++ b/docker-compose/otel-compose.yaml @@ -1,6 +1,6 @@ services: otel-collector: - image: otel/opentelemetry-collector-contrib:0.118.0 + image: otel/opentelemetry-collector-contrib:0.144.0 container_name: otel-collector command: [ @@ -14,10 +14,7 @@ services: # Host filesystem mounts so the hostmetrics receiver can read the VM's # CPU / memory / disk / network / process stats rather than the # collector container's own (containerized) view. - - /proc:/hostfs/proc:ro - - /sys:/hostfs/sys:ro - - /etc/os-release:/hostfs/etc/os-release:ro - - /:/hostfs/root:ro,rslave + - /:/hostfs:ro,rslave ports: - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver @@ -40,6 +37,11 @@ services: # this because they read via the Docker socket. - nginx_network - apache_network + # Note: bridge networking is required here for active prometheus scraping + # of named docker networks. For scraping host-networked containers + # (network_mode: host), switch to network_mode: host on this service, + # remove the networks: block, use localhost: as scrape target, + # and change logspout command to syslog+tcp://localhost:2255. logspout: image: "gliderlabs/logspout:v3.2.14" diff --git a/docker-compose/otel-config.yaml b/docker-compose/otel-config.yaml index c8b1244..c84b6c5 100644 --- a/docker-compose/otel-config.yaml +++ b/docker-compose/otel-config.yaml @@ -89,19 +89,40 @@ receivers: metrics: system.filesystem.utilization: enabled: true + exclude_fs_types: + fs_types: + - squashfs + - tmpfs + - devtmpfs + - overlay + - nsfs + - autofs + - cgroup + - cgroup2 + - proc + - sysfs + match_type: strict exclude_mount_points: mount_points: - - /hostfs/sys/* - - /hostfs/proc/* - - /var/lib/docker/* - - /hostfs/root/var/lib/docker/* + - /hostfs/sys/.* + - /hostfs/proc/.* + - /hostfs/dev/.* + - /hostfs/run/.* + - /hostfs/var/lib/docker/.* + - /hostfs/snap/.* + - /hostfs/boot.* + - /var/lib/docker/.* match_type: regexp network: paging: processes: - # mute_process_*_error: true silences the routine "permission denied" - # noise for kernel threads and short-lived processes whose - # /proc//{exe,io,cmdline} entries can't be read. + # mute_process_*_error: true silences "permission denied" noise for + # kernel threads and short-lived processes. Note: these flags do NOT + # cover TOCTOU races — if a process exits between PID discovery and + # stat read, /proc//stat returns ENOENT and is still logged as + # an error. On busy hosts with many short-lived processes (kworkers, + # sshd child processes), remove the process: scraper entirely and rely + # on docker_stats for per-container granularity instead. process: mute_process_user_error: true mute_process_io_error: true diff --git a/gemini-cli/.env.example b/gemini-cli/.env.example new file mode 100644 index 0000000..4e45a33 --- /dev/null +++ b/gemini-cli/.env.example @@ -0,0 +1,5 @@ +# Last9 OTLP credentials — get these from: +# https://app.last9.io/integrations?integration=OpenTelemetry + +LAST9_OTLP_ENDPOINT=https://otlp.last9.io +LAST9_OTLP_AUTH=Basic diff --git a/gemini-cli/.gitignore b/gemini-cli/.gitignore new file mode 100644 index 0000000..f0a6075 --- /dev/null +++ b/gemini-cli/.gitignore @@ -0,0 +1,11 @@ +# Environment/secrets +.env +.env.local +.env.*.local + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/gemini-cli/README.md b/gemini-cli/README.md new file mode 100644 index 0000000..16170b9 --- /dev/null +++ b/gemini-cli/README.md @@ -0,0 +1,137 @@ +# Gemini CLI — OpenTelemetry to Last9 + +Send Google's [Gemini CLI](https://github.com/google-gemini/gemini-cli) telemetry — traces, metrics, and logs — to Last9 via OpenTelemetry. + +Gemini CLI emits all three OpenTelemetry signal types. Keep all three on — combined volume is low and traces add per-call model latency / token attribution that metrics alone cannot. + +- **Traces** — spans for tool calls, API requests, agent runs (set `GEMINI_TELEMETRY_TRACES_ENABLED=true`) +- **Metrics** — session counts, token usage, latency, file ops, agent durations +- **Logs** — structured events for prompts, API requests/responses, slash commands, file operations + +## Prerequisites + +- A Last9 account ([app.last9.io](https://app.last9.io)) +- OTLP credentials from **Integrations → OpenTelemetry** in the Last9 dashboard +- Gemini CLI installed (`npm install -g @google/gemini-cli`) + +## Option 1 — Direct Export to Last9 (no Collector) + +Gemini CLI's OTLP exporters take `url` only, but the underlying OpenTelemetry JS SDK reads standard `OTEL_EXPORTER_OTLP_HEADERS` from env for authentication — so direct export works. + +```bash +# Gemini CLI telemetry switches +export GEMINI_TELEMETRY_ENABLED=true +export GEMINI_TELEMETRY_TARGET=local +export GEMINI_TELEMETRY_OTLP_ENDPOINT="https://" +export GEMINI_TELEMETRY_OTLP_PROTOCOL=http +export GEMINI_TELEMETRY_TRACES_ENABLED=true + +# Standard OTel env — picked up by SDK for auth headers +export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic " +``` + +Reload and run: + +```bash +source ~/.zshrc +gemini -p "explain this repo" +``` + +## Option 2 — Local OpenTelemetry Collector + +Useful for batching, retries, or fan-out to multiple backends. + +1. Copy `.env.example` to `.env` and fill in your Last9 credentials: + + ```bash + cp .env.example .env + ``` + +2. Start the collector: + + ```bash + docker compose up -d + ``` + +3. Point Gemini CLI at the local collector: + + ```bash + export GEMINI_TELEMETRY_ENABLED=true + export GEMINI_TELEMETRY_TARGET=local + export GEMINI_TELEMETRY_OTLP_ENDPOINT=http://localhost:4317 + export GEMINI_TELEMETRY_OTLP_PROTOCOL=grpc + export GEMINI_TELEMETRY_TRACES_ENABLED=true + unset OTEL_EXPORTER_OTLP_HEADERS # collector handles auth + ``` + +4. Run Gemini CLI: + + ```bash + gemini + ``` + +## Configuration Reference + +### Gemini CLI-specific + +| Variable | Default | Purpose | +|---|---|---| +| `GEMINI_TELEMETRY_ENABLED` | `false` | Master toggle | +| `GEMINI_TELEMETRY_TARGET` | `local` | `local` (OTLP) or `gcp` (Google Cloud) | +| `GEMINI_TELEMETRY_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP endpoint (base URL — `/v1/traces` etc. appended) | +| `GEMINI_TELEMETRY_OTLP_PROTOCOL` | `grpc` | `grpc` or `http` | +| `GEMINI_TELEMETRY_TRACES_ENABLED` | `false` | Set `true` to export spans (recommended) | +| `GEMINI_TELEMETRY_LOG_PROMPTS` | `true` | Include prompt text in logs | +| `GEMINI_TELEMETRY_OUTFILE` | — | Write to local file instead of OTLP | + +### Standard OTel env (used for auth headers) + +| Variable | Purpose | +|---|---| +| `OTEL_EXPORTER_OTLP_HEADERS` | `Authorization=Basic ` | +| `OTEL_EXPORTER_OTLP_{TRACES,METRICS,LOGS}_HEADERS` | Per-signal override | + +### `.env` variables (Collector path) + +| Variable | Purpose | +|---|---| +| `LAST9_OTLP_ENDPOINT` | Last9 OTLP endpoint (e.g. `https://otlp.last9.io`) | +| `LAST9_OTLP_AUTH` | Last9 Basic auth header (`Basic `) | + +## Verification + +After running a Gemini CLI session for a minute: + +1. **Traces** — filter by `service.name = gemini-cli` in Last9 Traces Explorer. Span name: `llm_call`. +2. **Metrics** — search for `gemini_cli_session_count_total`, `gemini_cli_api_request_count_total`, `gemini_cli_token_usage_total` +3. **Logs** — filter by `service.name = gemini-cli` in Last9 Logs Explorer + +
+Notable metrics + +| Metric | Type | +|---|---| +| `gemini_cli.session.count` | counter | +| `gemini_cli.tool.call.count` | counter | +| `gemini_cli.tool.call.latency` | histogram | +| `gemini_cli.api.request.count` | counter | +| `gemini_cli.api.request.latency` | histogram | +| `gemini_cli.token.usage` | counter | +| `gemini_cli.file.operation.count` | counter | +| `gemini_cli.lines.changed` | counter | +| `gemini_cli.agent.run.count` | counter | +| `gemini_cli.agent.duration` | histogram | +| `gemini_cli.startup.duration` | histogram | +| `gemini_cli.memory.usage` | gauge | +| `gemini_cli.cpu.usage` | gauge | +| `gen_ai.client.token.usage` | counter (GenAI semconv) | +| `gen_ai.client.operation.duration` | histogram (GenAI semconv) | + +
+ +## Troubleshooting + +- **No data in Last9** — confirm `echo $GEMINI_TELEMETRY_ENABLED` returns `true` in the shell that ran `gemini`. Restart shell after editing `~/.zshrc`. +- **Authentication errors** — verify `OTEL_EXPORTER_OTLP_HEADERS` is `Authorization=Basic ` (key=value format, not HTTP colon syntax). Trailing whitespace breaks it. +- **Traces missing** — make sure `GEMINI_TELEMETRY_TRACES_ENABLED=true` is set. Gemini CLI defaults to `false`, so spans only flow when you explicitly enable them. +- **gRPC connection refused** — make sure `-p 4317:4317` is exposed (collector path) and you're using `grpc` protocol with `http://localhost:4317` (not `https`). diff --git a/gemini-cli/docker-compose.yaml b/gemini-cli/docker-compose.yaml new file mode 100644 index 0000000..0f6b064 --- /dev/null +++ b/gemini-cli/docker-compose.yaml @@ -0,0 +1,19 @@ +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:0.144.0 + container_name: gemini-cli-otel-collector + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel/config.yaml:ro + env_file: .env + environment: + - DEPLOYMENT_ENV=${DEPLOYMENT_ENV:-development} + ports: + - "4317:4317" # gRPC receiver + - "4318:4318" # HTTP receiver + - "13133:13133" # Health check + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133"] + interval: 10s + timeout: 5s + retries: 3 diff --git a/gemini-cli/otel-collector-config.yaml b/gemini-cli/otel-collector-config.yaml new file mode 100644 index 0000000..344e559 --- /dev/null +++ b/gemini-cli/otel-collector-config.yaml @@ -0,0 +1,44 @@ +receivers: + # Receive logs, traces, and metrics from Gemini CLI via gRPC and HTTP + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 1000 + + # Add organization-level resource attributes + resource: + attributes: + - key: deployment.environment + value: "${env:DEPLOYMENT_ENV}" + action: upsert + +exporters: + otlp/last9: + endpoint: "${env:LAST9_OTLP_ENDPOINT}" + headers: + "Authorization": "${env:LAST9_OTLP_AUTH}" + + debug: + verbosity: basic + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] + metrics: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] + logs: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/last9, debug] diff --git a/otel-collector/delta-cumulative-routing/.gitignore b/otel-collector/delta-cumulative-routing/.gitignore new file mode 100644 index 0000000..77e5a0e --- /dev/null +++ b/otel-collector/delta-cumulative-routing/.gitignore @@ -0,0 +1,2 @@ +.env +delta_cumulative_demo diff --git a/otel-collector/delta-cumulative-routing/docker-compose.yaml b/otel-collector/delta-cumulative-routing/docker-compose.yaml new file mode 100644 index 0000000..218206c --- /dev/null +++ b/otel-collector/delta-cumulative-routing/docker-compose.yaml @@ -0,0 +1,73 @@ +services: + + # Simulates K8s Service load-balancing across N collector pods + nginx: + image: nginx:alpine + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + ports: + - "4318:4318" + depends_on: + - collector-1 + - collector-2 + networks: + - demo + + # Collector pod 1 + collector-1: + image: otel/opentelemetry-collector-contrib:0.137.0 + hostname: collector-1 + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-collector-config.yaml:ro + environment: + - HOSTNAME=collector-1 + - LAST9_OTLP_ENDPOINT=${LAST9_OTLP_ENDPOINT} + - LAST9_OTLP_AUTH_HEADER=${LAST9_OTLP_AUTH_HEADER} + networks: + - demo + + # Collector pod 2 — identical config, different hostname → different collector_id on delta series + collector-2: + image: otel/opentelemetry-collector-contrib:0.137.0 + hostname: collector-2 + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-collector-config.yaml:ro + environment: + - HOSTNAME=collector-2 + - LAST9_OTLP_ENDPOINT=${LAST9_OTLP_ENDPOINT} + - LAST9_OTLP_AUTH_HEADER=${LAST9_OTLP_AUTH_HEADER} + networks: + - demo + + # Metrics generator — sends both delta and cumulative counters + generator: + build: ./generator + environment: + - OTEL_EXPORTER_OTLP_ENDPOINT=nginx:4318 + depends_on: + - nginx + restart: on-failure + networks: + - demo + + # Local VictoriaMetrics — receives all metrics via remote write + victoriametrics: + image: victoriametrics/victoria-metrics:v1.115.0 + command: + - --storageDataPath=/storage + - --retentionPeriod=1d + ports: + - "8428:8428" + volumes: + - vm-data:/storage + networks: + - demo + +networks: + demo: + name: delta_cumulative_demo + +volumes: + vm-data: diff --git a/otel-collector/delta-cumulative-routing/generator/Dockerfile b/otel-collector/delta-cumulative-routing/generator/Dockerfile new file mode 100644 index 0000000..f17f9e1 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/Dockerfile @@ -0,0 +1,10 @@ +FROM golang:1.24-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY main.go . +RUN CGO_ENABLED=0 go build -o generator . + +FROM alpine:3.20 +COPY --from=builder /app/generator /generator +ENTRYPOINT ["/generator"] diff --git a/otel-collector/delta-cumulative-routing/generator/go.mod b/otel-collector/delta-cumulative-routing/generator/go.mod new file mode 100644 index 0000000..1794c9b --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/go.mod @@ -0,0 +1,30 @@ +module delta_cumulative_demo + +go 1.24.0 + +require ( + go.opentelemetry.io/otel v1.39.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0 + go.opentelemetry.io/otel/metric v1.39.0 + go.opentelemetry.io/otel/sdk v1.39.0 + go.opentelemetry.io/otel/sdk/metric v1.39.0 +) + +require ( + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/trace v1.39.0 // indirect + go.opentelemetry.io/proto/otlp v1.9.0 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/text v0.31.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/grpc v1.77.0 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) diff --git a/otel-collector/delta-cumulative-routing/generator/go.sum b/otel-collector/delta-cumulative-routing/generator/go.sum new file mode 100644 index 0000000..51df1e6 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/go.sum @@ -0,0 +1,57 @@ +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= +go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0 h1:nKP4Z2ejtHn3yShBb+2KawiXgpn8In5cT7aO2wXuOTE= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.39.0/go.mod h1:NwjeBbNigsO4Aj9WgM0C+cKIrxsZUaRmZUO7A8I7u8o= +go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= +go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= +go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= +go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= +go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= +go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= +go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= +go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/otel-collector/delta-cumulative-routing/generator/main.go b/otel-collector/delta-cumulative-routing/generator/main.go new file mode 100644 index 0000000..aff0b24 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/generator/main.go @@ -0,0 +1,112 @@ +package main + +import ( + "context" + "log" + "math/rand" + "os" + "os/signal" + "syscall" + "time" + + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/metric" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" +) + +func deltaTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality +} + +func cumulativeTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.CumulativeTemporality +} + +func newMeterProvider(ctx context.Context, endpoint string, temporality func(sdkmetric.InstrumentKind) metricdata.Temporality, svcName string) (*sdkmetric.MeterProvider, error) { + exp, err := otlpmetrichttp.New(ctx, + otlpmetrichttp.WithEndpoint(endpoint), + otlpmetrichttp.WithInsecure(), + otlpmetrichttp.WithTemporalitySelector(temporality), + ) + if err != nil { + return nil, err + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceName(svcName), + semconv.ServiceVersion("1.0.0"), + ), + ) + if err != nil { + return nil, err + } + + return sdkmetric.NewMeterProvider( + sdkmetric.WithReader( + sdkmetric.NewPeriodicReader(exp, sdkmetric.WithInterval(5*time.Second)), + ), + sdkmetric.WithResource(res), + ), nil +} + +func main() { + endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + if endpoint == "" { + endpoint = "localhost:4318" + } + + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + // Provider 1: delta temporality — simulates SDK that sends deltas (e.g. Golang default for some instruments) + deltaMp, err := newMeterProvider(ctx, endpoint, deltaTemporality, "demo-delta") + if err != nil { + log.Fatalf("delta provider: %v", err) + } + defer deltaMp.Shutdown(context.Background()) + + // Provider 2: cumulative temporality — simulates SDK that sends cumulative (e.g. Claude Code, Java default) + cumulativeMp, err := newMeterProvider(ctx, endpoint, cumulativeTemporality, "demo-cumulative") + if err != nil { + log.Fatalf("cumulative provider: %v", err) + } + defer cumulativeMp.Shutdown(context.Background()) + + deltaCounter, err := deltaMp.Meter("demo").Int64Counter( + "demo_requests_delta_total", + metric.WithDescription("Request counter with DELTA temporality"), + ) + if err != nil { + log.Fatalf("delta counter: %v", err) + } + + cumulativeCounter, err := cumulativeMp.Meter("demo").Int64Counter( + "demo_requests_cumulative_total", + metric.WithDescription("Request counter with CUMULATIVE temporality"), + ) + if err != nil { + log.Fatalf("cumulative counter: %v", err) + } + + log.Printf("sending to %s — delta + cumulative, every 2s", endpoint) + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Println("shutting down") + return + case <-ticker.C: + inc := int64(rand.Intn(10) + 1) + deltaCounter.Add(ctx, inc) + cumulativeCounter.Add(ctx, inc) + log.Printf("added %d to both counters", inc) + } + } +} diff --git a/otel-collector/delta-cumulative-routing/nginx.conf b/otel-collector/delta-cumulative-routing/nginx.conf new file mode 100644 index 0000000..ebf7889 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/nginx.conf @@ -0,0 +1,24 @@ +events { + worker_connections 1024; +} + +http { + # Round-robin across two collector replicas — simulates K8s Service + upstream otel_collectors { + server collector-1:4318; + server collector-2:4318; + } + + server { + listen 4318; + + location / { + proxy_pass http://otel_collectors; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header Connection ""; + # Force short keepalive so round-robin actually distributes across pods + keepalive_requests 2; + } + } +} diff --git a/otel-collector/delta-cumulative-routing/otel-config.yaml b/otel-collector/delta-cumulative-routing/otel-config.yaml new file mode 100644 index 0000000..2039a65 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/otel-config.yaml @@ -0,0 +1,98 @@ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +processors: + # Drop cumulative metrics so only delta passes through this pipeline + filter/keep_delta: + error_mode: ignore + metrics: + metric: + - 'aggregation_temporality == AGGREGATION_TEMPORALITY_CUMULATIVE' + + # Drop delta metrics so only cumulative passes through this pipeline + filter/keep_cumulative: + error_mode: ignore + metrics: + metric: + - 'aggregation_temporality == AGGREGATION_TEMPORALITY_DELTA' + + # Adds host.name to resource attributes from the OS hostname. + # Required so transform/setshardkey can read resource.attributes["host.name"]. + resourcedetection: + detectors: [system] + system: + hostname_sources: [os] + override: false + + # Stamps collector_id on delta metrics only — needed because each collector + # replica independently accumulates deltas from 0; without isolation the + # same series gets conflicting values from different pods. + transform/setshardkey: + metric_statements: + - context: datapoint + statements: + - set(attributes["collector_id"], resource.attributes["host.name"]) + + # Converts delta to cumulative in-memory per pod. + # Safe here because setshardkey already isolates each pod's series. + deltatocumulative: + max_stale: 5m + max_streams: 10000 + + # Renames delta metric to expose the broken (no shard key) variant alongside correct one. + # Applied AFTER deltatocumulative so d2c state is keyed on original name. + transform/rename_broken: + metric_statements: + - context: metric + statements: + - set(name, "demo_requests_delta_broken_total") where name == "demo_requests_delta_total" + + batch: + send_batch_size: 1000 + timeout: 5s + +exporters: + prometheusremotewrite: + endpoint: http://victoriametrics:8428/api/v1/write + tls: + insecure: true + + # Last9 OTLP exporter — for e2e verification via Last9 MCP + otlphttp/last9: + endpoint: ${env:LAST9_OTLP_ENDPOINT} + headers: + Authorization: ${env:LAST9_OTLP_AUTH_HEADER} + + # Debug exporter — shows what each pipeline actually processes + debug: + verbosity: basic + +service: + telemetry: + logs: + level: info + + pipelines: + # Delta path: filter → detect host.name → stamp collector_id → d2c → write + # Queries must use sum without(collector_id) to aggregate across pods + metrics/delta: + receivers: [otlp] + processors: [filter/keep_delta, resourcedetection, transform/setshardkey, deltatocumulative, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] + + # Broken delta path: no setshardkey → both pods write same series → last-write-wins in VM + # Use demo_requests_delta_broken_total to observe the incorrect value vs correct sum + metrics/delta_broken: + receivers: [otlp] + processors: [filter/keep_delta, resourcedetection, deltatocumulative, transform/rename_broken, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] + + # Cumulative path: no collector_id, no d2c — SDK owns the counter + # Queries work as-is; no dedup label needed + metrics/cumulative: + receivers: [otlp] + processors: [filter/keep_cumulative, batch] + exporters: [prometheusremotewrite, otlphttp/last9, debug] diff --git a/otel-collector/delta-cumulative-routing/verify.sh b/otel-collector/delta-cumulative-routing/verify.sh new file mode 100755 index 0000000..8bfaa08 --- /dev/null +++ b/otel-collector/delta-cumulative-routing/verify.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Queries VictoriaMetrics to verify correct routing of delta vs cumulative metrics. +# Run after: docker compose up -d && sleep 30 + +VM="http://localhost:8428" + +query() { + local label=$1 + local expr=$2 + echo "" + echo "=== $label ===" + curl -sg "${VM}/api/v1/query" --data-urlencode "query=${expr}" \ + | python3 -c " +import sys, json +d = json.load(sys.stdin) +results = d.get('data', {}).get('result', []) +if not results: + print(' (no data)') +for r in results: + print(' labels:', r['metric']) + print(' value: ', r['value'][1]) +" +} + +echo "========================================" +echo " Verifying delta vs cumulative routing " +echo "========================================" + +# 1. Delta series: should have collector_id label (one per pod) +query "Delta counter — expect 2 series, one per collector_id" \ + 'demo_requests_delta_total' + +# 2. Cumulative series: should have NO collector_id label (one series total) +query "Cumulative counter — expect 1 series, no collector_id" \ + 'demo_requests_cumulative_total' + +# 3. Correct delta total: sum across collector_ids +query "Delta total (correct) — sum without(collector_id)" \ + 'sum without(collector_id)(demo_requests_delta_total)' + +# 4. Naive delta total WITHOUT dedup — shows overcount risk if collector_id absent +query "Delta rate per pod (each pod's partial view)" \ + 'rate(demo_requests_delta_total[1m])' + +# 5. Combined delta rate — sum across pods +query "Delta rate total (correct) — sum of per-pod rates" \ + 'sum without(collector_id)(rate(demo_requests_delta_total[1m]))' + +# 6. Cumulative rate — works directly, no dedup needed +query "Cumulative rate (works natively) — rate(cumulative[1m])" \ + 'rate(demo_requests_cumulative_total[1m])' + +echo "" +echo "========================================" +echo " Series count sanity check " +echo "========================================" + +count() { + local label=$1 + local expr=$2 + local n + n=$(curl -sg "${VM}/api/v1/query" --data-urlencode "query=count(${expr})" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['data']['result'][0]['value'][1] if d['data']['result'] else 0)") + echo " $label: $n series" +} + +count "demo_requests_delta_total (expect 2, one per pod)" "demo_requests_delta_total" +count "demo_requests_cumulative_total (expect 1)" "demo_requests_cumulative_total"