From 398bc87ed2b59b578f80604f18d8fe5e36b0e2c9 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Wed, 1 Apr 2026 12:12:49 +0530 Subject: [PATCH 01/15] DLPX-96312 Add InfluxDB/Telegraf infrastructure for Engine Performance Analytics PR URL: https://www.github.com/delphix/performance-diagnostics/pull/119 --- debian/control | 2 +- debian/postinst | 3 + debian/rules | 3 + influxdb/delphix-influxdb-init | 215 ++++++++++++++++++++++++++++++ influxdb/delphix-influxdb-service | 23 ++++ influxdb/delphix-influxdb.service | 16 +++ influxdb/influxdb-init.conf | 12 ++ influxdb/influxdb.toml | 10 ++ influxdb/perf_influxdb | 47 +++++++ telegraf/delphix-telegraf-service | 13 ++ telegraf/telegraf.base | 9 +- 11 files changed, 346 insertions(+), 7 deletions(-) create mode 100644 influxdb/delphix-influxdb-init create mode 100644 influxdb/delphix-influxdb-service create mode 100644 influxdb/delphix-influxdb.service create mode 100644 influxdb/influxdb-init.conf create mode 100644 influxdb/influxdb.toml create mode 100644 influxdb/perf_influxdb diff --git a/debian/control b/debian/control index 1a86b83..3ed5ee5 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker.io +Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker.io, influxdb2, curl Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/debian/postinst b/debian/postinst index ea9a0ce..af216ef 100644 --- a/debian/postinst +++ b/debian/postinst @@ -24,6 +24,9 @@ if ! groups "$USER" | grep -q "\b$GROUP\b"; then fi fi +# Remove the influxdb2 package default config — we use influxdb.toml exclusively. +rm -f /etc/influxdb/config.toml + #DEBHELPER# exit 0 \ No newline at end of file diff --git a/debian/rules b/debian/rules index d6f4f00..00b38aa 100755 --- a/debian/rules +++ b/debian/rules @@ -26,3 +26,6 @@ override_dh_auto_install: dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin dh_install telegraf/delphix-telegraf.service /lib/systemd/system dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf + dh_install influxdb/delphix-influxdb-service influxdb/delphix-influxdb-init influxdb/perf_influxdb /usr/bin + dh_install influxdb/delphix-influxdb.service /lib/systemd/system + dh_install influxdb/influxdb.toml influxdb/influxdb-init.conf /etc/influxdb diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init new file mode 100644 index 0000000..0009c1a --- /dev/null +++ b/influxdb/delphix-influxdb-init @@ -0,0 +1,215 @@ +#!/bin/bash -eu +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# One-time InfluxDB initialization: creates org, bucket, admin token, +# a read-only token for DCT Smart Proxy, and writes the +# [[outputs.influxdb_v2]] stanza to /etc/telegraf/telegraf.outputs.influxdb, +# which is included by delphix-telegraf-service when INFLUXDB_ENABLED flag exists. +# Skips setup if InfluxDB is already initialized. +# + +INFLUXDB_URL="http://127.0.0.1:8086" +INFLUXDB_CONFIG_DIR="/etc/influxdb" +INFLUXDB_META_FILE="$INFLUXDB_CONFIG_DIR/influxdb_meta" +# State file written immediately after /api/v2/setup so the script can resume +# if it is interrupted before the metadata file is fully written. +INFLUXDB_SETUP_STATE_FILE="$INFLUXDB_CONFIG_DIR/influxdb_setup_state" +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb +INFLUXDB_INIT_CONF="$INFLUXDB_CONFIG_DIR/influxdb-init.conf" + +# Load tunable configuration (org, bucket, retention, wait parameters). +# shellcheck source=/etc/influxdb/influxdb-init.conf +# shellcheck disable=SC1091 +source "$INFLUXDB_INIT_CONF" + +INFLUXDB_ADMIN_USER="admin" +INFLUXDB_ADMIN_PASSWORD="" + +# +# Log a message to stderr with a timestamp. +# +log() { + echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2 +} + +# +# Extract a field from a JSON string using python3. +# +json_field() { + local json="$1" + local field="$2" + echo "$json" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())$field)" || + { log "ERROR: Failed to parse field '$field' from JSON response."; return 1; } +} + +# +# POST to the InfluxDB HTTP API. Exits with an error if the request fails. +# +influx_post() { + local endpoint="$1" + local data="$2" + local auth_header="${3:-}" + + local curl_args=(-sf -X POST "$INFLUXDB_URL$endpoint" -H 'Content-Type: application/json' -d "$data") + [[ -n "$auth_header" ]] && curl_args+=(-H "Authorization: Token $auth_header") + + local response + response=$(curl "${curl_args[@]}") || + { log "ERROR: HTTP POST to '$endpoint' failed."; return 1; } + echo "$response" +} + +mkdir -p "$INFLUXDB_CONFIG_DIR" + +# Skip if already fully initialized. +if [[ -f "$INFLUXDB_META_FILE" ]]; then + log "InfluxDB already initialized, skipping." + exit 0 +fi + +# +# Wait for InfluxDB to be ready. +# +ready=false +for i in $(seq 1 "$INFLUXDB_WAIT_RETRIES"); do + if curl -sf "$INFLUXDB_URL/health" &>/dev/null; then + ready=true + break + fi + sleep "$INFLUXDB_WAIT_INTERVAL" +done + +if [[ "$ready" != "true" ]]; then + log "ERROR: InfluxDB did not become ready after $((INFLUXDB_WAIT_RETRIES * INFLUXDB_WAIT_INTERVAL))s." + exit 1 +fi + +# +# Initial setup — creates org, bucket, and returns admin token + IDs. +# /api/v2/setup is a one-shot operation; if the script is interrupted after +# this point and re-run, the state file lets us skip setup and reuse the +# already-created admin token. +# +ADMIN_TOKEN="" +ORG_ID="" +BUCKET_ID="" + +if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then + while IFS= read -r line; do + key="${line%%=*}" + value="${line#*=}" + case "$key" in + ADMIN_TOKEN) ADMIN_TOKEN="$value" ;; + ORG_ID) ORG_ID="$value" ;; + BUCKET_ID) BUCKET_ID="$value" ;; + INFLUXDB_ADMIN_PASSWORD) INFLUXDB_ADMIN_PASSWORD="$value" ;; + WRITE_TOKEN) WRITE_TOKEN="$value" ;; + READ_TOKEN) READ_TOKEN="$value" ;; + esac + done <"$INFLUXDB_SETUP_STATE_FILE" +else + # Generate password only when actually running setup for the first time. + INFLUXDB_ADMIN_PASSWORD="$(openssl rand -hex 16)" + SETUP_RESPONSE=$(influx_post "/api/v2/setup" "{ + \"username\": \"$INFLUXDB_ADMIN_USER\", + \"password\": \"$INFLUXDB_ADMIN_PASSWORD\", + \"org\": \"$INFLUXDB_ORG\", + \"bucket\": \"$INFLUXDB_BUCKET\", + \"retentionPeriodSeconds\": $INFLUXDB_RETENTION_SECONDS + }") || exit 1 + + ADMIN_TOKEN=$(json_field "$SETUP_RESPONSE" "['auth']['token']") || exit 1 + ORG_ID=$(json_field "$SETUP_RESPONSE" "['org']['id']") || exit 1 + BUCKET_ID=$(json_field "$SETUP_RESPONSE" "['bucket']['id']") || exit 1 + + # Persist admin token + IDs + password immediately so a subsequent re-run + # can resume without repeating the one-shot setup call, and so the password + # stored in influxdb_meta always matches what InfluxDB was initialised with. + old_umask="$(umask)" + umask 077 + tmp_state="$(mktemp "${INFLUXDB_SETUP_STATE_FILE}.XXXXXX")" + printf 'ADMIN_TOKEN=%s\nORG_ID=%s\nBUCKET_ID=%s\nINFLUXDB_ADMIN_PASSWORD=%s\n' \ + "$ADMIN_TOKEN" "$ORG_ID" "$BUCKET_ID" "$INFLUXDB_ADMIN_PASSWORD" >"$tmp_state" + chmod 600 "$tmp_state" + mv "$tmp_state" "$INFLUXDB_SETUP_STATE_FILE" + umask "$old_umask" +fi + +# Token creation is guarded so that on crash-resume (setup state exists but +# meta file not yet written), we reuse already-created tokens rather than +# creating orphaned duplicates in InfluxDB on each retry. +WRITE_TOKEN="${WRITE_TOKEN:-}" +READ_TOKEN="${READ_TOKEN:-}" + +# +# Create a write-only token for Telegraf (skipped if already persisted in state). +# +if [[ -z "$WRITE_TOKEN" ]]; then + WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + WRITE_TOKEN=$(json_field "$WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'WRITE_TOKEN=%s\n' "$WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Create a read-only token for DCT Smart Proxy (skipped if already persisted in state). +# +if [[ -z "$READ_TOKEN" ]]; then + READ_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"dct-read-token\", + \"permissions\": [ + {\"action\": \"read\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + READ_TOKEN=$(json_field "$READ_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'READ_TOKEN=%s\n' "$READ_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Write the [[outputs.influxdb_v2]] stanza to a dedicated telegraf output file +# and enable it via the INFLUXDB_ENABLED flag. The flag is read by +# delphix-telegraf-service to conditionally include this output. +# +cat >"$INFLUXDB_OUTPUT" <"$tmp_meta" <&2 + kill "$INFLUXDB_PID" 2>/dev/null + exit 1 +fi + +wait "$INFLUXDB_PID" diff --git a/influxdb/delphix-influxdb.service b/influxdb/delphix-influxdb.service new file mode 100644 index 0000000..ec69c0b --- /dev/null +++ b/influxdb/delphix-influxdb.service @@ -0,0 +1,16 @@ +[Unit] +Description=Delphix InfluxDB Time Series Database +Documentation=https://docs.influxdata.com/influxdb/v2/ +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +User=root +ExecStart=/usr/bin/delphix-influxdb-service +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/influxdb/influxdb-init.conf b/influxdb/influxdb-init.conf new file mode 100644 index 0000000..54edf11 --- /dev/null +++ b/influxdb/influxdb-init.conf @@ -0,0 +1,12 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Configuration for delphix-influxdb-init. +# Sourced by /usr/bin/delphix-influxdb-init at runtime. +# + +INFLUXDB_ORG="delphix" +INFLUXDB_BUCKET="default" +INFLUXDB_RETENTION_SECONDS=2592000 # 30 days (720h) +INFLUXDB_WAIT_RETRIES=30 +INFLUXDB_WAIT_INTERVAL=2 diff --git a/influxdb/influxdb.toml b/influxdb/influxdb.toml new file mode 100644 index 0000000..49b7e3f --- /dev/null +++ b/influxdb/influxdb.toml @@ -0,0 +1,10 @@ +# +# Copyright 2026 Delphix. All rights reserved. +# +# InfluxDB 2.x Configuration +# + +bolt-path = "/var/lib/influxdb/influxd.bolt" +engine-path = "/var/lib/influxdb/engine" +http-bind-address = "127.0.0.1:8086" +log-level = "warn" diff --git a/influxdb/perf_influxdb b/influxdb/perf_influxdb new file mode 100644 index 0000000..94583bd --- /dev/null +++ b/influxdb/perf_influxdb @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Script that enables and disables InfluxDB metric output for Telegraf. +# + +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb + +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_influxdb() { + date + [[ ! -f $INFLUXDB_OUTPUT ]] && die "$INFLUXDB_OUTPUT not found. Run delphix-influxdb-init first." + echo "Enabling InfluxDB Metric Output" + touch $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +function disable_influxdb() { + date + echo "Disabling InfluxDB Metric Output" + rm -rf $INFLUXDB_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_influxdb ;; +disable) disable_influxdb ;; +*) usage ;; +esac diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service index 72df797..b15cce2 100755 --- a/telegraf/delphix-telegraf-service +++ b/telegraf/delphix-telegraf-service @@ -3,7 +3,9 @@ BASE_CONFIG=/etc/telegraf/telegraf.base DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose DCT_INPUTS=/etc/telegraf/telegraf.inputs.dct PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf @@ -21,6 +23,10 @@ function playbook_is_enabled() { [[ -f $PLAYBOOK_FLAG ]] } +function influxdb_is_enabled() { + [[ -f $INFLUXDB_FLAG ]] +} + rm -f $TELEGRAF_CONFIG if engine_is_object_based; then @@ -43,4 +49,11 @@ else fi fi +if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then + cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG +fi + +# Restrict permissions so the InfluxDB write token is not world-readable. +chmod 640 $TELEGRAF_CONFIG + /usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 7abd9a4..fc2c319 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -44,12 +44,9 @@ data_format = "json" namepass = ["agg_*"] -# Enable Live Monitoring, intended for internal Delphix use only: -#[[outputs.influxdb]] -# urls = ["http://dbsvr.company.com:8086"] -# database = "live_metrics" -# skip_database_creation = true -# data_format = "influx" +# InfluxDB output is managed via /etc/telegraf/telegraf.outputs.influxdb (written by +# delphix-influxdb-init) and the /etc/telegraf/INFLUXDB_ENABLED flag. +# Use 'perf_influxdb enable|disable' to toggle and restart Telegraf. ############################################################################### # INPUT PLUGINS # From e5e058753db77d23e368e1b3ad8823bad840e168 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Wed, 8 Apr 2026 21:20:43 +0530 Subject: [PATCH 02/15] nginx --- debian/postinst | 5 +++++ debian/rules | 4 +++- influxdb/influxdb-nginx.conf | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 influxdb/influxdb-nginx.conf diff --git a/debian/postinst b/debian/postinst index af216ef..44224e3 100644 --- a/debian/postinst +++ b/debian/postinst @@ -27,6 +27,11 @@ fi # Remove the influxdb2 package default config — we use influxdb.toml exclusively. rm -f /etc/influxdb/config.toml +# Reload nginx to pick up the InfluxDB proxy location block. +if nginx -t -c /etc/nginx/nginx.conf &>/dev/null && systemctl is-active --quiet nginx; then + nginx -s reload +fi + #DEBHELPER# exit 0 \ No newline at end of file diff --git a/debian/rules b/debian/rules index 00b38aa..c84f85c 100755 --- a/debian/rules +++ b/debian/rules @@ -13,11 +13,12 @@ # need to rename a couple files, so do that here. # override_dh_auto_build: - mkdir -p build/cmd/ + mkdir -p build/cmd/ build/influxdb/ cp cmd/estat.py build/cmd/estat cp cmd/stbtrace.py build/cmd/stbtrace cp cmd/nfs_threads.py build/cmd/nfs_threads cp cmd/dsp.py build/cmd/dsp + cp influxdb/influxdb-nginx.conf build/influxdb/influxdb.conf override_dh_auto_install: dh_install build/cmd/* /usr/bin @@ -29,3 +30,4 @@ override_dh_auto_install: dh_install influxdb/delphix-influxdb-service influxdb/delphix-influxdb-init influxdb/perf_influxdb /usr/bin dh_install influxdb/delphix-influxdb.service /lib/systemd/system dh_install influxdb/influxdb.toml influxdb/influxdb-init.conf /etc/influxdb + dh_install build/influxdb/influxdb.conf /opt/delphix/server/etc/nginx/conf.d diff --git a/influxdb/influxdb-nginx.conf b/influxdb/influxdb-nginx.conf new file mode 100644 index 0000000..ba2a74a --- /dev/null +++ b/influxdb/influxdb-nginx.conf @@ -0,0 +1,17 @@ +# +# Copyright (c) 2026 by Delphix. All rights reserved. +# +# Proxy InfluxDB 2.x API through nginx so external clients (DCT, Grafana) +# can reach it over HTTPS using the engine's existing TLS certificate. +# InfluxDB itself binds to 127.0.0.1:8086 (HTTP, localhost only). +# +location /influxdb/ { + proxy_pass http://127.0.0.1:8086/; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + proxy_read_timeout 999d; + proxy_buffering off; +} From e7b0552bbf29c252cbd2efbf1e1966ddc70204fb Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Tue, 14 Apr 2026 17:08:50 +0530 Subject: [PATCH 03/15] DLPX-96312 Add nfs and iscsi data in influxDb --- influxdb/delphix-influxdb-init | 2 +- telegraf/delphix-telegraf-service | 5 +++++ telegraf/telegraf.inputs.nfs_iscsi | 36 ++++++++++++++++++++++++++++++ telegraf/telegraf.inputs.playbook | 30 +++---------------------- 4 files changed, 45 insertions(+), 28 deletions(-) create mode 100644 telegraf/telegraf.inputs.nfs_iscsi diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index 0009c1a..9134f81 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -184,7 +184,7 @@ cat >"$INFLUXDB_OUTPUT" <> $TELEGRAF_CONFIG +fi + if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG fi diff --git a/telegraf/telegraf.inputs.nfs_iscsi b/telegraf/telegraf.inputs.nfs_iscsi new file mode 100644 index 0000000..0ee5f93 --- /dev/null +++ b/telegraf/telegraf.inputs.nfs_iscsi @@ -0,0 +1,36 @@ +############################################################################## +# NFS server I/O and iSCSI target I/O — included when InfluxDB is enabled +# but Performance Playbook is not (playbook already contains these inputs). + +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Convert estat string fields to integers so they are not dropped by Telegraf. +[[processors.converter]] + namepass = ["estat_nfs", "estat_iscsi"] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# End of NFS/iSCSI section +############################################################################## diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index 5ed7e21..62f87ea 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -1,31 +1,7 @@ ############################################################################## -# Performance Playbook (estat, nfs_threads) collection - -# Collect output from "estat nfs -jm 10" -[[inputs.execd]] - command = ["estat", "nfs", "-jm", "10"] - name_override = "estat_nfs" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] - -# Collect output from "estat iscsi -jm 10" -[[inputs.execd]] - command = ["estat", "iscsi", "-jm", "10"] - name_override = "estat_iscsi" - signal = "none" - restart_delay = "30s" - data_format = "json" - tag_keys = [ - "name", - "axis" - ] - json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] +# Performance Playbook (estat, nfs_threads) collection +# Note: estat_nfs and estat_iscsi are in telegraf.inputs.nfs_iscsi so they +# are also available when only InfluxDB is enabled (without the full playbook). # Collect output from "estat zpl -jm 10" [[inputs.execd]] From bfbb2706bc6881d840c0eee7f759f6d9d42c7e0d Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Tue, 14 Apr 2026 21:46:02 +0530 Subject: [PATCH 04/15] DLPX-96312 Sending all data to influxDB now --- influxdb/delphix-influxdb-init | 1 - telegraf/connstat-stats.sh | 4 ++ telegraf/delphix-telegraf-service | 8 +-- telegraf/telegraf.base | 63 ++++++------------- telegraf/telegraf.inputs.dct | 7 --- telegraf/telegraf.inputs.playbook | 23 +++---- ...s.nfs_iscsi => telegraf.inputs.storage_io} | 23 +++++-- 7 files changed, 52 insertions(+), 77 deletions(-) create mode 100755 telegraf/connstat-stats.sh rename telegraf/{telegraf.inputs.nfs_iscsi => telegraf.inputs.storage_io} (57%) diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index 9134f81..5ff3c23 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -184,7 +184,6 @@ cat >"$INFLUXDB_OUTPUT" < Date: Tue, 21 Apr 2026 12:10:38 +0530 Subject: [PATCH 05/15] DLPX-96312 Resolved Slack Review comments --- telegraf/connstat-stats.sh | 66 +++++++++++++++++++++++++++++++++++++- telegraf/telegraf.base | 25 +++++++++++---- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh index c6225fb..e2bdfc0 100755 --- a/telegraf/connstat-stats.sh +++ b/telegraf/connstat-stats.sh @@ -1,4 +1,68 @@ #!/bin/sh +# +# Collect per-connection TCP stats from connstat and aggregate by remote +# endpoint (laddr:raddr:rport) to bound cardinality on engines with many +# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or +# Elastic Data (many connections per object storage endpoint IP). +# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack. +# +# Service name lookup reads from /etc/services, matching LocalTCPStatsCollector +# exactly. lport is checked before rport so that listening services (where the +# engine is the server) are identified correctly. Falls back to "unknown". +# +# Output fields per aggregated endpoint: +# laddr, raddr, rport, service +# inbytes, outbytes, retranssegs, suna, unsent (summed across connections) +# swnd, cwnd, rwnd, rtt (averaged across connections) +# connections (count of aggregated conns) +# /usr/bin/connstat -PLe -i 10 -T u \ -o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \ - | grep --line-buffered -v "^=" + | awk -F',' ' +BEGIN { + # Load port->service mapping from /etc/services, same as LocalTCPStatsCollector. + # Pattern matches lines of the form: "servicename port/tcp" + while ((getline line < "/etc/services") > 0) { + sub(/^[[:space:]]+/, "", line) + if (line ~ /^(#|$)/) continue + n = split(line, f, /[[:space:]]+/) + if (n >= 2 && f[2] ~ /\/tcp/) { + split(f[2], pf, "/") + port = pf[1] + 0 + if (!(port in svc)) svc[port] = f[1] + } + } + close("/etc/services") + # Delphix DSP (Session Protocol) port — not present in /etc/services. + # Matches the ServiceProtocol special-case in LocalTCPStatsCollector. + svc[50001] = "dlpx-sp" +} +/^=/ { + for (key in cnt) { + n = cnt[key] + split(key, k, SUBSEP) + print k[1] "," k[2] "," k[3] "," k[4] "," \ + inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ + int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n + } + delete cnt; delete inb; delete outb; delete ret + delete sun; delete uns; delete sw; delete cw; delete rw; delete rt + next +} +NF == 13 { + lport = $2 + 0 + rport = $4 + 0 + if (lport in svc) { + service = svc[lport] + } else if (rport in svc) { + service = svc[rport] + } else { + service = "unknown" + } + key = $1 SUBSEP $3 SUBSEP rport SUBSEP service + inb[key] += $5; outb[key] += $6; ret[key] += $7 + sun[key] += $8; uns[key] += $9 + sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 + cnt[key]++ +} +' diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 39d2313..0ed7754 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -20,9 +20,11 @@ # INPUT PLUGINS # ############################################################################### -# Get CPU usage +# Get CPU usage — only cpu-total, not per-core (reduces data volume on +# many-CPU engines; agg_cpu automatically inherits this restriction). +# percpu defaults to true so must be explicitly set to false. [[inputs.cpu]] - percpu = true + percpu = false totalcpu = true collect_cpu_time = false report_active = false @@ -34,8 +36,10 @@ # Get disk I/O stats, excluding ZFS zvol devices (zd*) which are internal # ZFS block devices not useful for performance diagnostics. +# wwid is a redundant 100+ char tag; the short-form name tag is sufficient. [[inputs.diskio]] tagdrop = {name = ["zd*"]} + tagexclude = ["wwid"] # Get Memory stats [[inputs.mem]] @@ -44,8 +48,13 @@ [[inputs.net]] fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] -# Per-connection TCP stats (bytes, RTT, window sizes) via connstat. -# Mirrors the TCP_STATS collected by the mgmt stack into analytics_datapoint. +# Per-endpoint TCP stats (bytes, RTT, window sizes) via connstat. +# Aggregated by remote endpoint (laddr:raddr:rport) to mirror the aggregation +# in LocalTCPStatsCollector — avoids cardinality explosion on Oracle dNFS +# engines (hundreds of connections per VDB host) and Elastic Data engines +# (many connections per object storage endpoint IP). +# Cumulative fields (inbytes, outbytes, etc.) are summed; window/RTT fields +# are averaged; connections = number of TCP connections aggregated. [[inputs.execd]] command = ["/etc/telegraf/connstat-stats.sh"] name_override = "tcp_stats" @@ -54,9 +63,9 @@ data_format = "csv" csv_delimiter = "," csv_trim_space = true - csv_column_names = ["laddr", "lport", "raddr", "rport", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt"] - csv_column_types = ["string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] - csv_tag_columns = ["laddr", "lport", "raddr", "rport"] + csv_column_names = ["laddr", "raddr", "rport", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] + csv_column_types = ["string", "string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] + csv_tag_columns = ["laddr", "raddr", "rport", "service"] # Track CPU and Memory for the "delphix-mgmt" service (and children). [[inputs.procstat]] @@ -79,8 +88,10 @@ [[inputs.system]] # ZFS kstats (arcstat, abdstat, zfetch, etc) +# arcstats_l2_* fields are L2ARC stats — unused on all appliances (no L2ARC). [[inputs.zfs]] interval = "1m" + fielddrop = ["arcstats_l2_*"] # Detailed ZFS pool metrics from "zpool_influxdb" (noisy) #[[inputs.exec]] From b30f511137a83a3edd33b4c37cdf112fe2853abf Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Thu, 23 Apr 2026 11:35:26 +0530 Subject: [PATCH 06/15] DLPX-96312 Add perf-diagnostics-deploy skill for Telegraf/InfluxDB deploy workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a Claude Code skill that automates the change → deploy → verify workflow: SSH to a Delphix test engine, copy changed config files to their correct engine paths, restart services, wait for data, and query InfluxDB to confirm changes are working. Co-Authored-By: Claude Sonnet 4.6 --- .../skills/perf-diagnostics-deploy/SKILL.md | 233 ++++++++++++++++++ telegraf/telegraf.base | 15 +- telegraf/telegraf.inputs.storage_io | 33 +-- 3 files changed, 254 insertions(+), 27 deletions(-) create mode 100644 .claude/skills/perf-diagnostics-deploy/SKILL.md diff --git a/.claude/skills/perf-diagnostics-deploy/SKILL.md b/.claude/skills/perf-diagnostics-deploy/SKILL.md new file mode 100644 index 0000000..1a4e819 --- /dev/null +++ b/.claude/skills/perf-diagnostics-deploy/SKILL.md @@ -0,0 +1,233 @@ +--- +name: perf-diagnostics-deploy +description: Use when making changes to Telegraf or InfluxDB config files in the performance-diagnostics repo and wanting to test those changes on a live Delphix engine via SSH +--- + +# Performance Diagnostics Deploy & Verify + +## Overview + +Workflow for making config changes to the performance-diagnostics repo, deploying them to a Delphix test engine over SSH, and verifying the changes are working correctly via InfluxDB queries. + +## Workflow + +```dot +digraph deploy { + "Make changes" -> "Ask: test on engine?"; + "Ask: test on engine?" -> "Done" [label="no"]; + "Ask: test on engine?" -> "Ask: which engine hostname?" [label="yes"]; + "Ask: which engine hostname?" -> "SSH as delphix, ask for password"; + "SSH as delphix, ask for password" -> "Deploy changed files"; + "Deploy changed files" -> "Restart services"; + "Restart services" -> "Wait 5 min"; + "Wait 5 min" -> "Query InfluxDB, verify changes"; +} +``` + +## Step 1 — Make Changes + +Make the requested code/config changes. Summarise what was changed and why before asking about testing. + +## Step 2 — Ask to Test + +``` +Changes done. Do you want to test these on a Delphix engine? +``` + +If no → stop. If yes → proceed. + +## Step 3 — Get Engine + Password + +``` +Which Delphix engine hostname should I deploy to? +``` + +Then SSH using `sshpass`: +```bash +sshpass -p "$PASSWORD" ssh -o StrictHostKeyChecking=no delphix@$HOST "..." +``` + +Ask for the SSH password if not already known. Default user is always `delphix`. + +## Step 4 — Deploy Changed Files + +**File locations on engine:** + +| Repo path | Engine path | +|---|---| +| `telegraf/telegraf.base` | `/etc/telegraf/telegraf.base` | +| `telegraf/telegraf.inputs.*` | `/etc/telegraf/telegraf.inputs.*` | +| `telegraf/connstat-stats.sh` | `/etc/telegraf/connstat-stats.sh` | +| `telegraf/nfs-threads.sh` | `/etc/telegraf/nfs-threads.sh` | +| `telegraf/zcache-stats.sh` | `/etc/telegraf/zcache-stats.sh` | +| `telegraf/zpool-iostat-o.sh` | `/etc/telegraf/zpool-iostat-o.sh` | +| `telegraf/delphix-telegraf-service` | `/usr/bin/delphix-telegraf-service` | +| `telegraf/perf_playbook` | `/usr/bin/perf_playbook` | +| `telegraf/delphix-telegraf.service` | `/lib/systemd/system/delphix-telegraf.service` | +| `influxdb/delphix-influxdb-init` | `/usr/bin/delphix-influxdb-init` | +| `influxdb/delphix-influxdb-service` | `/usr/bin/delphix-influxdb-service` | +| `influxdb/perf_influxdb` | `/usr/bin/perf_influxdb` | +| `influxdb/influxdb.toml` | `/etc/influxdb/influxdb.toml` | +| `influxdb/influxdb-init.conf` | `/etc/influxdb/influxdb-init.conf` | +| `influxdb/delphix-influxdb.service` | `/lib/systemd/system/delphix-influxdb.service` | +| `influxdb/influxdb-nginx.conf` | `/opt/delphix/server/etc/nginx/conf.d/influxdb.conf` | + +Only copy files that were actually changed. Use `scp` to `/tmp/` first, then `sudo cp` to destination. Set `chmod +x` on any shell scripts. + +**InfluxDB data directory:** `/var/lib/influxdb/engine` + +## Step 5 — Restart Services + +```bash +sudo systemctl restart delphix-influxdb +sleep 5 +sudo systemctl restart delphix-telegraf + +# Confirm both are active +systemctl is-active delphix-influxdb +systemctl is-active delphix-telegraf +``` + +## Step 6 — Wait 5 Minutes + +Wait for data to flow into InfluxDB. Use `ScheduleWakeup` with `delaySeconds: 270` (within cache window). + +## Step 7 — Verify via InfluxDB Query + +Get the InfluxDB credentials from the engine: +```bash +sudo cat /etc/influxdb/influxdb_meta +``` + +Query InfluxDB using the Flux API to verify the changes for the **last 5 minutes** of data. Tailor the query to what was changed: + +| Change type | What to verify | +|---|---| +| New measurement added | `from(bucket:"default") |> range(start: -5m) |> filter(fn: (r) => r._measurement == "new_measurement") |> count()` | +| Field removed (e.g. `wwid` tag) | Check tag keys don't include the removed tag | +| Histogram processors | Verify `hist_estat_*` measurements exist with bucket fields | +| `microseconds` field | Check field exists in relevant measurements | +| `connstat` aggregation | Verify `tcp_stats` has `service` and `connections` fields | + +Query via curl: +```bash +curl -s -X POST "http://localhost:8086/api/v2/query?org=delphix" \ + -H "Authorization: Token $INFLUXDB_READ_TOKEN" \ + -H "Content-Type: application/vnd.flux" \ + -d 'from(bucket:"default") |> range(start: -5m) |> filter(fn:(r) => r._measurement == "MEASUREMENT") |> limit(n:5)' +``` + +Report results clearly: what measurements exist, what fields/tags are present, and whether the change is confirmed working. + +Then ask: + +``` +Verification done. Do you want to commit these changes? +``` + +If no → stop. If yes → proceed to Step 8. + +## Step 8 — Commit Changes + +Show the latest commit: +```bash +git log -1 --oneline +``` + +Ask: +``` +Latest commit: " " +Do you want to (1) amend that commit or (2) create a new commit? +``` + +**If amend:** +```bash +git add -A +git commit --amend --no-edit +``` + +**If new commit:** +Ask: +``` +What should the commit message be? (include the Jira ID, e.g. "DLPX-12345 Fix xyz") +``` + +Then: +```bash +git add -A +git commit -m "" +``` + +Then ask: +``` +Commit done. Do you want to push and update/raise a PR? +``` + +If no → stop. If yes → proceed to Step 9. + +## Step 9 — Push and PR + +`git review` is a Delphix tool that pushes the branch and creates/updates the PR in one command. Use it instead of `git push` + `gh pr create`. + +First check for an existing open PR on the current branch: +```bash +gh pr list --head "$(git branch --show-current)" --state open +``` + +**If PR exists → update it:** + +```bash +git review -r +``` + +Then fetch the current PR description and update it to reflect the new changes: +```bash +gh pr view --json body +gh pr edit --body "..." +``` +Keep the existing structure but add/update the relevant sections. + +**If no PR exists → raise a new one:** + +Ask: +``` +What is the Jira ticket number for this PR? (e.g. DLPX-12345) +``` + +Fetch the Jira issue using the Jira MCP tool (`mcp__jira__jira_get_issue`) to understand the problem context. Then run: + +```bash +git review +``` + +This creates the PR as a draft. Get the PR URL from the output, then set a full description: + +```bash +gh pr edit --title ": " --body "$(cat <<'EOF' +## Summary +- + +## Problem + + +## Solution + + +## Testing +- [ ] Deployed to test engine +- [ ] InfluxDB queries confirmed data flowing correctly +- [ ] + +Jira: +EOF +)" +``` + +Return the PR URL to the user. + +## Common Mistakes + +- Forgetting `chmod +x` on shell scripts → Telegraf fails with `EXEC` error +- Restarting Telegraf before InfluxDB is ready → Telegraf starts with `[[outputs.discard]]` +- Querying before 5 minutes pass → no data in range, looks broken but isn't +- Copying the wrong file path (influxdb vs telegraf directories) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 0ed7754..9309328 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -33,12 +33,17 @@ # Get mount point stats [[inputs.disk]] mount_points = ["/","/domain0"] - -# Get disk I/O stats, excluding ZFS zvol devices (zd*) which are internal -# ZFS block devices not useful for performance diagnostics. + tagexclude = ["fstype", "mode"] + +# Get disk I/O stats for whole disks only — partitions add cardinality without +# diagnostic value and account for ~30% of diskio/agg_diskio line volume. +# Excluded: +# zd* — ZFS zvol internal block devices +# *p[0-9]* — NVMe partitions (nvme0n1p1, nvme0n1p9, etc.) +# sd*[0-9]* — SCSI/SATA partitions (sda1, sdb2, etc.) # wwid is a redundant 100+ char tag; the short-form name tag is sufficient. [[inputs.diskio]] - tagdrop = {name = ["zd*"]} + tagdrop = {name = ["zd*", "*p[0-9]*", "sd*[0-9]*"]} tagexclude = ["wwid"] # Get Memory stats @@ -73,6 +78,7 @@ include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Track CPU and Memory for the "zfs-object-agent" service (and children). [[inputs.procstat]] @@ -80,6 +86,7 @@ include_systemd_children = true namedrop = ["procstat_lookup"] fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + tagexclude = ["cgroup_full"] # Get process counts [[inputs.processes]] diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 0845ab9..6fc249c 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -50,35 +50,22 @@ [processors.converter.fields] integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] -# Parse microseconds latency histograms for all estat_* measurements. -# Transforms "{20000,5},{30000,15}" → {"20000":5,"30000":15} as a JSON object. -# Covers both storage_io (estat_nfs/iscsi/backend-io) and playbook measurements -# (estat_zpl/zvol/zio/etc) — no duplication needed in telegraf.inputs.playbook. +# Clone estat_* measurements as hist_estat_* to hold histogram data only. +# microseconds is removed from the originals (order 2 below) so it lives in +# hist_estat_* exclusively — no duplication. +# Keeps the original format "{20000,5},{30000,15}" compatible with import code. [[processors.clone]] order = 1 name_prefix = "hist_" namepass = ["estat_*"] -[[processors.regex]] +# Drop microseconds from all original estat_* measurements after cloning. +# Covers both storage_io (estat_nfs/iscsi/backend-io) and playbook +# (estat_zpl/zvol/zio/etc) measurements in one place. +[[processors.strings]] order = 2 - namepass = ["hist_estat_*"] - [[processors.regex.fields]] - key = "microseconds" - pattern = "{(\\d+),(\\d+)}" - replacement = "\"${1}\":${2}" - [[processors.regex.fields]] - key = "microseconds" - pattern = ".*" - replacement = "{$0}" - -[[processors.parser]] - order = 3 - merge = "override" - parse_fields = ["microseconds"] - drop_original = false - data_format = "json" - namepass = ["hist_estat_*"] - fieldpass = ["microseconds"] + namepass = ["estat_*"] + fieldexclude = ["microseconds"] # End of Storage I/O section ############################################################################## From 0fb7aea217ecde323d501b41b16740bf580ae7e8 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Thu, 23 Apr 2026 15:25:12 +0530 Subject: [PATCH 07/15] DLPX-96312 Filter garbage stat names from estat metaslab-alloc output Wraps estat metaslab-alloc in a shell script that drops JSON lines whose "name" tag contains non-standard characters (backslashes, hashes, etc.). Addresses DLPX-88427 where a kernel bug causes random memory bytes or C macro strings to appear as stat names, producing unreadable metrics. Co-Authored-By: Claude Sonnet 4.6 --- telegraf/connstat-stats.sh | 28 +++++++++++++++------------- telegraf/metaslab-alloc-stats.sh | 9 +++++++++ telegraf/telegraf.base | 6 +++--- telegraf/telegraf.inputs.playbook | 5 +++-- telegraf/telegraf.inputs.storage_io | 27 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 18 deletions(-) create mode 100755 telegraf/metaslab-alloc-stats.sh diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh index e2bdfc0..4347a06 100755 --- a/telegraf/connstat-stats.sh +++ b/telegraf/connstat-stats.sh @@ -1,7 +1,7 @@ #!/bin/sh # # Collect per-connection TCP stats from connstat and aggregate by remote -# endpoint (laddr:raddr:rport) to bound cardinality on engines with many +# endpoint (laddr:raddr:service) to bound cardinality on engines with many # connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or # Elastic Data (many connections per object storage endpoint IP). # Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack. @@ -11,7 +11,7 @@ # engine is the server) are identified correctly. Falls back to "unknown". # # Output fields per aggregated endpoint: -# laddr, raddr, rport, service +# laddr, raddr, service # inbytes, outbytes, retranssegs, suna, unsent (summed across connections) # swnd, cwnd, rwnd, rtt (averaged across connections) # connections (count of aggregated conns) @@ -33,33 +33,35 @@ BEGIN { } } close("/etc/services") - # Delphix DSP (Session Protocol) port — not present in /etc/services. - # Matches the ServiceProtocol special-case in LocalTCPStatsCollector. - svc[50001] = "dlpx-sp" + # Delphix-specific ports not present in /etc/services. + # Matches LocalTCPStatsCollector.getService() special-cases exactly. + svc[8415] = "dlpx-sp" # DSP (ServiceProtocol.PORT) + svc[50001] = "network-throughput-test" # TtcpPerfSession.DEFAULT_PORT + svc[8341] = "oracle-logsync" # HTTP server (TunableRegistry.HTTP_SERVER_PORT default) + svc[9100] = "dlpx-connector" # Host Connector (Connector.DEFAULT_PORT) } /^=/ { for (key in cnt) { n = cnt[key] split(key, k, SUBSEP) - print k[1] "," k[2] "," k[3] "," k[4] "," \ + print k[1] "," k[2] "," k[3] "," \ inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n } + fflush() delete cnt; delete inb; delete outb; delete ret delete sun; delete uns; delete sw; delete cw; delete rw; delete rt next } NF == 13 { - lport = $2 + 0 - rport = $4 + 0 - if (lport in svc) { - service = svc[lport] - } else if (rport in svc) { - service = svc[rport] + if (($2 + 0) in svc) { + service = svc[$2 + 0] + } else if (($4 + 0) in svc) { + service = svc[$4 + 0] } else { service = "unknown" } - key = $1 SUBSEP $3 SUBSEP rport SUBSEP service + key = $1 SUBSEP $3 SUBSEP service inb[key] += $5; outb[key] += $6; ret[key] += $7 sun[key] += $8; uns[key] += $9 sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 diff --git a/telegraf/metaslab-alloc-stats.sh b/telegraf/metaslab-alloc-stats.sh new file mode 100755 index 0000000..aaee3fc --- /dev/null +++ b/telegraf/metaslab-alloc-stats.sh @@ -0,0 +1,9 @@ +#!/bin/sh +# +# Wrapper around "estat metaslab-alloc -jm 10" that filters out metrics whose +# "name" tag contains garbage characters (DLPX-88427). A kernel bug causes +# estat to occasionally emit stat names containing raw memory bytes or C macro +# strings. Only names consisting of printable ASCII letters, digits, spaces, +# and common punctuation are passed through. +# +estat metaslab-alloc -jm 10 | grep -E '"name":"[A-Za-z0-9 ,_()/.-]+"' diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 9309328..0933bb6 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -68,9 +68,9 @@ data_format = "csv" csv_delimiter = "," csv_trim_space = true - csv_column_names = ["laddr", "raddr", "rport", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] - csv_column_types = ["string", "string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] - csv_tag_columns = ["laddr", "raddr", "rport", "service"] + csv_column_names = ["laddr", "raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] + csv_column_types = ["string", "string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] + csv_tag_columns = ["laddr", "raddr", "service"] # Track CPU and Memory for the "delphix-mgmt" service (and children). [[inputs.procstat]] diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index dcfa71e..d478ab2 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -55,9 +55,10 @@ ] json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] -# Collect output from "estat metaslab-alloc -jm 10" +# Collect output from "estat metaslab-alloc -jm 10" via wrapper script. +# The wrapper filters out metrics with garbage "name" tags (DLPX-88427). [[inputs.execd]] - command = ["estat", "metaslab-alloc", "-jm", "10"] + command = ["/etc/telegraf/metaslab-alloc-stats.sh"] name_override = "estat_metaslab-alloc" signal = "none" restart_delay = "30s" diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index 6fc249c..c4ed37c 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -67,5 +67,32 @@ namepass = ["estat_*"] fieldexclude = ["microseconds"] +# Expand hist_estat_* microseconds histogram strings into per-bucket rows for +# Grafana heatmap support. Each "{upper_bound_us,count}" pair becomes a +# separate metric with le= tag and count field, replacing the +# opaque string. Runs after clone (order=1) and strings (order=2) so the +# microseconds field is still present in hist_estat_* at this point. +[[processors.starlark]] + order = 3 + namepass = ["hist_estat_*"] + source = ''' +def apply(metric): + ms = metric.fields.get("microseconds") + if ms == None: + return [metric] + + result = [] + for pair in ms[1:-1].split("},{"): + parts = pair.split(",") + if len(parts) == 2: + m = deepcopy(metric) + m.tags["le"] = parts[0] + m.fields["count"] = int(parts[1]) + m.fields.pop("microseconds") + result.append(m) + + return result if result else [metric] +''' + # End of Storage I/O section ############################################################################## From 60660daa58794a53d468e295b7eaaf1107dbe61d Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 27 Apr 2026 17:23:34 +0530 Subject: [PATCH 08/15] DLPX-96312 Trim Telegraf field sets and intervals to reduce InfluxDB volume --- telegraf/telegraf.base | 31 ++++++++++++++++++++++++++++- telegraf/telegraf.inputs.storage_io | 3 ++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 0933bb6..5abc3e2 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -32,8 +32,10 @@ # Get mount point stats [[inputs.disk]] + interval = "60s" mount_points = ["/","/domain0"] tagexclude = ["fstype", "mode"] + fieldpass = ["used", "free", "total"] # Get disk I/O stats for whole disks only — partitions add cardinality without # diagnostic value and account for ~30% of diskio/agg_diskio line volume. @@ -43,11 +45,14 @@ # sd*[0-9]* — SCSI/SATA partitions (sda1, sdb2, etc.) # wwid is a redundant 100+ char tag; the short-form name tag is sufficient. [[inputs.diskio]] + interval = "60s" tagdrop = {name = ["zd*", "*p[0-9]*", "sd*[0-9]*"]} tagexclude = ["wwid"] + fieldpass = ["reads", "writes", "read_bytes", "write_bytes", "read_time", "write_time", "iops_in_progress"] # Get Memory stats [[inputs.mem]] + fieldpass = ["used", "available", "total", "free", "cached", "buffered", "dirty", "slab"] # Get some network interface stats [[inputs.net]] @@ -98,7 +103,31 @@ # arcstats_l2_* fields are L2ARC stats — unused on all appliances (no L2ARC). [[inputs.zfs]] interval = "1m" - fielddrop = ["arcstats_l2_*"] + fieldpass = [ + "arcstats_anon_data", "arcstats_anon_evictable_data", + "arcstats_anon_evictable_metadata", "arcstats_anon_metadata", + "arcstats_arc_need_free", "arcstats_arc_no_grow", "arcstats_arc_prune", + "arcstats_arc_sys_free", "arcstats_async_upgrade_sync", + "arcstats_c", "arcstats_data_size", + "arcstats_demand_data_hits", "arcstats_demand_data_misses", + "arcstats_demand_hit_predictive_prefetch", + "arcstats_evict_not_enough", "arcstats_evict_skip", + "arcstats_hits", "arcstats_misses", + "arcstats_memory_available_bytes", "arcstats_memory_direct_count", + "arcstats_memory_free_bytes", "arcstats_memory_indirect_count", + "arcstats_metadata_size", + "arcstats_mfu_data", "arcstats_mfu_evictable_data", + "arcstats_mfu_evictable_metadata", "arcstats_mfu_ghost_hits", + "arcstats_mfu_hits", "arcstats_mfu_metadata", + "arcstats_mru_data", "arcstats_mru_evictable_data", + "arcstats_mru_evictable_metadata", "arcstats_mru_ghost_hits", + "arcstats_mru_hits", "arcstats_mru_metadata", + "arcstats_prefetch_data_hits", "arcstats_prefetch_data_misses", + "arcstats_size", + "zil_commit_count", "zil_itx_count", "zil_commit_stall_count", + "zfetchstats_hits", "zfetchstats_misses", + "dmu_tx_dirty_throttle", "dmu_tx_delay" + ] # Detailed ZFS pool metrics from "zpool_influxdb" (noisy) #[[inputs.exec]] diff --git a/telegraf/telegraf.inputs.storage_io b/telegraf/telegraf.inputs.storage_io index c4ed37c..2341147 100644 --- a/telegraf/telegraf.inputs.storage_io +++ b/telegraf/telegraf.inputs.storage_io @@ -87,8 +87,9 @@ def apply(metric): if len(parts) == 2: m = deepcopy(metric) m.tags["le"] = parts[0] + for k in list(m.fields.keys()): + m.fields.pop(k) m.fields["count"] = int(parts[1]) - m.fields.pop("microseconds") result.append(m) return result if result else [metric] From c2403c339fce203814ee2abba2b0df2ecb34aba5 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Mon, 27 Apr 2026 20:21:07 +0530 Subject: [PATCH 09/15] DLPX-96312 Route non-Grafana metrics to separate support_metrics InfluxDB bucket --- influxdb/delphix-influxdb-init | 51 +++++++++++++++++++++++++++++++--- influxdb/influxdb-init.conf | 2 ++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index f7335ee..8abffba 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -95,6 +95,7 @@ fi ADMIN_TOKEN="" ORG_ID="" BUCKET_ID="" +SUPPORT_BUCKET_ID="" if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then while IFS= read -r line; do @@ -104,9 +105,11 @@ if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then ADMIN_TOKEN) ADMIN_TOKEN="$value" ;; ORG_ID) ORG_ID="$value" ;; BUCKET_ID) BUCKET_ID="$value" ;; + SUPPORT_BUCKET_ID) SUPPORT_BUCKET_ID="$value" ;; INFLUXDB_ADMIN_PASSWORD) INFLUXDB_ADMIN_PASSWORD="$value" ;; WRITE_TOKEN) WRITE_TOKEN="$value" ;; READ_TOKEN) READ_TOKEN="$value" ;; + SUPPORT_WRITE_TOKEN) SUPPORT_WRITE_TOKEN="$value" ;; esac done <"$INFLUXDB_SETUP_STATE_FILE" else @@ -137,11 +140,25 @@ else umask "$old_umask" fi +# +# Create the support_metrics bucket (skipped if already persisted in state). +# +if [[ -z "$SUPPORT_BUCKET_ID" ]]; then + SUPPORT_BUCKET_RESPONSE=$(influx_post "/api/v2/buckets" "{ + \"orgID\": \"$ORG_ID\", + \"name\": \"$INFLUXDB_SUPPORT_BUCKET\", + \"retentionRules\": [{\"type\": \"expire\", \"everySeconds\": $INFLUXDB_SUPPORT_RETENTION_SECONDS}] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_BUCKET_ID=$(json_field "$SUPPORT_BUCKET_RESPONSE" "['id']") || exit 1 + printf 'SUPPORT_BUCKET_ID=%s\n' "$SUPPORT_BUCKET_ID" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + # Token creation is guarded so that on crash-resume (setup state exists but # meta file not yet written), we reuse already-created tokens rather than # creating orphaned duplicates in InfluxDB on each retry. WRITE_TOKEN="${WRITE_TOKEN:-}" READ_TOKEN="${READ_TOKEN:-}" +SUPPORT_WRITE_TOKEN="${SUPPORT_WRITE_TOKEN:-}" # # Create a write-only token for Telegraf (skipped if already persisted in state). @@ -174,9 +191,26 @@ if [[ -z "$READ_TOKEN" ]]; then fi # -# Write the [[outputs.influxdb_v2]] stanza to a dedicated telegraf output file -# and enable it via the INFLUXDB_ENABLED flag. The flag is read by -# delphix-telegraf-service to conditionally include this output. +# Create a write-only token for the support_metrics bucket (skipped if already persisted). +# +if [[ -z "$SUPPORT_WRITE_TOKEN" ]]; then + SUPPORT_WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{ + \"orgID\": \"$ORG_ID\", + \"description\": \"telegraf-support-write-token\", + \"permissions\": [ + {\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$SUPPORT_BUCKET_ID\", \"orgID\": \"$ORG_ID\"}} + ] + }" "$ADMIN_TOKEN") || exit 1 + SUPPORT_WRITE_TOKEN=$(json_field "$SUPPORT_WRITE_TOKEN_RESPONSE" "['token']") || exit 1 + printf 'SUPPORT_WRITE_TOKEN=%s\n' "$SUPPORT_WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE" +fi + +# +# Write two [[outputs.influxdb_v2]] stanzas to a dedicated telegraf output file: +# - default bucket: Grafana-facing measurements (cpu, mem, disk, net, zfs, estat_*, hist_estat_*) +# - support_metrics bucket: operational measurements not displayed in Grafana +# (tcp_stats, processes, system, procstat, agg_*) +# The flag is read by delphix-telegraf-service to conditionally include this output. # cat >"$INFLUXDB_OUTPUT" <"$INFLUXDB_OUTPUT" <"$tmp_meta" < Date: Tue, 12 May 2026 14:31:20 +0530 Subject: [PATCH 10/15] DLPX-96312 Route tcp_stats to default InfluxDB bucket instead of support bucket Co-Authored-By: Claude Sonnet 4.6 --- influxdb/delphix-influxdb-init | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index 8abffba..e3036d6 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -218,14 +218,14 @@ cat >"$INFLUXDB_OUTPUT" < Date: Tue, 12 May 2026 18:42:22 +0530 Subject: [PATCH 11/15] fix(connstat): replace mawk with Python to fix stdout buffering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mawk 1.3.4 does not reliably flush stdout to Telegraf execd's pipe, causing batches to be held in the buffer for hours before a single dump — resulting in no data or garbage derivatives when data arrives. Python with sys.stdout.flush() after every 10-second batch gives the same aggregation (laddr:raddr:service) and flushes deterministically. Co-Authored-By: Claude Sonnet 4.6 --- telegraf/connstat-stats.sh | 142 +++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 53 deletions(-) diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh index 4347a06..eec32ef 100755 --- a/telegraf/connstat-stats.sh +++ b/telegraf/connstat-stats.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env python3 # # Collect per-connection TCP stats from connstat and aggregate by remote # endpoint (laddr:raddr:service) to bound cardinality on engines with many @@ -16,55 +16,91 @@ # swnd, cwnd, rwnd, rtt (averaged across connections) # connections (count of aggregated conns) # -/usr/bin/connstat -PLe -i 10 -T u \ - -o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \ - | awk -F',' ' -BEGIN { - # Load port->service mapping from /etc/services, same as LocalTCPStatsCollector. - # Pattern matches lines of the form: "servicename port/tcp" - while ((getline line < "/etc/services") > 0) { - sub(/^[[:space:]]+/, "", line) - if (line ~ /^(#|$)/) continue - n = split(line, f, /[[:space:]]+/) - if (n >= 2 && f[2] ~ /\/tcp/) { - split(f[2], pf, "/") - port = pf[1] + 0 - if (!(port in svc)) svc[port] = f[1] - } - } - close("/etc/services") - # Delphix-specific ports not present in /etc/services. - # Matches LocalTCPStatsCollector.getService() special-cases exactly. - svc[8415] = "dlpx-sp" # DSP (ServiceProtocol.PORT) - svc[50001] = "network-throughput-test" # TtcpPerfSession.DEFAULT_PORT - svc[8341] = "oracle-logsync" # HTTP server (TunableRegistry.HTTP_SERVER_PORT default) - svc[9100] = "dlpx-connector" # Host Connector (Connector.DEFAULT_PORT) -} -/^=/ { - for (key in cnt) { - n = cnt[key] - split(key, k, SUBSEP) - print k[1] "," k[2] "," k[3] "," \ - inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \ - int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n - } - fflush() - delete cnt; delete inb; delete outb; delete ret - delete sun; delete uns; delete sw; delete cw; delete rw; delete rt - next -} -NF == 13 { - if (($2 + 0) in svc) { - service = svc[$2 + 0] - } else if (($4 + 0) in svc) { - service = svc[$4 + 0] - } else { - service = "unknown" - } - key = $1 SUBSEP $3 SUBSEP service - inb[key] += $5; outb[key] += $6; ret[key] += $7 - sun[key] += $8; uns[key] += $9 - sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13 - cnt[key]++ -} -' +import subprocess +import sys + +# Load port->service mapping from /etc/services, same as LocalTCPStatsCollector. +svc = {} +try: + with open('/etc/services') as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + parts = line.split() + if len(parts) >= 2 and '/tcp' in parts[1]: + try: + port = int(parts[1].split('/')[0]) + if port not in svc: + svc[port] = parts[0] + except ValueError: + pass +except OSError: + pass + +# Delphix-specific ports not present in /etc/services. +# Matches LocalTCPStatsCollector.getService() special-cases exactly. +svc[8415] = 'dlpx-sp' +svc[50001] = 'network-throughput-test' +svc[8341] = 'oracle-logsync' +svc[9100] = 'dlpx-connector' + +proc = subprocess.Popen( + ['/usr/bin/connstat', '-PLe', '-i', '10', '-T', 'u', + '-o', 'laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,' + 'suna,unsent,swnd,cwnd,rwnd,rtt'], + stdout=subprocess.PIPE, + text=True, + bufsize=1, +) + +cnt = {} +inb = {} +outb = {} +ret_ = {} +sun = {} +uns = {} +sw = {} +cw = {} +rw = {} +rt = {} + +for raw in proc.stdout: + line = raw.rstrip('\n') + if line.startswith('='): + for key, n in cnt.items(): + la, ra, sv = key + sys.stdout.write( + f"{la},{ra},{sv}," + f"{inb[key]},{outb[key]},{ret_[key]},{sun[key]},{uns[key]}," + f"{sw[key]//n},{cw[key]//n},{rw[key]//n},{rt[key]//n},{n}\n" + ) + sys.stdout.flush() + cnt.clear(); inb.clear(); outb.clear(); ret_.clear() + sun.clear(); uns.clear(); sw.clear(); cw.clear(); rw.clear(); rt.clear() + continue + + fields = line.split(',') + if len(fields) != 13: + continue + la, lp, ra, rp = fields[0], fields[1], fields[2], fields[3] + lp_i = int(lp) if lp.isdigit() else 0 + rp_i = int(rp) if rp.isdigit() else 0 + if lp_i in svc: + sv = svc[lp_i] + elif rp_i in svc: + sv = svc[rp_i] + else: + sv = 'unknown' + + key = (la, ra, sv) + cnt[key] = cnt.get(key, 0) + 1 + inb[key] = inb.get(key, 0) + int(fields[4]) + outb[key] = outb.get(key, 0) + int(fields[5]) + ret_[key] = ret_.get(key, 0) + int(fields[6]) + sun[key] = sun.get(key, 0) + int(fields[7]) + uns[key] = uns.get(key, 0) + int(fields[8]) + sw[key] = sw.get(key, 0) + int(fields[9]) + cw[key] = cw.get(key, 0) + int(fields[10]) + rw[key] = rw.get(key, 0) + int(fields[11]) + rt[key] = rt.get(key, 0) + int(fields[12]) From 9a5175acc7650ad471458bdafbde1f129009a676 Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Fri, 15 May 2026 00:05:23 +0530 Subject: [PATCH 12/15] DLPX-96312 Fix tcp_stats data to have right fields only in default bucket --- influxdb/delphix-influxdb-init | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index e3036d6..a59e42b 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -206,10 +206,15 @@ if [[ -z "$SUPPORT_WRITE_TOKEN" ]]; then fi # -# Write two [[outputs.influxdb_v2]] stanzas to a dedicated telegraf output file: -# - default bucket: Grafana-facing measurements (cpu, mem, disk, net, zfs, estat_*, hist_estat_*) -# - support_metrics bucket: operational measurements not displayed in Grafana -# (tcp_stats, processes, system, procstat, agg_*) +# Write three [[outputs.influxdb_v2]] stanzas to a dedicated telegraf output file: +# - default bucket: Grafana-facing measurements (cpu, disk, diskio, net, zfs, +# estat_nfs, estat_iscsi, hist_estat_*, tcp_stats slim) +# - default bucket (second stanza): tcp_stats slim — only the 4 fields needed by Grafana +# dashboards (connections, inbytes, outbytes, retranssegs) +# - support_metrics bucket: operational/diagnostic measurements — estat_backend-io +# (raw latency scalars, not used by Grafana; hist_estat_backend-io stays in default +# for the heatmap), mem (not used by Grafana), processes, system, procstat, agg_*, +# and full tcp_stats with all TCP internals # The flag is read by delphix-telegraf-service to conditionally include this output. # cat >"$INFLUXDB_OUTPUT" <"$INFLUXDB_OUTPUT" < Date: Wed, 20 May 2026 19:30:56 +0530 Subject: [PATCH 13/15] Drop laddr tag from tcp_stats; keep raddr for per-host breakdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit laddr is always the engine's own IP — constant, adds no diagnostic value as a tag. Dropping it reduces row size (~15 bytes/row) and removes a tag that was arbitrary in LocalTCPStatsCollector's default mode anyway. raddr is kept so callers can split tcp_stats by remote host (e.g. NFS throughput per VDB host, as Craig confirmed is needed for PerfDB). Aggregation key changes from (laddr, raddr, service) → (raddr, service). telegraf.base updated to remove laddr from csv_column_names and csv_tag_columns. Co-Authored-By: Claude Sonnet 4.6 --- telegraf/connstat-stats.sh | 23 ++++++++++++----------- telegraf/telegraf.base | 14 +++++++------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/telegraf/connstat-stats.sh b/telegraf/connstat-stats.sh index eec32ef..23b1fd4 100755 --- a/telegraf/connstat-stats.sh +++ b/telegraf/connstat-stats.sh @@ -1,17 +1,18 @@ #!/usr/bin/env python3 # -# Collect per-connection TCP stats from connstat and aggregate by remote -# endpoint (laddr:raddr:service) to bound cardinality on engines with many -# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or -# Elastic Data (many connections per object storage endpoint IP). -# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack. +# Collect per-connection TCP stats from connstat and aggregate by +# (raddr, service) to mirror the behaviour of LocalTCPStatsCollector. +# +# laddr (local address) is intentionally excluded: it is always the engine's +# own IP and adds no diagnostic value as a tag. raddr is kept so that callers +# can split stats by remote host — e.g. to see NFS throughput per VDB host. # # Service name lookup reads from /etc/services, matching LocalTCPStatsCollector # exactly. lport is checked before rport so that listening services (where the # engine is the server) are identified correctly. Falls back to "unknown". # -# Output fields per aggregated endpoint: -# laddr, raddr, service +# Output fields per aggregated (raddr, service) group: +# raddr, service # inbytes, outbytes, retranssegs, suna, unsent (summed across connections) # swnd, cwnd, rwnd, rtt (averaged across connections) # connections (count of aggregated conns) @@ -69,9 +70,9 @@ for raw in proc.stdout: line = raw.rstrip('\n') if line.startswith('='): for key, n in cnt.items(): - la, ra, sv = key + ra, sv = key sys.stdout.write( - f"{la},{ra},{sv}," + f"{ra},{sv}," f"{inb[key]},{outb[key]},{ret_[key]},{sun[key]},{uns[key]}," f"{sw[key]//n},{cw[key]//n},{rw[key]//n},{rt[key]//n},{n}\n" ) @@ -83,7 +84,7 @@ for raw in proc.stdout: fields = line.split(',') if len(fields) != 13: continue - la, lp, ra, rp = fields[0], fields[1], fields[2], fields[3] + lp, ra, rp = fields[1], fields[2], fields[3] lp_i = int(lp) if lp.isdigit() else 0 rp_i = int(rp) if rp.isdigit() else 0 if lp_i in svc: @@ -93,7 +94,7 @@ for raw in proc.stdout: else: sv = 'unknown' - key = (la, ra, sv) + key = (ra, sv) cnt[key] = cnt.get(key, 0) + 1 inb[key] = inb.get(key, 0) + int(fields[4]) outb[key] = outb.get(key, 0) + int(fields[5]) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 5abc3e2..f8759aa 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -59,10 +59,10 @@ fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] # Per-endpoint TCP stats (bytes, RTT, window sizes) via connstat. -# Aggregated by remote endpoint (laddr:raddr:rport) to mirror the aggregation -# in LocalTCPStatsCollector — avoids cardinality explosion on Oracle dNFS -# engines (hundreds of connections per VDB host) and Elastic Data engines -# (many connections per object storage endpoint IP). +# Aggregated by (raddr, service): laddr (always the engine's own IP) is dropped +# as it adds no diagnostic value. raddr is kept so stats can be split by remote +# host — e.g. NFS throughput per VDB host. Matches LocalTCPStatsCollector's +# default aggregation behaviour. # Cumulative fields (inbytes, outbytes, etc.) are summed; window/RTT fields # are averaged; connections = number of TCP connections aggregated. [[inputs.execd]] @@ -73,9 +73,9 @@ data_format = "csv" csv_delimiter = "," csv_trim_space = true - csv_column_names = ["laddr", "raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] - csv_column_types = ["string", "string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] - csv_tag_columns = ["laddr", "raddr", "service"] + csv_column_names = ["raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"] + csv_column_types = ["string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"] + csv_tag_columns = ["raddr", "service"] # Track CPU and Memory for the "delphix-mgmt" service (and children). [[inputs.procstat]] From ae889c57828625ed880786c16f2ddaaef2b5f46e Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Wed, 20 May 2026 19:33:36 +0530 Subject: [PATCH 14/15] Drop usage_guest and usage_guest_nice from cpu input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both fields are always zero on Delphix engines — we don't run guest VMs. Confirmed by Craig Alder (support). usage_nice is kept as it is uncertain whether it will always be zero. Co-Authored-By: Claude Sonnet 4.6 --- telegraf/telegraf.base | 3 +++ 1 file changed, 3 insertions(+) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index f8759aa..5786194 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -29,6 +29,9 @@ collect_cpu_time = false report_active = false fieldpass = ["usage*"] + # usage_guest and usage_guest_nice are always zero on Delphix engines — + # we don't run guest VMs. Dropping them saves two fields per interval. + fielddrop = ["usage_guest", "usage_guest_nice"] # Get mount point stats [[inputs.disk]] From 2e8baa628f50ad82c06e7e32b3ce0fc186a5237e Mon Sep 17 00:00:00 2001 From: dbshah12 Date: Thu, 21 May 2026 12:24:21 +0530 Subject: [PATCH 15/15] DLPX-96312 Persist InfluxDB bucket IDs in influxdb_meta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit delphix-influxdb-init already knows both bucket IDs (it just created them via /api/v2/setup and /api/v2/buckets). Persist them into /etc/influxdb/influxdb_meta as INFLUXDB_BUCKET_ID and INFLUXDB_SUPPORT_BUCKET_ID so callers (support_info, future tooling) can look up either bucket without having to scan the engine data directory. This avoids a class of bugs where a scan-the-data-dir heuristic returns the wrong bucket on engines that contain more than just default and support_metrics — notably InfluxDB's internal _monitoring bucket, which alphabetically sorts ahead of support_metrics by hex bucket ID on this engine and was being silently substituted in every bundle. Co-Authored-By: Claude Sonnet 4.6 --- influxdb/delphix-influxdb-init | 2 ++ 1 file changed, 2 insertions(+) diff --git a/influxdb/delphix-influxdb-init b/influxdb/delphix-influxdb-init index a59e42b..7630985 100644 --- a/influxdb/delphix-influxdb-init +++ b/influxdb/delphix-influxdb-init @@ -259,7 +259,9 @@ tmp_meta="$(mktemp "${INFLUXDB_META_FILE}.XXXXXX")" cat >"$tmp_meta" <