Skip to content

Commit 53b635b

Browse files
committed
DLPX-96312 Add InfluxDB/Telegraf infrastructure for Engine Performance Analytics
PR URL: https://www.github.com/delphix/performance-diagnostics/pull/119
1 parent 1a46d8d commit 53b635b

10 files changed

Lines changed: 343 additions & 7 deletions

debian/control

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ Standards-Version: 4.1.2
1313

1414
Package: performance-diagnostics
1515
Architecture: any
16-
Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker.io
16+
Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker.io, influxdb2, curl
1717
Description: eBPF-based Performance Diagnostic Tools
1818
A collection of eBPF-based tools for diagnosing performance issues.

debian/rules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ override_dh_auto_install:
2626
dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin
2727
dh_install telegraf/delphix-telegraf.service /lib/systemd/system
2828
dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf
29+
dh_install influxdb/delphix-influxdb-service influxdb/delphix-influxdb-init influxdb/perf_influxdb /usr/bin
30+
dh_install influxdb/delphix-influxdb.service /lib/systemd/system
31+
dh_install influxdb/influxdb.toml influxdb/influxdb-init.conf /etc/influxdb

influxdb/delphix-influxdb-init

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/bin/bash -eu
2+
#
3+
# Copyright (c) 2026 by Delphix. All rights reserved.
4+
#
5+
# One-time InfluxDB initialization: creates org, bucket, admin token,
6+
# a read-only token for DCT Smart Proxy, and writes the
7+
# [[outputs.influxdb_v2]] stanza to /etc/telegraf/telegraf.outputs.influxdb,
8+
# which is included by delphix-telegraf-service when INFLUXDB_ENABLED flag exists.
9+
# Skips setup if InfluxDB is already initialized.
10+
#
11+
12+
INFLUXDB_URL="http://127.0.0.1:8086"
13+
INFLUXDB_CONFIG_DIR="/etc/influxdb"
14+
INFLUXDB_META_FILE="$INFLUXDB_CONFIG_DIR/influxdb_meta"
15+
# State file written immediately after /api/v2/setup so the script can resume
16+
# if it is interrupted before the metadata file is fully written.
17+
INFLUXDB_SETUP_STATE_FILE="$INFLUXDB_CONFIG_DIR/influxdb_setup_state"
18+
INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED
19+
INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb
20+
INFLUXDB_INIT_CONF="$INFLUXDB_CONFIG_DIR/influxdb-init.conf"
21+
22+
# Load tunable configuration (org, bucket, retention, wait parameters).
23+
# shellcheck source=/etc/influxdb/influxdb-init.conf
24+
# shellcheck disable=SC1091
25+
source "$INFLUXDB_INIT_CONF"
26+
27+
INFLUXDB_ADMIN_USER="admin"
28+
INFLUXDB_ADMIN_PASSWORD=""
29+
30+
#
31+
# Log a message to stderr with a timestamp.
32+
#
33+
log() {
34+
echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" >&2
35+
}
36+
37+
#
38+
# Extract a field from a JSON string using python3.
39+
#
40+
json_field() {
41+
local json="$1"
42+
local field="$2"
43+
echo "$json" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())$field)" ||
44+
{ log "ERROR: Failed to parse field '$field' from JSON response."; return 1; }
45+
}
46+
47+
#
48+
# POST to the InfluxDB HTTP API. Exits with an error if the request fails.
49+
#
50+
influx_post() {
51+
local endpoint="$1"
52+
local data="$2"
53+
local auth_header="${3:-}"
54+
55+
local curl_args=(-sf -X POST "$INFLUXDB_URL$endpoint" -H 'Content-Type: application/json' -d "$data")
56+
[[ -n "$auth_header" ]] && curl_args+=(-H "Authorization: Token $auth_header")
57+
58+
local response
59+
response=$(curl "${curl_args[@]}") ||
60+
{ log "ERROR: HTTP POST to '$endpoint' failed."; return 1; }
61+
echo "$response"
62+
}
63+
64+
mkdir -p "$INFLUXDB_CONFIG_DIR"
65+
66+
# Skip if already fully initialized.
67+
if [[ -f "$INFLUXDB_META_FILE" ]]; then
68+
log "InfluxDB already initialized, skipping."
69+
exit 0
70+
fi
71+
72+
#
73+
# Wait for InfluxDB to be ready.
74+
#
75+
ready=false
76+
for i in $(seq 1 "$INFLUXDB_WAIT_RETRIES"); do
77+
if curl -sf "$INFLUXDB_URL/health" &>/dev/null; then
78+
ready=true
79+
break
80+
fi
81+
sleep "$INFLUXDB_WAIT_INTERVAL"
82+
done
83+
84+
if [[ "$ready" != "true" ]]; then
85+
log "ERROR: InfluxDB did not become ready after $((INFLUXDB_WAIT_RETRIES * INFLUXDB_WAIT_INTERVAL))s."
86+
exit 1
87+
fi
88+
89+
#
90+
# Initial setup — creates org, bucket, and returns admin token + IDs.
91+
# /api/v2/setup is a one-shot operation; if the script is interrupted after
92+
# this point and re-run, the state file lets us skip setup and reuse the
93+
# already-created admin token.
94+
#
95+
ADMIN_TOKEN=""
96+
ORG_ID=""
97+
BUCKET_ID=""
98+
99+
if [[ -f "$INFLUXDB_SETUP_STATE_FILE" ]]; then
100+
while IFS= read -r line; do
101+
key="${line%%=*}"
102+
value="${line#*=}"
103+
case "$key" in
104+
ADMIN_TOKEN) ADMIN_TOKEN="$value" ;;
105+
ORG_ID) ORG_ID="$value" ;;
106+
BUCKET_ID) BUCKET_ID="$value" ;;
107+
INFLUXDB_ADMIN_PASSWORD) INFLUXDB_ADMIN_PASSWORD="$value" ;;
108+
WRITE_TOKEN) WRITE_TOKEN="$value" ;;
109+
READ_TOKEN) READ_TOKEN="$value" ;;
110+
esac
111+
done <"$INFLUXDB_SETUP_STATE_FILE"
112+
else
113+
# Generate password only when actually running setup for the first time.
114+
INFLUXDB_ADMIN_PASSWORD="$(openssl rand -hex 16)"
115+
SETUP_RESPONSE=$(influx_post "/api/v2/setup" "{
116+
\"username\": \"$INFLUXDB_ADMIN_USER\",
117+
\"password\": \"$INFLUXDB_ADMIN_PASSWORD\",
118+
\"org\": \"$INFLUXDB_ORG\",
119+
\"bucket\": \"$INFLUXDB_BUCKET\",
120+
\"retentionPeriodSeconds\": $INFLUXDB_RETENTION_SECONDS
121+
}") || exit 1
122+
123+
ADMIN_TOKEN=$(json_field "$SETUP_RESPONSE" "['auth']['token']") || exit 1
124+
ORG_ID=$(json_field "$SETUP_RESPONSE" "['org']['id']") || exit 1
125+
BUCKET_ID=$(json_field "$SETUP_RESPONSE" "['bucket']['id']") || exit 1
126+
127+
# Persist admin token + IDs + password immediately so a subsequent re-run
128+
# can resume without repeating the one-shot setup call, and so the password
129+
# stored in influxdb_meta always matches what InfluxDB was initialised with.
130+
old_umask="$(umask)"
131+
umask 077
132+
tmp_state="$(mktemp "${INFLUXDB_SETUP_STATE_FILE}.XXXXXX")"
133+
printf 'ADMIN_TOKEN=%s\nORG_ID=%s\nBUCKET_ID=%s\nINFLUXDB_ADMIN_PASSWORD=%s\n' \
134+
"$ADMIN_TOKEN" "$ORG_ID" "$BUCKET_ID" "$INFLUXDB_ADMIN_PASSWORD" >"$tmp_state"
135+
chmod 600 "$tmp_state"
136+
mv "$tmp_state" "$INFLUXDB_SETUP_STATE_FILE"
137+
umask "$old_umask"
138+
fi
139+
140+
# Token creation is guarded so that on crash-resume (setup state exists but
141+
# meta file not yet written), we reuse already-created tokens rather than
142+
# creating orphaned duplicates in InfluxDB on each retry.
143+
WRITE_TOKEN="${WRITE_TOKEN:-}"
144+
READ_TOKEN="${READ_TOKEN:-}"
145+
146+
#
147+
# Create a write-only token for Telegraf (skipped if already persisted in state).
148+
#
149+
if [[ -z "$WRITE_TOKEN" ]]; then
150+
WRITE_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{
151+
\"orgID\": \"$ORG_ID\",
152+
\"description\": \"telegraf-write-token\",
153+
\"permissions\": [
154+
{\"action\": \"write\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}}
155+
]
156+
}" "$ADMIN_TOKEN") || exit 1
157+
WRITE_TOKEN=$(json_field "$WRITE_TOKEN_RESPONSE" "['token']") || exit 1
158+
printf 'WRITE_TOKEN=%s\n' "$WRITE_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE"
159+
fi
160+
161+
#
162+
# Create a read-only token for DCT Smart Proxy (skipped if already persisted in state).
163+
#
164+
if [[ -z "$READ_TOKEN" ]]; then
165+
READ_TOKEN_RESPONSE=$(influx_post "/api/v2/authorizations" "{
166+
\"orgID\": \"$ORG_ID\",
167+
\"description\": \"dct-read-token\",
168+
\"permissions\": [
169+
{\"action\": \"read\", \"resource\": {\"type\": \"buckets\", \"id\": \"$BUCKET_ID\", \"orgID\": \"$ORG_ID\"}}
170+
]
171+
}" "$ADMIN_TOKEN") || exit 1
172+
READ_TOKEN=$(json_field "$READ_TOKEN_RESPONSE" "['token']") || exit 1
173+
printf 'READ_TOKEN=%s\n' "$READ_TOKEN" >>"$INFLUXDB_SETUP_STATE_FILE"
174+
fi
175+
176+
#
177+
# Write the [[outputs.influxdb_v2]] stanza to a dedicated telegraf output file
178+
# and enable it via the INFLUXDB_ENABLED flag. The flag is read by
179+
# delphix-telegraf-service to conditionally include this output.
180+
#
181+
cat >"$INFLUXDB_OUTPUT" <<EOF
182+
[[outputs.influxdb_v2]]
183+
urls = ["http://127.0.0.1:8086"]
184+
token = "$WRITE_TOKEN"
185+
organization = "$INFLUXDB_ORG"
186+
bucket = "$INFLUXDB_BUCKET"
187+
namepass = ["cpu", "disk", "diskio", "mem", "net", "procstat", "processes", "swap", "system", "zfs"]
188+
EOF
189+
# Enforce restrictive permissions so the write token is not world-readable.
190+
chmod 640 "$INFLUXDB_OUTPUT"
191+
touch "$INFLUXDB_FLAG"
192+
193+
#
194+
# Persist org/bucket/admin credentials/tokens so DE APIs can expose them to DCT
195+
# and so the admin can access the InfluxDB UI. File is chmod 600 (root-only).
196+
#
197+
# Use a restrictive umask and a temp file to avoid a window where tokens are
198+
# readable by non-root users, then atomically move the file into place.
199+
old_umask="$(umask)"
200+
umask 077
201+
tmp_meta="$(mktemp "${INFLUXDB_META_FILE}.XXXXXX")"
202+
cat >"$tmp_meta" <<EOF
203+
INFLUXDB_ORG=$INFLUXDB_ORG
204+
INFLUXDB_BUCKET=$INFLUXDB_BUCKET
205+
INFLUXDB_ADMIN_USER=$INFLUXDB_ADMIN_USER
206+
INFLUXDB_ADMIN_PASSWORD=$INFLUXDB_ADMIN_PASSWORD
207+
INFLUXDB_WRITE_TOKEN=$WRITE_TOKEN
208+
INFLUXDB_READ_TOKEN=$READ_TOKEN
209+
EOF
210+
chmod 600 "$tmp_meta"
211+
mv "$tmp_meta" "$INFLUXDB_META_FILE"
212+
umask "$old_umask"
213+
214+
rm -f "$INFLUXDB_SETUP_STATE_FILE"
215+
log "InfluxDB initialized successfully."

influxdb/delphix-influxdb-service

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
#
3+
# Copyright (c) 2026 by Delphix. All rights reserved.
4+
#
5+
# Wrapper script to start InfluxDB 2.x and run first-time initialization.
6+
#
7+
8+
INFLUXDB_CONFIG=/etc/influxdb/influxdb.toml
9+
INFLUXDB_INIT=/usr/bin/delphix-influxdb-init
10+
11+
# Start influxd in the background.
12+
# influxd does not support a --config-path flag; config file is passed via env var.
13+
INFLUXD_CONFIG_PATH="$INFLUXDB_CONFIG" /usr/bin/influxd &
14+
INFLUXDB_PID=$!
15+
16+
# Run initialization (the init script handles waiting for InfluxDB to be ready)
17+
if ! $INFLUXDB_INIT; then
18+
echo "ERROR: delphix-influxdb-init failed, stopping influxd" >&2
19+
kill "$INFLUXDB_PID" 2>/dev/null
20+
exit 1
21+
fi
22+
23+
wait "$INFLUXDB_PID"

influxdb/delphix-influxdb.service

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[Unit]
2+
Description=Delphix InfluxDB Time Series Database
3+
Documentation=https://docs.influxdata.com/influxdb/v2/
4+
PartOf=delphix.target
5+
After=delphix-platform.service
6+
PartOf=delphix-platform.service
7+
8+
[Service]
9+
User=root
10+
ExecStart=/usr/bin/delphix-influxdb-service
11+
Restart=on-failure
12+
RestartForceExitStatus=SIGPIPE
13+
KillMode=control-group
14+
15+
[Install]
16+
WantedBy=delphix.target

influxdb/influxdb-init.conf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#
2+
# Copyright (c) 2026 by Delphix. All rights reserved.
3+
#
4+
# Configuration for delphix-influxdb-init.
5+
# Sourced by /usr/bin/delphix-influxdb-init at runtime.
6+
#
7+
8+
INFLUXDB_ORG="delphix"
9+
INFLUXDB_BUCKET="default"
10+
INFLUXDB_RETENTION_SECONDS=2592000 # 30 days (720h)
11+
INFLUXDB_WAIT_RETRIES=30
12+
INFLUXDB_WAIT_INTERVAL=2

influxdb/influxdb.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#
2+
# Copyright 2026 Delphix. All rights reserved.
3+
#
4+
# InfluxDB 2.x Configuration
5+
#
6+
7+
bolt-path = "/var/lib/influxdb/influxd.bolt"
8+
engine-path = "/var/lib/influxdb/engine"
9+
http-bind-address = "127.0.0.1:8086"
10+
log-level = "warn"

influxdb/perf_influxdb

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
#
3+
# Copyright (c) 2026 by Delphix. All rights reserved.
4+
#
5+
# Script that enables and disables InfluxDB metric output for Telegraf.
6+
#
7+
8+
INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED
9+
INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb
10+
11+
function die() {
12+
echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2
13+
exit 1
14+
}
15+
16+
[[ $EUID -ne 0 ]] && die "must be run as root"
17+
18+
function usage() {
19+
echo "$(basename $0): $*" >&2
20+
echo "Usage: $(basename $0) [enable|disable]"
21+
exit 2
22+
}
23+
24+
function enable_influxdb() {
25+
date
26+
[[ ! -f $INFLUXDB_OUTPUT ]] && die "$INFLUXDB_OUTPUT not found. Run delphix-influxdb-init first."
27+
echo "Enabling InfluxDB Metric Output"
28+
touch $INFLUXDB_FLAG
29+
systemctl restart delphix-telegraf
30+
}
31+
32+
function disable_influxdb() {
33+
date
34+
echo "Disabling InfluxDB Metric Output"
35+
rm -rf $INFLUXDB_FLAG
36+
systemctl restart delphix-telegraf
37+
}
38+
39+
if [[ $# -ne 1 ]]; then
40+
usage
41+
fi
42+
43+
case "$1" in
44+
enable) enable_influxdb ;;
45+
disable) disable_influxdb ;;
46+
*) usage ;;
47+
esac

telegraf/delphix-telegraf-service

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@ BASE_CONFIG=/etc/telegraf/telegraf.base
33
DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose
44
DCT_INPUTS=/etc/telegraf/telegraf.inputs.dct
55
PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook
6+
INFLUXDB_OUTPUT=/etc/telegraf/telegraf.outputs.influxdb
67
PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED
8+
INFLUXDB_FLAG=/etc/telegraf/INFLUXDB_ENABLED
79
TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf
810

911

@@ -21,6 +23,10 @@ function playbook_is_enabled() {
2123
[[ -f $PLAYBOOK_FLAG ]]
2224
}
2325

26+
function influxdb_is_enabled() {
27+
[[ -f $INFLUXDB_FLAG ]]
28+
}
29+
2430
rm -f $TELEGRAF_CONFIG
2531

2632
if engine_is_object_based; then
@@ -43,4 +49,11 @@ else
4349
fi
4450
fi
4551

52+
if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then
53+
cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG
54+
fi
55+
56+
# Restrict permissions so the InfluxDB write token is not world-readable.
57+
chmod 640 $TELEGRAF_CONFIG
58+
4659
/usr/bin/telegraf -config $TELEGRAF_CONFIG

telegraf/telegraf.base

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,9 @@
4444
data_format = "json"
4545
namepass = ["agg_*"]
4646

47-
# Enable Live Monitoring, intended for internal Delphix use only:
48-
#[[outputs.influxdb]]
49-
# urls = ["http://dbsvr.company.com:8086"]
50-
# database = "live_metrics"
51-
# skip_database_creation = true
52-
# data_format = "influx"
47+
# InfluxDB output is managed via /etc/telegraf/telegraf.outputs.influxdb (written by
48+
# delphix-influxdb-init) and the /etc/telegraf/INFLUXDB_ENABLED flag.
49+
# Use 'perf_influxdb enable|disable' to toggle and restart Telegraf.
5350

5451
###############################################################################
5552
# INPUT PLUGINS #

0 commit comments

Comments
 (0)