Skip to content

Commit 07eec3f

Browse files
committed
DLPX-96312 Resolved Slack Review comments
1 parent d27a122 commit 07eec3f

2 files changed

Lines changed: 83 additions & 8 deletions

File tree

telegraf/connstat-stats.sh

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,68 @@
11
#!/bin/sh
2+
#
3+
# Collect per-connection TCP stats from connstat and aggregate by remote
4+
# endpoint (laddr:raddr:rport) to bound cardinality on engines with many
5+
# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or
6+
# Elastic Data (many connections per object storage endpoint IP).
7+
# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack.
8+
#
9+
# Service name lookup reads from /etc/services, matching LocalTCPStatsCollector
10+
# exactly. lport is checked before rport so that listening services (where the
11+
# engine is the server) are identified correctly. Falls back to "unknown".
12+
#
13+
# Output fields per aggregated endpoint:
14+
# laddr, raddr, rport, service
15+
# inbytes, outbytes, retranssegs, suna, unsent (summed across connections)
16+
# swnd, cwnd, rwnd, rtt (averaged across connections)
17+
# connections (count of aggregated conns)
18+
#
219
/usr/bin/connstat -PLe -i 10 -T u \
320
-o laddr,lport,raddr,rport,inbytes,outbytes,retranssegs,suna,unsent,swnd,cwnd,rwnd,rtt \
4-
| grep --line-buffered -v "^="
21+
| awk -F',' '
22+
BEGIN {
23+
# Load port->service mapping from /etc/services, same as LocalTCPStatsCollector.
24+
# Pattern matches lines of the form: "servicename port/tcp"
25+
while ((getline line < "/etc/services") > 0) {
26+
sub(/^[[:space:]]+/, "", line)
27+
if (line ~ /^(#|$)/) continue
28+
n = split(line, f, /[[:space:]]+/)
29+
if (n >= 2 && f[2] ~ /\/tcp/) {
30+
split(f[2], pf, "/")
31+
port = pf[1] + 0
32+
if (!(port in svc)) svc[port] = f[1]
33+
}
34+
}
35+
close("/etc/services")
36+
# Delphix DSP (Session Protocol) port — not present in /etc/services.
37+
# Matches the ServiceProtocol special-case in LocalTCPStatsCollector.
38+
svc[50001] = "dlpx-sp"
39+
}
40+
/^=/ {
41+
for (key in cnt) {
42+
n = cnt[key]
43+
split(key, k, SUBSEP)
44+
print k[1] "," k[2] "," k[3] "," k[4] "," \
45+
inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \
46+
int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n
47+
}
48+
delete cnt; delete inb; delete outb; delete ret
49+
delete sun; delete uns; delete sw; delete cw; delete rw; delete rt
50+
next
51+
}
52+
NF == 13 {
53+
lport = $2 + 0
54+
rport = $4 + 0
55+
if (lport in svc) {
56+
service = svc[lport]
57+
} else if (rport in svc) {
58+
service = svc[rport]
59+
} else {
60+
service = "unknown"
61+
}
62+
key = $1 SUBSEP $3 SUBSEP rport SUBSEP service
63+
inb[key] += $5; outb[key] += $6; ret[key] += $7
64+
sun[key] += $8; uns[key] += $9
65+
sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13
66+
cnt[key]++
67+
}
68+
'

telegraf/telegraf.base

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020
# INPUT PLUGINS #
2121
###############################################################################
2222

23-
# Get CPU usage
23+
# Get CPU usage — only cpu-total, not per-core (reduces data volume on
24+
# many-CPU engines; agg_cpu automatically inherits this restriction).
25+
# percpu defaults to true so must be explicitly set to false.
2426
[[inputs.cpu]]
25-
percpu = true
27+
percpu = false
2628
totalcpu = true
2729
collect_cpu_time = false
2830
report_active = false
@@ -34,8 +36,10 @@
3436

3537
# Get disk I/O stats, excluding ZFS zvol devices (zd*) which are internal
3638
# ZFS block devices not useful for performance diagnostics.
39+
# wwid is a redundant 100+ char tag; the short-form name tag is sufficient.
3740
[[inputs.diskio]]
3841
tagdrop = {name = ["zd*"]}
42+
tagexclude = ["wwid"]
3943

4044
# Get Memory stats
4145
[[inputs.mem]]
@@ -44,8 +48,13 @@
4448
[[inputs.net]]
4549
fieldpass = ["tcp*","bytes*","packets*","err*","drop*"]
4650

47-
# Per-connection TCP stats (bytes, RTT, window sizes) via connstat.
48-
# Mirrors the TCP_STATS collected by the mgmt stack into analytics_datapoint.
51+
# Per-endpoint TCP stats (bytes, RTT, window sizes) via connstat.
52+
# Aggregated by remote endpoint (laddr:raddr:rport) to mirror the aggregation
53+
# in LocalTCPStatsCollector — avoids cardinality explosion on Oracle dNFS
54+
# engines (hundreds of connections per VDB host) and Elastic Data engines
55+
# (many connections per object storage endpoint IP).
56+
# Cumulative fields (inbytes, outbytes, etc.) are summed; window/RTT fields
57+
# are averaged; connections = number of TCP connections aggregated.
4958
[[inputs.execd]]
5059
command = ["/etc/telegraf/connstat-stats.sh"]
5160
name_override = "tcp_stats"
@@ -54,9 +63,9 @@
5463
data_format = "csv"
5564
csv_delimiter = ","
5665
csv_trim_space = true
57-
csv_column_names = ["laddr", "lport", "raddr", "rport", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt"]
58-
csv_column_types = ["string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"]
59-
csv_tag_columns = ["laddr", "lport", "raddr", "rport"]
66+
csv_column_names = ["laddr", "raddr", "rport", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"]
67+
csv_column_types = ["string", "string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"]
68+
csv_tag_columns = ["laddr", "raddr", "rport", "service"]
6069

6170
# Track CPU and Memory for the "delphix-mgmt" service (and children).
6271
[[inputs.procstat]]
@@ -79,8 +88,10 @@
7988
[[inputs.system]]
8089

8190
# ZFS kstats (arcstat, abdstat, zfetch, etc)
91+
# arcstats_l2_* fields are L2ARC stats — unused on all appliances (no L2ARC).
8292
[[inputs.zfs]]
8393
interval = "1m"
94+
fielddrop = ["arcstats_l2_*"]
8495

8596
# Detailed ZFS pool metrics from "zpool_influxdb" (noisy)
8697
#[[inputs.exec]]

0 commit comments

Comments
 (0)