Skip to content

Commit 0fb7aea

Browse files
dbshah12claude
andcommitted
DLPX-96312 Filter garbage stat names from estat metaslab-alloc output
Wraps estat metaslab-alloc in a shell script that drops JSON lines whose "name" tag contains non-standard characters (backslashes, hashes, etc.). Addresses DLPX-88427 where a kernel bug causes random memory bytes or C macro strings to appear as stat names, producing unreadable metrics. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent b30f511 commit 0fb7aea

5 files changed

Lines changed: 57 additions & 18 deletions

File tree

telegraf/connstat-stats.sh

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/sh
22
#
33
# Collect per-connection TCP stats from connstat and aggregate by remote
4-
# endpoint (laddr:raddr:rport) to bound cardinality on engines with many
4+
# endpoint (laddr:raddr:service) to bound cardinality on engines with many
55
# connections — e.g. Oracle dNFS (hundreds of connections per VDB host) or
66
# Elastic Data (many connections per object storage endpoint IP).
77
# Mirrors the aggregation done by LocalTCPStatsCollector in the mgmt stack.
@@ -11,7 +11,7 @@
1111
# engine is the server) are identified correctly. Falls back to "unknown".
1212
#
1313
# Output fields per aggregated endpoint:
14-
# laddr, raddr, rport, service
14+
# laddr, raddr, service
1515
# inbytes, outbytes, retranssegs, suna, unsent (summed across connections)
1616
# swnd, cwnd, rwnd, rtt (averaged across connections)
1717
# connections (count of aggregated conns)
@@ -33,33 +33,35 @@ BEGIN {
3333
}
3434
}
3535
close("/etc/services")
36-
# Delphix DSP (Session Protocol) port — not present in /etc/services.
37-
# Matches the ServiceProtocol special-case in LocalTCPStatsCollector.
38-
svc[50001] = "dlpx-sp"
36+
# Delphix-specific ports not present in /etc/services.
37+
# Matches LocalTCPStatsCollector.getService() special-cases exactly.
38+
svc[8415] = "dlpx-sp" # DSP (ServiceProtocol.PORT)
39+
svc[50001] = "network-throughput-test" # TtcpPerfSession.DEFAULT_PORT
40+
svc[8341] = "oracle-logsync" # HTTP server (TunableRegistry.HTTP_SERVER_PORT default)
41+
svc[9100] = "dlpx-connector" # Host Connector (Connector.DEFAULT_PORT)
3942
}
4043
/^=/ {
4144
for (key in cnt) {
4245
n = cnt[key]
4346
split(key, k, SUBSEP)
44-
print k[1] "," k[2] "," k[3] "," k[4] "," \
47+
print k[1] "," k[2] "," k[3] "," \
4548
inb[key] "," outb[key] "," ret[key] "," sun[key] "," uns[key] "," \
4649
int(sw[key]/n) "," int(cw[key]/n) "," int(rw[key]/n) "," int(rt[key]/n) "," n
4750
}
51+
fflush()
4852
delete cnt; delete inb; delete outb; delete ret
4953
delete sun; delete uns; delete sw; delete cw; delete rw; delete rt
5054
next
5155
}
5256
NF == 13 {
53-
lport = $2 + 0
54-
rport = $4 + 0
55-
if (lport in svc) {
56-
service = svc[lport]
57-
} else if (rport in svc) {
58-
service = svc[rport]
57+
if (($2 + 0) in svc) {
58+
service = svc[$2 + 0]
59+
} else if (($4 + 0) in svc) {
60+
service = svc[$4 + 0]
5961
} else {
6062
service = "unknown"
6163
}
62-
key = $1 SUBSEP $3 SUBSEP rport SUBSEP service
64+
key = $1 SUBSEP $3 SUBSEP service
6365
inb[key] += $5; outb[key] += $6; ret[key] += $7
6466
sun[key] += $8; uns[key] += $9
6567
sw[key] += $10; cw[key] += $11; rw[key] += $12; rt[key] += $13

telegraf/metaslab-alloc-stats.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/sh
2+
#
3+
# Wrapper around "estat metaslab-alloc -jm 10" that filters out metrics whose
4+
# "name" tag contains garbage characters (DLPX-88427). A kernel bug causes
5+
# estat to occasionally emit stat names containing raw memory bytes or C macro
6+
# strings. Only names consisting of printable ASCII letters, digits, spaces,
7+
# and common punctuation are passed through.
8+
#
9+
estat metaslab-alloc -jm 10 | grep -E '"name":"[A-Za-z0-9 ,_()/.-]+"'

telegraf/telegraf.base

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@
6868
data_format = "csv"
6969
csv_delimiter = ","
7070
csv_trim_space = true
71-
csv_column_names = ["laddr", "raddr", "rport", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"]
72-
csv_column_types = ["string", "string", "int", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"]
73-
csv_tag_columns = ["laddr", "raddr", "rport", "service"]
71+
csv_column_names = ["laddr", "raddr", "service", "inbytes", "outbytes", "retranssegs", "suna", "unsent", "swnd", "cwnd", "rwnd", "rtt", "connections"]
72+
csv_column_types = ["string", "string", "string", "int", "int", "int", "int", "int", "int", "int", "int", "int", "int"]
73+
csv_tag_columns = ["laddr", "raddr", "service"]
7474

7575
# Track CPU and Memory for the "delphix-mgmt" service (and children).
7676
[[inputs.procstat]]

telegraf/telegraf.inputs.playbook

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,10 @@
5555
]
5656
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"]
5757

58-
# Collect output from "estat metaslab-alloc -jm 10"
58+
# Collect output from "estat metaslab-alloc -jm 10" via wrapper script.
59+
# The wrapper filters out metrics with garbage "name" tags (DLPX-88427).
5960
[[inputs.execd]]
60-
command = ["estat", "metaslab-alloc", "-jm", "10"]
61+
command = ["/etc/telegraf/metaslab-alloc-stats.sh"]
6162
name_override = "estat_metaslab-alloc"
6263
signal = "none"
6364
restart_delay = "30s"

telegraf/telegraf.inputs.storage_io

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,5 +67,32 @@
6767
namepass = ["estat_*"]
6868
fieldexclude = ["microseconds"]
6969

70+
# Expand hist_estat_* microseconds histogram strings into per-bucket rows for
71+
# Grafana heatmap support. Each "{upper_bound_us,count}" pair becomes a
72+
# separate metric with le=<upper_bound_us> tag and count field, replacing the
73+
# opaque string. Runs after clone (order=1) and strings (order=2) so the
74+
# microseconds field is still present in hist_estat_* at this point.
75+
[[processors.starlark]]
76+
order = 3
77+
namepass = ["hist_estat_*"]
78+
source = '''
79+
def apply(metric):
80+
ms = metric.fields.get("microseconds")
81+
if ms == None:
82+
return [metric]
83+
84+
result = []
85+
for pair in ms[1:-1].split("},{"):
86+
parts = pair.split(",")
87+
if len(parts) == 2:
88+
m = deepcopy(metric)
89+
m.tags["le"] = parts[0]
90+
m.fields["count"] = int(parts[1])
91+
m.fields.pop("microseconds")
92+
result.append(m)
93+
94+
return result if result else [metric]
95+
'''
96+
7097
# End of Storage I/O section
7198
##############################################################################

0 commit comments

Comments
 (0)