Skip to content

Commit 1b0d9e1

Browse files
committed
Back merged
2 parents bfbb270 + 4d7aae4 commit 1b0d9e1

6 files changed

Lines changed: 30 additions & 23 deletions

File tree

debian/control

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ Standards-Version: 4.1.2
1313

1414
Package: performance-diagnostics
1515
Architecture: any
16-
Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker.io, influxdb2, curl
16+
Depends: python3-bpfcc, python3-minimal, python3-psutil, telegraf, docker-ce, influxdb2, curl
1717
Description: eBPF-based Performance Diagnostic Tools
1818
A collection of eBPF-based tools for diagnosing performance issues.

influxdb/delphix-influxdb-init

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,7 @@ umask "$old_umask"
212212

213213
rm -f "$INFLUXDB_SETUP_STATE_FILE"
214214
log "InfluxDB initialized successfully."
215+
216+
# Restart Telegraf so it picks up the InfluxDB output stanza written above.
217+
# This is only reached on first boot (subsequent boots exit early at the top).
218+
systemctl restart delphix-telegraf 2>/dev/null || true

influxdb/perf_influxdb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function enable_influxdb() {
3232
function disable_influxdb() {
3333
date
3434
echo "Disabling InfluxDB Metric Output"
35-
rm -rf $INFLUXDB_FLAG
35+
rm -f $INFLUXDB_FLAG
3636
systemctl restart delphix-telegraf
3737
}
3838

telegraf/delphix-telegraf-service

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,14 @@ else
5151
fi
5252

5353
if influxdb_is_enabled && [[ -f $INFLUXDB_OUTPUT ]]; then
54-
cat $STORAGE_IO_INPUTS $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG
54+
if [[ -f $STORAGE_IO_INPUTS ]]; then
55+
cat $STORAGE_IO_INPUTS >> $TELEGRAF_CONFIG
56+
fi
57+
cat $INFLUXDB_OUTPUT >> $TELEGRAF_CONFIG
58+
else
59+
# No InfluxDB output configured. Add discard so Telegraf can start —
60+
# it requires at least one output plugin.
61+
echo "[[outputs.discard]]" >> $TELEGRAF_CONFIG
5562
fi
5663

5764
# Restrict permissions so the InfluxDB write token is not world-readable.

telegraf/telegraf.inputs.playbook

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
##############################################################################
22
# Performance Playbook (estat, nfs_threads) collection
33
# Note: estat_nfs, estat_iscsi, and estat_backend-io live in
4-
# telegraf.inputs.nfs_iscsi and are always collected when InfluxDB is
4+
# telegraf.inputs.storage_io and are always collected when InfluxDB is
55
# enabled, independent of playbook state.
66

77
# Collect output from "estat zpl -jm 10"
@@ -88,31 +88,25 @@
8888
# PROCESSOR PLUGINS #
8989
###############################################################################
9090
# Convert strings from estat into integer values so they don't get dropped.
91-
# Scoped to playbook-only metrics; estat_nfs/iscsi have their own converter
92-
# in telegraf.inputs.nfs_iscsi.
91+
# Scoped to playbook-only metrics; estat_nfs/iscsi/backend-io have their own
92+
# converter in telegraf.inputs.storage_io.
9393
[[processors.converter]]
9494
namepass = ["estat_zpl", "estat_zio", "estat_zvol", "estat_zio-queue", "estat_metaslab-alloc"]
9595
[processors.converter.fields]
9696
integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"]
9797

98-
# The estat output contains a nested latency histogram, so we need to
99-
# parse that out as a new array metric rather than a non-JSON string.
100-
#
101-
# From this:
102-
# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}"
103-
# to this:
104-
# "microseconds":"{20000:5,30000:15,40000:3,50000:24}"
105-
#
106-
# Clone the original so we have a "new" metric with a "hist_" name prefix
98+
# Parse microseconds latency histograms for playbook estat measurements.
99+
# Transforms "{20000,5},{30000,15}" → {"20000":5,"30000":15} as a JSON object.
100+
# Scoped to playbook-only measurements; storage_io (estat_nfs/iscsi/backend-io)
101+
# does not collect microseconds to keep always-on data volume low.
107102
[[processors.clone]]
108103
order = 1
109104
name_prefix = "hist_"
110-
namepass = ["estat_*"]
105+
namepass = ["estat_zpl", "estat_zvol", "estat_zio", "estat_zio-queue", "estat_metaslab-alloc"]
111106

112-
# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects
113107
[[processors.regex]]
114108
order = 2
115-
namepass = ["hist_estat_*"]
109+
namepass = ["hist_estat_zpl", "hist_estat_zvol", "hist_estat_zio", "hist_estat_zio-queue", "hist_estat_metaslab-alloc"]
116110
[[processors.regex.fields]]
117111
key = "microseconds"
118112
pattern = "{(\\d+),(\\d+)}"
@@ -122,14 +116,13 @@
122116
pattern = ".*"
123117
replacement = "{$0}"
124118

125-
# Now parse out the arrays for "hist_estat_*" metrics
126119
[[processors.parser]]
127120
order = 3
128121
merge = "override"
129122
parse_fields = ["microseconds"]
130123
drop_original = false
131124
data_format = "json"
132-
namepass = ["hist_estat_*"]
125+
namepass = ["hist_estat_zpl", "hist_estat_zvol", "hist_estat_zio", "hist_estat_zio-queue", "hist_estat_metaslab-alloc"]
133126
fieldpass = ["microseconds"]
134127

135128
# End of Processor section

telegraf/telegraf.inputs.storage_io

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
##############################################################################
22
# Storage I/O collection: NFS server, iSCSI target, and backend disk I/O.
33
# Always included when InfluxDB is enabled, independent of playbook state.
4+
# Note: microseconds histogram is NOT collected here to keep always-on data
5+
# volume low. Full histogram expansion lives in telegraf.inputs.playbook and
6+
# is only active when the playbook is enabled.
47

58
# Collect output from "estat nfs -jm 10"
69
[[inputs.execd]]
@@ -13,7 +16,7 @@
1316
"name",
1417
"axis"
1518
]
16-
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"]
19+
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"]
1720

1821
# Collect output from "estat iscsi -jm 10"
1922
[[inputs.execd]]
@@ -26,7 +29,7 @@
2629
"name",
2730
"axis"
2831
]
29-
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"]
32+
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"]
3033

3134
# Collect output from "estat backend-io -jm 10" (stbtrace io equivalent)
3235
[[inputs.execd]]
@@ -39,7 +42,7 @@
3942
"name",
4043
"axis"
4144
]
42-
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"]
45+
json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"]
4346

4447
# Convert estat string fields to integers so they are not dropped by Telegraf.
4548
[[processors.converter]]

0 commit comments

Comments
 (0)