diff --git a/qa/1559 b/qa/1559 index e5cf4329861..8333eed9a3a 100755 --- a/qa/1559 +++ b/qa/1559 @@ -56,7 +56,7 @@ _filter() # real QA test starts here -for arch in `echo archives/amdgpu-*.index | sed -e 's/\.index$//'` +for arch in `ls archives/amdgpu-*.index 2>/dev/null | sed -e 's/\.index$//'` do echo echo "=== $arch ===" diff --git a/qa/1559.out b/qa/1559.out index af7fab10f0a..e2e788c2b57 100644 --- a/qa/1559.out +++ b/qa/1559.out @@ -77,7 +77,7 @@ metadata diffs Help: The maximum GDDRx memory clock speed in MHz -=== archives/amdgpu-1.index === +=== archives/amdgpu-1 === metadata diffs --- TMP.old ... +++ TMP.new ... diff --git a/qa/1633 b/qa/1633 index a48771a16ee..39d88b44f53 100755 --- a/qa/1633 +++ b/qa/1633 @@ -23,7 +23,7 @@ _filter() $PCP_AWK_PROG ' BEGIN { out = "head" } NR == 3 { out = "body" } - { print >"'$tmp'." out }' + { print >("'$tmp'." out) }' cat $tmp.head LC_COLLATE=POSIX sort $tmp.body } diff --git a/qa/1670.out b/qa/1670.out index 8494b13598c..758fb903188 100644 --- a/qa/1670.out +++ b/qa/1670.out @@ -87,12 +87,12 @@ metadata diffs Help: The maximum GDDRx memory clock speed in MHz -=== archives/amdgpu-1.index === +=== archives/amdgpu-1 === === std out === === std err === === filtered valgrind report === Memcheck, a memory error detector -Command: pmlogrewrite -c /etc/pcp/pmlogrewrite/amdgpu.conf archives/amdgpu-1.index TMP +Command: pmlogrewrite -c /etc/pcp/pmlogrewrite/amdgpu.conf archives/amdgpu-1 TMP LEAK SUMMARY: definitely lost: 0 bytes in 0 blocks indirectly lost: 0 bytes in 0 blocks diff --git a/qa/1674 b/qa/1674 index af19867089d..c08ae1770b9 100755 --- a/qa/1674 +++ b/qa/1674 @@ -85,26 +85,26 @@ in_mem_use == 1 && $1 == "TOTAL_VRAM:" { if ($3 == "MB") mem_total[inst] = $2 * 1024 * 1024 else - print >"'"$tmp.err"'" NR ":" $0 ": unit not MB" + print NR ":" $0 ": unit not MB" >"'"$tmp.err"'" } in_mem_use == 1 && $1 == "USED_VRAM:" { if ($3 == "MB") mem_used[inst] = $2 * 1024 * 1024 else - print >"'"$tmp.err"'" NR ":" $0 ": unit not MB" + print NR ":" $0 ": unit not MB" >"'"$tmp.err"'" } in_mem_use == 1 && $1 == "FREE_VRAM:" { if ($3 == "MB") mem_free[inst] = $2 * 1024 * 1024 else - print >"'"$tmp.err"'" NR ":" $0 ": unit not MB" + print NR ":" $0 ": unit not MB" >"'"$tmp.err"'" } in_temp == 1 && $1 == "EDGE:" { temp[inst] = $2 } in_clock == 1 && $1 == "MAX_CLK:" { if ($3 == "MHz") clock_max[inst] = $2 else - print >"'"$tmp.err"'" NR ":" $0 ": unit not MHz" + print NR ":" $0 ": unit not MHz" >"'"$tmp.err"'" } NF == 1 { in_mem_use = in_temp = in_clock = 0 } END { printf "amdgpu.memory.total 2 %d %d\n",mem_total[1],mem_total[0] @@ -113,7 +113,7 @@ END { printf "amdgpu.memory.total 2 %d %d\n",mem_total[1],mem_total[0] printf "amdgpu.gpu.temperature 2 %d %d\n",temp[1],temp[0] #TODO# printf "amdgpu.gpu.clock_max 2 %d %d\n",clock_max[1],clock_max[0] }' \ -| _fixval | sort >$tmp.smi +| _fixval | sort >$tmp.smi echo "=== pcp metrics ===" >>$seq_full cat $tmp.pcp >>$seq_full diff --git a/qa/1675 b/qa/1675 new file mode 100755 index 00000000000..44942b352cd --- /dev/null +++ b/qa/1675 @@ -0,0 +1,86 @@ +#!/bin/sh +# PCP QA Test No. 1675 +# pmlogrewrite extensions to accommodate nvidia PMDA evolution +# (extra units) +# +# Copyright (c) 2026 Ken McDonell. All Rights Reserved. +# + +if [ $# -eq 0 ] +then + seq=`basename $0` + echo "QA output created by $seq" +else + # use $seq from caller, unless not set + [ -n "$seq" ] || seq=`basename $0` + echo "QA output created by `basename $0` $*" +fi + +# get standard environment, filters and checks +. ./common.product +. ./common.filter +. ./common.check + +[ -f $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf ] || _notrun "nvidia logrewrite config not installed" + +do_valgrind=false +if [ "$1" = "--valgrind" ] +then + _check_valgrind + do_valgrind=true +elif which valgrind >/dev/null 2>&1 +then + [ "$PCPQA_VALGRIND" = both ] || \ + _notrun "valgrind variant qa/1685 will be run" +fi + +_cleanup() +{ + cd $here + $sudo rm -rf $tmp $tmp.* +} + +status=0 # success is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +_filter() +{ + sed \ + -e "s@$tmp@TMP@g" \ + -e '/^---/s/old.*/old .../' \ + -e '/^+++/s/new.*/new .../' \ + # end +} + +# real QA test starts here + +for arch in `ls archives/nvidiagpu-*.index 2>/dev/null | sed -e 's/\.index$//'` +do + echo + echo "=== $arch ===" + rm -f $tmp.* + if $do_valgrind + then + _run_valgrind pmlogrewrite -c $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf $arch $tmp + else + if ! pmlogrewrite -c $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf $arch $tmp + then + echo "Arrgh: pmlogrewrite failed" + _exit 1 + fi + fi \ + | _filter + + export PCP_DERIVED_CONFIG= + metrics_old=`pminfo -a $arch nvidia | sort` + metrics_new=`pminfo -a $tmp nvidia | sort` + + echo metadata diffs + pminfo -dmTtl -a $arch $metrics_old >$tmp.old + pminfo -dmTtl -a $tmp $metrics_new >$tmp.new + + diff -u $tmp.old $tmp.new | _filter +done + +# success, all done +exit diff --git a/qa/1675.out b/qa/1675.out new file mode 100644 index 00000000000..a88c8695633 --- /dev/null +++ b/qa/1675.out @@ -0,0 +1,24 @@ +QA output created by 1675 + +=== archives/nvidiagpu-0 === +metadata diffs +--- TMP.old ... ++++ TMP.new ... +@@ -143,7 +143,7 @@ + + nvidia.power PMID: 120.0.21 [Total GPU power consumption ] + Data Type: 32-bit unsigned int InDom: 120.0 0x1e000000 +- Semantics: instant Units: none ++ Semantics: instant Units: power (W) + labels {"agent":"nvidia","device_type":"gpu","domainname":"localdomain","groupid":1000,"hostname":"haro","indom_name":"per gpu","machineid":"9ccf3a72f40b4727b71179ddd6a6d765","units":"milliwatts","userid":1000} + Help: + Current power usage for this GPU and its associated circuitry, in milliwatts. +@@ -594,7 +594,7 @@ + + nvidia.temperature PMID: 120.0.4 [The temperature of the card] + Data Type: 32-bit unsigned int InDom: 120.0 0x1e000000 +- Semantics: instant Units: none ++ Semantics: instant Units: temperature (C) + labels {"agent":"nvidia","device_type":"gpu","domainname":"localdomain","groupid":1000,"hostname":"haro","indom_name":"per gpu","machineid":"9ccf3a72f40b4727b71179ddd6a6d765","units":"degrees celsius","userid":1000} + Help: + The temperature of the GPU on the NVIDIA card in degrees celsius. diff --git a/qa/671 b/qa/671 index fc6289989f1..457b0cef5fe 100755 --- a/qa/671 +++ b/qa/671 @@ -71,7 +71,7 @@ s/ at .*/ at DATE/ | $PCP_AWK_PROG ' BEGIN { part = 1 } part == 2 && NF == 0 { part = 3 } - { print >"'$tmp.out.'" part } + { print >("'$tmp.out.'" part) } part == 1 && $1 == "Ordinal" { part = 2 }' if [ -f $tmp.out.1 ] diff --git a/qa/681 b/qa/681 index f1d36c179ad..bfa45f84d15 100755 --- a/qa/681 +++ b/qa/681 @@ -39,7 +39,7 @@ _filter() | $PCP_AWK_PROG ' BEGIN { state = 1 } /^=== filtered/ { state = 3 } - { print >"'$tmp'." state } + { print >("'$tmp'." state) } /^=== std err/ { state = 2 }' cat $tmp.1 [ -s $tmp.2 ] && LC_COLLATE=POSIX sort $tmp.2 diff --git a/qa/archives/mk.nvidiagpu b/qa/archives/mk.nvidiagpu index d31e4f76377..4004ebd4cb8 100755 --- a/qa/archives/mk.nvidiagpu +++ b/qa/archives/mk.nvidiagpu @@ -32,4 +32,25 @@ End-of-File . ./mk.common +rm -f "$archive.smi.txt" +if which nvidia-smi >/dev/null 2>&1 +then + nvidia-smi >"$archive.smi.txt" 2>"$tmp.err" + if [ $? != 0 -o -s "$tmp.err" ] + then + echo >&2 "mk.nvidiagpu: Warning: nvidia-smi failed" + cat >&2 "$tmp.err" + rm -f "$tmp.err" + fi + nvidia-smi -q >>"$archive.smi.txt" 2>>"$tmp.err" + if [ $? != 0 -o -s "$tmp.err" ] + then + echo >&2 "mk.nvidiagpu: Warning: nvidia-smi metric failed" + cat >&2 "$tmp.err" + rm -f "$tmp.err" + fi +else + echo >&2 "mk.nvidiagpu: Warning: nvidia-smi not installed, cannot recreate $archive.smi.txt" +fi + exit diff --git a/qa/group b/qa/group index 44cfcb30bfc..78e724398ac 100644 --- a/qa/group +++ b/qa/group @@ -2222,7 +2222,7 @@ suse 1672 secure pmcd local 1673 pmda.bpf local 1674 pmda.amdgpu pmprobe local -1675:reserved pmlogrewrite pmda.amdgpu +1675 pmlogrewrite pmda.nvidia local 1676:reserved pmlogrewrite pmda.amdgpu 1677:reserved pmlogrewrite pmda.amdgpu 1678 pmda.opentelemetry local python diff --git a/qa/new b/qa/new index 668187972ed..1f1dcd8e17e 100755 --- a/qa/new +++ b/qa/new @@ -444,7 +444,7 @@ echo "$id$tag $ans" >>group $PCP_AWK_PROG "'$tmp'." state }' + { print >("'$tmp'." state) }' sort -n $tmp.list >>$tmp.head cp $tmp.head group diff --git a/qa/new-dup b/qa/new-dup index d1ed6b3a736..8c03bc77ffc 100755 --- a/qa/new-dup +++ b/qa/new-dup @@ -107,7 +107,7 @@ p $PCP_AWK_PROG <$tmp.tmp ' BEGIN { state = "group" } state == "group" && /^[0-9]/ { state = "list" } - { print >"'$tmp'." state }' + { print >("'$tmp'." state) }' sort -n $tmp.list >>$tmp.group for o in $1*out* diff --git a/qa/new-grind b/qa/new-grind index ad567439af6..4a895d881d8 100755 --- a/qa/new-grind +++ b/qa/new-grind @@ -240,7 +240,7 @@ sed "'$tmp'." state }' + { print >("'$tmp'." state) }' echo "$newgroup" >>$tmp.list sort -n $tmp.list >>$tmp.head cp $tmp.head group diff --git a/qa/src/mv-me b/qa/src/mv-me index 9631f63d333..517e38b4922 100755 --- a/qa/src/mv-me +++ b/qa/src/mv-me @@ -67,7 +67,7 @@ sed "'$tmp'.make." part }' + { print >("'$tmp'.make." part) }' for arch do diff --git a/src/ctl-tools/ctl-tools.sh b/src/ctl-tools/ctl-tools.sh index 742b5ef786a..b14fd74ff96 100755 --- a/src/ctl-tools/ctl-tools.sh +++ b/src/ctl-tools/ctl-tools.sh @@ -1195,7 +1195,7 @@ state == 3 && $1 !~ /^#/ { state = 4; part = 2 } /^\/\/ --- START GENERATED SECTION (do not change this section) ---/ \ { part = 1 } /^\[access]/ { part = 3 } - { print >"'$tmp/'" part } + { print >("'$tmp/'" part) } /^# DO NOT UPDATE THE INITIAL SECTION OF THIS FILE/ \ { state = 1 } /^# DO NOT UPDATE THE FILE ABOVE THIS LINE/ \ diff --git a/src/pmdas/nvidia/GNUmakefile b/src/pmdas/nvidia/GNUmakefile index aed336d982f..477b84b3e74 100644 --- a/src/pmdas/nvidia/GNUmakefile +++ b/src/pmdas/nvidia/GNUmakefile @@ -31,6 +31,8 @@ PMDAADMDIR = $(PCP_PMDASADM_DIR)/$(IAM) PMDATMPDIR = $(PCP_PMDAS_DIR)/$(IAM) LOGCONFDIR = $(PCP_SYSCONF_DIR)/pmlogconf/$(IAM) LOGCONFVARDIR = $(PCP_VAR_DIR)/config/pmlogconf/$(IAM) +REWRITEDIR = $(PCP_SYSCONF_DIR)/pmlogrewrite +REWRITEVARDIR = $(PCP_VAR_DIR)/config/pmlogrewrite default: $(LIBTARGET) $(CMDTARGET) @@ -46,6 +48,7 @@ install: default $(INSTALL) -m 755 -d $(LOGCONFVARDIR) $(INSTALL) -m 644 -t $(LOGCONFVARDIR)/config pmlogconf.config $(LOGCONFDIR)/config $(INSTALL) -m 644 -t $(LOGCONFVARDIR)/summary pmlogconf.summary $(LOGCONFDIR)/summary + $(INSTALL) -m 644 -t $(REWRITEVARDIR)/$(IAM).conf rewrite.conf $(REWRITEDIR)/$(IAM).conf $(OBJECTS): domain.h diff --git a/src/pmdas/nvidia/nvidia.c b/src/pmdas/nvidia/nvidia.c index 525c4f55cd9..4644cdd6ece 100644 --- a/src/pmdas/nvidia/nvidia.c +++ b/src/pmdas/nvidia/nvidia.c @@ -94,7 +94,7 @@ static pmdaMetric metrictab[] = { { NULL, { PMDA_PMID(0, NVIDIA_BUSID), PM_TYPE_STRING, GCARD_INDOM, PM_SEM_DISCRETE, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, { NULL, { PMDA_PMID(0, NVIDIA_TEMPERATURE), PM_TYPE_U32, GCARD_INDOM, - PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + PM_SEM_INSTANT, PMDA_EXTRAUNITS(0, 0, 0, 0, 0, 0, PM_UNIT_TEMPERATURE, PM_TEMPERATURE_C) } }, { NULL, { PMDA_PMID(0, NVIDIA_FANSPEED), PM_TYPE_U32, GCARD_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, { NULL, { PMDA_PMID(0, NVIDIA_PERFSTATE), PM_TYPE_U32, GCARD_INDOM, @@ -126,7 +126,7 @@ static pmdaMetric metrictab[] = { { NULL, { PMDA_PMID(0, NVIDIA_ENERGY), PM_TYPE_U64, GCARD_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, { NULL, { PMDA_PMID(0, NVIDIA_POWER), PM_TYPE_U32, GCARD_INDOM, - PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + PM_SEM_INSTANT, PMDA_EXTRAUNITS(0, 0, 0, 0, 0, 0, PM_UNIT_POWER, PM_POWER_mW) } }, { NULL, { PMDA_PMID(0, NVIDIA_NPROCS), PM_TYPE_U32, GCARD_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, { NULL, { PMDA_PMID(0, NVIDIA_SAMPLES), PM_TYPE_U64, GCARD_INDOM, diff --git a/src/pmdas/nvidia/rewrite.conf b/src/pmdas/nvidia/rewrite.conf new file mode 100755 index 00000000000..b3e546dccc4 --- /dev/null +++ b/src/pmdas/nvidia/rewrite.conf @@ -0,0 +1,11 @@ +# nvidia PMDA rewriting rules +# + +# changes for extra units +# +metric nvidia.temperature { + extraunits -> 0,0,0,0,0,0,TEMPERATURE,C +} +metric nvidia.power { + extraunits -> 0,0,0,0,0,0,POWER,mW +}