Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion qa/1559
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ _filter()

# real QA test starts here

for arch in `echo archives/amdgpu-*.index | sed -e 's/\.index$//'`
for arch in `ls archives/amdgpu-*.index 2>/dev/null | sed -e 's/\.index$//'`
do
echo
echo "=== $arch ==="
Expand Down
2 changes: 1 addition & 1 deletion qa/1559.out
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ metadata diffs
Help:
The maximum GDDRx memory clock speed in MHz

=== archives/amdgpu-1.index ===
=== archives/amdgpu-1 ===
metadata diffs
--- TMP.old ...
+++ TMP.new ...
Expand Down
2 changes: 1 addition & 1 deletion qa/1633
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ _filter()
$PCP_AWK_PROG '
BEGIN { out = "head" }
NR == 3 { out = "body" }
{ print >"'$tmp'." out }'
{ print >("'$tmp'." out) }'
cat $tmp.head
LC_COLLATE=POSIX sort $tmp.body
}
Expand Down
4 changes: 2 additions & 2 deletions qa/1670.out
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,12 @@ metadata diffs
Help:
The maximum GDDRx memory clock speed in MHz

=== archives/amdgpu-1.index ===
=== archives/amdgpu-1 ===
=== std out ===
=== std err ===
=== filtered valgrind report ===
Memcheck, a memory error detector
Command: pmlogrewrite -c /etc/pcp/pmlogrewrite/amdgpu.conf archives/amdgpu-1.index TMP
Command: pmlogrewrite -c /etc/pcp/pmlogrewrite/amdgpu.conf archives/amdgpu-1 TMP
LEAK SUMMARY:
definitely lost: 0 bytes in 0 blocks
indirectly lost: 0 bytes in 0 blocks
Expand Down
10 changes: 5 additions & 5 deletions qa/1674
Original file line number Diff line number Diff line change
Expand Up @@ -85,26 +85,26 @@ in_mem_use == 1 && $1 == "TOTAL_VRAM:" {
if ($3 == "MB")
mem_total[inst] = $2 * 1024 * 1024
else
print >"'"$tmp.err"'" NR ":" $0 ": unit not MB"
print NR ":" $0 ": unit not MB" >"'"$tmp.err"'"
}
in_mem_use == 1 && $1 == "USED_VRAM:" {
if ($3 == "MB")
mem_used[inst] = $2 * 1024 * 1024
else
print >"'"$tmp.err"'" NR ":" $0 ": unit not MB"
print NR ":" $0 ": unit not MB" >"'"$tmp.err"'"
}
in_mem_use == 1 && $1 == "FREE_VRAM:" {
if ($3 == "MB")
mem_free[inst] = $2 * 1024 * 1024
else
print >"'"$tmp.err"'" NR ":" $0 ": unit not MB"
print NR ":" $0 ": unit not MB" >"'"$tmp.err"'"
}
in_temp == 1 && $1 == "EDGE:" { temp[inst] = $2 }
in_clock == 1 && $1 == "MAX_CLK:" {
if ($3 == "MHz")
clock_max[inst] = $2
else
print >"'"$tmp.err"'" NR ":" $0 ": unit not MHz"
print NR ":" $0 ": unit not MHz" >"'"$tmp.err"'"
}
NF == 1 { in_mem_use = in_temp = in_clock = 0 }
END { printf "amdgpu.memory.total 2 %d %d\n",mem_total[1],mem_total[0]
Expand All @@ -113,7 +113,7 @@ END { printf "amdgpu.memory.total 2 %d %d\n",mem_total[1],mem_total[0]
printf "amdgpu.gpu.temperature 2 %d %d\n",temp[1],temp[0]
#TODO# printf "amdgpu.gpu.clock_max 2 %d %d\n",clock_max[1],clock_max[0]
}' \
| _fixval | sort >$tmp.smi
| _fixval | sort >$tmp.smi

echo "=== pcp metrics ===" >>$seq_full
cat $tmp.pcp >>$seq_full
Expand Down
86 changes: 86 additions & 0 deletions qa/1675
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/sh
# PCP QA Test No. 1675
# pmlogrewrite extensions to accommodate nvidia PMDA evolution
# (extra units)
#
# Copyright (c) 2026 Ken McDonell. All Rights Reserved.
#

if [ $# -eq 0 ]
then
seq=`basename $0`
echo "QA output created by $seq"
else
# use $seq from caller, unless not set
[ -n "$seq" ] || seq=`basename $0`
echo "QA output created by `basename $0` $*"
fi

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

[ -f $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf ] || _notrun "nvidia logrewrite config not installed"

do_valgrind=false
if [ "$1" = "--valgrind" ]
then
_check_valgrind
do_valgrind=true
elif which valgrind >/dev/null 2>&1
then
[ "$PCPQA_VALGRIND" = both ] || \
_notrun "valgrind variant qa/1685 will be run"
fi

_cleanup()
{
cd $here
$sudo rm -rf $tmp $tmp.*
}

status=0 # success is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15

_filter()
{
sed \
-e "s@$tmp@TMP@g" \
-e '/^---/s/old.*/old .../' \
-e '/^+++/s/new.*/new .../' \
# end
}

# real QA test starts here

for arch in `ls archives/nvidiagpu-*.index 2>/dev/null | sed -e 's/\.index$//'`
do
echo
echo "=== $arch ==="
rm -f $tmp.*
if $do_valgrind
then
_run_valgrind pmlogrewrite -c $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf $arch $tmp
else
if ! pmlogrewrite -c $PCP_SYSCONF_DIR/pmlogrewrite/nvidia.conf $arch $tmp
then
echo "Arrgh: pmlogrewrite failed"
_exit 1
fi
fi \
| _filter

export PCP_DERIVED_CONFIG=
metrics_old=`pminfo -a $arch nvidia | sort`
metrics_new=`pminfo -a $tmp nvidia | sort`

echo metadata diffs
pminfo -dmTtl -a $arch $metrics_old >$tmp.old
pminfo -dmTtl -a $tmp $metrics_new >$tmp.new

diff -u $tmp.old $tmp.new | _filter
done

# success, all done
exit
24 changes: 24 additions & 0 deletions qa/1675.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
QA output created by 1675

=== archives/nvidiagpu-0 ===
metadata diffs
--- TMP.old ...
+++ TMP.new ...
@@ -143,7 +143,7 @@

nvidia.power PMID: 120.0.21 [Total GPU power consumption ]
Data Type: 32-bit unsigned int InDom: 120.0 0x1e000000
- Semantics: instant Units: none
+ Semantics: instant Units: power (W)
labels {"agent":"nvidia","device_type":"gpu","domainname":"localdomain","groupid":1000,"hostname":"haro","indom_name":"per gpu","machineid":"9ccf3a72f40b4727b71179ddd6a6d765","units":"milliwatts","userid":1000}
Help:
Current power usage for this GPU and its associated circuitry, in milliwatts.
@@ -594,7 +594,7 @@

nvidia.temperature PMID: 120.0.4 [The temperature of the card]
Data Type: 32-bit unsigned int InDom: 120.0 0x1e000000
- Semantics: instant Units: none
+ Semantics: instant Units: temperature (C)
labels {"agent":"nvidia","device_type":"gpu","domainname":"localdomain","groupid":1000,"hostname":"haro","indom_name":"per gpu","machineid":"9ccf3a72f40b4727b71179ddd6a6d765","units":"degrees celsius","userid":1000}
Help:
The temperature of the GPU on the NVIDIA card in degrees celsius.
2 changes: 1 addition & 1 deletion qa/671
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ s/ at .*/ at DATE/
| $PCP_AWK_PROG '
BEGIN { part = 1 }
part == 2 && NF == 0 { part = 3 }
{ print >"'$tmp.out.'" part }
{ print >("'$tmp.out.'" part) }
part == 1 && $1 == "Ordinal" { part = 2 }'

if [ -f $tmp.out.1 ]
Expand Down
2 changes: 1 addition & 1 deletion qa/681
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ _filter()
| $PCP_AWK_PROG '
BEGIN { state = 1 }
/^=== filtered/ { state = 3 }
{ print >"'$tmp'." state }
{ print >("'$tmp'." state) }
/^=== std err/ { state = 2 }'
cat $tmp.1
[ -s $tmp.2 ] && LC_COLLATE=POSIX sort $tmp.2
Expand Down
21 changes: 21 additions & 0 deletions qa/archives/mk.nvidiagpu
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,25 @@ End-of-File

. ./mk.common

rm -f "$archive.smi.txt"
if which nvidia-smi >/dev/null 2>&1
then
nvidia-smi >"$archive.smi.txt" 2>"$tmp.err"
if [ $? != 0 -o -s "$tmp.err" ]
then
echo >&2 "mk.nvidiagpu: Warning: nvidia-smi failed"
cat >&2 "$tmp.err"
rm -f "$tmp.err"
fi
nvidia-smi -q >>"$archive.smi.txt" 2>>"$tmp.err"
if [ $? != 0 -o -s "$tmp.err" ]
then
echo >&2 "mk.nvidiagpu: Warning: nvidia-smi metric failed"
cat >&2 "$tmp.err"
rm -f "$tmp.err"
fi
else
echo >&2 "mk.nvidiagpu: Warning: nvidia-smi not installed, cannot recreate $archive.smi.txt"
fi

exit
2 changes: 1 addition & 1 deletion qa/group
Original file line number Diff line number Diff line change
Expand Up @@ -2222,7 +2222,7 @@ suse
1672 secure pmcd local
1673 pmda.bpf local
1674 pmda.amdgpu pmprobe local
1675:reserved pmlogrewrite pmda.amdgpu
1675 pmlogrewrite pmda.nvidia local
1676:reserved pmlogrewrite pmda.amdgpu
1677:reserved pmlogrewrite pmda.amdgpu
1678 pmda.opentelemetry local python
Expand Down
2 changes: 1 addition & 1 deletion qa/new
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ echo "$id$tag $ans" >>group
$PCP_AWK_PROG <group '
BEGIN { state = "head" }
state == "head" && /^[0-9]/ { state = "list" }
{ print >"'$tmp'." state }'
{ print >("'$tmp'." state) }'
sort -n $tmp.list >>$tmp.head
cp $tmp.head group

Expand Down
2 changes: 1 addition & 1 deletion qa/new-dup
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ p
$PCP_AWK_PROG <$tmp.tmp '
BEGIN { state = "group" }
state == "group" && /^[0-9]/ { state = "list" }
{ print >"'$tmp'." state }'
{ print >("'$tmp'." state) }'
sort -n $tmp.list >>$tmp.group

for o in $1*out*
Expand Down
2 changes: 1 addition & 1 deletion qa/new-grind
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ sed <group \
| $PCP_AWK_PROG '
BEGIN { state = "head" }
state == "head" && /^[0-9]/ { state = "list" }
{ print >"'$tmp'." state }'
{ print >("'$tmp'." state) }'
echo "$newgroup" >>$tmp.list
sort -n $tmp.list >>$tmp.head
cp $tmp.head group
Expand Down
2 changes: 1 addition & 1 deletion qa/src/mv-me
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ sed <GNUlocaldefs \
BEGIN { part = "head" }
part == "head" && /^ '"$macro"'[ ]=/ { part = "macro" }
part == "macro" && NF == 0 { part = "tail" }
{ print >"'$tmp'.make." part }'
{ print >("'$tmp'.make." part) }'

for arch
do
Expand Down
2 changes: 1 addition & 1 deletion src/ctl-tools/ctl-tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,7 @@ state == 3 && $1 !~ /^#/ { state = 4; part = 2 }
/^\/\/ --- START GENERATED SECTION (do not change this section) ---/ \
{ part = 1 }
/^\[access]/ { part = 3 }
{ print >"'$tmp/'" part }
{ print >("'$tmp/'" part) }
/^# DO NOT UPDATE THE INITIAL SECTION OF THIS FILE/ \
{ state = 1 }
/^# DO NOT UPDATE THE FILE ABOVE THIS LINE/ \
Expand Down
3 changes: 3 additions & 0 deletions src/pmdas/nvidia/GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ PMDAADMDIR = $(PCP_PMDASADM_DIR)/$(IAM)
PMDATMPDIR = $(PCP_PMDAS_DIR)/$(IAM)
LOGCONFDIR = $(PCP_SYSCONF_DIR)/pmlogconf/$(IAM)
LOGCONFVARDIR = $(PCP_VAR_DIR)/config/pmlogconf/$(IAM)
REWRITEDIR = $(PCP_SYSCONF_DIR)/pmlogrewrite
REWRITEVARDIR = $(PCP_VAR_DIR)/config/pmlogrewrite

default: $(LIBTARGET) $(CMDTARGET)

Expand All @@ -46,6 +48,7 @@ install: default
$(INSTALL) -m 755 -d $(LOGCONFVARDIR)
$(INSTALL) -m 644 -t $(LOGCONFVARDIR)/config pmlogconf.config $(LOGCONFDIR)/config
$(INSTALL) -m 644 -t $(LOGCONFVARDIR)/summary pmlogconf.summary $(LOGCONFDIR)/summary
$(INSTALL) -m 644 -t $(REWRITEVARDIR)/$(IAM).conf rewrite.conf $(REWRITEDIR)/$(IAM).conf

$(OBJECTS): domain.h

Expand Down
4 changes: 2 additions & 2 deletions src/pmdas/nvidia/nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ static pmdaMetric metrictab[] = {
{ NULL, { PMDA_PMID(0, NVIDIA_BUSID), PM_TYPE_STRING, GCARD_INDOM,
PM_SEM_DISCRETE, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
{ NULL, { PMDA_PMID(0, NVIDIA_TEMPERATURE), PM_TYPE_U32, GCARD_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
PM_SEM_INSTANT, PMDA_EXTRAUNITS(0, 0, 0, 0, 0, 0, PM_UNIT_TEMPERATURE, PM_TEMPERATURE_C) } },
{ NULL, { PMDA_PMID(0, NVIDIA_FANSPEED), PM_TYPE_U32, GCARD_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
{ NULL, { PMDA_PMID(0, NVIDIA_PERFSTATE), PM_TYPE_U32, GCARD_INDOM,
Expand Down Expand Up @@ -126,7 +126,7 @@ static pmdaMetric metrictab[] = {
{ NULL, { PMDA_PMID(0, NVIDIA_ENERGY), PM_TYPE_U64, GCARD_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
{ NULL, { PMDA_PMID(0, NVIDIA_POWER), PM_TYPE_U32, GCARD_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
PM_SEM_INSTANT, PMDA_EXTRAUNITS(0, 0, 0, 0, 0, 0, PM_UNIT_POWER, PM_POWER_mW) } },
{ NULL, { PMDA_PMID(0, NVIDIA_NPROCS), PM_TYPE_U32, GCARD_INDOM,
PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
{ NULL, { PMDA_PMID(0, NVIDIA_SAMPLES), PM_TYPE_U64, GCARD_INDOM,
Expand Down
11 changes: 11 additions & 0 deletions src/pmdas/nvidia/rewrite.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# nvidia PMDA rewriting rules
#

# changes for extra units
#
metric nvidia.temperature {
extraunits -> 0,0,0,0,0,0,TEMPERATURE,C
}
metric nvidia.power {
extraunits -> 0,0,0,0,0,0,POWER,mW
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Loading