From d66a19f334fed0b627479ba88fd271a0a6d3bff4 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:04:58 -0400 Subject: [PATCH 01/25] docs(health): add GB200 NVSWITCH telemetry matrix Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- dev/bin/generate_nvswitch_gb200_matrix.py | 337 ++++++++++++++++++ .../nvswitch_telemetry_gb200_matrix.csv | 194 ++++++++++ .../health/nvswitch_telemetry_gb200_matrix.md | 43 +++ 3 files changed, 574 insertions(+) create mode 100755 dev/bin/generate_nvswitch_gb200_matrix.py create mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv create mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_matrix.md diff --git a/dev/bin/generate_nvswitch_gb200_matrix.py b/dev/bin/generate_nvswitch_gb200_matrix.py new file mode 100755 index 0000000000..00f31c878a --- /dev/null +++ b/dev/bin/generate_nvswitch_gb200_matrix.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +"""Generate the GB200 NVSWITCH telemetry source matrix from OMX catalog artifacts. + +Input artifacts are intentionally under .omx because the source workbook is not tracked. +The generated CSV and Markdown summary are tracked under docs for MR review. +""" + +from __future__ import annotations + +import csv +import json +import re +from collections import Counter +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +ROWS_CSV = ROOT / ".omx/artifacts/nvswitch_rows.csv" +COVERAGE_JSON = ROOT / ".omx/artifacts/nvswitch_catalog_coverage_heuristic.json" +OUT_DIR = ROOT / "docs/architecture/health" +OUT_CSV = OUT_DIR / "nvswitch_telemetry_gb200_matrix.csv" +OUT_MD = OUT_DIR / "nvswitch_telemetry_gb200_matrix.md" + +GB200_COLUMNS = [ + "Applicable for \nGB200 NVL HMC", + "Applicable for \nGB200 NVL BMC", + "Applicable for\nGB200 NVL NvswitchTray", +] + +COL_METRIC = "Metric (ParamName)" +COL_GUID = "Telemetry GUID (Device+ParamName)" +COL_DEVICE = "Device \n(CompClass)" +COL_CATEGORY = "Category\n(ParamClass)" +COL_DATA_TYPE = "Data\nType" +COL_DESC = "Description" +COL_AVAIL = "Availability\n(IB/OOB/BOTH/NONE)" +COL_WILDCARD = "OOB API - Wildcards\n(Redfish URI and Field. N/A for NvSwitch Tray)" +COL_URI_DOMAIN = "URI Search Domain" +COL_URI_MATCH = "URI Match Criteria for Search Domain" +COL_OTLP = "OTLP" +COL_ONBOARD = "Onboard API (dbus path etc. within HMC/BMC)" +COL_NMXT = "Hi @zhillel@nvidia.com, IIUC these interfaces will be applicable even if there is single or no compute node at all correct in the rack ? so no need to say its \"applicable for multi node\" ?\n_Assigned to Ziv Hillel IL_\n-Pradeep Kumar Shima US\nNMX-T(applicable for MultiNode)" +COL_GNMI = "NVOS gNMI(applicable for MultiNode)" +COL_CLI_2502 = "Format of this column:\nline-1: nvos cli command with any placeholder for Id starting with \"$\"\nline-2 (Optional): Search criteria/filter for finding the applicable IDs for the placeholder in column. If this line isn't present, we'll look at all available Ids (interfaces, fans etc.)\nline-3: Property to check enclosed in curly braces. For example, {voltage}. For nested properties, curly braces can be used. E.g. {link{counters}}\n-Afsana Chowdhury US\nNVOS CLI v25.02.4282 (applicable for MultiNode)" +COL_CLI_2503 = "NVOS CLI v25.03.XXXX (applicable for MultiNode)" +COL_REDFISH_GB = "OOB API on GH200 NVL/GB200 NVL/GB300 NVL/MGX-4U-NVL16/Vera Rubin NVL72\n(Redfish URI and Field. N/A for NvSwitch Tray)" +COL_REDFISH_DGX = "Candidate to get rid of\n-Afsana Chowdhury US\nCheck with Jim and Joe about partners' usage\n-Afsana Chowdhury US\nOOB API on GH200/C2/DGX Station GB300\n(Redfish URI and Field. N/A for NvSwitch Tray)" +COL_MRD = "MRD URI on Hopper-HGX-8-GPU/Blackwell-HGX-8-GPU/GH200/GB200/HGX B300 NVL8/GB300/MGX-4U-NVL16\n(N/A for NvSwitch Tray)" + +SOURCE_COLUMNS = { + "redfish_gb": COL_REDFISH_GB, + "redfish_dgx_or_c2": COL_REDFISH_DGX, + "redfish_wildcard": COL_WILDCARD, + "mrd": COL_MRD, + "nvos_gnmi": COL_GNMI, + "nmx_t": COL_NMXT, + "nvos_cli_2503": COL_CLI_2503, + "nvos_cli_2502": COL_CLI_2502, + "onboard_dbus": COL_ONBOARD, + "otlp": COL_OTLP, +} + +NA_VALUES = {"", "NA", "N/A", "#N/A", "NONE", "TBD", "N.A."} + + +def clean(value: str | None) -> str: + if value is None: + return "" + return re.sub(r"\s+", " ", value.replace("\xa0", " ")).strip() + + +def has_value(value: str | None) -> bool: + c = clean(value) + return bool(c) and c.upper() not in NA_VALUES + + +def yes(value: str | None) -> bool: + return clean(value).lower() == "yes" + + +def snake(metric: str) -> str: + return re.sub(r"[^a-z0-9]+", "_", metric.lower()).strip("_") + + +def load_coverage() -> dict[int, dict[str, str]]: + if not COVERAGE_JSON.exists(): + return {} + data = json.loads(COVERAGE_JSON.read_text()) + out: dict[int, dict[str, str]] = {} + for section in ("covered", "partial", "gaps"): + for item in data.get(section, []): + out[int(item["row"])] = item + return out + + +def extract_sources(row: dict[str, str]) -> dict[str, str]: + sources = {} + for name, col in SOURCE_COLUMNS.items(): + val = row.get(col, "") + if has_value(val): + sources[name] = clean(val) + return sources + + +def source_family(source_name: str, value: str) -> str: + text = f"{source_name} {value}".lower() + if "telemetryservice" in text or "metricreport" in text or source_name == "mrd": + return "Redfish TelemetryService" + if source_name.startswith("redfish"): + return "Redfish Fabric/Switch/Port" + if source_name == "nvos_gnmi": + return "NVOS gNMI" + if source_name == "nmx_t": + return "NMX-T" + if source_name.startswith("nvos_cli"): + return "NVOS CLI" + if source_name == "onboard_dbus": + return "Onboard DBus" + if source_name == "otlp": + return "OTLP" + return source_name + + +def choose_sources(row: dict[str, str], sources: dict[str, str], metric: str = "") -> tuple[str, str, str, str]: + existing_primary = { + "PORT-RCV-ERRORS": "nvos_gnmi", + "PORT-XMIT-CONSTRAINTS-ERRORS": "nvos_gnmi", + "EFFECTIVE-BER": "nvos_gnmi", + "SYMBOL-BER": "nvos_gnmi", + "PHY-SYMBOL-ERRORS": "nmx_t", + } + if not sources: + return "BLOCKER source resolution", "", "No catalog source listed for GB200 row", "source-resolution blocker" + + availability = clean(row.get(COL_AVAIL, "")).upper() + tray = yes(row.get("Applicable for\nGB200 NVL NvswitchTray")) + hmc_or_bmc = yes(row.get("Applicable for \nGB200 NVL HMC")) or yes(row.get("Applicable for \nGB200 NVL BMC")) + tray_only = tray and not hmc_or_bmc + + ordered_names = [] + if any(k in sources for k in ("mrd",)): + ordered_names.append("mrd") + if any(k in sources for k in ("redfish_gb", "redfish_wildcard", "redfish_dgx_or_c2")) and not tray_only: + ordered_names.extend(["redfish_wildcard", "redfish_gb", "redfish_dgx_or_c2"]) + if "nvos_gnmi" in sources: + if "IB" in availability or tray_only: + ordered_names.insert(0, "nvos_gnmi") + else: + ordered_names.append("nvos_gnmi") + if "nmx_t" in sources: + ordered_names.append("nmx_t") + if "nvos_cli_2503" in sources: + ordered_names.append("nvos_cli_2503") + if "nvos_cli_2502" in sources: + ordered_names.append("nvos_cli_2502") + if "onboard_dbus" in sources: + ordered_names.append("onboard_dbus") + if "otlp" in sources: + ordered_names.append("otlp") + + seen = set() + available_ordered = [] + for name in ordered_names: + if name in sources and name not in seen: + seen.add(name) + available_ordered.append(name) + for name in sources: + if name not in seen: + available_ordered.append(name) + + if metric in existing_primary and existing_primary[metric] in available_ordered: + available_ordered.remove(existing_primary[metric]) + available_ordered.insert(0, existing_primary[metric]) + + primary_name = available_ordered[0] + fallback_name = available_ordered[1] if len(available_ordered) > 1 else "" + primary = source_family(primary_name, sources[primary_name]) + fallback = source_family(fallback_name, sources[fallback_name]) if fallback_name else "" + precedence_parts = [] + for name in available_ordered: + family = source_family(name, sources[name]) + if family not in precedence_parts: + precedence_parts.append(family) + precedence = " then ".join(precedence_parts) + return primary, fallback, precedence, "one canonical series unless source-qualified duplicate is justified" + + +def target_collector(primary: str, sources: dict[str, str]) -> str: + if primary == "BLOCKER source resolution": + return "BLOCKER: source resolution required" + if primary == "Redfish TelemetryService": + return "new NvSwitchTelemetryServiceCollector behind collectors.telemetry_service" + if primary == "Redfish Fabric/Switch/Port": + return "new NvSwitchRedfishCollector for switch BMC endpoints" + if primary == "NVOS gNMI": + return "extend NvueGnmiCollector sample paths/processors" + if primary == "NMX-T": + return "extend NmxtCollector mapping" + if primary == "NVOS CLI": + if "nvos_gnmi" in sources: + return "prefer NVOS gNMI equivalent; CLI-only path is blocker if no streamed equivalent exists" + return "BLOCKER: no current NVOS CLI collector; source equivalent required" + if primary == "Onboard DBus": + return "prefer Redfish exposure; otherwise BLOCKER: no current DBus collector" + if primary == "OTLP": + return "BLOCKER: upstream OTLP source contract required" + return "TBD collector" + + +def emitted_surface(metric: str, data_type: str, coverage: str) -> str: + existing = { + "PORT-RCV-ERRORS": "existing interface_in_errors MetricSample", + "PORT-XMIT-CONSTRAINTS-ERRORS": "existing interface_out_errors MetricSample", + "EFFECTIVE-BER": "existing interface_effective_ber MetricSample", + "SYMBOL-BER": "existing interface_symbol_ber MetricSample", + "PHY-SYMBOL-ERRORS": "existing switch_nmxt symbol_errors MetricSample", + } + if metric in existing and coverage.startswith("covered"): + return existing[metric] + dtype = clean(data_type).lower() + base = f"nvswitch_{snake(metric)}" + if "text" in dtype or "string" in dtype: + return f"{base} as inventory/info event or state metric with bounded labels" + if "bool" in dtype or "enum" in dtype or "status" in dtype: + return f"{base} as numeric state MetricSample" + return f"{base} MetricSample" + + +def main() -> None: + coverage = load_coverage() + OUT_DIR.mkdir(parents=True, exist_ok=True) + with ROWS_CSV.open(newline="") as f: + rows = list(csv.DictReader(f)) + + out_rows = [] + for row in rows: + if "nvswitch" not in clean(row.get(COL_DEVICE, "")).lower(): + continue + applicable_cols = [col for col in GB200_COLUMNS if yes(row.get(col))] + if not applicable_cols: + continue + row_no = int(row["__ods_row_number"]) + metric = clean(row.get(COL_METRIC, "")) + sources = extract_sources(row) + primary, fallback, precedence, duplicate_policy = choose_sources(row, sources, metric) + cov = coverage.get(row_no, {}) + cov_status = clean(cov.get("coverage", "gap")) or "gap" + cov_reason = clean(cov.get("coverage_reason", "")) + if primary.startswith("BLOCKER"): + implementation_status = "blocker-source-resolution" + elif cov_status.startswith("covered"): + implementation_status = "already-covered-regression-required" + elif cov_status.startswith("partial"): + implementation_status = "partial-needs-implementation" + else: + implementation_status = "gap-needs-implementation" + + out_rows.append({ + "catalog_row": row_no, + "guid": clean(row.get(COL_GUID, "")), + "metric_param_name": metric, + "description": clean(row.get(COL_DESC, "")), + "category": clean(row.get(COL_CATEGORY, "")), + "data_type": clean(row.get(COL_DATA_TYPE, "")), + "gb200_applicability": "; ".join(col.replace("Applicable for", "").replace("\n", " ").strip() for col in applicable_cols), + "availability": clean(row.get(COL_AVAIL, "")), + "source_families": "; ".join(dict.fromkeys(source_family(k, v) for k, v in sources.items())), + "primary_source": primary, + "fallback_source": fallback, + "source_precedence": precedence, + "duplicate_alias_policy": duplicate_policy, + "target_collector": target_collector(primary, sources), + "target_emitted_surface": emitted_surface(metric, row.get(COL_DATA_TYPE, ""), cov_status), + "current_coverage": cov_status, + "implementation_status": implementation_status, + "coverage_reason": cov_reason, + "redfish_or_mrd_path": clean(row.get(COL_URI_DOMAIN)) or clean(row.get(COL_WILDCARD)) or clean(row.get(COL_REDFISH_GB)) or clean(row.get(COL_MRD)), + "nvos_gnmi_path": clean(row.get(COL_GNMI, "")), + "nmx_t_field": clean(row.get(COL_NMXT, "")), + "nvos_cli_reference": clean(row.get(COL_CLI_2503, "")) or clean(row.get(COL_CLI_2502, "")), + "onboard_dbus_reference": clean(row.get(COL_ONBOARD, "")), + "test_fixture_plan": "required: parser fixture plus metric emission assertion; live GB evidence before review pause", + "live_validation_plan": "validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review", + }) + + fieldnames = list(out_rows[0].keys()) if out_rows else [] + with OUT_CSV.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(out_rows) + + counts = Counter(r["implementation_status"] for r in out_rows) + primary_counts = Counter(r["primary_source"] for r in out_rows) + coverage_counts = Counter(r["current_coverage"] for r in out_rows) + md = [ + "# NVSWITCH telemetry GB200 source matrix", + "", + "Generated from `.omx/artifacts/nvswitch_rows.csv` for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`:", + "", + "- `Applicable for GB200 NVL HMC`", + "- `Applicable for GB200 NVL BMC`", + "- `Applicable for GB200 NVL NvswitchTray`", + "", + f"CSV matrix: `{OUT_CSV.relative_to(ROOT)}`", + "", + "## Counts", + "", + f"- Total GB200-applicable NVSWITCH rows: {len(out_rows)}", + "", + "### Implementation status", + "", + ] + for key, value in sorted(counts.items()): + md.append(f"- {key}: {value}") + md.extend(["", "### Current coverage", ""]) + for key, value in sorted(coverage_counts.items()): + md.append(f"- {key}: {value}") + md.extend(["", "### Primary source", ""]) + for key, value in sorted(primary_counts.items()): + md.append(f"- {key}: {value}") + md.extend([ + "", + "## Execution rules", + "", + "- Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete.", + "- Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale.", + "- Rows marked `blocker-source-resolution` are not deferred; they require immediate source-resolution or escalation.", + "- Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed.", + "", + ]) + OUT_MD.write_text("\n".join(md)) + print(f"wrote {OUT_CSV}") + print(f"wrote {OUT_MD}") + print(f"rows {len(out_rows)}") + + +if __name__ == "__main__": + main() diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv new file mode 100644 index 0000000000..a2e18c0ac5 --- /dev/null +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -0,0 +1,194 @@ +catalog_row,guid,metric_param_name,description,category,data_type,gb200_applicability,availability,source_families,primary_source,fallback_source,source_precedence,duplicate_alias_policy,target_collector,target_emitted_surface,current_coverage,implementation_status,coverage_reason,redfish_or_mrd_path,nvos_gnmi_path,nmx_t_field,nvos_cli_reference,onboard_dbus_reference,test_fixture_plan,live_validation_plan +763,NVSWITCH-NET-FW-VER,NET-FW-VER,Switch ASIC Firmware Version,Config,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_net_fw_ver as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId {FirmwareVersion},NA,FW_Version,nv show platform firmware $name {name: {Name: ASIC}} {actual-firmware},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/software/HGX_FW_NVSwitch_{InstanceId} xyz.openbmc_project.Software.Version Version,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +764,NVSWITCH-OS-VERSION,OS-VERSION,OS version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_os_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show system version {kernel},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +765,NVSWITCH-OS-KERNEL,OS-KERNEL,OS Kernel version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_os_kernel as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show system version {image{build-id}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +766,NVSWITCH-EROT-FW-VERSION,EROT-FW-VERSION,ERoT FW version,Config,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_erot_fw_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform firmware $name {name: {Name: EROT}} {actual-firmware},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +767,NVSWITCH-BMC-VERSION,BMC-VERSION,BMC firmware version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_bmc_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform firmware $name {name: {Name: BMC}} {actual-firmware},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +794,NVSWITCH-LINK-DOWNED-COUNTER,LINK-DOWNED-COUNTER,Total number of times the Port Training state machine has failed the link error recovery process and downed the link.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_downed_counter MetricSample,partial_host,partial-needs-implementation,"NMX-T maps Link_Down to link_down; gNMI code listens for unintentional-link-down-events, not catalog link-downed path",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkDownedCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-downed,Link_Down,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{link-downed}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +795,NVSWITCH-PORT-MALFORMED-PACKET-ERRORS,PORT-MALFORMED-PACKET-ERRORS,"Total number of packets received on the port that contain malformed packet errors • Data packets: LVer, length, VL • Link packets: operand, length, VL",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_malformed_packet_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{MalformedPackets}}},/interfaces/interface [name]/phy-diag/state/port-malformed-packet-errors,PortMalformedPacketErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-malformed-packet-errors}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +796,NVSWITCH-PORT-NEIGHBOR-MTU-DISCARDS,PORT-NEIGHBOR-MTU-DISCARDS,Number of outbound packets discarded by the port because packet length exceeded the NeighborMTU.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_neighbor_mtu_discards MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{NeighborMTUDiscards}}},/interfaces/interface [name]/phy-diag/state/port-neighbor-mtu-discards,PortNeighborMTUDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-neighbor-mtu-discards}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +797,NVSWITCH-PORT-RCV-ERRORS,PORT-RCV-ERRORS,"Total number of packets containing an error that were received on the port. These errors include: • Local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine) • Malformed data packet errors (LVer, length, VL) • Malformed link packet errors (operand, length, VL) • Packets discarded due to buffer overrun",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_in_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_in_errors,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXErrors},interfaces/interface [name]/state/counters/in-errors,PortRcvErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-errors}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +798,NVSWITCH-PORT-XMIT-DISCARDS,PORT-XMIT-DISCARDS,Total number of outbound packets discarded by the port because the port is down or congested.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_discards MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXDiscards}},interfaces/interface[name=*]/state/counters/out-discards,PortXmitDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-drops}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +799,NVSWITCH-PORT-RCV-REMOTE-PHYSICAL-ERRORS,PORT-RCV-REMOTE-PHYSICAL-ERRORS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_remote_physical_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXRemotePhysicalErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-remote-phy-errors,PortRcvRemotePhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +800,NVSWITCH-PORT-RCV-SWITCH-RELAY-ERRORS,PORT-RCV-SWITCH-RELAY-ERRORS,"Total number of packets received on the port that were discarded because they could not be forwarded by the switch relay.This might happen if, for instance, the destination port is congested or there are internal switch errors.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_switch_relay_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXSwitchRelayErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-switch-relay-errors,PortRcvSwitchRelayErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +801,NVSWITCH-QP1Dropped,QP1Dropped,"Number of QP1 MADs (packets) dropped due to resource limitations (e.g., lack of buffers or receives posted) on the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_qp1dropped MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{QP1Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/qp1-dropped,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{qp1-drops}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +802,NVSWITCH-VL15-DROPPED,VL15-DROPPED,"Number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers) of the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl15_dropped MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{VL15Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/vl15-dropped,VL15Dropped,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +804,NVSWITCH-SERIAL,SERIAL,Serial Number,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serial as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {SerialNumber},NA,sw_serial_number,nv show platform {serial-number},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/NVSwitch1 xyz.openbmc_project.Inventory.Decorator.Asset SerialNumber,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +806,NVSWITCH-NODE-GUID,NODE-GUID,"GUID of the HCA, switch, GPU, or router itself. All ports on the same node shall report the same NodeGUID. Provides a means to uniquely identify a node within a subnet and determine co-location of ports.",Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_node_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Node_GUID,nv show ib device $IbDeviceId {IbDeviceId: {type: NVLink*}} {guid},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/HGX_NVSwitch_{InstanceId} xyz.openbmc_project.Common.UUID UUID,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +807,NVSWITCH-PORT-GUID,PORT-GUID,GUID of the port. All ports on the same switch shall report the same NodeGUID.,Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T,NMX-T,Redfish Fabric/Switch/Port,NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Port_GUID,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +834,NVSWITCH-NVLINK-STATUS,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVLink Link status (e.g. LinkUp),Status,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI; Onboard DBus,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_pshima_nvidia_com_should_be_called_port_physical_state_ziv_hillel_il_nvlink_status as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId {LinkStatus},interfaces/interface[name=$port_name]/infiniband/state/physical-port-state,phy_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{physical-state}},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/fabrics/HGX_NVLinkFabric_{InstanceId}/Switches/NVSwitch_{InstanceId}/Ports/NVLink_{InstanceId} xyz.openbmc_project.Inventory.Item.Port LinkStatus,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +846,NVSWITCH-LINK-ERROR-RECOVERY-COUNTER,LINK-ERROR-RECOVERY-COUNTER,Total number of times the Port Training state machine has successfully completed the link error recovery process. This enrty is applicable for platforms with NVL5.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_error_recovery_counter MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkErrorRecoveryCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-error-recovery,LinkErrorRecoveryCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{error-recovery}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +847,NVSWITCH-PORT-MULTICAST-RCV-PKTS,PORT-MULTICAST-RCV-PKTS,"Total number of multicast packets, including multicast packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,@pshima@nvidia.com spelling is wrong RXMulitcastFrames -> RXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXMulticastFrames}},/interfaces/interface [name]/phy-diag/state/port-multi-cast-rcv-pkts,PortMultiCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-multicast-pkts}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +848,NVSWITCH-PORT-MULTICAST-XMIT-PKTS,PORT-MULTICAST-XMIT-PKTS,Total number of multicast packets transmitted on all VLs from the port. This may include multicast packets with errors.,Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,"@pshima@nvidia.com spelling issue , should be TXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXMulticastFrames}}}",/interfaces/interface [name]/phy-diag/state/port-multi-cast-xmit-pkts,PortMultiCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-multicast-pkts}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +849,NVSWITCH-PORT-RCV-DATA,PORT-RCV-DATA,"Total number of data octets, divided by 4, received on all VLs at the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_data MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXBytes},interfaces/interface[name=*]/state/counters/in-octets,PortRcvDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-bytes}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +850,NVSWITCH-PORT-RCV-PKTS,PORT-RCV-PKTS,"Total number of received packets, including packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXFrames}},interfaces/interface[name=*]/state/counters/in-pkts,PortRcvPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +851,NVSWITCH-PORT-UNICAST-RCV-PKTS,PORT-UNICAST-RCV-PKTS,"Total number of unicast packets, including unicast packets containing errors.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXUnicastFrames}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-rcv-pkts,PortUniCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-unicast-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +852,NVSWITCH-PORT-UNICAST-XMIT-PKTS,PORT-UNICAST-XMIT-PKTS,Total number of unicast packets transmitted on all VLs from the port. This may include unicast packets with errors.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXUnicastFrames}}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-xmit-pkts,PortUniCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-unicast-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +853,NVSWITCH-PORT-XMIT-DATA,PORT-XMIT-DATA,"Total number of data octets, divided by 4, transmitted on all VLs from the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors. It excludes all link packets.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_data MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {TXBytes}},interfaces/interface[name=*]/state/counters/out-octets,PortXmitDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-bytes}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +854,NVSWITCH-PORT-XMIT-PKTS,PORT-XMIT-PKTS,Total number of packets transmitted on all VLs from the port. This may include packets with errors,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXFrames}},interfaces/interface[name=*]/state/counters/out-pkts,PortXmitPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +855,NVSWITCH-PORT-XMIT-WAIT,PORT-XMIT-WAIT,The number of ticks during which the port selected by PortSelect had data to transmit but no data was sent during the entire tick either because of insufficient credits or because of lack of arbitration.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_wait MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{TXWait}}},interfaces/interface[name=*]/infiniband/state/counters/port/xmit-wait,PortXmitWait,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-wait}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +862,NVSWITCH-CONTACT,CONTACT,UTF-8 encoded string to describe contact person.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_contact as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/contact,NA,TBD,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +863,NVSWITCH-LOCATION,LOCATION,UTF-8 encoded string to describe location of the device.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_location as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/location,NA,nv show platform chassis-location {slot-number},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +864,NVSWITCH-NODE-DESCRIPTION,NODE-DESCRIPTION,UTF-8 encoded string to describe node in text format.,Inventory,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_node_description as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/platform-name,node_description,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +865,NVSWITCH-LID,LID,Local ID- Link layer address of an end port.,NetworkId,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_lid MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,lid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +866,NVSWITCH-PORT-NUMBER,PORT-NUMBER,Port number,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_number as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Port_Number,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +867,NVSWITCH-PORT-LABEL,PORT-LABEL,Front panel label of the port,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_label as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,port_label,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +868,NVSWITCH-REVISION,REVISION,Switch HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_revision MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,sw_revision,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +869,NVSWITCH-DEVICE-HARDWARE-REVISION,DEVICE-HARDWARE-REVISION,DEvice HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_hardware_revision MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,device_hw_rev,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +870,NVSWITCH-CPU_CORE_NUMBER,CPU_CORE_NUMBER,Number of cores,System,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_core_number MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show system cpu {core-count},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +872,NVSWITCH-ASIC-TEMP-CRITICAL,ASIC-TEMP-CRITICAL,"Critical temperature threshold for NVSwitch ASIC. Above this level, the system will shutdown.",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +873,NVSWITCH-ASIC-TEMP-MAX,ASIC-TEMP-MAX,Max temperature threshold for NVSwitch ASIC.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {max}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +874,NVSWITCH-ASIC-TEMP-STATE,ASIC-TEMP-STATE,NVSwitch ASIC state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +875,NVSWITCH-ASIC-TEMP-CURRENT,ASIC-TEMP-CURRENT,NVSwitch ASIC current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=ASIC*]/asic/state/asic-temp,Chip_Temp,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +876,NVSWITCH-ASIC-NAME,ASIC-NAME,NVSwitch ASIC current temperature,Platform,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_name MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=ASIC*]/state/name,NA,nv show platform {asic-model},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +879,NVSWITCH-AMBIENT-MNG-TEMP-STATE,AMBIENT-MNG-TEMP-STATE,Ambient temperature located in port side state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_ambient_mng_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +880,NVSWITCH-AMBIENT-MNG-TEMP-CURRENT,AMBIENT-MNG-TEMP-CURRENT,Ambient temperature located in port side,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_ambient_mng_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +881,NVSWITCH-CPU_PACK_TEMP_CRITICAL,CPU_PACK_TEMP_CRITICAL,"Critical temperature threshold for CPU PACK, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +882,NVSWITCH-CPU_PACK_TEMP_MAX,CPU_PACK_TEMP_MAX,Max temperature threshold for CPU PACK,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +883,NVSWITCH-CPU_PACK_TEMP_STATE,CPU_PACK_TEMP_STATE,CPU PACK temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +884,NVSWITCH-CPU_PACK_TEMP_CURRENT,CPU_PACK_TEMP_CURRENT,CPU PACK temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +885,NVSWITCH-CPU-UTIL,CPU-UTIL,ComE CPU utilization,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_util MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=cpu]/cpu/utilization/state/avg,NA,nv show system cpu {total-utilization},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +886,NVSWITCH-MEM-UTIL,MEM-UTIL,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_util MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/memory-used,NA,nv show system memory {physical{utilization}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +887,NVSWITCH-MEM-TOTAL-SIZE,MEM-TOTAL-SIZE,Memory total size,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_total_size MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/memory-total-size,NA,nv show system memory {physical{total}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +888,NVSWITCH-DISK-TOTAL-SIZE,DISK-TOTAL-SIZE,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_total_size MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/disk-total-size,NA,TBD,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +889,NVSWITCH-DISK-USED,DISK-USED,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_used MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/disk-used,NA,TBD,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +890,NVSWITCH-SODIMM_TEMP_CRITICAL,SODIMM_TEMP_CRITICAL,"Critical temperature threshold for SODIMM temperature, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +891,NVSWITCH-SODIMM_TEMP_MAX,SODIMM_TEMP_MAX,Max temperature threshold for SODIMM temperature,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {max}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +892,NVSWITCH-SODIMM_TEMP_STATE,SODIMM_TEMP_STATE,SODIMM temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +893,NVSWITCH-SODIMM_TEMP_CURRENT,SODIMM_TEMP_CURRENT,SODIMM temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +894,FAN-MAX-SPEED,MAX-SPEED,Chassis fan reading range (max),Config,Float,GB200 NVL BMC; GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NVOS CLI,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port then NVOS CLI,one canonical series unless source-qualified duplicate is justified,new NvSwitchRedfishCollector for switch BMC endpoints,nvswitch_max_speed MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,2023.3 /redfish/v1/Chassis/$ChassisId/Sensors/$SensorId {ReadingRangeMax},NA,NA,nv show platform environment fan $FanId {max-speed},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +897,NVSWITCH-PORT-LOGICAL-STATE,PORT-LOGICAL-STATE,Port State. Enumerated as: 0: No State Change; 1: Down (includes failed links) 2: Initialize 3: Armed 4: Active,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_logical_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/logical-port-state,logical_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{logical-state}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +898,NVSWITCH-FEC-MODE-ACTIVE,FEC-MODE-ACTIVE,"FEC mode active: 0: No_FEC 1: Firecode_FEC 2: Standard_RS_FEC - RS(528,514) 3: Standard_LL_RS_FEC - RS(271,257) 6: Interleaved_Standard_RS-FEC - (544,514) 7: Standard_RS-FEC - (544,514)",Status,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_fec_mode_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Active_FEC,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +899,NVSWITCH-RAW-BER,RAW-BER,Raw BER- calculated by the following: bits 15:8 - raw_ber_magnitude bits 3:0 - raw_ber_coef Raw_BER = raw_ber_coef*10^(-raw_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber,Total_Raw_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{raw-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +900,NVSWITCH-EFFECTIVE-BER,EFFECTIVE-BER,Effective BER- calculated by the following: bits 15:8 - effective_ber_magnitude bits 3:0 - effective_ber_coef Effective_BER = effective_ber_coef*10^(-effective_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_effective_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,NA,/interfaces/interface [name]/phy-diag/state/effective-ber,Effective_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +901,NVSWITCH-SYMBOL-BER,SYMBOL-BER,Symbol BER- calculated by the following: bits 15:8 - symbol_ber_magnitude bits 3:0 - symbol_ber_coef Symbol_BER = symbol_ber_coef*10^(-symbol_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_symbol_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{BitErrorRate}}},/interfaces/interface [name]/phy-diag/state/symbol-ber,Symbol_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{symbol-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +902,NVSWITCH-ZERO-HIST,ZERO-HIST,First FEC histogram bin with value of 0 while all higher bins are only with 0 value as well.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_zero_hist MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/zero-hist,fc_zero_hist,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{zero-hist}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +903,NVSWITCH-PHY-RAW-ERRORS-LANE0,PHY-RAW-ERRORS-LANE0,This counter provides information on error bits that were identified on lane 0. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-1,Raw_Errors_Lane_0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{phy-raw-errors}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +904,NVSWITCH-PHY-RAW-ERRORS-LANE1,PHY-RAW-ERRORS-LANE1,This counter provides information on error bits that were identified on lane 1. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-2,Raw_Errors_Lane_1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{phy-raw-errors}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +905,NVSWITCH-RAW-BER-LANE0,RAW-BER-LANE0,Raw BER for lane 0. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-1,raw_ber_lane0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{raw-ber}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +906,NVSWITCH-RAW-BER-LANE1,RAW-BER-LANE1,Raw BER for lane 1. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-2,raw_ber_lane1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{raw-ber}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +907,NVSWITCH-PHY-EFFECTIVE-ERRORS,PHY-EFFECTIVE-ERRORS,This counter provides information on error bits that were not corrected by FEC correction algorithm or that FEC is not active. (post FEC pre PLR),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_effective_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2025.1 /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{EffectiveError}}},/interfaces/interface [name]/phy-diag/state/effective-errors,Effective_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-errors}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +908,NVSWITCH-PHY-SYMBOL-ERRORS,PHY-SYMBOL-ERRORS,Total number of minor link errors detected on one or more physical lanes. This counter provides information on error bits that were not corrected by phy correction mechanisms. (post FEC & PLR),Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NMX-T,NVOS gNMI,NMX-T then NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,existing switch_nmxt symbol_errors MetricSample,covered_host_nmxt,already-covered-regression-required,NMX-T maps Symbol_Errors to symbol_errors,NA,/interfaces/interface [name]/phy-diag/state/symbol-errors,Symbol_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{nvl{errors{symbol-errors{receive}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +909,NVSWITCH-TIME-SINCE-LASTS-CLEAR,TIME-SINCE-LASTS-CLEAR,The time passed since the last counters clear event in msec- time since the port was raised to up.,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_time_since_lasts_clear MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/time-since-last-clear-min,Time_since_last_clear_Min,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{time-since-last-clear-min}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +910,NVSWITCH-DEVICE-ID,DEVICE-ID,Device ID information as assigned by device manufacturer.,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_id as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Device_ID,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +911,NVSWITCH-FEC-HIST-0,FEC-HIST-0,Value of RS FEC Histogram (Reed Solomon error correction) bin0,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin0,hist0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{0{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +912,NVSWITCH-FEC-HIST-1,FEC-HIST-1,Value of RS FEC Histogram (Reed Solomon error correction) bin1,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin1,hist1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{1{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +913,NVSWITCH-FEC-HIST-2,FEC-HIST-2,Value of RS FEC Histogram (Reed Solomon error correction) bin2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_2 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin2,hist2,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{2{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +914,NVSWITCH-FEC-HIST-3,FEC-HIST-3,Value of RS FEC Histogram (Reed Solomon error correction) bin3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_3 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin3,hist3,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{3{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +915,NVSWITCH-FEC-HIST-4,FEC-HIST-4,Value of RS FEC Histogram (Reed Solomon error correction) bin4,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_4 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin4,hist4,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{4{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +916,NVSWITCH-FEC-HIST-5,FEC-HIST-5,Value of RS FEC Histogram (Reed Solomon error correction) bin5,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_5 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin5,hist5,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{5{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +917,NVSWITCH-FEC-HIST-6,FEC-HIST-6,Value of RS FEC Histogram (Reed Solomon error correction) bin6,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_6 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin6,Hist6,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{6{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +918,NVSWITCH-FEC-HIST-7,FEC-HIST-7,Value of RS FEC Histogram (Reed Solomon error correction) bin7,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_7 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin7,Hist7,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{7{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +919,NVSWITCH-FEC-HIST-8,FEC-HIST-8,Value of RS FEC Histogram (Reed Solomon error correction) bin8,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_8 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin8,Hist8,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{8{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +920,NVSWITCH-FEC-HIST-9,FEC-HIST-9,Value of RS FEC Histogram (Reed Solomon error correction) bin9,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_9 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin9,Hist9,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{9{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +921,NVSWITCH-FEC-HIST-10,FEC-HIST-10,Value of RS FEC Histogram (Reed Solomon error correction) bin10,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_10 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin10,Hist10,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{10{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +922,NVSWITCH-FEC-HIST-11,FEC-HIST-11,Value of RS FEC Histogram (Reed Solomon error correction) bin11,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_11 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin11,Hist11,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{11{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +923,NVSWITCH-FEC-HIST-12,FEC-HIST-12,Value of RS FEC Histogram (Reed Solomon error correction) bin12,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_12 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin12,hist12,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{12{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +924,NVSWITCH-FEC-HIST-13,FEC-HIST-13,Value of RS FEC Histogram (Reed Solomon error correction) bin13,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_13 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin13,hist13,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{13{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +925,NVSWITCH-FEC-HIST-14,FEC-HIST-14,Value of RS FEC Histogram (Reed Solomon error correction) bin14,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_14 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin14,hist14,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{14{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +926,NVSWITCH-FEC-HIST-15,FEC-HIST-15,Value of RS FEC Histogram (Reed Solomon error correction) bin15,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_15 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin15,hist15,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{15{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +931,NVSWITCH-PLR-CODES-LOSS,PLR-CODES-LOSS,Recieved bandwidth loss due to codes retransmission. calculated in resolution of: (plr_rcv_code_err / plr_rcv_codes) * 10^10 BW Loss % = (plr_codes_loss / 10^10 ) *100,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_plr_codes_loss MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,HiRetransmissionRate,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-codes-loss}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +932,NVSWITCH-PORT-BUFFER-OVERRUN-ERRORS,PORT-BUFFER-OVERRUN-ERRORS,Total number of packets received on the port that were discarded due to buffer overrun.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_buffer_overrun_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/counters/port/excessive-buffer-overrun,ExcessiveBufferOverrunErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{buffer-overrun-errors}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +933,NVSWITCH-LINK-SPEED-ACTIVE,LINK-SPEED-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_speed_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/speed,Link_speed_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{speed}}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +934,NVSWITCH-PLR-RCV-CODES,PLR-RCV-CODES,Number of received PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-codes,PlrRcvCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +935,NVSWITCH-PLR-RCV-CODES-ERR,PLR-RCV-CODES-ERR,The total number of rejected PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes_err MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-code-err,PlrRcvCodeErr,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes-err}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +936,NVSWITCH-PLR-RCV-UNCORRECTABLES-CODE,PLR-RCV-UNCORRECTABLES-CODE,The total number of uncorrectable PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_uncorrectables_code MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-uncorrectable-code,PlrRcvUncorrectableCode,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-uncorrectable-code}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +937,NVSWITCH-PLR-XMIT-CODES,PLR-XMIT-CODES,Number of transmitted PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-codes,PlrXmitCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +938,NVSWITCH-PLR-XMIT-RETRYS-CODES,PLR-XMIT-RETRYS-CODES,The total number of PLR codewords retransmitted,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-codes,PlrXmitRetryCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +939,NVSWITCH-PLR-XMIT-RETRYS-EVENTS,PLR-XMIT-RETRYS-EVENTS,The total number of retransmitted events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_events MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events,PlrXmitRetryEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +940,NVSWITCH-PLR-SYNC-EVENTS,PLR-SYNC-EVENTS,The number of PLR sync events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_sync_events MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-sync-events,PlrSyncEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-sync-events}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +941,NVSWITCH-PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,The maximum number of retransmitted events in 60 sec window based upon the action of undertaking PLR (physical layer retry),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retry_codes_within_minute MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events-within-t-sec-max,PlrXmitRetryCodesWithinTSecMax,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +942,NVSWITCH-PLR-BW-LOSS-PERCENT,PLR-BW-LOSS-PERCENT,The bandwidth loss (percentage) based upon PLR on the NVLink.,Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_plr_bw_loss_percent MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +943,NVSWITCH-RQ-GENERAL-ERROR,RQ-GENERAL-ERROR,The total number of packets that were dropped since it contained errors. Reasons for this include: Dropped due to MPR mismatch.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_rq_general_error MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/rq-general-error,rq_general_error,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{rq-general-error}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +944,NVSWITCH-TIME-TO-LINKS-UP,TIME-TO-LINKS-UP,"Time in msec to link up from disable until phy up state. While the phy manager did not reach phy up state the timer will return 0. The timer resets to 0 in one of the following cases: 1. When moving to disable or rx disable state. 2. When moving from active or phy up to polling state, while working at force mode.",Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_to_links_up MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,time_to_link_up_ext_msec,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +945,NVSWITCH-STATUS-OPCODE,STATUS-OPCODE,Opcode for advanced debug. String representation can be found in STATUS-MESSAGE.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_opcode MetricSample,partial_host,partial-needs-implementation,"NVUE REST link_diagnostic emits opcode as label and boolean state, not opcode as metric value",NA,NA,Advanced_Status_Opcode,"nv show interface --view link-diagnostics ""code""",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +946,NVSWITCH-STATUS-MESSAGE,STATUS-MESSAGE,String represntation of STATUS-OPCODE. All Messages are terminated by a Null character ‘\0’,Status,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_message as inventory/info event or state metric with bounded labels,partial_host,partial-needs-implementation,"NVUE REST link_diagnostic emits diagnostic_status label, not message metric",NA,NA,Status_Message,"nv show interface --view link-diagnostics ""status""",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +947,NVSWITCH-DOWN-BLAME,DOWN-BLAME,Which receiver caused last link down: 0: Unknown 1: Local_phy 2: Remote_phy,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_down_blame MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,down_blame,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +948,NVSWITCH-LOCAL-REASON-OPCODE,LOCAL-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 3: Block_Lock_loss 4: Alignment_loss 5: FEC_sync_loss 6: PLL_lock_loss 7: FIFO_overflow 8: false_SKIP_condition 9: Minor_Error_threshold_exceeded 10: Physical_layer_retransmission_timeout 11: Heartbeat_errors 12: Link_Layer_credit_monitoring_watchdog 13: Link_Layer_integrity_threshold_exceeded 14: Link_Layer_buffer_overrun 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 19: Down_by_verification_GW 20: Received_Remote_Fault 21: Received_TS1 22: Down_by_management_command 23: Cable_was_unplugged 24: Cable_access_issue 25: Thermal_shutdown 26: Current_issue 27: Power_budget 28: Fast_recovery_raw_ber 29: Fast_recovery_effective_ber 30: Fast_recovery_symbol_ber 31: Fast_recovery_credit_watchdog 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_local_reason_opcode MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,local_reason_opcode,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +949,NVSWITCH-REMOTE-REASON-OPCODE,REMOTE-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 4: Alignment_loss 10: Physical_layer_retransmission_timeout 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 21: Received_TS1 22: Down_by_management_command 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event 38: Reset_no_power_cycle 40: Down_due_to_HW_force_event 41: Down_due_to_thermal_event 42: L1_exit_failure 43: too_many_link_error_recoveries 44: Down_due_to_contain_mode 45: BW_loss_threshold_exceeded 47: Hi_SER 48: down_by_nmx_adminstate_cmd,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_remote_reason_opcode MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,remote_reason_opcode,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +950,NVSWITCH-PHY-RECEIVED-BITS,PHY-RECEIVED-BITS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_received_bits MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/phy-received-bits,phy_received_bits,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{phy-received-bits}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +951,NVSWITCH-PORT-RCV-CONSTRAINT-ERRORS,PORT-RCV-CONSTRAINT-ERRORS,Total number of packets received on the switch physical port that are discarded for the following reasons: • FilterRawInbound is true and packet is raw • PartitionEnforcementInbound is true and packet fails partition key check or IP version check,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_constraint_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/counters/port/rcv-constraints-errors,PortRcvConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-constraint-errors}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +952,NVSWITCH-PORT-XMIT-CONSTRAINTS-ERRORS,PORT-XMIT-CONSTRAINTS-ERRORS,Total number of packets not transmitted from the switch physical port for the following reasons: • FilterRawOutbound is true and packet is raw • PartitionEnforcementOutbound is true and packet fails partition key check or IP version check,Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_out_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_out_errors,NA,/interfaces/interface [name]/state/counters/out-errors,PortXmitConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-errors}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +953,NVSWITCH-PORT-LOCAL-PHYSICAL-ERRORS,PORT-LOCAL-PHYSICAL-ERRORS,"Total number of packets received on the port that contain local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine).",Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_local_physical_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-local-physical-errors,PortLocalPhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-local-physical-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +954,NVSWITCH-SYNC-HEADER-ERROR-COUNTER,SYNC-HEADER-ERROR-COUNTER,Count of errored block sync header on one or more lanes.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_sync_header_error_counter MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/sync-header-error-counter,SyncHeaderErrorCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{sync-header-error-counter}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +955,NVSWITCH-PORT-DLID-MAPPING-ERRORS,PORT-DLID-MAPPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to DLID mapping errors.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_dlid_mapping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-dlid-mapping-errors,PortDLIDMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-dlid-mapping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +956,NVSWITCH-LOCAL-LINK-INTEGRITY-ERRORS,LOCAL-LINK-INTEGRITY-ERRORS,The number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors;,Error,Integer,GB200 NVL HMC; GB200 NVL BMC; GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_local_link_integrity_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name]/infiniband/state/counters/port/local-link-integrity-errors,LocalLinkIntegrityErrors,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +957,NVSWITCH-PORT-VL-MAPPING-ERRORS,PORT-VL-MAPPING-ERRORS,"Packet discards due to VL mapping behavior are not considered errors, so the behavior of this counter is implementation dependent. However, it is recommended that this counter be used to count the total number of packets received on the port that were discarded because they could not be forwarded by the switch relay due to VL mapping behavior",Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_vl_mapping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-vl-mapping-errors,PortVLMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-vl-mapping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +958,NVSWITCH-PORT-LOOPING-ERRORS,PORT-LOOPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to looping errors (output port = input port). This applies to switches only.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_looping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-looping-errors,PortLoopingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-looping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +959,NVSWITCH-PORT-INACTIVE-DISCARDS,PORT-INACTIVE-DISCARDS,Number of outbound packets discarded by the port because it is not in the active state.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_inactive_discards MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-inactive-discards,PortInactiveDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-inactive-discards}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +960,NVSWITCH-LINK-WIDTH-ACTIVE,LINK-WIDTH-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_width_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/width,Link_width_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{lanes}}",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +961,NVSWITCH-PHY-MANAGER-STATE,PHY-MANAGER-STATE,Show some more info about the PHY state: 0:Disabled 1:Open_port 2:Polling 3:Active_or_Linkup 4:Close_port 5:Phy_up 7:Rx_disable,Status,Text,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_manager_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/phy-manager-state,Phy_Manager_State,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +962,NVSWITCH-MTU,MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mtu MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/mtu,NA,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{mtu}}",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +963,NVSWITCH-MAX-SUPPORTED-MTU,MAX-SUPPORTED-MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_max_supported_mtu MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/max-supported-mtus,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{max-supported-mtu}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +964,NVSWITCH-SUPPORTED-WIDTH,SUPPORTED-WIDTH,Maximum Transmission Unit,Specs,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_supported_width MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/supported-widths,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{supported-lanes}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +965,NVSWITCH-VL-CAPABILITIES,VL-CAPABILITIES,Maximum Transmission Unit,Specs,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl_capabilities as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/vl-capabilities,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{vl-capabilities}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +966,NVSWITCH-FAN-STATE,FAN-STATE,Fan status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fan_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name=FAN1/1]/state/oper-status,NA,nv show platform environment fan $FanId {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +967,NVSWITCH-FAN-LED,FAN-LED,Fan LED color,Sensor.Other,Text,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_fan_led as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,,nv show platform environment led $LedID {color},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +968,NVSWITCH-CABLE-PART-NUMBER,CABLE-PART-NUMBER,Cable part num,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_part_number as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Cable_PN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-pn},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +969,NVSWITCH-CABLE-SERIAL-NUMBER,CABLE-SERIAL-NUMBER,Cabl Serial num,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_serial_number MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,Na,NA,Cable_SN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-sn},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +970,NVSWITCH-CABLE-TRANSMITTER-TECHNOLOGY,CABLE-TRANSMITTER-TECHNOLOGY,Active/Passive cable,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_transmitter_technology as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_technology,TBD,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +971,NVSWITCH-CABLE-TYPE,CABLE-TYPE,Cable/module type: 0: Unidentified 1: Active_cable - (active copper / optics) 2: Optical_Module - (separated) 3: Passive_copper_cable 4: Cable_unplugged 5: Twisted_pair,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_type as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_type,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-type},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +972,NVSWITCH-CABLE-VENDOR,CABLE-VENDOR,Cable vendor: 0: Other 1: Mellanox 2: Known_OUI 3: NVIDIA,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_vendor as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_vendor,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-name},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +973,NVSWITCH-CABLE-LENGTH,CABLE-LENGTH,Cable length in 1m units. For CMIS modules: bits 6:7 represent cable_length_multiplier for calculating cable length 00 - 0.1 multiplier (0.1 to 6.3m) 01- 1 multiplier (1 to 63m) 10 - 10 multiplier (10 to 630m) 11 - 100 multiplier (100 to 6300m) bits 0:5 represent cable_length_value for calculating cable length. length is calculated with cable_length_value * cable_length_- multiplier,Specs,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_length MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_length,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-length},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +974,NVSWITCH-CABLE-IDENTIFIER,CABLE-IDENTIFIER,"0: QSFP28 1: QSFP_Plus 2: SFP28_or_SFP_Plus 3: QSA - (QSFP->SFP) 4: Backplane 5: SFP_DD 6: QSFP_DD 7: QSFP_CMIS 8: OSFP 9: C2C 10: DSFP 11: QSFP_Split_Cable identifiers that are CMIS compliant are: 5,6,7,8,10",Specs,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_identifier MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_identifier,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {identifier},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +975,NVSWITCH-CABLE-REV,CABLE-REV,ASCII Vendor revision aligned to right padded with 0h on the left,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rev as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,vendor_rev,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-rev},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +976,NVSWITCH-CABLE-FW-VERSION,CABLE-FW-VERSION,module FW version (relevant for optic only),Config,,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_fw_version MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_fw_version,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {firmware},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +977,NVSWITCH-CABLE-RX-POWER-LANE0,CABLE-RX-POWER-LANE0,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rx_power_lane_0,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-1{rx-power{Power}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +978,NVSWITCH-CABLE-RX-POWER-LANE1,CABLE-RX-POWER-LANE1,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rx_power_lane_1,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-2{rx-power{Power}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +979,NVSWITCH-CABLE-DIAG-SUPPLY-VOLTAGE,CABLE-DIAG-SUPPLY-VOLTAGE,Internally measured supply voltage in 100uV (relevant for optic only),Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_diag_supply_voltage MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Module_Voltage,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {voltage{voltage}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +980,NVSWITCH-CABLE-TEMP,CABLE-TEMP,Module main temperature sensor measured on a unit scale of 1/256 C degrees(relevant for optic only),Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_temp MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Module_Temperature,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {temperature{temperature}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +981,NVSWITCH-CABLE-TEMP-ALARM,CABLE-TEMP-ALARM,Temperature warning threshold on a unit scale of 1/256 C degrees.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_temp_alarm MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag,Temp_flags,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +982,NVSWITCH-CABLE-VOLTAGE-ALARM,CABLE-VOLTAGE-ALARM,Voltage warning threshold on a unit scale of 100uV.,Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_voltage_alarm MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag,Vcc_flags,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +983,NVSWITCH-CABLE-TX-CDR-LOL,CABLE-TX-CDR-LOL,Bitmask for latched Tx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_cdr_lol as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-cdr-lol,tx_cdr_lol,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +984,NVSWITCH-CABLE-RX-CDR-LOL,CABLE-RX-CDR-LOL,Bitmask for latched Rx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_cdr_lol as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-cdr-lol,rx_cdr_lol,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +985,NVSWITCH-CABLE-TX-LOS,CABLE-TX-LOS,Bitmask for latched Tx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_los as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-los,tx_los,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +986,NVSWITCH-CABLE-RX-LOS,CABLE-RX-LOS,Bitmask for latched Rx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_los as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-los,rx_los,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +987,NVSWITCH-LINK-PARTNER-DESCRIPTION,LINK-PARTNER-DESCRIPTION,Description of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_description as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_description,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +988,NVSWITCH-LINK-PARTNER-NODE-GUID,LINK-PARTNER-NODE-GUID,GUID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_node_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_node_guid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +989,NVSWITCH-LINK-PARTNER-LID,LINK-PARTNER-LID,LID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_lid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_lid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +990,NVSWITCH-LINK-PARTNER-PORT-NUM,LINK-PARTNER-PORT-NUM,Port number of the link partner side (port that is connected to the port),Inventory,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_port_num MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_port_num,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1174,NVSWITCH-CPU-STATE,CPU-STATE,CPU status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name=cpu]/state/oper-status,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1241,NVSWITCH-DRIVE-TEMP-CRITICAL,DRIVE-TEMP-CRITICAL,"Critical temperature threshold for drive, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1242,NVSWITCH-DRIVE-TEMP-MAX,DRIVE-TEMP-MAX,Max temperature threshold for drive,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1243,NVSWITCH-DRIVE-TEMP-STATE,DRIVE-TEMP-STATE,Drive Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1244,NVSWITCH-DRIVE-TEMP-CURRENT,DRIVE-TEMP-CURRENT,Drive Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1245,NVSWITCH-HSC-VINDC-TEMP-CRITICAL,HSC-VINDC-TEMP-CRITICAL,"Critical temperature threshold for HSC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1246,NVSWITCH-HSC-VINDC-TEMP-MAX,HSC-VINDC-TEMP-MAX,Max temperature threshold for HSC,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1247,NVSWITCH-HSC-VINDC-TEMP-STATE,HSC-VINDC-TEMP-STATE,HSC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1248,NVSWITCH-HSC-VINDC-TEMP-CURRENT,HSC-VINDC-TEMP-CURRENT,HSC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1249,NVSWITCH-PDB-CONV-TEMP-CRITICAL,PDB-CONV-TEMP-CRITICAL,"Critical temperature threshold for PDB, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1251,NVSWITCH-PDB-CONV-TEMP-STATE,PDB-CONV-TEMP-STATE,PDB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1252,NVSWITCH-PDB-CONV-TEMP-CURRENT,PDB-CONV-TEMP-CURRENT,PDB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1253,NVSWITCH-PMIC-TEMP-CRITICAL,PMIC-TEMP-CRITICAL,"Critical temperature threshold for PMIC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1255,NVSWITCH-PMIC-TEMP-STATE,PMIC-TEMP-STATE,PMIC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1256,NVSWITCH-PMIC-TEMP-CURRENT,PMIC-TEMP-CURRENT,PMIC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1259,NVSWITCH-SWB-ASIC-PCB-TEMP-STATE,SWB-ASIC-PCB-TEMP-STATE,SWB ASIC PCB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_swb_asic_pcb_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1260,NVSWITCH-SWB-ASIC-PCB-TEMP-CURRENT,SWB-ASIC-PCB-TEMP-CURRENT,SWB ASIC PCB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_swb_asic_pcb_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1688,NVSWITCH-LINK-RECOVERY-SUCCESS-CNT,LINK-RECOVERY-SUCCESS-CNT,Successful recovery count in an active link. Counter resets on link flap.,"Status, Event",Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_recovery_success_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,successful_recovery_events,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1689,NVSWITCH-TOTAL-LINK-RECOVERY-SUCCESS-CNT,TOTAL-LINK-RECOVERY-SUCCESS-CNT,Total successful recovery count accumulated across link flaps.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_total_link_recovery_success_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,total_successful_recovery_events,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1690,NVSWITCH-TIME-SINCE-LAST-RECOVERY,TIME-SINCE-LAST-RECOVERY,"Elapsed time since last recovery event, measured in seconds.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_since_last_recovery MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_since_last_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1691,NVSWITCH-TIME-BTWN-TWO-RECOVERIES,TIME-BTWN-TWO-RECOVERIES,"Time in msec between two last consecutive recoveries (success or fail) from exit of first to entry of second. When value is OxFFFF, time is more than 1 minute.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_btwn_two_recoveries MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_between_last_2_recoveries,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1692,NVSWITCH-RECOVERY-ATTEMPTS-L1-CNT,RECOVERY-ATTEMPTS-L1-CNT,Number of first level (logical lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l1_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,last_host_logical_recovery_attempts_count,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1693,NVSWITCH-RECOVERY-ATTEMPTS-L2-CNT,RECOVERY-ATTEMPTS-L2-CNT,Number of second level (Serdes) lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l2_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,last_host_serdes_feq_attempts_count,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1694,NVSWITCH-RECOVERY-CYCLE-DURATION,RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last logical recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_cycle_duration MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_in_last_host_logical_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1695,NVSWITCH-SERDES-RECOVERY-CYCLE-DURATION,SERDES-RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last Serdes recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serdes_recovery_cycle_duration MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_in_last_host_serdes_feq_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1696,NVSWITCH-CONTAIN-DRAIN-XMIT-DISCARD,CONTAIN-DRAIN-XMIT-DISCARD,Number of transmit discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_xmit_discard MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,contain_n_drain_xmit_discards,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1697,NVSWITCH-CONTAIN-DRAIN-RCV-DISCARD,CONTAIN-DRAIN-RCV-DISCARD,Number of receive discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_rcv_discard MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,contain_n_drain_rcv_discards,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1698,NVSWITCH-DEVICE-NUM,DEVICE-NUM,Device number on tray,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_num MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,device_num_on_tray,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1699,NVSWITCH-BOARD-TYPE,BOARD-TYPE,board type,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_board_type as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,board_type,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1700,NVSWITCH-CHASSIS-SLOT-IDX,CHASSIS-SLOT-IDX,chassis slot index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_slot_idx MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,chassis_slot_index,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1701,NVSWITCH-TRAY-IDX,TRAY-IDX,Tray index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_tray_idx MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,tray_index,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1702,NVSWITCH-TOPOLOGY-ID,TOPOLOGY-ID,Topology Id,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_topology_id MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,topology_id,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1703,NVSWITCH-CHASSIS-ID,CHASSIS-ID,Chassis Id,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_id as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,chassis_id,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1704,NVSWITCH-RAW-ERR-LANE-2,RAW-ERR-LANE-2,Raw errors lane 2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_2 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Raw_Errors_Lane_2,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1705,NVSWITCH-RAW-ERR-LANE-3,RAW-ERR-LANE-3,Raw errors lane 3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_3 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Raw_Errors_Lane_3,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1706,NVSWITCH-RQ-NUM-WRFE,RQ-NUM-WRFE,RQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_wrfe MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rq_num_wrfe,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1707,NVSWITCH-RQ-NUM-LLE,RQ-NUM-LLE,RQ num LLE,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_lle MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rq_num_lle,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1708,NVSWITCH-SQ-NUM-WRFE,SQ-NUM-WRFE,SQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_sq_num_wrfe MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,sq_num_wrfe,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2293,NVSWITCH-CABLE-OPER-STATUS,CABLE-OPER-STATUS,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_oper_status MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/transceiver-diag/state/module-oper-status,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2294,NVSWITCH-CABLE-SNR-MEDIA-LANE-N,CABLE-SNR-MEDIA-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,BLOCKER source resolution,,No catalog source listed for GB200 row,source-resolution blocker,BLOCKER: source resolution required,nvswitch_cable_snr_media_lane_n MetricSample,catalog_no_source_gap,blocker-source-resolution,Catalog row has no source listed,,NA,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2295,NVSWITCH-CABLE-SNR-HOST-LANE-N,CABLE-SNR-HOST-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,BLOCKER source resolution,,No catalog source listed for GB200 row,source-resolution blocker,BLOCKER: source resolution required,nvswitch_cable_snr_host_lane_n MetricSample,catalog_no_source_gap,blocker-source-resolution,Catalog row has no source listed,,NA,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2296,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_low_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-lower,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2297,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_low_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-lower,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2298,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_high_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-upper,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2299,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_high_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-upper,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md new file mode 100644 index 0000000000..184aad30dc --- /dev/null +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -0,0 +1,43 @@ +# NVSWITCH telemetry GB200 source matrix + +Generated from `.omx/artifacts/nvswitch_rows.csv` for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`: + +- `Applicable for GB200 NVL HMC` +- `Applicable for GB200 NVL BMC` +- `Applicable for GB200 NVL NvswitchTray` + +CSV matrix: `docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv` + +## Counts + +- Total GB200-applicable NVSWITCH rows: 193 + +### Implementation status + +- already-covered-regression-required: 5 +- blocker-source-resolution: 2 +- gap-needs-implementation: 183 +- partial-needs-implementation: 3 + +### Current coverage + +- catalog_no_source_gap: 2 +- covered_host_gnmi: 4 +- covered_host_nmxt: 1 +- gap: 183 +- partial_host: 3 + +### Primary source + +- BLOCKER source resolution: 2 +- NMX-T: 57 +- NVOS CLI: 36 +- NVOS gNMI: 97 +- Redfish Fabric/Switch/Port: 1 + +## Execution rules + +- Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete. +- Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale. +- Rows marked `blocker-source-resolution` are not deferred; they require immediate source-resolution or escalation. +- Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed. From aaf72b4a679984c81b7e85090b4778d774857844 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:07:09 -0400 Subject: [PATCH 02/25] docs(health): record nv-redfish dependency path Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- ...vswitch_telemetry_nv_redfish_dependency.md | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md diff --git a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md new file mode 100644 index 0000000000..b24a4c4b06 --- /dev/null +++ b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md @@ -0,0 +1,36 @@ +# NVSWITCH telemetry nv-redfish dependency notes + +Generated during the GB200 NVSWITCH telemetry branch setup. + +## Current infra-controller dependency state + +- `Cargo.toml` pins `nv-redfish = { version = "0.10.0" }`. +- `crates/health/Cargo.toml` enables standard health features but not `telemetry-service`. +- The GB200 branch uses a local `nv-redfish` worktree for development only: + - `/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps` + - Branch: `nvswitch_telemetry_gaps` + - Base: `origin/main` at `dbd2789c987fd320d263d87524fc25fde305bc7f` + +## Refreshed upstream state + +- Local `/Users/mkoci/Projects/nv-redfish` was fetched from `origin` on 2026-06-18. +- Latest observed public tags: `v0.10.2`, `v0.10.1`, `v0.10.0`. +- `origin/main` includes a `telemetry-service` feature in `redfish/features.toml`. +- `origin/main` exposes `ServiceRoot::telemetry_service()` behind the `telemetry-service` feature. +- Neither `origin/main` nor `v0.10.2` has a `fabrics` feature or generated/wrapper hits for Fabric, Switch, Port, SwitchMetrics, or PortMetrics in the inspected source. + +## Dependency conclusion + +TelemetryService MetricReports can be wired by enabling `telemetry-service` and updating `crates/health` to consume the typed `TelemetryService` APIs. + +Redfish Fabric/Switch/Port support needs companion `nv-redfish` work if GB200 live hardware or the catalog requires those paths. The companion work should add standard DMTF schema XMLs and feature entries for Fabric, Switch, Port, SwitchMetrics, PortMetrics, Endpoint, and Zone families, plus ergonomic ServiceRoot/Fabric/Switch navigation wrappers and mock tests. + +## Local development strategy + +During local development, use the `nv-redfish` worktree as a path dependency or patch only on the GB200 feature branch. Do not leave a user-local absolute path in the final MR. Before final review, replace local path usage with one of: + +1. A released `nv-redfish` version containing the companion support. +2. A reviewer-approved git revision dependency if release timing blocks final integration. +3. A documented two-MR handoff where the infra-controller MR names the required `nv-redfish` companion MR and keeps local-path changes out of the final diff. + +Because upstream source manifests use workspace version `0.1.0` while crates.io currently publishes `0.10.x`, local path development should use an explicit path dependency in the workspace dependency table rather than relying on the crates.io version constraint. From 8f213f13fd4835c4be7372228c9d465f19be5a6c Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:10:01 -0400 Subject: [PATCH 03/25] docs(health): clarify nv-redfish local patch strategy Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- ...vswitch_telemetry_nv_redfish_dependency.md | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md index b24a4c4b06..fb7d2ed36d 100644 --- a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md +++ b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md @@ -5,8 +5,9 @@ Generated during the GB200 NVSWITCH telemetry branch setup. ## Current infra-controller dependency state - `Cargo.toml` pins `nv-redfish = { version = "0.10.0" }`. +- `Cargo.lock` resolves `nv-redfish`, `nv-redfish-bmc-http`, `nv-redfish-core`, `nv-redfish-schema`, and `nv-redfish-csdl-compiler` to `0.10.0` from crates.io. - `crates/health/Cargo.toml` enables standard health features but not `telemetry-service`. -- The GB200 branch uses a local `nv-redfish` worktree for development only: +- The GB200 branch has a local `nv-redfish` worktree available for companion development only: - `/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps` - Branch: `nvswitch_telemetry_gaps` - Base: `origin/main` at `dbd2789c987fd320d263d87524fc25fde305bc7f` @@ -15,22 +16,41 @@ Generated during the GB200 NVSWITCH telemetry branch setup. - Local `/Users/mkoci/Projects/nv-redfish` was fetched from `origin` on 2026-06-18. - Latest observed public tags: `v0.10.2`, `v0.10.1`, `v0.10.0`. +- `v0.10.2` does not appear to contain Fabric/Switch/Port/NVSwitch changes relevant to this work. - `origin/main` includes a `telemetry-service` feature in `redfish/features.toml`. - `origin/main` exposes `ServiceRoot::telemetry_service()` behind the `telemetry-service` feature. - Neither `origin/main` nor `v0.10.2` has a `fabrics` feature or generated/wrapper hits for Fabric, Switch, Port, SwitchMetrics, or PortMetrics in the inspected source. ## Dependency conclusion -TelemetryService MetricReports can be wired by enabling `telemetry-service` and updating `crates/health` to consume the typed `TelemetryService` APIs. +TelemetryService MetricReports can be wired in infra-controller by enabling `telemetry-service` and consuming the typed `TelemetryService` APIs already available in nv-redfish 0.10.x. Redfish Fabric/Switch/Port support needs companion `nv-redfish` work if GB200 live hardware or the catalog requires those paths. The companion work should add standard DMTF schema XMLs and feature entries for Fabric, Switch, Port, SwitchMetrics, PortMetrics, Endpoint, and Zone families, plus ergonomic ServiceRoot/Fabric/Switch navigation wrappers and mock tests. ## Local development strategy -During local development, use the `nv-redfish` worktree as a path dependency or patch only on the GB200 feature branch. Do not leave a user-local absolute path in the final MR. Before final review, replace local path usage with one of: +During local development, keep user-local absolute paths out of committed manifests. Use Cargo local patching via command-line `--config` for experiments against the companion `nv-redfish` worktree, for example: -1. A released `nv-redfish` version containing the companion support. -2. A reviewer-approved git revision dependency if release timing blocks final integration. -3. A documented two-MR handoff where the infra-controller MR names the required `nv-redfish` companion MR and keeps local-path changes out of the final diff. +```bash +cargo test -p carbide-health --lib --no-run \ + --config 'patch.crates-io.nv-redfish.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/redfish"' +``` + +If companion changes touch internal nv-redfish crates, patch the affected packages too: + +```bash +cargo test -p carbide-health --lib --no-run \ + --config 'patch.crates-io.nv-redfish.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/redfish"' \ + --config 'patch.crates-io.nv-redfish-core.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/core"' \ + --config 'patch.crates-io.nv-redfish-schema.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/schema"' \ + --config 'patch.crates-io.nv-redfish-csdl-compiler.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/csdl-compiler"' \ + --config 'patch.crates-io.nv-redfish-bmc-http.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/bmc-http"' +``` -Because upstream source manifests use workspace version `0.1.0` while crates.io currently publishes `0.10.x`, local path development should use an explicit path dependency in the workspace dependency table rather than relying on the crates.io version constraint. +## Final MR strategy + +Do not commit local absolute path dependencies. Before final review, use one of these acceptable states: + +1. A released `nv-redfish` version containing companion support, with `Cargo.toml` and `Cargo.lock` updated accordingly. +2. A reviewer-approved git revision dependency if release timing blocks final integration. +3. A documented split where infra-controller names the required `nv-redfish` companion MR and keeps local path overrides out of the final diff. From 0fdc4c9a98219fdb8ff89d694435787120f010db Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:29:18 -0400 Subject: [PATCH 04/25] feat(health): collect GB200 NVSwitch telemetry gaps Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/Cargo.toml | 1 + crates/health/example/config.example.toml | 11 +- crates/health/src/collectors/mod.rs | 2 + crates/health/src/collectors/nmxt.rs | 237 +++++++- .../health/src/collectors/nvue/gnmi/client.rs | 27 +- .../collectors/nvue/gnmi/sample_processor.rs | 392 ++++++++++-- crates/health/src/collectors/sensors.rs | 110 +++- .../src/collectors/telemetry_service.rs | 564 ++++++++++++++++++ crates/health/src/config.rs | 171 +++++- crates/health/src/discovery/cleanup.rs | 2 + crates/health/src/discovery/context.rs | 15 +- crates/health/src/discovery/spawn.rs | 67 ++- crates/health/src/endpoint/sources.rs | 4 +- dev/bin/generate_nvswitch_gb200_matrix.py | 219 +++++-- ...vswitch_telemetry_gb200_live_validation.md | 157 +++++ .../nvswitch_telemetry_gb200_matrix.csv | 388 ++++++------ .../health/nvswitch_telemetry_gb200_matrix.md | 39 +- ...vswitch_telemetry_nv_redfish_dependency.md | 24 +- .../files/carbide-bmc-proxy.toml | 4 +- 19 files changed, 2091 insertions(+), 343 deletions(-) create mode 100644 crates/health/src/collectors/telemetry_service.rs create mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index 5423024726..1605b2bee8 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -73,6 +73,7 @@ nv-redfish = { workspace = true, features = [ "processors", "sensors", "storages", + "telemetry-service", "thermal", "update-service", "resource-status", diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 459babc400..f9f98ed26f 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -56,7 +56,7 @@ port = 443 mac = "11:22:33:44:55:77" username = "admin" password = "secret" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, nmxt_enabled = true, slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" @@ -146,6 +146,13 @@ include_sensor_thresholds = true fetch_interval = "2m" fetch_concurrency = 4 +[collectors.telemetry_service] +poll_interval = "1m" +fetch_concurrency = 4 +# Empty means collect all Redfish TelemetryService MetricReports exposed by the switch BMC. +# For GB200 live validation this can be narrowed to ["NvidiaNMMetrics_0"] if needed. +metric_report_ids = [] + [collectors.firmware] firmware_refresh_interval = "30m" @@ -219,6 +226,8 @@ system_events_enabled = true [collectors.nvue.gnmi.paths] components_enabled = true interfaces_enabled = true +# Defaults to false; enable for GB200 NVSwitch platform-general catalog leaves. +platform_general_enabled = true # ============================================================================== # Processors diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 6499644edf..2d3a699766 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -25,6 +25,7 @@ mod nmxt; mod nvue; mod runtime; mod sensors; +mod telemetry_service; pub use discovery::{EntityDiscoveryCollector, EntityDiscoveryCollectorConfig}; pub use entity_metrics::{MetricsCollector, MetricsCollectorConfig}; @@ -45,3 +46,4 @@ pub use runtime::{ StreamingCollectorStartContext, open_sse_stream, }; pub use sensors::{SensorCollector, SensorCollectorConfig}; +pub use telemetry_service::{TelemetryServiceCollector, TelemetryServiceCollectorConfig}; diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 7f762a2ed8..ffa24bfee3 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -17,7 +17,8 @@ //! This module collects metrics from NMX-T telemetry endpoints on NVLink switches if the service is enabled. //! Scrapes HTTP on 9352 (default for NMX-T) - NOT A Redfish collector! -//! Currently scraping for Effective BER, Symbol Errors and Link Down counter. +//! Known switch metrics are emitted with existing canonical names; all other +//! numeric Prometheus samples are preserved as source-qualified NMX-T metrics. use std::borrow::Cow; use std::collections::HashMap; @@ -223,30 +224,41 @@ impl NmxtCollector { let port_num = sample_labels.remove("Port_Number").unwrap_or_default(); let node_guid = sample_labels.remove("Node_GUID").unwrap_or_default(); + let known_legacy_metric = matches!( + name.as_str(), + "Effective_BER" | "Symbol_Errors" | "Link_Down" + ); let metric_type = match name.as_str() { - "Effective_BER" => "effective_ber", - "Symbol_Errors" => "symbol_errors", - "Link_Down" => "link_down", - _ => continue, + "Effective_BER" => "effective_ber".to_string(), + "Symbol_Errors" => "symbol_errors".to_string(), + "Link_Down" => "link_down".to_string(), + _ => sanitize_metric_token(&name), }; - let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); - metric_key.push_str(metric_type); - metric_key.push(':'); - metric_key.push_str(&port_num); + let metric_key = if known_legacy_metric { + legacy_metric_key(&metric_type, &port_num) + } else { + generic_metric_key(&metric_type, &name, &port_num, &node_guid, &sample_labels) + }; - let labels = vec![ + let mut labels = vec![ (Cow::Borrowed("switch_id"), self.switch_id.clone()), (Cow::Borrowed("switch_ip"), switch_ip.clone()), (Cow::Borrowed("node_guid"), node_guid), (Cow::Borrowed("port_num"), port_num), ]; + if !known_legacy_metric { + labels.push((Cow::Borrowed("source_metric"), name)); + } + for (label_name, label_value) in sample_labels { + labels.push((Cow::Owned(sanitize_label_name(&label_name)), label_value)); + } self.emit_event(CollectorEvent::Metric( MetricSample { key: metric_key, name: "switch_nmxt".to_string(), - metric_type: metric_type.to_string(), + metric_type, unit: "count".to_string(), value, labels, @@ -262,6 +274,122 @@ impl NmxtCollector { } } +fn legacy_metric_key(metric_type: &str, port_num: &str) -> String { + let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); + metric_key.push_str(metric_type); + metric_key.push(':'); + metric_key.push_str(port_num); + metric_key +} + +fn generic_metric_key( + metric_type: &str, + source_metric: &str, + port_num: &str, + node_guid: &str, + sample_labels: &HashMap, +) -> String { + let mut metric_key = metric_type.to_string(); + + append_metric_key_identity(&mut metric_key, "port_num", port_num); + append_metric_key_identity(&mut metric_key, "source_metric", source_metric); + append_metric_key_identity(&mut metric_key, "node_guid", node_guid); + + let mut identity_labels = sample_labels + .iter() + .map(|(label_name, label_value)| (sanitize_label_name(label_name), label_name, label_value)) + .collect::>(); + identity_labels.sort_by( + |(left_sanitized, left_name, left_value), (right_sanitized, right_name, right_value)| { + left_sanitized + .cmp(right_sanitized) + .then_with(|| left_name.cmp(right_name)) + .then_with(|| left_value.cmp(right_value)) + }, + ); + + for (_, label_name, label_value) in identity_labels { + append_metric_key_identity(&mut metric_key, "label_name", label_name); + append_metric_key_identity(&mut metric_key, "label_value", label_value); + } + + metric_key +} + +fn append_metric_key_identity( + metric_key: &mut String, + component_name: &str, + component_value: &str, +) { + if component_value.is_empty() { + return; + } + metric_key.push(':'); + metric_key.push_str(&escape_metric_key_component(component_name)); + metric_key.push('='); + metric_key.push_str(&escape_metric_key_component(component_value)); +} + +fn escape_metric_key_component(value: &str) -> String { + let mut escaped = String::with_capacity(value.len()); + for byte in value.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + escaped.push(byte as char); + } + _ => { + escaped.push('%'); + escaped.push(hex_digit(byte >> 4)); + escaped.push(hex_digit(byte & 0x0f)); + } + } + } + escaped +} + +fn hex_digit(nibble: u8) -> char { + match nibble { + 0..=9 => (b'0' + nibble) as char, + 10..=15 => (b'A' + nibble - 10) as char, + _ => unreachable!("hex nibble is always <= 15"), + } +} + +fn sanitize_metric_token(value: &str) -> String { + let mut token = String::with_capacity(value.len()); + let mut previous_was_separator = false; + let chars = value.chars().collect::>(); + for (index, ch) in chars.iter().copied().enumerate() { + if ch.is_ascii_alphanumeric() { + let previous = index.checked_sub(1).and_then(|i| chars.get(i)).copied(); + let next = chars.get(index + 1).copied(); + let starts_word = ch.is_ascii_uppercase() + && !previous_was_separator + && previous.is_some_and(|prev| prev.is_ascii_alphanumeric()) + && (previous + .is_some_and(|prev| prev.is_ascii_lowercase() || prev.is_ascii_digit()) + || next.is_some_and(|next| next.is_ascii_lowercase())); + if starts_word { + token.push('_'); + } + token.push(ch.to_ascii_lowercase()); + previous_was_separator = false; + } else if !previous_was_separator { + token.push('_'); + previous_was_separator = true; + } + } + token.trim_matches('_').to_string() +} + +fn sanitize_label_name(value: &str) -> String { + let mut label = sanitize_metric_token(value); + if label.chars().next().is_some_and(|ch| ch.is_ascii_digit()) { + label.insert(0, '_'); + } + label +} + #[cfg(test)] mod tests { use super::*; @@ -304,4 +432,91 @@ Link_Down{Port_Number="1"} 5 let samples = parse_prometheus_metrics(body); assert_eq!(samples.len(), 4); } + + #[test] + fn unknown_nmxt_metric_names_are_sanitized_instead_of_dropped() { + assert_eq!( + sanitize_metric_token("PortMalformedPacketErrors"), + "port_malformed_packet_errors" + ); + assert_eq!(sanitize_label_name("Lane-Number"), "lane_number"); + assert_eq!(sanitize_label_name("8b10b"), "_8b10b"); + } + + #[test] + fn generic_metric_key_includes_sorted_extra_label_identity() { + let labels = HashMap::from([ + ("Lane-Number".to_string(), "3".to_string()), + ("Device".to_string(), "nvswitch0".to_string()), + ]); + + assert_eq!( + generic_metric_key( + "port_malformed_packet_errors", + "PortMalformedPacketErrors", + "4", + "0x8e2161c8803caf64", + &labels, + ), + "port_malformed_packet_errors:port_num=4:source_metric=PortMalformedPacketErrors:node_guid=0x8e2161c8803caf64:label_name=Device:label_value=nvswitch0:label_name=Lane-Number:label_value=3" + ); + } + + #[test] + fn generic_metric_key_includes_raw_source_metric_to_avoid_sanitized_name_aliasing() { + let labels = HashMap::new(); + + assert_ne!( + generic_metric_key("rx_errors", "RxErrors", "1", "", &labels), + generic_metric_key("rx_errors", "rx-errors", "1", "", &labels), + ); + } + + #[test] + fn generic_metric_key_escapes_identity_delimiters_to_avoid_aliasing() { + let labels_with_delimiter_value = HashMap::from([("b".to_string(), "c:d=e".to_string())]); + let labels_split_by_delimiters = HashMap::from([ + ("b".to_string(), "c".to_string()), + ("d".to_string(), "e".to_string()), + ]); + + assert_ne!( + generic_metric_key( + "rx_errors", + "RxErrors", + "1", + "", + &labels_with_delimiter_value + ), + generic_metric_key( + "rx_errors", + "RxErrors", + "1", + "", + &labels_split_by_delimiters + ) + ); + + assert_ne!( + generic_metric_key( + "rx_errors", + "RxErrors:node_guid=x", + "1", + "", + &HashMap::new() + ), + generic_metric_key("rx_errors", "RxErrors", "1", "x", &HashMap::new()) + ); + } + + #[test] + fn generic_metric_key_distinguishes_same_port_samples_by_extra_labels() { + let first = HashMap::from([("Lane".to_string(), "0".to_string())]); + let second = HashMap::from([("Lane".to_string(), "1".to_string())]); + + assert_ne!( + generic_metric_key("rx_errors", "RxErrors", "1", "", &first), + generic_metric_key("rx_errors", "RxErrors", "1", "", &second) + ); + } } diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 3560b81d99..01544d0b08 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -31,7 +31,7 @@ use crate::HealthError; use crate::config::NvueGnmiPaths; pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { - let mut paths = Vec::with_capacity(2); + let mut paths = Vec::with_capacity(3); if paths_config.components_enabled { paths.push(Path { elem: vec![ @@ -62,6 +62,15 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { ..Default::default() }); } + if paths_config.platform_general_enabled { + paths.push(Path { + elem: vec![PathElem { + name: "platform-general".into(), + key: Default::default(), + }], + ..Default::default() + }); + } paths } @@ -439,7 +448,7 @@ mod tests { } #[test] - fn test_nvue_subscribe_paths_all_enabled() { + fn test_nvue_subscribe_paths_defaults_do_not_enable_platform_general() { let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); assert_eq!(paths.len(), 2); @@ -452,11 +461,24 @@ mod tests { assert_eq!(paths[1].elem[1].name, "interface"); } + #[test] + fn test_nvue_subscribe_paths_all_enabled() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: true, + interfaces_enabled: true, + platform_general_enabled: true, + }); + assert_eq!(paths.len(), 3); + assert_eq!(paths[2].elem.len(), 1); + assert_eq!(paths[2].elem[0].name, "platform-general"); + } + #[test] fn test_nvue_subscribe_paths_selective() { let paths = nvue_subscribe_paths(&NvueGnmiPaths { components_enabled: false, interfaces_enabled: true, + platform_general_enabled: false, }); assert_eq!(paths.len(), 1); assert_eq!(paths[0].elem.len(), 2); @@ -469,6 +491,7 @@ mod tests { let paths = nvue_subscribe_paths(&NvueGnmiPaths { components_enabled: false, interfaces_enabled: false, + platform_general_enabled: false, }); assert!(paths.is_empty()); } diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 20c06e3854..2d34f0dc45 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -94,10 +94,20 @@ impl GnmiSampleProcessor { if let Some(iface) = find_elem_key_ref(&combined, "interface", "name") { entities.insert(("interface", iface)); - self.process_interface_metric(&combined, iface, val); + if !self.process_interface_metric(&combined, iface, val) { + self.emit_generic_leaf_metric(&combined, "interface", iface, val); + } } else if let Some(comp) = find_elem_key_ref(&combined, "component", "name") { entities.insert(("component", comp)); - self.process_component_metric(&combined, comp, val); + if !self.process_component_metric(&combined, comp, val) { + self.emit_generic_leaf_metric(&combined, "component", comp, val); + } + } else if combined + .first() + .is_some_and(|elem| elem.name == "platform-general") + { + entities.insert(("platform", "platform-general")); + self.emit_generic_leaf_metric(&combined, "platform", "platform-general", val); } } @@ -109,7 +119,7 @@ impl GnmiSampleProcessor { elems: &[&PathElem], iface_name: &str, val: &proto::TypedValue, - ) { + ) -> bool { if leaf_matches(elems, &["state", "oper-status"]) { let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); self.emit_data_metric( @@ -120,63 +130,62 @@ impl GnmiSampleProcessor { "interface_name", iface_name, ); - } else if leaf_matches(elems, &["state", "counters", "in-errors"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + true + } else if leaf_matches(elems, &["state", "counters", "in-errors"]) { + self.emit_numeric_metric_if_valid( "interface_in_errors", - iface_name, - v, "count", "interface_name", iface_name, + elems, + val, ); - } else if leaf_matches(elems, &["state", "counters", "out-errors"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + true + } else if leaf_matches(elems, &["state", "counters", "out-errors"]) { + self.emit_numeric_metric_if_valid( "interface_out_errors", - iface_name, - v, "count", "interface_name", iface_name, + elems, + val, ); - } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + true + } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) { + self.emit_numeric_metric_if_valid( "interface_effective_ber", - iface_name, - v, "ratio", "interface_name", iface_name, + elems, + val, ); - } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + true + } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) { + self.emit_numeric_metric_if_valid( "interface_symbol_ber", - iface_name, - v, "ratio", "interface_name", iface_name, + elems, + val, ); + true } else if leaf_matches( elems, &["phy-diag", "state", "unintentional-link-down-events"], - ) && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + ) { + self.emit_numeric_metric_if_valid( "interface_link_down_events", - iface_name, - v, "count", "interface_name", iface_name, + elems, + val, ); + true + } else { + false } } @@ -185,7 +194,7 @@ impl GnmiSampleProcessor { elems: &[&PathElem], comp_name: &str, val: &proto::TypedValue, - ) { + ) -> bool { if leaf_matches(elems, &["healthz", "state", "status"]) { let v = component_health_to_f64(typed_value_to_string(val).as_deref()); self.emit_data_metric( @@ -196,18 +205,94 @@ impl GnmiSampleProcessor { "component_name", comp_name, ); - } else if leaf_matches(elems, &["state", "temperature", "instant"]) - && let Some(v) = typed_value_to_f64(val) - { - self.emit_data_metric( + true + } else if leaf_matches(elems, &["state", "temperature", "instant"]) { + self.emit_numeric_metric_if_valid( "component_temperature_celsius", - comp_name, - v, "celsius", "component_name", comp_name, + elems, + val, + ); + true + } else { + false + } + } + + fn emit_generic_leaf_metric( + &self, + elems: &[&PathElem], + entity_label_name: &'static str, + entity_label_value: &str, + val: &proto::TypedValue, + ) { + let Some(sink) = &self.data_sink else { return }; + let Some(leaf_name) = elems.last().map(|elem| elem.name.as_str()) else { + return; + }; + let Some((value, unit)) = typed_value_to_metric_value(val) else { + return; + }; + let metric_type = catalog_metric_type_for_leaf(leaf_name) + .map(str::to_string) + .unwrap_or_else(|| { + let leaf = sanitize_metric_token(leaf_name); + format!("nvswitch_{leaf}") + }); + let path = path_string(elems); + + let key = format!("{metric_type}:{entity_label_value}:{path}"); + let labels = vec![ + ( + Cow::Borrowed(entity_label_name), + entity_label_value.to_string(), + ), + (Cow::Borrowed("source_path"), path), + ]; + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type, + unit, + value, + labels, + context: None, + })), + ); + } + + fn emit_numeric_metric_if_valid( + &self, + metric_type: &str, + unit: &str, + entity_label_name: &'static str, + entity_label_value: &str, + elems: &[&PathElem], + val: &proto::TypedValue, + ) { + if let Some(value) = typed_value_to_f64(val).filter(|value| value.is_finite()) { + self.emit_data_metric( + metric_type, + entity_label_value, + value, + unit, + entity_label_name, + entity_label_value, ); + return; } + + tracing::warn!( + metric_type, + source_path = %path_string(elems), + entity_label_name, + entity_label_value, + "nvue_gnmi SAMPLE: skipping known numeric leaf with invalid value" + ); } fn emit_data_metric( @@ -248,6 +333,103 @@ impl GnmiSampleProcessor { } } +fn typed_value_to_metric_value(value: &proto::TypedValue) -> Option<(f64, String)> { + if let Some(value) = typed_value_to_f64(value) { + return value.is_finite().then_some((value, "value".to_string())); + } + let raw = typed_value_to_string(value)?; + if raw.eq_ignore_ascii_case("up") + || raw.eq_ignore_ascii_case("healthy") + || raw.eq_ignore_ascii_case("true") + { + return Some((1.0, "state".to_string())); + } + if raw.eq_ignore_ascii_case("down") + || raw.eq_ignore_ascii_case("unhealthy") + || raw.eq_ignore_ascii_case("false") + { + return Some((0.0, "state".to_string())); + } + Some((1.0, "info".to_string())) +} + +fn path_string(elems: &[&PathElem]) -> String { + elems + .iter() + .map(|elem| { + if elem.key.is_empty() { + elem.name.clone() + } else { + let mut keys = elem + .key + .iter() + .map(|(key, value)| format!("{key}={value}")) + .collect::>(); + keys.sort(); + format!("{}[{}]", elem.name, keys.join(",")) + } + }) + .collect::>() + .join("/") +} + +fn catalog_metric_type_for_leaf(leaf_name: &str) -> Option<&'static str> { + match leaf_name { + "link-downed" => Some("nvswitch_link_downed_counter"), + "port-malformed-packet-errors" => Some("nvswitch_port_malformed_packet_errors"), + "port-neighbor-mtu-discards" => Some("nvswitch_port_neighbor_mtu_discards"), + "out-discards" => Some("nvswitch_port_xmit_discards"), + "rcv-remote-phy-errors" => Some("nvswitch_port_rcv_remote_physical_errors"), + "rcv-switch-relay-errors" => Some("nvswitch_port_rcv_switch_relay_errors"), + "qp1-dropped" => Some("nvswitch_qp1dropped"), + "vl15-dropped" => Some("nvswitch_vl15_dropped"), + "physical-port-state" => Some("nvswitch_nvlink_status"), + "link-error-recovery" => Some("nvswitch_link_error_recovery_counter"), + "port-multi-cast-rcv-pkts" => Some("nvswitch_port_multicast_rcv_pkts"), + "port-multi-cast-xmit-pkts" => Some("nvswitch_port_multicast_xmit_pkts"), + "in-octets" => Some("nvswitch_port_rcv_data"), + "in-pkts" => Some("nvswitch_port_rcv_pkts"), + "port-uni-cast-rcv-pkts" => Some("nvswitch_port_unicast_rcv_pkts"), + "port-uni-cast-xmit-pkts" => Some("nvswitch_port_unicast_xmit_pkts"), + "out-octets" => Some("nvswitch_port_xmit_data"), + "out-pkts" => Some("nvswitch_port_xmit_pkts"), + "xmit-wait" => Some("nvswitch_port_xmit_wait"), + "raw-ber" => Some("nvswitch_raw_ber"), + "zero-hist" => Some("nvswitch_zero_hist"), + "raw-errors-ch-1" => Some("nvswitch_phy_raw_errors_lane0"), + "raw-errors-ch-2" => Some("nvswitch_phy_raw_errors_lane1"), + "raw-ber-ch-1" => Some("nvswitch_raw_ber_lane0"), + "raw-ber-ch-2" => Some("nvswitch_raw_ber_lane1"), + "effective-errors" => Some("nvswitch_phy_effective_errors"), + "time-since-last-clear-min" => Some("nvswitch_time_since_lasts_clear"), + "excessive-buffer-overrun" => Some("nvswitch_port_buffer_overrun_errors"), + "speed" => Some("nvswitch_link_speed_active"), + "width" => Some("nvswitch_link_width_active"), + "mtu" => Some("nvswitch_mtu"), + "max-supported-mtus" => Some("nvswitch_max_supported_mtu"), + "supported-widths" => Some("nvswitch_supported_width"), + "vl-capabilities" => Some("nvswitch_vl_capabilities"), + "local-link-integrity-errors" => Some("nvswitch_local_link_integrity_errors"), + "module-oper-status" => Some("nvswitch_cable_oper_status"), + _ => None, + } +} + +fn sanitize_metric_token(value: &str) -> String { + let mut token = String::with_capacity(value.len()); + let mut previous_was_separator = false; + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + token.push(ch.to_ascii_lowercase()); + previous_was_separator = false; + } else if !previous_was_separator { + token.push('_'); + previous_was_separator = true; + } + } + token.trim_matches('_').to_string() +} + fn find_elem_key_ref<'a>( elems: &[&'a PathElem], elem_name: &str, @@ -460,6 +642,138 @@ mod tests { assert_eq!(count, 1); } + #[test] + fn unmapped_interface_leaf_emits_catalog_metric_sample() { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("port-malformed-packet-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_uint(9)), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1); + let CollectorEvent::Metric(sample) = &events[0].1 else { + panic!("expected metric event"); + }; + assert_eq!(sample.metric_type, "nvswitch_port_malformed_packet_errors"); + assert_eq!(sample.value, 9.0); + assert!( + sample + .labels + .iter() + .any(|(key, value)| key.as_ref() == "interface" && value == "nvl4") + ); + } + + #[test] + fn known_numeric_interface_leaf_with_invalid_value_does_not_emit_generic_info_metric() { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl4")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("counters", &[]), + make_path_elem("in-errors", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("N/A")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + + let events = sink.events.lock().expect("lock poisoned"); + assert!( + events.is_empty(), + "known numeric leaf with invalid value must be skipped instead of emitted as generic info" + ); + } + + #[test] + fn platform_general_string_leaf_emits_info_metric() { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![make_path_elem("platform-general", &[])], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("state", &[]), + make_path_elem("platform-name", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("gb200-switch-a")), + ..Default::default() + }], + ..Default::default() + }; + + let count = proc.process_notification(¬ification); + assert_eq!(count, 1); + + let events = sink.events.lock().expect("lock poisoned"); + let CollectorEvent::Metric(sample) = &events[0].1 else { + panic!("expected metric event"); + }; + assert_eq!(sample.metric_type, "nvswitch_platform_name"); + assert_eq!(sample.unit, "info"); + assert_eq!(sample.value, 1.0); + assert!( + sample + .labels + .iter() + .all(|(key, _)| key.as_ref() != "leaf_value") + ); + assert!(sample.labels.iter().any(|(key, value)| { + key.as_ref() == "source_path" && value == "platform-general/state/platform-name" + })); + } + #[test] fn emitted_metrics_preserve_switch_position_context() { use std::str::FromStr; diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index d05275d05c..0b4926a5a6 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -27,9 +27,31 @@ use crate::HealthError; use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; -use crate::metrics::sanitize_unit; +use crate::metrics::{MetricLabel, sanitize_unit}; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample, SensorThresholdContext}; +#[derive(Clone, Copy)] +enum SensorRangeKind { + Max, + Min, +} + +impl SensorRangeKind { + fn metric_suffix(self) -> &'static str { + match self { + Self::Max => "range_max", + Self::Min => "range_min", + } + } + + fn label_value(self) -> &'static str { + match self { + Self::Max => "reading_range_max", + Self::Min => "reading_range_min", + } + } +} + /// Configuration for the sensor collector. pub struct SensorCollectorConfig { pub data_sink: Option>, @@ -256,6 +278,8 @@ impl SensorCollector { let metric_type = reading_type.to_snake_case().to_string(); let unit = sanitize_unit(&unit); + let range_max = sensor.reading_range_max.flatten(); + let range_min = sensor.reading_range_min.flatten(); let ( upper_fatal, @@ -299,10 +323,10 @@ impl SensorCollector { MetricSample { key: sensor.odata_id().to_string(), name: "hw_sensor".to_string(), - metric_type, - unit, + metric_type: metric_type.clone(), + unit: unit.clone(), value: reading, - labels: attributes, + labels: attributes.clone(), context: Some(SensorThresholdContext { entity_type: entity.entity_type().to_string(), sensor_id: sensor.base.id.clone(), @@ -312,14 +336,88 @@ impl SensorCollector { lower_critical, upper_caution, lower_caution, - range_max: sensor.reading_range_max.flatten(), - range_min: sensor.reading_range_min.flatten(), + range_max, + range_min, bmc_health, }), } .into(), )); + if self.include_sensor_thresholds { + self.emit_sensor_range_metric( + sensor.odata_id().to_string(), + &metric_type, + &unit, + &attributes, + SensorRangeKind::Max, + range_max, + ); + self.emit_sensor_range_metric( + sensor.odata_id().to_string(), + &metric_type, + &unit, + &attributes, + SensorRangeKind::Min, + range_min, + ); + } + 1 } + + fn emit_sensor_range_metric( + &self, + sensor_key: String, + reading_type: &str, + unit: &str, + attributes: &[MetricLabel], + range_kind: SensorRangeKind, + value: Option, + ) { + let Some(value) = value else { return }; + let metric_suffix = range_kind.metric_suffix(); + let mut labels = attributes.to_vec(); + labels.push(( + Cow::Borrowed("sensor_range"), + range_kind.label_value().to_string(), + )); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("{sensor_key}/{metric_suffix}"), + name: "hw_sensor".to_string(), + metric_type: format!("{reading_type}_{metric_suffix}"), + unit: unit.to_string(), + value, + labels, + context: None, + } + .into(), + )); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sensor_range_kind_uses_documented_metric_suffixes_and_label_values() { + assert_eq!(SensorRangeKind::Max.metric_suffix(), "range_max"); + assert_eq!(SensorRangeKind::Max.label_value(), "reading_range_max"); + assert_eq!(SensorRangeKind::Min.metric_suffix(), "range_min"); + assert_eq!(SensorRangeKind::Min.label_value(), "reading_range_min"); + } + + #[test] + fn sensor_range_metric_contract_matches_matrix_surface() { + let reading_type = "fan_speed"; + let range_kind = SensorRangeKind::Max; + + assert_eq!( + format!("{reading_type}_{}", range_kind.metric_suffix()), + "fan_speed_range_max" + ); + assert_eq!(range_kind.label_value(), "reading_range_max"); + } } diff --git a/crates/health/src/collectors/telemetry_service.rs b/crates/health/src/collectors/telemetry_service.rs new file mode 100644 index 0000000000..9bb904805b --- /dev/null +++ b/crates/health/src/collectors/telemetry_service.rs @@ -0,0 +1,564 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Redfish TelemetryService MetricReport collection. + +use std::borrow::Cow; +use std::collections::HashSet; +use std::sync::Arc; + +use futures::{StreamExt, stream}; +use nv_redfish::ServiceRoot; +use nv_redfish::core::{Bmc, EntityTypeRef}; +use nv_redfish::schema::metric_report::{MetricReport, MetricValue}; + +use crate::HealthError; +use crate::collectors::{IterationResult, PeriodicCollector}; +use crate::config::TelemetryServiceCollectorConfig as TelemetryServiceCollectorOptions; +use crate::endpoint::BmcEndpoint; +use crate::metrics::MetricLabel; +use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; + +pub struct TelemetryServiceCollectorConfig { + pub data_sink: Option>, + pub options: TelemetryServiceCollectorOptions, +} + +pub struct TelemetryServiceCollector { + bmc: Arc, + event_context: EventContext, + data_sink: Option>, + metric_report_ids: HashSet, + fetch_concurrency: usize, +} + +impl PeriodicCollector for TelemetryServiceCollector { + type Config = TelemetryServiceCollectorConfig; + + fn new_runner( + bmc: Arc, + endpoint: Arc, + config: Self::Config, + ) -> Result { + Ok(Self { + bmc, + event_context: EventContext::from_endpoint( + endpoint.as_ref(), + "redfish_telemetry_service", + ), + data_sink: config.data_sink, + metric_report_ids: config.options.metric_report_ids.into_iter().collect(), + fetch_concurrency: config.options.fetch_concurrency.max(1), + }) + } + + async fn run_iteration(&mut self) -> Result { + self.collect_metric_reports().await + } + + fn collector_type(&self) -> &'static str { + "redfish_telemetry_service" + } + + async fn stop(&mut self) { + self.emit_event(CollectorEvent::CollectorRemoved); + } +} + +impl TelemetryServiceCollector { + fn emit_event(&self, event: CollectorEvent) { + if let Some(data_sink) = &self.data_sink { + data_sink.handle_event(&self.event_context, &event); + } + } + + async fn collect_metric_reports(&self) -> Result { + let root = ServiceRoot::new(self.bmc.clone()) + .await + .map_err(|error| HealthError::BmcError(Box::new(error)))?; + + let Some(telemetry_service) = root + .telemetry_service() + .await + .map_err(|error| HealthError::BmcError(Box::new(error)))? + else { + tracing::debug!("BMC endpoint does not expose Redfish TelemetryService"); + return Ok(IterationResult { + refresh_triggered: true, + entity_count: Some(0), + fetch_failures: 0, + }); + }; + + let Some(metric_report_links) = telemetry_service + .metric_report_links() + .await + .map_err(|error| HealthError::BmcError(Box::new(error)))? + else { + tracing::debug!("Redfish TelemetryService has no MetricReports collection"); + return Ok(IterationResult { + refresh_triggered: true, + entity_count: Some(0), + fetch_failures: 0, + }); + }; + + let requested_ids = &self.metric_report_ids; + let fetch_concurrency = self.fetch_concurrency; + let reports = stream::iter(metric_report_links) + .filter(|link| { + let include = requested_ids.is_empty() + || link + .odata_id() + .last_segment() + .is_some_and(|id| requested_ids.contains(id)); + async move { include } + }) + .map(|link| async move { + let report_id = link.odata_id().to_string(); + link.fetch().await.map(|report| (report_id, report)) + }) + .buffer_unordered(fetch_concurrency) + .collect::>() + .await; + + self.emit_event(CollectorEvent::MetricCollectionStart); + + let mut sample_count = 0; + let mut fetch_failures = 0; + for result in reports { + match result { + Ok((report_uri, report)) => { + for sample in metric_samples_from_report(&report, &report_uri) { + sample_count += 1; + self.emit_event(CollectorEvent::Metric(sample.into())); + } + } + Err(error) => { + fetch_failures += 1; + tracing::warn!(?error, "failed to fetch Redfish MetricReport"); + } + } + } + + self.emit_event(CollectorEvent::MetricCollectionEnd); + + Ok(IterationResult { + refresh_triggered: true, + entity_count: Some(sample_count), + fetch_failures, + }) + } +} + +fn metric_samples_from_report(report: &MetricReport, report_uri: &str) -> Vec { + let report_id = report.base.id.as_str(); + let report_definition = report + .metric_report_definition + .as_ref() + .map(|reference| reference.odata_id().to_string()); + + report + .metric_values + .as_deref() + .unwrap_or_default() + .iter() + .filter_map(|metric| { + metric_sample_from_value(report_id, report_uri, report_definition.as_deref(), metric) + }) + .collect() +} + +fn metric_sample_from_value( + report_id: &str, + report_uri: &str, + report_definition: Option<&str>, + metric: &MetricValue, +) -> Option { + let raw_value = nested_optional_str(&metric.metric_value)?; + let metric_id = nested_optional_str(&metric.metric_id); + let metric_property = nested_optional_str(&metric.metric_property); + let (value, unit) = metric_value_to_f64(raw_value)?; + let metric_identity = metric_identity(metric_id, metric_property).or_else(|| { + tracing::warn!( + report_id, + report_uri, + "Skipping Redfish MetricReport value without MetricId or MetricProperty" + ); + None + })?; + let metric_type = metric_type(metric_id, metric_property)?; + + let mut labels: Vec = vec![ + (Cow::Borrowed("report_id"), report_id.to_string()), + (Cow::Borrowed("report_uri"), report_uri.to_string()), + ]; + if let Some(metric_id) = metric_id { + labels.push((Cow::Borrowed("metric_id"), metric_id.to_string())); + } + if let Some(metric_property) = metric_property { + labels.push(( + Cow::Borrowed("metric_property"), + metric_property.to_string(), + )); + } + if let Some(report_definition) = report_definition { + labels.push(( + Cow::Borrowed("metric_report_definition"), + report_definition.to_string(), + )); + } + labels.push((Cow::Borrowed("metric_identity"), metric_identity)); + let key = metric_sample_key(report_id, metric_id, metric_property)?; + Some(MetricSample { + key, + name: "redfish_telemetry_service".to_string(), + metric_type, + unit, + value, + labels, + context: None, + }) +} + +fn nested_optional_str(value: &Option>) -> Option<&str> { + value.as_ref().and_then(|inner| inner.as_deref()) +} + +fn metric_value_to_f64(raw: &str) -> Option<(f64, String)> { + if raw.eq_ignore_ascii_case("true") { + return Some((1.0, "state".to_string())); + } + if raw.eq_ignore_ascii_case("false") { + return Some((0.0, "state".to_string())); + } + if let Ok(value) = raw.parse::() { + return value.is_finite().then_some((value, "value".to_string())); + } + + Some((1.0, "info".to_string())) +} + +fn metric_identity(metric_id: Option<&str>, metric_property: Option<&str>) -> Option { + let mut parts = Vec::new(); + if let Some(metric_id) = metric_id.and_then(non_empty) { + let token = sanitize_metric_token(metric_id); + if !token.is_empty() { + parts.push(format!("metric_id:{token}")); + } + } + if let Some(metric_property) = metric_property.and_then(non_empty) { + let token = sanitize_metric_token(metric_property); + if !token.is_empty() { + parts.push(format!("metric_property:{token}")); + } + } + (!parts.is_empty()).then(|| parts.join(":")) +} + +fn metric_sample_key( + report_id: &str, + metric_id: Option<&str>, + metric_property: Option<&str>, +) -> Option { + let mut parts = Vec::new(); + if let Some(metric_id) = metric_id.and_then(non_empty) { + parts.push(format!( + "metric_id={}", + escape_metric_key_component(metric_id) + )); + } + if let Some(metric_property) = metric_property.and_then(non_empty) { + parts.push(format!( + "metric_property={}", + escape_metric_key_component(metric_property) + )); + } + + (!parts.is_empty()).then(|| format!("{report_id}:{}", parts.join(":"))) +} + +fn escape_metric_key_component(value: &str) -> String { + let mut escaped = String::with_capacity(value.len()); + for byte in value.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + escaped.push(byte as char); + } + _ => { + escaped.push('%'); + escaped.push(hex_digit(byte >> 4)); + escaped.push(hex_digit(byte & 0x0f)); + } + } + } + escaped +} + +fn hex_digit(nibble: u8) -> char { + match nibble { + 0..=9 => (b'0' + nibble) as char, + 10..=15 => (b'A' + nibble - 10) as char, + _ => unreachable!("hex nibble is always <= 15"), + } +} + +fn metric_type(metric_id: Option<&str>, metric_property: Option<&str>) -> Option { + metric_id + .and_then(non_empty) + .or_else(|| metric_property.and_then(last_path_segment)) + .map(sanitize_metric_token) + .filter(|token| !token.is_empty()) +} + +fn non_empty(value: &str) -> Option<&str> { + (!value.is_empty()).then_some(value) +} + +fn last_path_segment(value: &str) -> Option<&str> { + let pointer = value + .split_once('#') + .map(|(_, pointer)| pointer) + .filter(|pointer| !pointer.is_empty()); + let path = pointer.unwrap_or(value).trim_end_matches('/'); + path.rsplit('/').find(|segment| !segment.is_empty()) +} + +fn sanitize_metric_token(value: &str) -> String { + let mut token = String::with_capacity(value.len()); + let mut previous_was_separator = false; + let chars = value.chars().collect::>(); + for (index, ch) in chars.iter().copied().enumerate() { + if ch.is_ascii_alphanumeric() { + let previous = index.checked_sub(1).and_then(|i| chars.get(i)).copied(); + let next = chars.get(index + 1).copied(); + let starts_word = ch.is_ascii_uppercase() + && !previous_was_separator + && previous.is_some_and(|prev| prev.is_ascii_alphanumeric()) + && (previous + .is_some_and(|prev| prev.is_ascii_lowercase() || prev.is_ascii_digit()) + || next.is_some_and(|next| next.is_ascii_lowercase())); + if starts_word { + token.push('_'); + } + token.push(ch.to_ascii_lowercase()); + previous_was_separator = false; + } else if !previous_was_separator { + token.push('_'); + previous_was_separator = true; + } + } + token.trim_matches('_').to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn metric_report_values_emit_numeric_and_info_samples() { + let report: MetricReport = serde_json::from_value(serde_json::json!({ + "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "@odata.type": "#MetricReport.v1_3_0.MetricReport", + "Id": "NvidiaNMMetrics_0", + "Name": "NVIDIA NVSwitch metrics", + "MetricReportDefinition": { + "@odata.id": "/redfish/v1/TelemetryService/MetricReportDefinitions/NvidiaNMMetrics" + }, + "MetricValues": [ + { + "MetricId": "PortMalformedPacketErrors", + "MetricValue": "17", + "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/Oem/Nvidia/MalformedPackets" + }, + { + "MetricId": "SwitchFirmwareVersion", + "MetricValue": "1.2.3" + }, + { + "MetricId": "LinkHealthy", + "MetricValue": "true" + } + ] + })) + .expect("MetricReport JSON should parse"); + + let samples = metric_samples_from_report( + &report, + "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + ); + + assert_eq!(samples.len(), 3); + assert_eq!(samples[0].name, "redfish_telemetry_service"); + assert_eq!(samples[0].metric_type, "port_malformed_packet_errors"); + assert_eq!(samples[0].unit, "value"); + assert_eq!(samples[0].value, 17.0); + assert!( + samples[0].key.starts_with( + "NvidiaNMMetrics_0:metric_id=PortMalformedPacketErrors:metric_property=" + ) + ); + assert_eq!(samples[1].metric_type, "switch_firmware_version"); + assert_eq!(samples[1].unit, "info"); + assert_eq!(samples[1].value, 1.0); + assert!( + samples[1] + .labels + .iter() + .all(|(key, _)| key.as_ref() != "metric_value") + ); + assert_eq!(samples[2].metric_type, "link_healthy"); + assert_eq!(samples[2].unit, "state"); + assert_eq!(samples[2].value, 1.0); + } + + #[test] + fn metric_report_keys_use_stable_metric_identity_instead_of_array_index() { + let report: MetricReport = serde_json::from_value(serde_json::json!({ + "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "@odata.type": "#MetricReport.v1_3_0.MetricReport", + "Id": "NvidiaNMMetrics_0", + "Name": "NVIDIA NVSwitch metrics", + "MetricValues": [ + { + "MetricId": "PortRcvErrors", + "MetricValue": "1", + "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors" + }, + { + "MetricId": "PortRcvErrors", + "MetricValue": "2", + "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/2/Metrics#/RXErrors" + } + ] + })) + .expect("MetricReport JSON should parse"); + let reversed: MetricReport = serde_json::from_value(serde_json::json!({ + "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "@odata.type": "#MetricReport.v1_3_0.MetricReport", + "Id": "NvidiaNMMetrics_0", + "Name": "NVIDIA NVSwitch metrics", + "MetricValues": [ + { + "MetricId": "PortRcvErrors", + "MetricValue": "2", + "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/2/Metrics#/RXErrors" + }, + { + "MetricId": "PortRcvErrors", + "MetricValue": "1", + "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors" + } + ] + })) + .expect("MetricReport JSON should parse"); + + let original_keys = metric_samples_from_report( + &report, + "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + ) + .into_iter() + .map(|sample| sample.key) + .collect::>(); + let reversed_keys = metric_samples_from_report( + &reversed, + "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + ) + .into_iter() + .map(|sample| sample.key) + .collect::>(); + + assert_eq!(original_keys, reversed_keys); + assert_eq!(original_keys.len(), 2); + assert!( + original_keys + .iter() + .all(|key| !key.ends_with(":0") && !key.ends_with(":1")) + ); + } + + #[test] + fn metric_report_keys_preserve_raw_identity_after_sanitized_aliasing() { + let report: MetricReport = serde_json::from_value(serde_json::json!({ + "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "@odata.type": "#MetricReport.v1_3_0.MetricReport", + "Id": "NvidiaNMMetrics_0", + "Name": "NVIDIA NVSwitch metrics", + "MetricValues": [ + { + "MetricId": "Port-RcvErrors", + "MetricValue": "1" + }, + { + "MetricId": "Port_RcvErrors", + "MetricValue": "2" + } + ] + })) + .expect("MetricReport JSON should parse"); + + let samples = metric_samples_from_report( + &report, + "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + ); + + assert_eq!(samples.len(), 2); + assert_eq!(samples[0].metric_type, samples[1].metric_type); + assert_ne!(samples[0].key, samples[1].key); + assert_eq!( + samples + .iter() + .map(|sample| sample.key.as_str()) + .collect::>() + .len(), + 2 + ); + } + + #[test] + fn metric_report_value_without_source_identity_is_skipped() { + let report: MetricReport = serde_json::from_value(serde_json::json!({ + "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "@odata.type": "#MetricReport.v1_3_0.MetricReport", + "Id": "NvidiaNMMetrics_0", + "Name": "NVIDIA NVSwitch metrics", + "MetricValues": [{ "MetricValue": "3" }] + })) + .expect("MetricReport JSON should parse"); + + assert!( + metric_samples_from_report( + &report, + "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0" + ) + .is_empty() + ); + } + + #[test] + fn metric_type_falls_back_to_metric_property_last_segment() { + assert_eq!( + metric_type( + None, + Some("/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors"), + ) + .as_deref(), + Some("rx_errors") + ); + assert_eq!(metric_type(None, None), None); + } +} diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 6812dbfb66..51f6816037 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -137,10 +137,6 @@ pub enum StaticSwitchEndpointRole { Host, } -fn default_static_switch_endpoint_role() -> StaticSwitchEndpointRole { - StaticSwitchEndpointRole::Host -} - #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticSwitchEndpoint { @@ -150,7 +146,6 @@ pub struct StaticSwitchEndpoint { pub slot_number: Option, #[serde(alias = "compute_tray_index")] pub tray_index: Option, - #[serde(default = "default_static_switch_endpoint_role")] pub endpoint_role: StaticSwitchEndpointRole, #[serde(default)] pub is_primary: bool, @@ -195,13 +190,19 @@ impl StaticBmcEndpoint { )); } - if let Some(switch) = &self.switch - && switch.id.is_none() - && switch.serial.is_none() - { - return Err(format!( - "endpoint_sources.static_bmc_endpoints[{index}].switch requires id or serial" - )); + if let Some(switch) = &self.switch { + if switch.id.is_none() && switch.serial.is_none() { + return Err(format!( + "endpoint_sources.static_bmc_endpoints[{index}].switch requires id or serial" + )); + } + if switch.endpoint_role == StaticSwitchEndpointRole::Host + && switch.nmxt_enabled.is_none() + { + return Err(format!( + "endpoint_sources.static_bmc_endpoints[{index}].switch.nmxt_enabled must be explicit for host switch endpoints" + )); + } } Ok(()) @@ -460,6 +461,9 @@ pub struct CollectorsConfig { /// Entity metrics collector configuration (if present, metrics collector is enabled) pub metrics: Configurable, + /// Redfish TelemetryService MetricReports collector configuration. + pub telemetry_service: Configurable, + /// Firmware collector configuration (if present, firmware collector is enabled) pub firmware: Configurable, @@ -482,6 +486,7 @@ impl Default for CollectorsConfig { discovery: DiscoveryConfig::default(), sensors: Configurable::Enabled(SensorCollectorConfig::default()), metrics: Configurable::Disabled, + telemetry_service: Configurable::Disabled, firmware: Configurable::Disabled, leak_detector: Configurable::Enabled(LeakDetectorCollectorConfig::default()), logs: Configurable::Disabled, @@ -527,6 +532,30 @@ impl Default for MetricsCollectorConfig { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +pub struct TelemetryServiceCollectorConfig { + /// Interval between Redfish TelemetryService MetricReport polls. + #[serde(with = "humantime_serde")] + pub poll_interval: Duration, + + /// Maximum number of MetricReports fetched concurrently per endpoint. + pub fetch_concurrency: usize, + + /// Optional allow-list of MetricReport resource IDs. Empty means all MetricReports. + pub metric_report_ids: Vec, +} + +impl Default for TelemetryServiceCollectorConfig { + fn default() -> Self { + Self { + poll_interval: Duration::from_secs(60), + fetch_concurrency: 4, + metric_report_ids: Vec::new(), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct ProcessorsConfig { @@ -928,6 +957,7 @@ impl Default for NvueGnmiConfig { pub struct NvueGnmiPaths { pub components_enabled: bool, pub interfaces_enabled: bool, + pub platform_general_enabled: bool, } impl Default for NvueGnmiPaths { @@ -935,6 +965,7 @@ impl Default for NvueGnmiPaths { Self { components_enabled: true, interfaces_enabled: true, + platform_general_enabled: false, } } } @@ -1234,6 +1265,7 @@ mod tests { assert!(config.collectors.firmware.is_enabled()); assert!(config.collectors.leak_detector.is_enabled()); assert!(config.collectors.logs.is_enabled()); + assert!(config.collectors.telemetry_service.is_enabled()); assert!(config.collectors.nvue.is_enabled()); assert!(!config.sinks.tracing.is_enabled()); assert!(config.sinks.prometheus.is_enabled()); @@ -1290,6 +1322,14 @@ mod tests { assert_eq!(config.cache_size, 100); assert_eq!(config.endpoint_discovery_interval, Duration::from_secs(300)); + if let Configurable::Enabled(ref telemetry_service) = config.collectors.telemetry_service { + assert_eq!(telemetry_service.poll_interval, Duration::from_secs(60)); + assert_eq!(telemetry_service.fetch_concurrency, 4); + assert!(telemetry_service.metric_report_ids.is_empty()); + } else { + panic!("telemetry service config should be enabled in example config"); + } + if let Configurable::Enabled(ref nvue) = config.collectors.nvue { if let Configurable::Enabled(ref rest) = nvue.rest { assert_eq!(rest.poll_interval, Duration::from_secs(60)); @@ -1302,6 +1342,7 @@ mod tests { assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); assert!(gnmi.system_events_enabled); + assert!(gnmi.paths.platform_general_enabled); } else { panic!("nvue gnmi config should be enabled in example config"); } @@ -1574,6 +1615,9 @@ skip_empty_reports = false assert!(rest.paths.sdn_partitions_enabled); assert!(rest.paths.interfaces_enabled); } + if let Configurable::Enabled(ref gnmi) = defaults.gnmi { + assert!(!gnmi.paths.platform_general_enabled); + } } #[test] @@ -1617,6 +1661,42 @@ request_timeout = "45s" assert!(!config.collectors.nvue.is_enabled()); } + #[test] + fn test_telemetry_service_config_parsing() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.telemetry_service] +poll_interval = "45s" +fetch_concurrency = 8 +metric_report_ids = ["NvidiaNMMetrics_0"] +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("failed to parse telemetry service config"); + + if let Configurable::Enabled(ref telemetry_service) = config.collectors.telemetry_service { + assert_eq!(telemetry_service.poll_interval, Duration::from_secs(45)); + assert_eq!(telemetry_service.fetch_concurrency, 8); + assert_eq!(telemetry_service.metric_report_ids, ["NvidiaNMMetrics_0"]); + } else { + panic!("telemetry service config should be enabled"); + } + } + + #[test] + fn test_telemetry_service_config_disabled_by_default() { + let config = Config::default(); + assert!(!config.collectors.telemetry_service.is_enabled()); + } + #[test] fn test_nvue_config_explicit_disable() { let toml_content = r#" @@ -1761,7 +1841,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "cumulus" password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = true, slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" @@ -1848,7 +1928,7 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 } #[test] - fn test_static_switch_host_accepts_primary_without_nmxt_override() { + fn test_static_switch_host_accepts_primary_with_explicit_nmxt_enabled() { let toml_content = r#" [endpoint_sources.carbide_api] enabled = false @@ -1858,7 +1938,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "admin" password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true, nmxt_enabled = true } "#; let config: Config = Figment::new() @@ -1874,7 +1954,7 @@ switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); assert!(switch.is_primary); - assert_eq!(switch.nmxt_enabled, None); + assert_eq!(switch.nmxt_enabled, Some(true)); } #[test] @@ -1907,6 +1987,61 @@ switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(switch.nmxt_enabled, Some(true)); } + #[test] + fn test_static_switch_endpoint_requires_explicit_role() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.3" +mac = "11:22:33:44:55:88" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-003" } +"#; + + let err = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract::() + .expect_err("switch endpoint role must be explicit"); + + assert!( + err.to_string().contains("endpoint_role"), + "error should mention the missing endpoint_role: {err}" + ); + } + + #[test] + fn test_static_switch_host_requires_explicit_nmxt_enabled() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.4" +mac = "11:22:33:44:55:99" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-004", endpoint_role = "host", is_primary = true } +"#; + + let config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract::() + .expect("config parses before validation"); + + let err = config + .validate() + .expect_err("host nmxt_enabled must be explicit"); + assert!( + err.contains("nmxt_enabled"), + "error should mention missing nmxt_enabled: {err}" + ); + } + #[test] fn test_static_machine_endpoint_accepts_placement_and_nvlink_metadata() { let toml_content = r#" @@ -1958,7 +2093,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "cumulus" password = "pass" -switch = { serial = "SN-SW-001", physical_slot_number = 7, compute_tray_index = 3 } +switch = { serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = false, physical_slot_number = 7, compute_tray_index = 3 } "#; let config: Config = Figment::new() @@ -1997,7 +2132,7 @@ mac = "aa:bb:cc:dd:ee:ff" username = "admin" password = "pass" machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" } -switch = { serial = "SN-SW-001" } +switch = { serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = false } "#; let config: Config = Figment::new() diff --git a/crates/health/src/discovery/cleanup.rs b/crates/health/src/discovery/cleanup.rs index 5dba8d0728..a4e8ab5693 100644 --- a/crates/health/src/discovery/cleanup.rs +++ b/crates/health/src/discovery/cleanup.rs @@ -57,6 +57,8 @@ pub(super) fn stop_removed_bmc_collectors( tracing::info!( removed_count = removed_keys.len(), remaining_sensors = ctx.collectors.len(CollectorKind::Sensor), + remaining_telemetry_service_collectors = + ctx.collectors.len(CollectorKind::TelemetryService), remaining_collectors = ctx.collectors.len(CollectorKind::Logs), remaining_firmware_collectors = ctx.collectors.len(CollectorKind::Firmware), remaining_leak_detector_collectors = ctx.collectors.len(CollectorKind::LeakDetector), diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 9a1948d27d..6345042a20 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -31,6 +31,7 @@ use crate::config::{ LogsCollectorConfig as LogsCollectorOptions, MetricsCollectorConfig as MetricsCollectorOptions, NmxtCollectorConfig as NmxtCollectorOptions, NvueCollectorConfig as NvueCollectorOptions, SensorCollectorConfig as SensorCollectorOptions, + TelemetryServiceCollectorConfig as TelemetryServiceCollectorOptions, }; use crate::limiter::RateLimiter; use crate::metrics::{MetricsManager, operation_duration_buckets_seconds}; @@ -40,6 +41,7 @@ pub(super) enum CollectorKind { Discovery, Sensor, Metrics, + TelemetryService, Logs, Firmware, LeakDetector, @@ -49,10 +51,11 @@ pub(super) enum CollectorKind { } impl CollectorKind { - pub(super) const ALL: [CollectorKind; 9] = [ + pub(super) const ALL: [CollectorKind; 10] = [ CollectorKind::Discovery, CollectorKind::Sensor, CollectorKind::Metrics, + CollectorKind::TelemetryService, CollectorKind::Logs, CollectorKind::Firmware, CollectorKind::LeakDetector, @@ -68,6 +71,9 @@ impl CollectorKind { } CollectorKind::Sensor => "Stopping sensor collector for removed BMC endpoint", CollectorKind::Metrics => "Stopping entity metrics collector for removed BMC endpoint", + CollectorKind::TelemetryService => { + "Stopping Redfish TelemetryService collector for removed BMC endpoint" + } CollectorKind::Logs => "Stopping logs collector for removed BMC endpoint", CollectorKind::Firmware => "Stopping firmware collector for removed BMC endpoint", CollectorKind::LeakDetector => { @@ -86,6 +92,7 @@ pub(super) struct CollectorState { discovery: HashMap, Collector>, sensors: HashMap, Collector>, metrics: HashMap, Collector>, + telemetry_service: HashMap, Collector>, firmware: HashMap, Collector>, leak_detector: HashMap, Collector>, logs: HashMap, Collector>, @@ -101,6 +108,7 @@ impl CollectorState { discovery: HashMap::new(), sensors: HashMap::new(), metrics: HashMap::new(), + telemetry_service: HashMap::new(), firmware: HashMap::new(), leak_detector: HashMap::new(), logs: HashMap::new(), @@ -116,6 +124,7 @@ impl CollectorState { CollectorKind::Discovery => &self.discovery, CollectorKind::Sensor => &self.sensors, CollectorKind::Metrics => &self.metrics, + CollectorKind::TelemetryService => &self.telemetry_service, CollectorKind::Logs => &self.logs, CollectorKind::Firmware => &self.firmware, CollectorKind::LeakDetector => &self.leak_detector, @@ -133,6 +142,7 @@ impl CollectorState { CollectorKind::Discovery => &mut self.discovery, CollectorKind::Sensor => &mut self.sensors, CollectorKind::Metrics => &mut self.metrics, + CollectorKind::TelemetryService => &mut self.telemetry_service, CollectorKind::Logs => &mut self.logs, CollectorKind::Firmware => &mut self.firmware, CollectorKind::LeakDetector => &mut self.leak_detector, @@ -182,6 +192,7 @@ impl CollectorState { .keys() .chain(self.sensors.keys()) .chain(self.metrics.keys()) + .chain(self.telemetry_service.keys()) .chain(self.logs.keys()) .chain(self.firmware.keys()) .chain(self.leak_detector.keys()) @@ -217,6 +228,7 @@ pub struct DiscoveryLoopContext { pub(crate) discovery_config: DiscoveryConfig, pub(crate) sensors_config: Configurable, pub(crate) metrics_config: Configurable, + pub(crate) telemetry_service_config: Configurable, pub(crate) logs_config: Configurable, pub(crate) firmware_config: Configurable, pub(crate) leak_detector_config: Configurable, @@ -262,6 +274,7 @@ impl DiscoveryLoopContext { discovery_config: config.collectors.discovery.clone(), sensors_config: config.collectors.sensors.clone(), metrics_config: config.collectors.metrics.clone(), + telemetry_service_config: config.collectors.telemetry_service.clone(), logs_config: config.collectors.logs.clone(), firmware_config: config.collectors.firmware.clone(), leak_detector_config: config.collectors.leak_detector.clone(), diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 107c90a882..e0f8dbed6d 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -29,7 +29,7 @@ use crate::collectors::{ LogsCollectorConfig, MetricsCollector, MetricsCollectorConfig, NmxtCollector, NmxtCollectorConfig, NvueRestCollector, NvueRestCollectorConfig, SensorCollector, SensorCollectorConfig, SseLogCollector, SseLogCollectorConfig, StreamingCollectorStartContext, - spawn_gnmi_collector, + TelemetryServiceCollector, TelemetryServiceCollectorConfig, spawn_gnmi_collector, }; use crate::config::{Configurable, LogCollectionMode, PeriodicLogConfig}; use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; @@ -66,6 +66,10 @@ fn spawn_generic_redfish_collectors( let sensors_enabled = matches!(ctx.sensors_config, Configurable::Enabled(_)); let metrics_enabled = matches!(ctx.metrics_config, Configurable::Enabled(_)); + let telemetry_service_enabled = endpoint + .switch_data() + .is_some_and(|switch| matches!(switch.endpoint_role, SwitchEndpointRole::Bmc)) + && ctx.telemetry_service_config.is_enabled(); if (sensors_enabled || metrics_enabled) && !ctx.collectors.contains(CollectorKind::Discovery, &key) @@ -193,6 +197,56 @@ fn spawn_generic_redfish_collectors( } } + if telemetry_service_enabled + && let Configurable::Enabled(telemetry_service_cfg) = &ctx.telemetry_service_config + && !ctx + .collectors + .contains(CollectorKind::TelemetryService, &key) + { + if let Some(data_sink) = data_sink.clone() { + let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( + format!("telemetry_service_collector_{key}"), + metrics_prefix, + )?); + match Collector::start::>( + endpoint_arc.clone(), + bmc.clone(), + TelemetryServiceCollectorConfig { + data_sink: Some(data_sink), + options: telemetry_service_cfg.clone(), + }, + CollectorStartContext { + limiter: ctx.limiter.clone(), + iteration_interval: telemetry_service_cfg.poll_interval, + collector_registry, + metrics_manager: ctx.metrics_manager.clone(), + }, + ) { + Ok(monitor) => { + ctx.collectors.insert( + CollectorKind::TelemetryService, + key.clone().into(), + monitor, + ); + tracing::info!( + endpoint_key = %key, + total_collectors = ctx.collectors.len(CollectorKind::TelemetryService), + "Started Redfish TelemetryService collection for switch BMC endpoint" + ); + } + Err(error) => { + tracing::error!( + ?error, + "Could not start Redfish TelemetryService collector for: {:?}", + endpoint.addr + ); + } + } + } else { + tracing::warn!("Redfish TelemetryService collector requires a data sink, skipping"); + } + } + if let Configurable::Enabled(logs_cfg) = &ctx.logs_config && !ctx.collectors.contains(CollectorKind::Logs, &key) { @@ -695,6 +749,7 @@ mod tests { async fn test_switch_bmc_endpoint_starts_redfish_but_not_switch_host_collectors() { let mut config = Config::default(); config.collectors.sensors = Configurable::Enabled(Default::default()); + config.collectors.telemetry_service = Configurable::Enabled(Default::default()); config.collectors.logs = Configurable::Disabled; config.collectors.firmware = Configurable::Disabled; config.collectors.leak_detector = Configurable::Disabled; @@ -716,10 +771,16 @@ mod tests { )), ); - spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test_switch_bmc_redfish_only") - .expect("spawn should succeed"); + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + Some(Arc::new(NoopSink)), + "test_switch_bmc_redfish_only", + ) + .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 1); + assert_eq!(ctx.collectors.len(CollectorKind::TelemetryService), 1); assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); assert_eq!(ctx.collectors.len(CollectorKind::NvueGnmi), 0); diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index cad243113d..53197db084 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -100,7 +100,7 @@ impl StaticEndpointSource { StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, }; - let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); + let nmxt_enabled = switch.nmxt_enabled.unwrap_or(false); Some(EndpointMetadata::Switch(SwitchData { id, @@ -318,7 +318,7 @@ mod tests { tray_index: Some(3), endpoint_role: StaticSwitchEndpointRole::Host, is_primary: true, - nmxt_enabled: None, + nmxt_enabled: Some(true), }), rack_id: None, }]; diff --git a/dev/bin/generate_nvswitch_gb200_matrix.py b/dev/bin/generate_nvswitch_gb200_matrix.py index 00f31c878a..e2e5aadc18 100755 --- a/dev/bin/generate_nvswitch_gb200_matrix.py +++ b/dev/bin/generate_nvswitch_gb200_matrix.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 -"""Generate the GB200 NVSWITCH telemetry source matrix from OMX catalog artifacts. +"""Generate the GB200 NVSWITCH telemetry source matrix. -Input artifacts are intentionally under .omx because the source workbook is not tracked. -The generated CSV and Markdown summary are tracked under docs for MR review. +The source workbook is not tracked. Pass sanitized catalog extraction artifacts with +``--rows-csv`` and ``--coverage-json`` when regenerating review artifacts. """ from __future__ import annotations +import argparse import csv import json import re @@ -14,11 +15,9 @@ from pathlib import Path ROOT = Path(__file__).resolve().parents[2] -ROWS_CSV = ROOT / ".omx/artifacts/nvswitch_rows.csv" -COVERAGE_JSON = ROOT / ".omx/artifacts/nvswitch_catalog_coverage_heuristic.json" -OUT_DIR = ROOT / "docs/architecture/health" -OUT_CSV = OUT_DIR / "nvswitch_telemetry_gb200_matrix.csv" -OUT_MD = OUT_DIR / "nvswitch_telemetry_gb200_matrix.md" +DEFAULT_OUT_DIR = ROOT / "docs/architecture/health" +DEFAULT_OUT_CSV = DEFAULT_OUT_DIR / "nvswitch_telemetry_gb200_matrix.csv" +DEFAULT_OUT_MD = DEFAULT_OUT_DIR / "nvswitch_telemetry_gb200_matrix.md" GB200_COLUMNS = [ "Applicable for \nGB200 NVL HMC", @@ -60,6 +59,12 @@ } NA_VALUES = {"", "NA", "N/A", "#N/A", "NONE", "TBD", "N.A."} +GENERIC_INFRA_FAMILIES = { + "Redfish TelemetryService", + "Redfish Fabric/Switch/Port", + "NVOS gNMI", + "NMX-T", +} def clean(value: str | None) -> str: @@ -81,10 +86,10 @@ def snake(metric: str) -> str: return re.sub(r"[^a-z0-9]+", "_", metric.lower()).strip("_") -def load_coverage() -> dict[int, dict[str, str]]: - if not COVERAGE_JSON.exists(): +def load_coverage(coverage_json: Path) -> dict[int, dict[str, str]]: + if not coverage_json.exists(): return {} - data = json.loads(COVERAGE_JSON.read_text()) + data = json.loads(coverage_json.read_text()) out: dict[int, dict[str, str]] = {} for section in ("covered", "partial", "gaps"): for item in data.get(section, []): @@ -129,7 +134,12 @@ def choose_sources(row: dict[str, str], sources: dict[str, str], metric: str = " "PHY-SYMBOL-ERRORS": "nmx_t", } if not sources: - return "BLOCKER source resolution", "", "No catalog source listed for GB200 row", "source-resolution blocker" + return ( + "SOURCE UNLISTED live source resolution", + "", + "No catalog source listed for GB200 row; resolve during live validation", + "source-resolution required before live signoff", + ) availability = clean(row.get(COL_AVAIL, "")).upper() tray = yes(row.get("Applicable for\nGB200 NVL NvswitchTray")) @@ -184,9 +194,25 @@ def choose_sources(row: dict[str, str], sources: dict[str, str], metric: str = " return primary, fallback, precedence, "one canonical series unless source-qualified duplicate is justified" -def target_collector(primary: str, sources: dict[str, str]) -> str: - if primary == "BLOCKER source resolution": - return "BLOCKER: source resolution required" +def is_redfish_sensor_range(redfish_path: str) -> bool: + return "/Sensors/" in redfish_path and ( + "ReadingRangeMax" in redfish_path or "ReadingRangeMin" in redfish_path + ) + + +def sensor_range_surface(redfish_path: str) -> str: + if "ReadingRangeMax" in redfish_path: + return "hw_sensor {reading_type}_range_max MetricSample with sensor_range=reading_range_max" + if "ReadingRangeMin" in redfish_path: + return "hw_sensor {reading_type}_range_min MetricSample with sensor_range=reading_range_min" + return "hw_sensor range MetricSample" + + +def target_collector(primary: str, sources: dict[str, str], redfish_path: str) -> str: + if is_redfish_sensor_range(redfish_path): + return "existing SensorsCollector range emission when include_sensor_thresholds=true" + if primary.startswith("SOURCE UNLISTED"): + return "live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted" if primary == "Redfish TelemetryService": return "new NvSwitchTelemetryServiceCollector behind collectors.telemetry_service" if primary == "Redfish Fabric/Switch/Port": @@ -197,16 +223,55 @@ def target_collector(primary: str, sources: dict[str, str]) -> str: return "extend NmxtCollector mapping" if primary == "NVOS CLI": if "nvos_gnmi" in sources: - return "prefer NVOS gNMI equivalent; CLI-only path is blocker if no streamed equivalent exists" - return "BLOCKER: no current NVOS CLI collector; source equivalent required" + return "prefer NVOS gNMI equivalent; live source-equivalence required if no streamed equivalent exists" + return "live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector" if primary == "Onboard DBus": - return "prefer Redfish exposure; otherwise BLOCKER: no current DBus collector" + return "live source-equivalence required; prefer Redfish exposure before adding DBus collector" if primary == "OTLP": - return "BLOCKER: upstream OTLP source contract required" + return "live source-equivalence required; upstream OTLP source contract needed if not exposed elsewhere" return "TBD collector" -def emitted_surface(metric: str, data_type: str, coverage: str) -> str: +def has_generic_infra_source(sources: dict[str, str]) -> bool: + return any( + source_family(source_name, source_value) in GENERIC_INFRA_FAMILIES + for source_name, source_value in sources.items() + ) + + +def branch_coverage( + primary: str, + sources: dict[str, str], + cov_status: str, + cov_reason: str, +) -> tuple[str, str, str]: + if cov_status.startswith("covered"): + return cov_status, "already-covered-regression-required", cov_reason + + if primary.startswith("SOURCE UNLISTED") or not sources: + return ( + "source_resolution_required", + "requires-live-source-resolution", + "Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.", + ) + + if has_generic_infra_source(sources): + return ( + "covered_generic_infra_unvalidated", + "covered-by-generic-infra-requires-live-validation", + "GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.", + ) + + return ( + "source_equivalent_required", + "requires-live-source-equivalent", + "Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.", + ) + + +def emitted_surface(metric: str, data_type: str, coverage: str, redfish_path: str) -> str: + if is_redfish_sensor_range(redfish_path): + return sensor_range_surface(redfish_path) existing = { "PORT-RCV-ERRORS": "existing interface_in_errors MetricSample", "PORT-XMIT-CONSTRAINTS-ERRORS": "existing interface_out_errors MetricSample", @@ -225,10 +290,53 @@ def emitted_surface(metric: str, data_type: str, coverage: str) -> str: return f"{base} MetricSample" +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--rows-csv", + required=True, + type=Path, + help="Sanitized NVSWITCH rows extracted from the telemetry catalog workbook.", + ) + parser.add_argument( + "--coverage-json", + required=True, + type=Path, + help="Coverage heuristic JSON for the sanitized NVSWITCH rows.", + ) + parser.add_argument( + "--out-csv", + default=DEFAULT_OUT_CSV, + type=Path, + help="Output CSV path.", + ) + parser.add_argument( + "--out-md", + default=DEFAULT_OUT_MD, + type=Path, + help="Output Markdown summary path.", + ) + return parser.parse_args() + + +def display_path(path: Path) -> str: + try: + return str(path.relative_to(ROOT)) + except ValueError: + return str(path) + + def main() -> None: - coverage = load_coverage() - OUT_DIR.mkdir(parents=True, exist_ok=True) - with ROWS_CSV.open(newline="") as f: + args = parse_args() + rows_csv = args.rows_csv.resolve() + coverage_json = args.coverage_json.resolve() + out_csv = args.out_csv.resolve() + out_md = args.out_md.resolve() + out_dir = out_csv.parent + + coverage = load_coverage(coverage_json) + out_dir.mkdir(parents=True, exist_ok=True) + with rows_csv.open(newline="") as f: rows = list(csv.DictReader(f)) out_rows = [] @@ -245,14 +353,13 @@ def main() -> None: cov = coverage.get(row_no, {}) cov_status = clean(cov.get("coverage", "gap")) or "gap" cov_reason = clean(cov.get("coverage_reason", "")) - if primary.startswith("BLOCKER"): - implementation_status = "blocker-source-resolution" - elif cov_status.startswith("covered"): - implementation_status = "already-covered-regression-required" - elif cov_status.startswith("partial"): - implementation_status = "partial-needs-implementation" - else: - implementation_status = "gap-needs-implementation" + redfish_path = clean(row.get(COL_URI_DOMAIN)) or clean(row.get(COL_WILDCARD)) or clean(row.get(COL_REDFISH_GB)) or clean(row.get(COL_MRD)) + branch_cov_status, implementation_status, branch_cov_reason = branch_coverage( + primary, + sources, + cov_status, + cov_reason, + ) out_rows.append({ "catalog_row": row_no, @@ -268,23 +375,24 @@ def main() -> None: "fallback_source": fallback, "source_precedence": precedence, "duplicate_alias_policy": duplicate_policy, - "target_collector": target_collector(primary, sources), - "target_emitted_surface": emitted_surface(metric, row.get(COL_DATA_TYPE, ""), cov_status), - "current_coverage": cov_status, + "target_collector": target_collector(primary, sources, redfish_path), + "target_emitted_surface": emitted_surface(metric, row.get(COL_DATA_TYPE, ""), cov_status, redfish_path), + "current_coverage": branch_cov_status, "implementation_status": implementation_status, - "coverage_reason": cov_reason, - "redfish_or_mrd_path": clean(row.get(COL_URI_DOMAIN)) or clean(row.get(COL_WILDCARD)) or clean(row.get(COL_REDFISH_GB)) or clean(row.get(COL_MRD)), + "coverage_reason": branch_cov_reason, + "redfish_or_mrd_path": redfish_path, "nvos_gnmi_path": clean(row.get(COL_GNMI, "")), "nmx_t_field": clean(row.get(COL_NMXT, "")), "nvos_cli_reference": clean(row.get(COL_CLI_2503, "")) or clean(row.get(COL_CLI_2502, "")), "onboard_dbus_reference": clean(row.get(COL_ONBOARD, "")), - "test_fixture_plan": "required: parser fixture plus metric emission assertion; live GB evidence before review pause", + "test_fixture_plan": "required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation", "live_validation_plan": "validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review", }) fieldnames = list(out_rows[0].keys()) if out_rows else [] - with OUT_CSV.open("w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) + out_dir.mkdir(parents=True, exist_ok=True) + with out_csv.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") writer.writeheader() writer.writerows(out_rows) @@ -294,13 +402,13 @@ def main() -> None: md = [ "# NVSWITCH telemetry GB200 source matrix", "", - "Generated from `.omx/artifacts/nvswitch_rows.csv` for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`:", + "Generated from sanitized Telemetry Catalog extraction artifacts for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`:", "", "- `Applicable for GB200 NVL HMC`", "- `Applicable for GB200 NVL BMC`", "- `Applicable for GB200 NVL NvswitchTray`", "", - f"CSV matrix: `{OUT_CSV.relative_to(ROOT)}`", + f"CSV matrix: `{display_path(out_csv)}`", "", "## Counts", "", @@ -311,25 +419,44 @@ def main() -> None: ] for key, value in sorted(counts.items()): md.append(f"- {key}: {value}") - md.extend(["", "### Current coverage", ""]) + md.extend(["", "### Branch coverage status", ""]) for key, value in sorted(coverage_counts.items()): md.append(f"- {key}: {value}") md.extend(["", "### Primary source", ""]) for key, value in sorted(primary_counts.items()): md.append(f"- {key}: {value}") md.extend([ + "", + "## GB200 branch implementation coverage", + "", + "The `nvswitch_telemetry_gaps` branch implements common GB+VR-friendly collector infrastructure for the GB200 phase:", + "", + "- Redfish BMC: enabled `nv-redfish` `telemetry-service`, added a switch-BMC-only TelemetryService collector, and emits every numeric/boolean/string `MetricReport` value as `redfish_telemetry_service` samples with report and source-property labels.", + "- BMC proxy: widened TelemetryService ACLs to `MetricReportDefinitions/*` and `MetricReports/*` so live GB200 validation is not limited to `NvidiaNMMetrics_0`.", + "- NMX-T HOST: preserves all numeric Prometheus samples instead of dropping unknown metric names; legacy `Effective_BER`, `Symbol_Errors`, and `Link_Down` metric names remain canonical.", + "- NVUE gNMI HOST: subscribes to `components`, `interfaces`, and `platform-general`; known current metrics keep their existing names, and previously unmapped leaves are emitted as source-qualified `nvswitch_*` samples.", + "- Config: `collectors.telemetry_service` is disabled by default, and `collectors.nvue.gnmi.paths.platform_general_enabled` is an explicit opt-in path gate; the example and live-validation configs enable the full GB200 switch collector set.", + "", + "The generic-preservation surfaces are behavior-locked by unit tests before live hardware validation:", + "", + "- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples` covers numeric, string/info, and boolean/state MetricReport values.", + "- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels` cover stable key identity for unknown Prometheus samples with extra labels.", + "- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric` cover previously unmapped interface leaves and platform-general string leaves.", + "", + "Rows that still have no catalog-listed source remain in scope: `CABLE-SNR-MEDIA-LANE-N` and `CABLE-SNR-HOST-LANE-N` are marked `requires-live-source-resolution` and must be checked during live validation. The generic Redfish MetricReport, NMX-T, and gNMI preservation paths will expose them if the device emits them; if not, open a source-owner follow-up immediately.", "", "## Execution rules", "", "- Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete.", "- Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale.", - "- Rows marked `blocker-source-resolution` are not deferred; they require immediate source-resolution or escalation.", + "- Generic-preserved metrics must keep bounded identity labels: report id/URI/definition and metric id/property/identity for Redfish MetricReports, raw source metric plus sorted source-label identity for NMX-T, and full gNMI path plus endpoint/entity labels for gNMI. Redfish internal keys must use escaped raw MetricId/MetricProperty identity, and NMX-T generic keys must escape raw port/source/node/label identity, to avoid aliasing. Raw string metric values must not be emitted as labels.", + "- Rows marked `requires-live-source-resolution` or `requires-live-source-equivalent` remain in scope; they require live source proof or immediate escalation before GB200 signoff.", "- Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed.", "", ]) - OUT_MD.write_text("\n".join(md)) - print(f"wrote {OUT_CSV}") - print(f"wrote {OUT_MD}") + out_md.write_text("\n".join(md) + "\n") + print(f"wrote {out_csv}") + print(f"wrote {out_md}") print(f"rows {len(out_rows)}") diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md new file mode 100644 index 0000000000..6df5826cdf --- /dev/null +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -0,0 +1,157 @@ +# GB200 NVSWITCH telemetry live-validation runbook + +This branch stops before live hardware validation. After build/test/lint review, run the health service locally against one GB200 NVLink Switch BMC endpoint and one switch HOST/NVOS endpoint. + +## Collectors that must be enabled + +For the GB200 phase, enable all switch telemetry collectors below: + +- BMC endpoint (`switch.endpoint_role = "bmc"`): + - `collectors.sensors` for standard Redfish sensor readings and threshold/range context. + - `collectors.telemetry_service` for Redfish `TelemetryService/MetricReports/*`. +- HOST endpoint (`switch.endpoint_role = "host"`): + - `collectors.nmxt` for NMX-T Prometheus telemetry on port `9352`. + - `collectors.nvue.rest` for existing NVUE health/app/partition/interface diagnostics. + - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general`, plus ON_CHANGE system events. + +The BMC proxy ACL must allow: + +- `GET /redfish/v1/TelemetryService` +- `GET /redfish/v1/TelemetryService/MetricReportDefinitions/*` +- `GET /redfish/v1/TelemetryService/MetricReports/*` + +## Local static config template + +Replace placeholders after the branch is reviewed. Keep real credentials out of git. + +```toml +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[sinks.rack_health_report] +enabled = false + +[sinks.switch_health_report] +enabled = false + +[sinks.power_shelf_health_report] +enabled = false + +[sinks.prometheus] +enabled = true + +[metrics] +endpoint = "127.0.0.1:9009" +prefix = "carbide_hardware_health" + +[[endpoint_sources.static_bmc_endpoints]] +ip = "" +port = 443 +mac = "" +username = "" +password = "" +switch = { serial = "", endpoint_role = "bmc", slot_number = , tray_index = } + +[[endpoint_sources.static_bmc_endpoints]] +ip = "" +port = 443 +mac = "" +username = "" +password = "" +switch = { serial = "", endpoint_role = "host", is_primary = true, nmxt_enabled = true, slot_number = , tray_index = } + +[collectors.discovery] +refresh_interval = "5m" +discovery_concurrency = 4 + +[collectors.sensors] +sensor_fetch_interval = "1m" +sensor_fetch_concurrency = 8 +include_sensor_thresholds = true + +[collectors.telemetry_service] +poll_interval = "1m" +fetch_concurrency = 4 +# Empty means all exposed MetricReports. Narrow to ["NvidiaNMMetrics_0"] only if the BMC exposes noisy unrelated reports. +metric_report_ids = [] + +[collectors.metrics] +enabled = false + +[collectors.logs] +enabled = false + +[collectors.firmware] +enabled = false + +[collectors.leak_detector] +enabled = false + +[collectors.nmxt] +scrape_interval = "1m" +request_timeout = "30s" + +[collectors.nvue.rest] +poll_interval = "1m" +request_timeout = "30s" + +[collectors.nvue.rest.paths] +system_health_enabled = true +cluster_apps_enabled = true +sdn_partitions_enabled = true +interfaces_enabled = true + +[collectors.nvue.gnmi] +gnmi_port = 9339 +sample_interval = "1m" +request_timeout = "30s" +system_events_enabled = true + +[collectors.nvue.gnmi.paths] +components_enabled = true +interfaces_enabled = true +platform_general_enabled = true +``` + +## Local nv-redfish patch command + +The infra-controller MR must not commit absolute local paths. For local validation against a locally built `nv-redfish` checkout, use Cargo command-line patching. The local `nv-redfish` workspace package version must satisfy the infra-controller dependency (`0.10.x` for this branch); if the companion checkout is on `origin/main` with a development `0.1.0` workspace version, use a matching release tag or a temporary local-only version edit that is not committed. + +```bash +cargo run \ + --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" \ + -p carbide-health --bin forge-hw-health -- \ + /path/to/gb200-switch-local.toml +``` + +If the companion `nv-redfish` checkout changes internal crates, add the matching `patch.crates-io` entries documented in `nvswitch_telemetry_nv_redfish_dependency.md`. + +## Evidence to capture during live validation + +1. `/telemetry` output contains `redfish_telemetry_service` samples for the BMC endpoint. +2. `/telemetry` output contains `switch_nmxt` samples for the HOST endpoint, including any source metric names beyond the three legacy hard-coded metrics. +3. `/telemetry` output contains `nvue_gnmi` samples for: + - existing canonical interface metrics (`interface_*`), and + - newly preserved `nvswitch_*` catalog leaf metrics from previously unmapped gNMI leaves. +4. Logs show the TelemetryService, NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected endpoint roles. +5. The two catalog rows with no listed source (`CABLE-SNR-MEDIA-LANE-N`, `CABLE-SNR-HOST-LANE-N`) are checked explicitly in live output. If they do not appear through Redfish MetricReports, NMX-T, or gNMI, open a catalog/source-owner follow-up immediately; keep them open until source-owner resolution. + +## Cardinality and series-shape acceptance checks + +The branch intentionally preserves generic Redfish MetricReport, NMX-T, and gNMI samples so GB200 bring-up does not drop unknown NVSWITCH rows. Before treating live validation as successful, capture the series shape and confirm it is bounded by device structure rather than by scrape churn: + +1. Capture the distinct `(metric name, metric_type, key)` tuples from two consecutive `/telemetry` scrapes after collectors are warm. +2. Confirm the tuple set is stable across those scrapes except for expected hot-plug, link, or error-counter changes. +3. For Redfish MetricReports, confirm labels are limited to report id/URI/definition and metric id/property/identity, and that internal sample keys use escaped raw MetricId/MetricProperty identity so sanitized aliases do not collapse. Raw string values must not appear as metric labels. +4. For NMX-T, confirm unknown metric keys include escaped raw port/source/node identity and stable sorted source-label identity so same metric/port samples with different lane/device labels do not collapse. +5. For gNMI, confirm unknown leaves are keyed by full source path plus endpoint/entity labels and do not create time-varying label names. +6. If live GB200 only needs a subset of TelemetryService reports, narrow `metric_report_ids` and consider tightening the BMC proxy ACL before final merge. + +Unit coverage that locks the pre-live behavior: + +- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples`. +- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels`. +- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric`. diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index a2e18c0ac5..0f85f68b4d 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -1,194 +1,194 @@ -catalog_row,guid,metric_param_name,description,category,data_type,gb200_applicability,availability,source_families,primary_source,fallback_source,source_precedence,duplicate_alias_policy,target_collector,target_emitted_surface,current_coverage,implementation_status,coverage_reason,redfish_or_mrd_path,nvos_gnmi_path,nmx_t_field,nvos_cli_reference,onboard_dbus_reference,test_fixture_plan,live_validation_plan -763,NVSWITCH-NET-FW-VER,NET-FW-VER,Switch ASIC Firmware Version,Config,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_net_fw_ver as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId {FirmwareVersion},NA,FW_Version,nv show platform firmware $name {name: {Name: ASIC}} {actual-firmware},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/software/HGX_FW_NVSwitch_{InstanceId} xyz.openbmc_project.Software.Version Version,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -764,NVSWITCH-OS-VERSION,OS-VERSION,OS version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_os_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show system version {kernel},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -765,NVSWITCH-OS-KERNEL,OS-KERNEL,OS Kernel version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_os_kernel as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show system version {image{build-id}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -766,NVSWITCH-EROT-FW-VERSION,EROT-FW-VERSION,ERoT FW version,Config,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_erot_fw_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform firmware $name {name: {Name: EROT}} {actual-firmware},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -767,NVSWITCH-BMC-VERSION,BMC-VERSION,BMC firmware version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_bmc_version as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform firmware $name {name: {Name: BMC}} {actual-firmware},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -794,NVSWITCH-LINK-DOWNED-COUNTER,LINK-DOWNED-COUNTER,Total number of times the Port Training state machine has failed the link error recovery process and downed the link.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_downed_counter MetricSample,partial_host,partial-needs-implementation,"NMX-T maps Link_Down to link_down; gNMI code listens for unintentional-link-down-events, not catalog link-downed path",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkDownedCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-downed,Link_Down,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{link-downed}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -795,NVSWITCH-PORT-MALFORMED-PACKET-ERRORS,PORT-MALFORMED-PACKET-ERRORS,"Total number of packets received on the port that contain malformed packet errors • Data packets: LVer, length, VL • Link packets: operand, length, VL",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_malformed_packet_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{MalformedPackets}}},/interfaces/interface [name]/phy-diag/state/port-malformed-packet-errors,PortMalformedPacketErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-malformed-packet-errors}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -796,NVSWITCH-PORT-NEIGHBOR-MTU-DISCARDS,PORT-NEIGHBOR-MTU-DISCARDS,Number of outbound packets discarded by the port because packet length exceeded the NeighborMTU.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_neighbor_mtu_discards MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{NeighborMTUDiscards}}},/interfaces/interface [name]/phy-diag/state/port-neighbor-mtu-discards,PortNeighborMTUDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-neighbor-mtu-discards}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -797,NVSWITCH-PORT-RCV-ERRORS,PORT-RCV-ERRORS,"Total number of packets containing an error that were received on the port. These errors include: • Local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine) • Malformed data packet errors (LVer, length, VL) • Malformed link packet errors (operand, length, VL) • Packets discarded due to buffer overrun",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_in_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_in_errors,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXErrors},interfaces/interface [name]/state/counters/in-errors,PortRcvErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-errors}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -798,NVSWITCH-PORT-XMIT-DISCARDS,PORT-XMIT-DISCARDS,Total number of outbound packets discarded by the port because the port is down or congested.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_discards MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXDiscards}},interfaces/interface[name=*]/state/counters/out-discards,PortXmitDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-drops}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -799,NVSWITCH-PORT-RCV-REMOTE-PHYSICAL-ERRORS,PORT-RCV-REMOTE-PHYSICAL-ERRORS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_remote_physical_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXRemotePhysicalErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-remote-phy-errors,PortRcvRemotePhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -800,NVSWITCH-PORT-RCV-SWITCH-RELAY-ERRORS,PORT-RCV-SWITCH-RELAY-ERRORS,"Total number of packets received on the port that were discarded because they could not be forwarded by the switch relay.This might happen if, for instance, the destination port is congested or there are internal switch errors.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_switch_relay_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXSwitchRelayErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-switch-relay-errors,PortRcvSwitchRelayErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -801,NVSWITCH-QP1Dropped,QP1Dropped,"Number of QP1 MADs (packets) dropped due to resource limitations (e.g., lack of buffers or receives posted) on the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_qp1dropped MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{QP1Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/qp1-dropped,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{qp1-drops}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -802,NVSWITCH-VL15-DROPPED,VL15-DROPPED,"Number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers) of the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl15_dropped MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{VL15Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/vl15-dropped,VL15Dropped,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -804,NVSWITCH-SERIAL,SERIAL,Serial Number,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serial as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {SerialNumber},NA,sw_serial_number,nv show platform {serial-number},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/NVSwitch1 xyz.openbmc_project.Inventory.Decorator.Asset SerialNumber,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -806,NVSWITCH-NODE-GUID,NODE-GUID,"GUID of the HCA, switch, GPU, or router itself. All ports on the same node shall report the same NodeGUID. Provides a means to uniquely identify a node within a subnet and determine co-location of ports.",Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_node_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Node_GUID,nv show ib device $IbDeviceId {IbDeviceId: {type: NVLink*}} {guid},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/HGX_NVSwitch_{InstanceId} xyz.openbmc_project.Common.UUID UUID,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -807,NVSWITCH-PORT-GUID,PORT-GUID,GUID of the port. All ports on the same switch shall report the same NodeGUID.,Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T,NMX-T,Redfish Fabric/Switch/Port,NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Port_GUID,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -834,NVSWITCH-NVLINK-STATUS,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVLink Link status (e.g. LinkUp),Status,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI; Onboard DBus,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_pshima_nvidia_com_should_be_called_port_physical_state_ziv_hillel_il_nvlink_status as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId {LinkStatus},interfaces/interface[name=$port_name]/infiniband/state/physical-port-state,phy_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{physical-state}},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/fabrics/HGX_NVLinkFabric_{InstanceId}/Switches/NVSwitch_{InstanceId}/Ports/NVLink_{InstanceId} xyz.openbmc_project.Inventory.Item.Port LinkStatus,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -846,NVSWITCH-LINK-ERROR-RECOVERY-COUNTER,LINK-ERROR-RECOVERY-COUNTER,Total number of times the Port Training state machine has successfully completed the link error recovery process. This enrty is applicable for platforms with NVL5.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_error_recovery_counter MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkErrorRecoveryCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-error-recovery,LinkErrorRecoveryCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{error-recovery}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -847,NVSWITCH-PORT-MULTICAST-RCV-PKTS,PORT-MULTICAST-RCV-PKTS,"Total number of multicast packets, including multicast packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,@pshima@nvidia.com spelling is wrong RXMulitcastFrames -> RXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXMulticastFrames}},/interfaces/interface [name]/phy-diag/state/port-multi-cast-rcv-pkts,PortMultiCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-multicast-pkts}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -848,NVSWITCH-PORT-MULTICAST-XMIT-PKTS,PORT-MULTICAST-XMIT-PKTS,Total number of multicast packets transmitted on all VLs from the port. This may include multicast packets with errors.,Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,"@pshima@nvidia.com spelling issue , should be TXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXMulticastFrames}}}",/interfaces/interface [name]/phy-diag/state/port-multi-cast-xmit-pkts,PortMultiCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-multicast-pkts}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -849,NVSWITCH-PORT-RCV-DATA,PORT-RCV-DATA,"Total number of data octets, divided by 4, received on all VLs at the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_data MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXBytes},interfaces/interface[name=*]/state/counters/in-octets,PortRcvDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-bytes}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -850,NVSWITCH-PORT-RCV-PKTS,PORT-RCV-PKTS,"Total number of received packets, including packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXFrames}},interfaces/interface[name=*]/state/counters/in-pkts,PortRcvPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -851,NVSWITCH-PORT-UNICAST-RCV-PKTS,PORT-UNICAST-RCV-PKTS,"Total number of unicast packets, including unicast packets containing errors.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_rcv_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXUnicastFrames}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-rcv-pkts,PortUniCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-unicast-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -852,NVSWITCH-PORT-UNICAST-XMIT-PKTS,PORT-UNICAST-XMIT-PKTS,Total number of unicast packets transmitted on all VLs from the port. This may include unicast packets with errors.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXUnicastFrames}}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-xmit-pkts,PortUniCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-unicast-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -853,NVSWITCH-PORT-XMIT-DATA,PORT-XMIT-DATA,"Total number of data octets, divided by 4, transmitted on all VLs from the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors. It excludes all link packets.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_data MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {TXBytes}},interfaces/interface[name=*]/state/counters/out-octets,PortXmitDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-bytes}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -854,NVSWITCH-PORT-XMIT-PKTS,PORT-XMIT-PKTS,Total number of packets transmitted on all VLs from the port. This may include packets with errors,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_pkts MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXFrames}},interfaces/interface[name=*]/state/counters/out-pkts,PortXmitPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-pkts}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -855,NVSWITCH-PORT-XMIT-WAIT,PORT-XMIT-WAIT,The number of ticks during which the port selected by PortSelect had data to transmit but no data was sent during the entire tick either because of insufficient credits or because of lack of arbitration.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_wait MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{TXWait}}},interfaces/interface[name=*]/infiniband/state/counters/port/xmit-wait,PortXmitWait,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-wait}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -862,NVSWITCH-CONTACT,CONTACT,UTF-8 encoded string to describe contact person.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_contact as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/contact,NA,TBD,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -863,NVSWITCH-LOCATION,LOCATION,UTF-8 encoded string to describe location of the device.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_location as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/location,NA,nv show platform chassis-location {slot-number},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -864,NVSWITCH-NODE-DESCRIPTION,NODE-DESCRIPTION,UTF-8 encoded string to describe node in text format.,Inventory,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_node_description as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,platform-general/state/platform-name,node_description,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -865,NVSWITCH-LID,LID,Local ID- Link layer address of an end port.,NetworkId,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_lid MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,lid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -866,NVSWITCH-PORT-NUMBER,PORT-NUMBER,Port number,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_number as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Port_Number,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -867,NVSWITCH-PORT-LABEL,PORT-LABEL,Front panel label of the port,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_label as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,port_label,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -868,NVSWITCH-REVISION,REVISION,Switch HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_revision MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,sw_revision,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -869,NVSWITCH-DEVICE-HARDWARE-REVISION,DEVICE-HARDWARE-REVISION,DEvice HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_hardware_revision MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,device_hw_rev,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -870,NVSWITCH-CPU_CORE_NUMBER,CPU_CORE_NUMBER,Number of cores,System,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_core_number MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show system cpu {core-count},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -872,NVSWITCH-ASIC-TEMP-CRITICAL,ASIC-TEMP-CRITICAL,"Critical temperature threshold for NVSwitch ASIC. Above this level, the system will shutdown.",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -873,NVSWITCH-ASIC-TEMP-MAX,ASIC-TEMP-MAX,Max temperature threshold for NVSwitch ASIC.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {max}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -874,NVSWITCH-ASIC-TEMP-STATE,ASIC-TEMP-STATE,NVSwitch ASIC state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_asic_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -875,NVSWITCH-ASIC-TEMP-CURRENT,ASIC-TEMP-CURRENT,NVSwitch ASIC current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=ASIC*]/asic/state/asic-temp,Chip_Temp,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -876,NVSWITCH-ASIC-NAME,ASIC-NAME,NVSwitch ASIC current temperature,Platform,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_name MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=ASIC*]/state/name,NA,nv show platform {asic-model},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -879,NVSWITCH-AMBIENT-MNG-TEMP-STATE,AMBIENT-MNG-TEMP-STATE,Ambient temperature located in port side state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_ambient_mng_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -880,NVSWITCH-AMBIENT-MNG-TEMP-CURRENT,AMBIENT-MNG-TEMP-CURRENT,Ambient temperature located in port side,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_ambient_mng_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -881,NVSWITCH-CPU_PACK_TEMP_CRITICAL,CPU_PACK_TEMP_CRITICAL,"Critical temperature threshold for CPU PACK, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -882,NVSWITCH-CPU_PACK_TEMP_MAX,CPU_PACK_TEMP_MAX,Max temperature threshold for CPU PACK,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -883,NVSWITCH-CPU_PACK_TEMP_STATE,CPU_PACK_TEMP_STATE,CPU PACK temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -884,NVSWITCH-CPU_PACK_TEMP_CURRENT,CPU_PACK_TEMP_CURRENT,CPU PACK temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_cpu_pack_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -885,NVSWITCH-CPU-UTIL,CPU-UTIL,ComE CPU utilization,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_util MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,components/component[name=cpu]/cpu/utilization/state/avg,NA,nv show system cpu {total-utilization},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -886,NVSWITCH-MEM-UTIL,MEM-UTIL,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_util MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/memory-used,NA,nv show system memory {physical{utilization}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -887,NVSWITCH-MEM-TOTAL-SIZE,MEM-TOTAL-SIZE,Memory total size,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_total_size MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/memory-total-size,NA,nv show system memory {physical{total}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -888,NVSWITCH-DISK-TOTAL-SIZE,DISK-TOTAL-SIZE,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_total_size MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/disk-total-size,NA,TBD,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -889,NVSWITCH-DISK-USED,DISK-USED,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_used MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,platform-general/state/disk-used,NA,TBD,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -890,NVSWITCH-SODIMM_TEMP_CRITICAL,SODIMM_TEMP_CRITICAL,"Critical temperature threshold for SODIMM temperature, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -891,NVSWITCH-SODIMM_TEMP_MAX,SODIMM_TEMP_MAX,Max temperature threshold for SODIMM temperature,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {max}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -892,NVSWITCH-SODIMM_TEMP_STATE,SODIMM_TEMP_STATE,SODIMM temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -893,NVSWITCH-SODIMM_TEMP_CURRENT,SODIMM_TEMP_CURRENT,SODIMM temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_sodimm_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -894,FAN-MAX-SPEED,MAX-SPEED,Chassis fan reading range (max),Config,Float,GB200 NVL BMC; GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NVOS CLI,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port then NVOS CLI,one canonical series unless source-qualified duplicate is justified,new NvSwitchRedfishCollector for switch BMC endpoints,nvswitch_max_speed MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,2023.3 /redfish/v1/Chassis/$ChassisId/Sensors/$SensorId {ReadingRangeMax},NA,NA,nv show platform environment fan $FanId {max-speed},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -897,NVSWITCH-PORT-LOGICAL-STATE,PORT-LOGICAL-STATE,Port State. Enumerated as: 0: No State Change; 1: Down (includes failed links) 2: Initialize 3: Armed 4: Active,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_logical_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/logical-port-state,logical_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{logical-state}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -898,NVSWITCH-FEC-MODE-ACTIVE,FEC-MODE-ACTIVE,"FEC mode active: 0: No_FEC 1: Firecode_FEC 2: Standard_RS_FEC - RS(528,514) 3: Standard_LL_RS_FEC - RS(271,257) 6: Interleaved_Standard_RS-FEC - (544,514) 7: Standard_RS-FEC - (544,514)",Status,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_fec_mode_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Active_FEC,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -899,NVSWITCH-RAW-BER,RAW-BER,Raw BER- calculated by the following: bits 15:8 - raw_ber_magnitude bits 3:0 - raw_ber_coef Raw_BER = raw_ber_coef*10^(-raw_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber,Total_Raw_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{raw-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -900,NVSWITCH-EFFECTIVE-BER,EFFECTIVE-BER,Effective BER- calculated by the following: bits 15:8 - effective_ber_magnitude bits 3:0 - effective_ber_coef Effective_BER = effective_ber_coef*10^(-effective_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_effective_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,NA,/interfaces/interface [name]/phy-diag/state/effective-ber,Effective_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -901,NVSWITCH-SYMBOL-BER,SYMBOL-BER,Symbol BER- calculated by the following: bits 15:8 - symbol_ber_magnitude bits 3:0 - symbol_ber_coef Symbol_BER = symbol_ber_coef*10^(-symbol_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_symbol_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{BitErrorRate}}},/interfaces/interface [name]/phy-diag/state/symbol-ber,Symbol_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{symbol-ber}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -902,NVSWITCH-ZERO-HIST,ZERO-HIST,First FEC histogram bin with value of 0 while all higher bins are only with 0 value as well.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_zero_hist MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/zero-hist,fc_zero_hist,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{zero-hist}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -903,NVSWITCH-PHY-RAW-ERRORS-LANE0,PHY-RAW-ERRORS-LANE0,This counter provides information on error bits that were identified on lane 0. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-1,Raw_Errors_Lane_0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{phy-raw-errors}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -904,NVSWITCH-PHY-RAW-ERRORS-LANE1,PHY-RAW-ERRORS-LANE1,This counter provides information on error bits that were identified on lane 1. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-2,Raw_Errors_Lane_1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{phy-raw-errors}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -905,NVSWITCH-RAW-BER-LANE0,RAW-BER-LANE0,Raw BER for lane 0. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-1,raw_ber_lane0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{raw-ber}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -906,NVSWITCH-RAW-BER-LANE1,RAW-BER-LANE1,Raw BER for lane 1. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-2,raw_ber_lane1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{raw-ber}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -907,NVSWITCH-PHY-EFFECTIVE-ERRORS,PHY-EFFECTIVE-ERRORS,This counter provides information on error bits that were not corrected by FEC correction algorithm or that FEC is not active. (post FEC pre PLR),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_effective_errors MetricSample,gap,gap-needs-implementation,No current BMC collector traverses Redfish Fabric/Switch/Port/SwitchMetrics resources,2025.1 /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{EffectiveError}}},/interfaces/interface [name]/phy-diag/state/effective-errors,Effective_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-errors}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -908,NVSWITCH-PHY-SYMBOL-ERRORS,PHY-SYMBOL-ERRORS,Total number of minor link errors detected on one or more physical lanes. This counter provides information on error bits that were not corrected by phy correction mechanisms. (post FEC & PLR),Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NMX-T,NVOS gNMI,NMX-T then NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,existing switch_nmxt symbol_errors MetricSample,covered_host_nmxt,already-covered-regression-required,NMX-T maps Symbol_Errors to symbol_errors,NA,/interfaces/interface [name]/phy-diag/state/symbol-errors,Symbol_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{nvl{errors{symbol-errors{receive}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -909,NVSWITCH-TIME-SINCE-LASTS-CLEAR,TIME-SINCE-LASTS-CLEAR,The time passed since the last counters clear event in msec- time since the port was raised to up.,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_time_since_lasts_clear MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/time-since-last-clear-min,Time_since_last_clear_Min,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{time-since-last-clear-min}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -910,NVSWITCH-DEVICE-ID,DEVICE-ID,Device ID information as assigned by device manufacturer.,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_id as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Device_ID,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -911,NVSWITCH-FEC-HIST-0,FEC-HIST-0,Value of RS FEC Histogram (Reed Solomon error correction) bin0,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin0,hist0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{0{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -912,NVSWITCH-FEC-HIST-1,FEC-HIST-1,Value of RS FEC Histogram (Reed Solomon error correction) bin1,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin1,hist1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{1{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -913,NVSWITCH-FEC-HIST-2,FEC-HIST-2,Value of RS FEC Histogram (Reed Solomon error correction) bin2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_2 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin2,hist2,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{2{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -914,NVSWITCH-FEC-HIST-3,FEC-HIST-3,Value of RS FEC Histogram (Reed Solomon error correction) bin3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_3 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin3,hist3,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{3{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -915,NVSWITCH-FEC-HIST-4,FEC-HIST-4,Value of RS FEC Histogram (Reed Solomon error correction) bin4,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_4 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin4,hist4,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{4{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -916,NVSWITCH-FEC-HIST-5,FEC-HIST-5,Value of RS FEC Histogram (Reed Solomon error correction) bin5,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_5 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin5,hist5,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{5{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -917,NVSWITCH-FEC-HIST-6,FEC-HIST-6,Value of RS FEC Histogram (Reed Solomon error correction) bin6,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_6 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin6,Hist6,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{6{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -918,NVSWITCH-FEC-HIST-7,FEC-HIST-7,Value of RS FEC Histogram (Reed Solomon error correction) bin7,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_7 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin7,Hist7,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{7{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -919,NVSWITCH-FEC-HIST-8,FEC-HIST-8,Value of RS FEC Histogram (Reed Solomon error correction) bin8,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_8 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin8,Hist8,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{8{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -920,NVSWITCH-FEC-HIST-9,FEC-HIST-9,Value of RS FEC Histogram (Reed Solomon error correction) bin9,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_9 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin9,Hist9,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{9{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -921,NVSWITCH-FEC-HIST-10,FEC-HIST-10,Value of RS FEC Histogram (Reed Solomon error correction) bin10,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_10 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin10,Hist10,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{10{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -922,NVSWITCH-FEC-HIST-11,FEC-HIST-11,Value of RS FEC Histogram (Reed Solomon error correction) bin11,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_11 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin11,Hist11,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{11{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -923,NVSWITCH-FEC-HIST-12,FEC-HIST-12,Value of RS FEC Histogram (Reed Solomon error correction) bin12,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_12 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin12,hist12,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{12{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -924,NVSWITCH-FEC-HIST-13,FEC-HIST-13,Value of RS FEC Histogram (Reed Solomon error correction) bin13,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_13 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin13,hist13,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{13{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -925,NVSWITCH-FEC-HIST-14,FEC-HIST-14,Value of RS FEC Histogram (Reed Solomon error correction) bin14,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_14 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin14,hist14,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{14{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -926,NVSWITCH-FEC-HIST-15,FEC-HIST-15,Value of RS FEC Histogram (Reed Solomon error correction) bin15,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_15 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin15,hist15,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{15{count}}}}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -931,NVSWITCH-PLR-CODES-LOSS,PLR-CODES-LOSS,Recieved bandwidth loss due to codes retransmission. calculated in resolution of: (plr_rcv_code_err / plr_rcv_codes) * 10^10 BW Loss % = (plr_codes_loss / 10^10 ) *100,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_plr_codes_loss MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,HiRetransmissionRate,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-codes-loss}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -932,NVSWITCH-PORT-BUFFER-OVERRUN-ERRORS,PORT-BUFFER-OVERRUN-ERRORS,Total number of packets received on the port that were discarded due to buffer overrun.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_buffer_overrun_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/counters/port/excessive-buffer-overrun,ExcessiveBufferOverrunErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{buffer-overrun-errors}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -933,NVSWITCH-LINK-SPEED-ACTIVE,LINK-SPEED-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_speed_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,interfaces/interface[name=$port_name]/infiniband/state/speed,Link_speed_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{speed}}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -934,NVSWITCH-PLR-RCV-CODES,PLR-RCV-CODES,Number of received PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-codes,PlrRcvCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -935,NVSWITCH-PLR-RCV-CODES-ERR,PLR-RCV-CODES-ERR,The total number of rejected PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes_err MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-code-err,PlrRcvCodeErr,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes-err}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -936,NVSWITCH-PLR-RCV-UNCORRECTABLES-CODE,PLR-RCV-UNCORRECTABLES-CODE,The total number of uncorrectable PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_uncorrectables_code MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-uncorrectable-code,PlrRcvUncorrectableCode,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-uncorrectable-code}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -937,NVSWITCH-PLR-XMIT-CODES,PLR-XMIT-CODES,Number of transmitted PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-codes,PlrXmitCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -938,NVSWITCH-PLR-XMIT-RETRYS-CODES,PLR-XMIT-RETRYS-CODES,The total number of PLR codewords retransmitted,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_codes MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-codes,PlrXmitRetryCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-codes}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -939,NVSWITCH-PLR-XMIT-RETRYS-EVENTS,PLR-XMIT-RETRYS-EVENTS,The total number of retransmitted events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_events MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events,PlrXmitRetryEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -940,NVSWITCH-PLR-SYNC-EVENTS,PLR-SYNC-EVENTS,The number of PLR sync events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_sync_events MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-sync-events,PlrSyncEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-sync-events}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -941,NVSWITCH-PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,The maximum number of retransmitted events in 60 sec window based upon the action of undertaking PLR (physical layer retry),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retry_codes_within_minute MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events-within-t-sec-max,PlrXmitRetryCodesWithinTSecMax,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -942,NVSWITCH-PLR-BW-LOSS-PERCENT,PLR-BW-LOSS-PERCENT,The bandwidth loss (percentage) based upon PLR on the NVLink.,Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_plr_bw_loss_percent MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -943,NVSWITCH-RQ-GENERAL-ERROR,RQ-GENERAL-ERROR,The total number of packets that were dropped since it contained errors. Reasons for this include: Dropped due to MPR mismatch.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_rq_general_error MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/rq-general-error,rq_general_error,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{rq-general-error}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -944,NVSWITCH-TIME-TO-LINKS-UP,TIME-TO-LINKS-UP,"Time in msec to link up from disable until phy up state. While the phy manager did not reach phy up state the timer will return 0. The timer resets to 0 in one of the following cases: 1. When moving to disable or rx disable state. 2. When moving from active or phy up to polling state, while working at force mode.",Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_to_links_up MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,time_to_link_up_ext_msec,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -945,NVSWITCH-STATUS-OPCODE,STATUS-OPCODE,Opcode for advanced debug. String representation can be found in STATUS-MESSAGE.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_opcode MetricSample,partial_host,partial-needs-implementation,"NVUE REST link_diagnostic emits opcode as label and boolean state, not opcode as metric value",NA,NA,Advanced_Status_Opcode,"nv show interface --view link-diagnostics ""code""",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -946,NVSWITCH-STATUS-MESSAGE,STATUS-MESSAGE,String represntation of STATUS-OPCODE. All Messages are terminated by a Null character ‘\0’,Status,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_message as inventory/info event or state metric with bounded labels,partial_host,partial-needs-implementation,"NVUE REST link_diagnostic emits diagnostic_status label, not message metric",NA,NA,Status_Message,"nv show interface --view link-diagnostics ""status""",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -947,NVSWITCH-DOWN-BLAME,DOWN-BLAME,Which receiver caused last link down: 0: Unknown 1: Local_phy 2: Remote_phy,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_down_blame MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,down_blame,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -948,NVSWITCH-LOCAL-REASON-OPCODE,LOCAL-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 3: Block_Lock_loss 4: Alignment_loss 5: FEC_sync_loss 6: PLL_lock_loss 7: FIFO_overflow 8: false_SKIP_condition 9: Minor_Error_threshold_exceeded 10: Physical_layer_retransmission_timeout 11: Heartbeat_errors 12: Link_Layer_credit_monitoring_watchdog 13: Link_Layer_integrity_threshold_exceeded 14: Link_Layer_buffer_overrun 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 19: Down_by_verification_GW 20: Received_Remote_Fault 21: Received_TS1 22: Down_by_management_command 23: Cable_was_unplugged 24: Cable_access_issue 25: Thermal_shutdown 26: Current_issue 27: Power_budget 28: Fast_recovery_raw_ber 29: Fast_recovery_effective_ber 30: Fast_recovery_symbol_ber 31: Fast_recovery_credit_watchdog 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_local_reason_opcode MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,local_reason_opcode,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -949,NVSWITCH-REMOTE-REASON-OPCODE,REMOTE-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 4: Alignment_loss 10: Physical_layer_retransmission_timeout 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 21: Received_TS1 22: Down_by_management_command 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event 38: Reset_no_power_cycle 40: Down_due_to_HW_force_event 41: Down_due_to_thermal_event 42: L1_exit_failure 43: too_many_link_error_recoveries 44: Down_due_to_contain_mode 45: BW_loss_threshold_exceeded 47: Hi_SER 48: down_by_nmx_adminstate_cmd,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_remote_reason_opcode MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,remote_reason_opcode,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -950,NVSWITCH-PHY-RECEIVED-BITS,PHY-RECEIVED-BITS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_received_bits MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/phy-received-bits,phy_received_bits,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{phy-received-bits}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -951,NVSWITCH-PORT-RCV-CONSTRAINT-ERRORS,PORT-RCV-CONSTRAINT-ERRORS,Total number of packets received on the switch physical port that are discarded for the following reasons: • FilterRawInbound is true and packet is raw • PartitionEnforcementInbound is true and packet fails partition key check or IP version check,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_constraint_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/counters/port/rcv-constraints-errors,PortRcvConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-constraint-errors}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -952,NVSWITCH-PORT-XMIT-CONSTRAINTS-ERRORS,PORT-XMIT-CONSTRAINTS-ERRORS,Total number of packets not transmitted from the switch physical port for the following reasons: • FilterRawOutbound is true and packet is raw • PartitionEnforcementOutbound is true and packet fails partition key check or IP version check,Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_out_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_out_errors,NA,/interfaces/interface [name]/state/counters/out-errors,PortXmitConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-errors}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -953,NVSWITCH-PORT-LOCAL-PHYSICAL-ERRORS,PORT-LOCAL-PHYSICAL-ERRORS,"Total number of packets received on the port that contain local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine).",Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_local_physical_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-local-physical-errors,PortLocalPhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-local-physical-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -954,NVSWITCH-SYNC-HEADER-ERROR-COUNTER,SYNC-HEADER-ERROR-COUNTER,Count of errored block sync header on one or more lanes.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_sync_header_error_counter MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/sync-header-error-counter,SyncHeaderErrorCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{sync-header-error-counter}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -955,NVSWITCH-PORT-DLID-MAPPING-ERRORS,PORT-DLID-MAPPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to DLID mapping errors.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_dlid_mapping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-dlid-mapping-errors,PortDLIDMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-dlid-mapping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -956,NVSWITCH-LOCAL-LINK-INTEGRITY-ERRORS,LOCAL-LINK-INTEGRITY-ERRORS,The number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors;,Error,Integer,GB200 NVL HMC; GB200 NVL BMC; GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_local_link_integrity_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name]/infiniband/state/counters/port/local-link-integrity-errors,LocalLinkIntegrityErrors,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -957,NVSWITCH-PORT-VL-MAPPING-ERRORS,PORT-VL-MAPPING-ERRORS,"Packet discards due to VL mapping behavior are not considered errors, so the behavior of this counter is implementation dependent. However, it is recommended that this counter be used to count the total number of packets received on the port that were discarded because they could not be forwarded by the switch relay due to VL mapping behavior",Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_vl_mapping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-vl-mapping-errors,PortVLMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-vl-mapping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -958,NVSWITCH-PORT-LOOPING-ERRORS,PORT-LOOPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to looping errors (output port = input port). This applies to switches only.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_looping_errors MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-looping-errors,PortLoopingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-looping-errors}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -959,NVSWITCH-PORT-INACTIVE-DISCARDS,PORT-INACTIVE-DISCARDS,Number of outbound packets discarded by the port because it is not in the active state.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_inactive_discards MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/port-inactive-discards,PortInactiveDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-inactive-discards}}}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -960,NVSWITCH-LINK-WIDTH-ACTIVE,LINK-WIDTH-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_width_active MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/width,Link_width_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{lanes}}",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -961,NVSWITCH-PHY-MANAGER-STATE,PHY-MANAGER-STATE,Show some more info about the PHY state: 0:Disabled 1:Open_port 2:Polling 3:Active_or_Linkup 4:Close_port 5:Phy_up 7:Rx_disable,Status,Text,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_manager_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,/interfaces/interface [name]/phy-diag/state/phy-manager-state,Phy_Manager_State,NA,NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -962,NVSWITCH-MTU,MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mtu MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/mtu,NA,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{mtu}}",NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -963,NVSWITCH-MAX-SUPPORTED-MTU,MAX-SUPPORTED-MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_max_supported_mtu MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/max-supported-mtus,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{max-supported-mtu}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -964,NVSWITCH-SUPPORTED-WIDTH,SUPPORTED-WIDTH,Maximum Transmission Unit,Specs,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_supported_width MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/supported-widths,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{supported-lanes}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -965,NVSWITCH-VL-CAPABILITIES,VL-CAPABILITIES,Maximum Transmission Unit,Specs,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl_capabilities as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,NA,interfaces/interface[name=$port_name]/infiniband/state/vl-capabilities,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{vl-capabilities}},NA,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -966,NVSWITCH-FAN-STATE,FAN-STATE,Fan status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fan_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name=FAN1/1]/state/oper-status,NA,nv show platform environment fan $FanId {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -967,NVSWITCH-FAN-LED,FAN-LED,Fan LED color,Sensor.Other,Text,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_fan_led as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,,nv show platform environment led $LedID {color},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -968,NVSWITCH-CABLE-PART-NUMBER,CABLE-PART-NUMBER,Cable part num,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_part_number as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Cable_PN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-pn},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -969,NVSWITCH-CABLE-SERIAL-NUMBER,CABLE-SERIAL-NUMBER,Cabl Serial num,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_serial_number MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,Na,NA,Cable_SN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-sn},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -970,NVSWITCH-CABLE-TRANSMITTER-TECHNOLOGY,CABLE-TRANSMITTER-TECHNOLOGY,Active/Passive cable,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_transmitter_technology as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_technology,TBD,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -971,NVSWITCH-CABLE-TYPE,CABLE-TYPE,Cable/module type: 0: Unidentified 1: Active_cable - (active copper / optics) 2: Optical_Module - (separated) 3: Passive_copper_cable 4: Cable_unplugged 5: Twisted_pair,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_type as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_type,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-type},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -972,NVSWITCH-CABLE-VENDOR,CABLE-VENDOR,Cable vendor: 0: Other 1: Mellanox 2: Known_OUI 3: NVIDIA,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_vendor as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_vendor,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-name},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -973,NVSWITCH-CABLE-LENGTH,CABLE-LENGTH,Cable length in 1m units. For CMIS modules: bits 6:7 represent cable_length_multiplier for calculating cable length 00 - 0.1 multiplier (0.1 to 6.3m) 01- 1 multiplier (1 to 63m) 10 - 10 multiplier (10 to 630m) 11 - 100 multiplier (100 to 6300m) bits 0:5 represent cable_length_value for calculating cable length. length is calculated with cable_length_value * cable_length_- multiplier,Specs,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_length MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_length,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-length},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -974,NVSWITCH-CABLE-IDENTIFIER,CABLE-IDENTIFIER,"0: QSFP28 1: QSFP_Plus 2: SFP28_or_SFP_Plus 3: QSA - (QSFP->SFP) 4: Backplane 5: SFP_DD 6: QSFP_DD 7: QSFP_CMIS 8: OSFP 9: C2C 10: DSFP 11: QSFP_Split_Cable identifiers that are CMIS compliant are: 5,6,7,8,10",Specs,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_identifier MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_identifier,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {identifier},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -975,NVSWITCH-CABLE-REV,CABLE-REV,ASCII Vendor revision aligned to right padded with 0h on the left,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rev as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,vendor_rev,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-rev},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -976,NVSWITCH-CABLE-FW-VERSION,CABLE-FW-VERSION,module FW version (relevant for optic only),Config,,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_fw_version MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,cable_fw_version,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {firmware},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -977,NVSWITCH-CABLE-RX-POWER-LANE0,CABLE-RX-POWER-LANE0,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane0 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rx_power_lane_0,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-1{rx-power{Power}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -978,NVSWITCH-CABLE-RX-POWER-LANE1,CABLE-RX-POWER-LANE1,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane1 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rx_power_lane_1,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-2{rx-power{Power}}}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -979,NVSWITCH-CABLE-DIAG-SUPPLY-VOLTAGE,CABLE-DIAG-SUPPLY-VOLTAGE,Internally measured supply voltage in 100uV (relevant for optic only),Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_diag_supply_voltage MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Module_Voltage,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {voltage{voltage}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -980,NVSWITCH-CABLE-TEMP,CABLE-TEMP,Module main temperature sensor measured on a unit scale of 1/256 C degrees(relevant for optic only),Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_temp MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Module_Temperature,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {temperature{temperature}},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -981,NVSWITCH-CABLE-TEMP-ALARM,CABLE-TEMP-ALARM,Temperature warning threshold on a unit scale of 1/256 C degrees.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_temp_alarm MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag,Temp_flags,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -982,NVSWITCH-CABLE-VOLTAGE-ALARM,CABLE-VOLTAGE-ALARM,Voltage warning threshold on a unit scale of 100uV.,Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_voltage_alarm MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag,Vcc_flags,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -983,NVSWITCH-CABLE-TX-CDR-LOL,CABLE-TX-CDR-LOL,Bitmask for latched Tx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_cdr_lol as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-cdr-lol,tx_cdr_lol,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -984,NVSWITCH-CABLE-RX-CDR-LOL,CABLE-RX-CDR-LOL,Bitmask for latched Rx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_cdr_lol as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-cdr-lol,rx_cdr_lol,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -985,NVSWITCH-CABLE-TX-LOS,CABLE-TX-LOS,Bitmask for latched Tx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_los as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-los,tx_los,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -986,NVSWITCH-CABLE-RX-LOS,CABLE-RX-LOS,Bitmask for latched Rx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_los as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-los,rx_los,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -987,NVSWITCH-LINK-PARTNER-DESCRIPTION,LINK-PARTNER-DESCRIPTION,Description of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_description as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_description,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -988,NVSWITCH-LINK-PARTNER-NODE-GUID,LINK-PARTNER-NODE-GUID,GUID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_node_guid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_node_guid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -989,NVSWITCH-LINK-PARTNER-LID,LINK-PARTNER-LID,LID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_lid as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_lid,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -990,NVSWITCH-LINK-PARTNER-PORT-NUM,LINK-PARTNER-PORT-NUM,Port number of the link partner side (port that is connected to the port),Inventory,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_port_num MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,link_partner_port_num,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1174,NVSWITCH-CPU-STATE,CPU-STATE,CPU status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_state as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,components/component[name=cpu]/state/oper-status,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1241,NVSWITCH-DRIVE-TEMP-CRITICAL,DRIVE-TEMP-CRITICAL,"Critical temperature threshold for drive, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1242,NVSWITCH-DRIVE-TEMP-MAX,DRIVE-TEMP-MAX,Max temperature threshold for drive,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1243,NVSWITCH-DRIVE-TEMP-STATE,DRIVE-TEMP-STATE,Drive Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1244,NVSWITCH-DRIVE-TEMP-CURRENT,DRIVE-TEMP-CURRENT,Drive Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_drive_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1245,NVSWITCH-HSC-VINDC-TEMP-CRITICAL,HSC-VINDC-TEMP-CRITICAL,"Critical temperature threshold for HSC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {crit},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1246,NVSWITCH-HSC-VINDC-TEMP-MAX,HSC-VINDC-TEMP-MAX,Max temperature threshold for HSC,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_max MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {max},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1247,NVSWITCH-HSC-VINDC-TEMP-STATE,HSC-VINDC-TEMP-STATE,HSC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {state},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1248,NVSWITCH-HSC-VINDC-TEMP-CURRENT,HSC-VINDC-TEMP-CURRENT,HSC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_hsc_vindc_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {current},,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1249,NVSWITCH-PDB-CONV-TEMP-CRITICAL,PDB-CONV-TEMP-CRITICAL,"Critical temperature threshold for PDB, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1251,NVSWITCH-PDB-CONV-TEMP-STATE,PDB-CONV-TEMP-STATE,PDB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1252,NVSWITCH-PDB-CONV-TEMP-CURRENT,PDB-CONV-TEMP-CURRENT,PDB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pdb_conv_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1253,NVSWITCH-PMIC-TEMP-CRITICAL,PMIC-TEMP-CRITICAL,"Critical temperature threshold for PMIC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_critical MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {crit}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1255,NVSWITCH-PMIC-TEMP-STATE,PMIC-TEMP-STATE,PMIC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1256,NVSWITCH-PMIC-TEMP-CURRENT,PMIC-TEMP-CURRENT,PMIC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_pmic_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1259,NVSWITCH-SWB-ASIC-PCB-TEMP-STATE,SWB-ASIC-PCB-TEMP-STATE,SWB ASIC PCB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_swb_asic_pcb_temp_state MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {state}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1260,NVSWITCH-SWB-ASIC-PCB-TEMP-CURRENT,SWB-ASIC-PCB-TEMP-CURRENT,SWB ASIC PCB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,BLOCKER: no current NVOS CLI collector; source equivalent required,nvswitch_swb_asic_pcb_temp_current MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {current}",,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1688,NVSWITCH-LINK-RECOVERY-SUCCESS-CNT,LINK-RECOVERY-SUCCESS-CNT,Successful recovery count in an active link. Counter resets on link flap.,"Status, Event",Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_recovery_success_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,successful_recovery_events,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1689,NVSWITCH-TOTAL-LINK-RECOVERY-SUCCESS-CNT,TOTAL-LINK-RECOVERY-SUCCESS-CNT,Total successful recovery count accumulated across link flaps.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_total_link_recovery_success_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,total_successful_recovery_events,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1690,NVSWITCH-TIME-SINCE-LAST-RECOVERY,TIME-SINCE-LAST-RECOVERY,"Elapsed time since last recovery event, measured in seconds.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_since_last_recovery MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_since_last_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1691,NVSWITCH-TIME-BTWN-TWO-RECOVERIES,TIME-BTWN-TWO-RECOVERIES,"Time in msec between two last consecutive recoveries (success or fail) from exit of first to entry of second. When value is OxFFFF, time is more than 1 minute.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_btwn_two_recoveries MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_between_last_2_recoveries,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1692,NVSWITCH-RECOVERY-ATTEMPTS-L1-CNT,RECOVERY-ATTEMPTS-L1-CNT,Number of first level (logical lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l1_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,last_host_logical_recovery_attempts_count,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1693,NVSWITCH-RECOVERY-ATTEMPTS-L2-CNT,RECOVERY-ATTEMPTS-L2-CNT,Number of second level (Serdes) lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l2_cnt MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,last_host_serdes_feq_attempts_count,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1694,NVSWITCH-RECOVERY-CYCLE-DURATION,RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last logical recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_cycle_duration MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_in_last_host_logical_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1695,NVSWITCH-SERDES-RECOVERY-CYCLE-DURATION,SERDES-RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last Serdes recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serdes_recovery_cycle_duration MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,time_in_last_host_serdes_feq_recovery,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1696,NVSWITCH-CONTAIN-DRAIN-XMIT-DISCARD,CONTAIN-DRAIN-XMIT-DISCARD,Number of transmit discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_xmit_discard MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,contain_n_drain_xmit_discards,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1697,NVSWITCH-CONTAIN-DRAIN-RCV-DISCARD,CONTAIN-DRAIN-RCV-DISCARD,Number of receive discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_rcv_discard MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,contain_n_drain_rcv_discards,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1698,NVSWITCH-DEVICE-NUM,DEVICE-NUM,Device number on tray,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_num MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,device_num_on_tray,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1699,NVSWITCH-BOARD-TYPE,BOARD-TYPE,board type,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_board_type as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,board_type,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1700,NVSWITCH-CHASSIS-SLOT-IDX,CHASSIS-SLOT-IDX,chassis slot index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_slot_idx MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,chassis_slot_index,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1701,NVSWITCH-TRAY-IDX,TRAY-IDX,Tray index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_tray_idx MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,tray_index,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1702,NVSWITCH-TOPOLOGY-ID,TOPOLOGY-ID,Topology Id,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_topology_id MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,topology_id,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1703,NVSWITCH-CHASSIS-ID,CHASSIS-ID,Chassis Id,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_id as inventory/info event or state metric with bounded labels,gap,gap-needs-implementation,No exact current collector mapping found,,NA,chassis_id,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1704,NVSWITCH-RAW-ERR-LANE-2,RAW-ERR-LANE-2,Raw errors lane 2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_2 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Raw_Errors_Lane_2,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1705,NVSWITCH-RAW-ERR-LANE-3,RAW-ERR-LANE-3,Raw errors lane 3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_3 MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,Raw_Errors_Lane_3,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1706,NVSWITCH-RQ-NUM-WRFE,RQ-NUM-WRFE,RQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_wrfe MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rq_num_wrfe,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1707,NVSWITCH-RQ-NUM-LLE,RQ-NUM-LLE,RQ num LLE,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_lle MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,rq_num_lle,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1708,NVSWITCH-SQ-NUM-WRFE,SQ-NUM-WRFE,SQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_sq_num_wrfe MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,NA,sq_num_wrfe,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2293,NVSWITCH-CABLE-OPER-STATUS,CABLE-OPER-STATUS,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_oper_status MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/transceiver-diag/state/module-oper-status,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2294,NVSWITCH-CABLE-SNR-MEDIA-LANE-N,CABLE-SNR-MEDIA-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,BLOCKER source resolution,,No catalog source listed for GB200 row,source-resolution blocker,BLOCKER: source resolution required,nvswitch_cable_snr_media_lane_n MetricSample,catalog_no_source_gap,blocker-source-resolution,Catalog row has no source listed,,NA,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2295,NVSWITCH-CABLE-SNR-HOST-LANE-N,CABLE-SNR-HOST-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,BLOCKER source resolution,,No catalog source listed for GB200 row,source-resolution blocker,BLOCKER: source resolution required,nvswitch_cable_snr_host_lane_n MetricSample,catalog_no_source_gap,blocker-source-resolution,Catalog row has no source listed,,NA,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2296,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_low_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-lower,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2297,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_low_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-lower,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2298,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_high_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-upper,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2299,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_high_n MetricSample,gap,gap-needs-implementation,No exact current collector mapping found,,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-upper,NA,NA,,required: parser fixture plus metric emission assertion; live GB evidence before review pause,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +catalog_row,guid,metric_param_name,description,category,data_type,gb200_applicability,availability,source_families,primary_source,fallback_source,source_precedence,duplicate_alias_policy,target_collector,target_emitted_surface,current_coverage,implementation_status,coverage_reason,redfish_or_mrd_path,nvos_gnmi_path,nmx_t_field,nvos_cli_reference,onboard_dbus_reference,test_fixture_plan,live_validation_plan +763,NVSWITCH-NET-FW-VER,NET-FW-VER,Switch ASIC Firmware Version,Config,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_net_fw_ver as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId {FirmwareVersion},NA,FW_Version,nv show platform firmware $name {name: {Name: ASIC}} {actual-firmware},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/software/HGX_FW_NVSwitch_{InstanceId} xyz.openbmc_project.Software.Version Version,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +764,NVSWITCH-OS-VERSION,OS-VERSION,OS version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_os_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show system version {kernel},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +765,NVSWITCH-OS-KERNEL,OS-KERNEL,OS Kernel version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_os_kernel as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show system version {image{build-id}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +766,NVSWITCH-EROT-FW-VERSION,EROT-FW-VERSION,ERoT FW version,Config,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_erot_fw_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform firmware $name {name: {Name: EROT}} {actual-firmware},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +767,NVSWITCH-BMC-VERSION,BMC-VERSION,BMC firmware version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_bmc_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform firmware $name {name: {Name: BMC}} {actual-firmware},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +794,NVSWITCH-LINK-DOWNED-COUNTER,LINK-DOWNED-COUNTER,Total number of times the Port Training state machine has failed the link error recovery process and downed the link.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_downed_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkDownedCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-downed,Link_Down,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{link-downed}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +795,NVSWITCH-PORT-MALFORMED-PACKET-ERRORS,PORT-MALFORMED-PACKET-ERRORS,"Total number of packets received on the port that contain malformed packet errors • Data packets: LVer, length, VL • Link packets: operand, length, VL",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_malformed_packet_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{MalformedPackets}}},/interfaces/interface [name]/phy-diag/state/port-malformed-packet-errors,PortMalformedPacketErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-malformed-packet-errors}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +796,NVSWITCH-PORT-NEIGHBOR-MTU-DISCARDS,PORT-NEIGHBOR-MTU-DISCARDS,Number of outbound packets discarded by the port because packet length exceeded the NeighborMTU.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_neighbor_mtu_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{NeighborMTUDiscards}}},/interfaces/interface [name]/phy-diag/state/port-neighbor-mtu-discards,PortNeighborMTUDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-neighbor-mtu-discards}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +797,NVSWITCH-PORT-RCV-ERRORS,PORT-RCV-ERRORS,"Total number of packets containing an error that were received on the port. These errors include: • Local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine) • Malformed data packet errors (LVer, length, VL) • Malformed link packet errors (operand, length, VL) • Packets discarded due to buffer overrun",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_in_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_in_errors,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXErrors},interfaces/interface [name]/state/counters/in-errors,PortRcvErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-errors}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +798,NVSWITCH-PORT-XMIT-DISCARDS,PORT-XMIT-DISCARDS,Total number of outbound packets discarded by the port because the port is down or congested.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXDiscards}},interfaces/interface[name=*]/state/counters/out-discards,PortXmitDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-drops}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +799,NVSWITCH-PORT-RCV-REMOTE-PHYSICAL-ERRORS,PORT-RCV-REMOTE-PHYSICAL-ERRORS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_remote_physical_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXRemotePhysicalErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-remote-phy-errors,PortRcvRemotePhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +800,NVSWITCH-PORT-RCV-SWITCH-RELAY-ERRORS,PORT-RCV-SWITCH-RELAY-ERRORS,"Total number of packets received on the port that were discarded because they could not be forwarded by the switch relay.This might happen if, for instance, the destination port is congested or there are internal switch errors.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_switch_relay_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXSwitchRelayErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-switch-relay-errors,PortRcvSwitchRelayErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +801,NVSWITCH-QP1Dropped,QP1Dropped,"Number of QP1 MADs (packets) dropped due to resource limitations (e.g., lack of buffers or receives posted) on the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_qp1dropped MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{QP1Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/qp1-dropped,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{qp1-drops}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +802,NVSWITCH-VL15-DROPPED,VL15-DROPPED,"Number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers) of the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl15_dropped MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{VL15Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/vl15-dropped,VL15Dropped,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +804,NVSWITCH-SERIAL,SERIAL,Serial Number,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serial as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {SerialNumber},NA,sw_serial_number,nv show platform {serial-number},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/NVSwitch1 xyz.openbmc_project.Inventory.Decorator.Asset SerialNumber,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +806,NVSWITCH-NODE-GUID,NODE-GUID,"GUID of the HCA, switch, GPU, or router itself. All ports on the same node shall report the same NodeGUID. Provides a means to uniquely identify a node within a subnet and determine co-location of ports.",Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_node_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Node_GUID,nv show ib device $IbDeviceId {IbDeviceId: {type: NVLink*}} {guid},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/HGX_NVSwitch_{InstanceId} xyz.openbmc_project.Common.UUID UUID,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +807,NVSWITCH-PORT-GUID,PORT-GUID,GUID of the port. All ports on the same switch shall report the same NodeGUID.,Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T,NMX-T,Redfish Fabric/Switch/Port,NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Port_GUID,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +834,NVSWITCH-NVLINK-STATUS,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVLink Link status (e.g. LinkUp),Status,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI; Onboard DBus,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_pshima_nvidia_com_should_be_called_port_physical_state_ziv_hillel_il_nvlink_status as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId {LinkStatus},interfaces/interface[name=$port_name]/infiniband/state/physical-port-state,phy_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{physical-state}},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/fabrics/HGX_NVLinkFabric_{InstanceId}/Switches/NVSwitch_{InstanceId}/Ports/NVLink_{InstanceId} xyz.openbmc_project.Inventory.Item.Port LinkStatus,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +846,NVSWITCH-LINK-ERROR-RECOVERY-COUNTER,LINK-ERROR-RECOVERY-COUNTER,Total number of times the Port Training state machine has successfully completed the link error recovery process. This enrty is applicable for platforms with NVL5.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_error_recovery_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkErrorRecoveryCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-error-recovery,LinkErrorRecoveryCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{error-recovery}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +847,NVSWITCH-PORT-MULTICAST-RCV-PKTS,PORT-MULTICAST-RCV-PKTS,"Total number of multicast packets, including multicast packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",@pshima@nvidia.com spelling is wrong RXMulitcastFrames -> RXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXMulticastFrames}},/interfaces/interface [name]/phy-diag/state/port-multi-cast-rcv-pkts,PortMultiCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-multicast-pkts}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +848,NVSWITCH-PORT-MULTICAST-XMIT-PKTS,PORT-MULTICAST-XMIT-PKTS,Total number of multicast packets transmitted on all VLs from the port. This may include multicast packets with errors.,Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.","@pshima@nvidia.com spelling issue , should be TXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXMulticastFrames}}}",/interfaces/interface [name]/phy-diag/state/port-multi-cast-xmit-pkts,PortMultiCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-multicast-pkts}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +849,NVSWITCH-PORT-RCV-DATA,PORT-RCV-DATA,"Total number of data octets, divided by 4, received on all VLs at the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_data MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXBytes},interfaces/interface[name=*]/state/counters/in-octets,PortRcvDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-bytes}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +850,NVSWITCH-PORT-RCV-PKTS,PORT-RCV-PKTS,"Total number of received packets, including packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXFrames}},interfaces/interface[name=*]/state/counters/in-pkts,PortRcvPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +851,NVSWITCH-PORT-UNICAST-RCV-PKTS,PORT-UNICAST-RCV-PKTS,"Total number of unicast packets, including unicast packets containing errors.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXUnicastFrames}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-rcv-pkts,PortUniCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-unicast-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +852,NVSWITCH-PORT-UNICAST-XMIT-PKTS,PORT-UNICAST-XMIT-PKTS,Total number of unicast packets transmitted on all VLs from the port. This may include unicast packets with errors.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXUnicastFrames}}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-xmit-pkts,PortUniCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-unicast-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +853,NVSWITCH-PORT-XMIT-DATA,PORT-XMIT-DATA,"Total number of data octets, divided by 4, transmitted on all VLs from the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors. It excludes all link packets.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_data MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {TXBytes}},interfaces/interface[name=*]/state/counters/out-octets,PortXmitDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-bytes}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +854,NVSWITCH-PORT-XMIT-PKTS,PORT-XMIT-PKTS,Total number of packets transmitted on all VLs from the port. This may include packets with errors,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXFrames}},interfaces/interface[name=*]/state/counters/out-pkts,PortXmitPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +855,NVSWITCH-PORT-XMIT-WAIT,PORT-XMIT-WAIT,The number of ticks during which the port selected by PortSelect had data to transmit but no data was sent during the entire tick either because of insufficient credits or because of lack of arbitration.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_wait MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{TXWait}}},interfaces/interface[name=*]/infiniband/state/counters/port/xmit-wait,PortXmitWait,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-wait}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +862,NVSWITCH-CONTACT,CONTACT,UTF-8 encoded string to describe contact person.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_contact as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/contact,NA,TBD,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +863,NVSWITCH-LOCATION,LOCATION,UTF-8 encoded string to describe location of the device.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_location as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/location,NA,nv show platform chassis-location {slot-number},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +864,NVSWITCH-NODE-DESCRIPTION,NODE-DESCRIPTION,UTF-8 encoded string to describe node in text format.,Inventory,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_node_description as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/platform-name,node_description,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +865,NVSWITCH-LID,LID,Local ID- Link layer address of an end port.,NetworkId,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_lid MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,lid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +866,NVSWITCH-PORT-NUMBER,PORT-NUMBER,Port number,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_number as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Port_Number,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +867,NVSWITCH-PORT-LABEL,PORT-LABEL,Front panel label of the port,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_label as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,port_label,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +868,NVSWITCH-REVISION,REVISION,Switch HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_revision MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,sw_revision,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +869,NVSWITCH-DEVICE-HARDWARE-REVISION,DEVICE-HARDWARE-REVISION,DEvice HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_hardware_revision MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,device_hw_rev,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +870,NVSWITCH-CPU_CORE_NUMBER,CPU_CORE_NUMBER,Number of cores,System,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_core_number MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show system cpu {core-count},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +872,NVSWITCH-ASIC-TEMP-CRITICAL,ASIC-TEMP-CRITICAL,"Critical temperature threshold for NVSwitch ASIC. Above this level, the system will shutdown.",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +873,NVSWITCH-ASIC-TEMP-MAX,ASIC-TEMP-MAX,Max temperature threshold for NVSwitch ASIC.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {max}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +874,NVSWITCH-ASIC-TEMP-STATE,ASIC-TEMP-STATE,NVSwitch ASIC state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +875,NVSWITCH-ASIC-TEMP-CURRENT,ASIC-TEMP-CURRENT,NVSwitch ASIC current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_temp_current MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=ASIC*]/asic/state/asic-temp,Chip_Temp,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +876,NVSWITCH-ASIC-NAME,ASIC-NAME,NVSwitch ASIC current temperature,Platform,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_name MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=ASIC*]/state/name,NA,nv show platform {asic-model},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +879,NVSWITCH-AMBIENT-MNG-TEMP-STATE,AMBIENT-MNG-TEMP-STATE,Ambient temperature located in port side state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_ambient_mng_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +880,NVSWITCH-AMBIENT-MNG-TEMP-CURRENT,AMBIENT-MNG-TEMP-CURRENT,Ambient temperature located in port side,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_ambient_mng_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +881,NVSWITCH-CPU_PACK_TEMP_CRITICAL,CPU_PACK_TEMP_CRITICAL,"Critical temperature threshold for CPU PACK, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +882,NVSWITCH-CPU_PACK_TEMP_MAX,CPU_PACK_TEMP_MAX,Max temperature threshold for CPU PACK,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +883,NVSWITCH-CPU_PACK_TEMP_STATE,CPU_PACK_TEMP_STATE,CPU PACK temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +884,NVSWITCH-CPU_PACK_TEMP_CURRENT,CPU_PACK_TEMP_CURRENT,CPU PACK temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +885,NVSWITCH-CPU-UTIL,CPU-UTIL,ComE CPU utilization,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_util MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=cpu]/cpu/utilization/state/avg,NA,nv show system cpu {total-utilization},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +886,NVSWITCH-MEM-UTIL,MEM-UTIL,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_util MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/memory-used,NA,nv show system memory {physical{utilization}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +887,NVSWITCH-MEM-TOTAL-SIZE,MEM-TOTAL-SIZE,Memory total size,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_total_size MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/memory-total-size,NA,nv show system memory {physical{total}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +888,NVSWITCH-DISK-TOTAL-SIZE,DISK-TOTAL-SIZE,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_total_size MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/disk-total-size,NA,TBD,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +889,NVSWITCH-DISK-USED,DISK-USED,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_used MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/disk-used,NA,TBD,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +890,NVSWITCH-SODIMM_TEMP_CRITICAL,SODIMM_TEMP_CRITICAL,"Critical temperature threshold for SODIMM temperature, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +891,NVSWITCH-SODIMM_TEMP_MAX,SODIMM_TEMP_MAX,Max temperature threshold for SODIMM temperature,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {max}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +892,NVSWITCH-SODIMM_TEMP_STATE,SODIMM_TEMP_STATE,SODIMM temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +893,NVSWITCH-SODIMM_TEMP_CURRENT,SODIMM_TEMP_CURRENT,SODIMM temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +894,FAN-MAX-SPEED,MAX-SPEED,Chassis fan reading range (max),Config,Float,GB200 NVL BMC; GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NVOS CLI,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port then NVOS CLI,one canonical series unless source-qualified duplicate is justified,existing SensorsCollector range emission when include_sensor_thresholds=true,hw_sensor {reading_type}_range_max MetricSample with sensor_range=reading_range_max,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.3 /redfish/v1/Chassis/$ChassisId/Sensors/$SensorId {ReadingRangeMax},NA,NA,nv show platform environment fan $FanId {max-speed},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +897,NVSWITCH-PORT-LOGICAL-STATE,PORT-LOGICAL-STATE,Port State. Enumerated as: 0: No State Change; 1: Down (includes failed links) 2: Initialize 3: Armed 4: Active,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_logical_state MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/logical-port-state,logical_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{logical-state}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +898,NVSWITCH-FEC-MODE-ACTIVE,FEC-MODE-ACTIVE,"FEC mode active: 0: No_FEC 1: Firecode_FEC 2: Standard_RS_FEC - RS(528,514) 3: Standard_LL_RS_FEC - RS(271,257) 6: Interleaved_Standard_RS-FEC - (544,514) 7: Standard_RS-FEC - (544,514)",Status,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_fec_mode_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Active_FEC,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +899,NVSWITCH-RAW-BER,RAW-BER,Raw BER- calculated by the following: bits 15:8 - raw_ber_magnitude bits 3:0 - raw_ber_coef Raw_BER = raw_ber_coef*10^(-raw_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber,Total_Raw_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{raw-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +900,NVSWITCH-EFFECTIVE-BER,EFFECTIVE-BER,Effective BER- calculated by the following: bits 15:8 - effective_ber_magnitude bits 3:0 - effective_ber_coef Effective_BER = effective_ber_coef*10^(-effective_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_effective_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,NA,/interfaces/interface [name]/phy-diag/state/effective-ber,Effective_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +901,NVSWITCH-SYMBOL-BER,SYMBOL-BER,Symbol BER- calculated by the following: bits 15:8 - symbol_ber_magnitude bits 3:0 - symbol_ber_coef Symbol_BER = symbol_ber_coef*10^(-symbol_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_symbol_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{BitErrorRate}}},/interfaces/interface [name]/phy-diag/state/symbol-ber,Symbol_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{symbol-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +902,NVSWITCH-ZERO-HIST,ZERO-HIST,First FEC histogram bin with value of 0 while all higher bins are only with 0 value as well.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_zero_hist MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/zero-hist,fc_zero_hist,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{zero-hist}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +903,NVSWITCH-PHY-RAW-ERRORS-LANE0,PHY-RAW-ERRORS-LANE0,This counter provides information on error bits that were identified on lane 0. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-1,Raw_Errors_Lane_0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{phy-raw-errors}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +904,NVSWITCH-PHY-RAW-ERRORS-LANE1,PHY-RAW-ERRORS-LANE1,This counter provides information on error bits that were identified on lane 1. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-2,Raw_Errors_Lane_1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{phy-raw-errors}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +905,NVSWITCH-RAW-BER-LANE0,RAW-BER-LANE0,Raw BER for lane 0. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-1,raw_ber_lane0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{raw-ber}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +906,NVSWITCH-RAW-BER-LANE1,RAW-BER-LANE1,Raw BER for lane 1. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-2,raw_ber_lane1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{raw-ber}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +907,NVSWITCH-PHY-EFFECTIVE-ERRORS,PHY-EFFECTIVE-ERRORS,This counter provides information on error bits that were not corrected by FEC correction algorithm or that FEC is not active. (post FEC pre PLR),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_effective_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2025.1 /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{EffectiveError}}},/interfaces/interface [name]/phy-diag/state/effective-errors,Effective_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-errors}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +908,NVSWITCH-PHY-SYMBOL-ERRORS,PHY-SYMBOL-ERRORS,Total number of minor link errors detected on one or more physical lanes. This counter provides information on error bits that were not corrected by phy correction mechanisms. (post FEC & PLR),Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NMX-T,NVOS gNMI,NMX-T then NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,existing switch_nmxt symbol_errors MetricSample,covered_host_nmxt,already-covered-regression-required,NMX-T maps Symbol_Errors to symbol_errors,NA,/interfaces/interface [name]/phy-diag/state/symbol-errors,Symbol_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{nvl{errors{symbol-errors{receive}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +909,NVSWITCH-TIME-SINCE-LASTS-CLEAR,TIME-SINCE-LASTS-CLEAR,The time passed since the last counters clear event in msec- time since the port was raised to up.,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_time_since_lasts_clear MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/time-since-last-clear-min,Time_since_last_clear_Min,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{time-since-last-clear-min}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +910,NVSWITCH-DEVICE-ID,DEVICE-ID,Device ID information as assigned by device manufacturer.,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_id as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Device_ID,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +911,NVSWITCH-FEC-HIST-0,FEC-HIST-0,Value of RS FEC Histogram (Reed Solomon error correction) bin0,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin0,hist0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{0{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +912,NVSWITCH-FEC-HIST-1,FEC-HIST-1,Value of RS FEC Histogram (Reed Solomon error correction) bin1,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin1,hist1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{1{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +913,NVSWITCH-FEC-HIST-2,FEC-HIST-2,Value of RS FEC Histogram (Reed Solomon error correction) bin2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_2 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin2,hist2,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{2{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +914,NVSWITCH-FEC-HIST-3,FEC-HIST-3,Value of RS FEC Histogram (Reed Solomon error correction) bin3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_3 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin3,hist3,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{3{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +915,NVSWITCH-FEC-HIST-4,FEC-HIST-4,Value of RS FEC Histogram (Reed Solomon error correction) bin4,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_4 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin4,hist4,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{4{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +916,NVSWITCH-FEC-HIST-5,FEC-HIST-5,Value of RS FEC Histogram (Reed Solomon error correction) bin5,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_5 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin5,hist5,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{5{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +917,NVSWITCH-FEC-HIST-6,FEC-HIST-6,Value of RS FEC Histogram (Reed Solomon error correction) bin6,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_6 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin6,Hist6,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{6{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +918,NVSWITCH-FEC-HIST-7,FEC-HIST-7,Value of RS FEC Histogram (Reed Solomon error correction) bin7,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_7 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin7,Hist7,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{7{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +919,NVSWITCH-FEC-HIST-8,FEC-HIST-8,Value of RS FEC Histogram (Reed Solomon error correction) bin8,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_8 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin8,Hist8,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{8{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +920,NVSWITCH-FEC-HIST-9,FEC-HIST-9,Value of RS FEC Histogram (Reed Solomon error correction) bin9,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_9 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin9,Hist9,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{9{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +921,NVSWITCH-FEC-HIST-10,FEC-HIST-10,Value of RS FEC Histogram (Reed Solomon error correction) bin10,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_10 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin10,Hist10,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{10{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +922,NVSWITCH-FEC-HIST-11,FEC-HIST-11,Value of RS FEC Histogram (Reed Solomon error correction) bin11,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_11 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin11,Hist11,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{11{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +923,NVSWITCH-FEC-HIST-12,FEC-HIST-12,Value of RS FEC Histogram (Reed Solomon error correction) bin12,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_12 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin12,hist12,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{12{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +924,NVSWITCH-FEC-HIST-13,FEC-HIST-13,Value of RS FEC Histogram (Reed Solomon error correction) bin13,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_13 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin13,hist13,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{13{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +925,NVSWITCH-FEC-HIST-14,FEC-HIST-14,Value of RS FEC Histogram (Reed Solomon error correction) bin14,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_14 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin14,hist14,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{14{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +926,NVSWITCH-FEC-HIST-15,FEC-HIST-15,Value of RS FEC Histogram (Reed Solomon error correction) bin15,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_15 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin15,hist15,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{15{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +931,NVSWITCH-PLR-CODES-LOSS,PLR-CODES-LOSS,Recieved bandwidth loss due to codes retransmission. calculated in resolution of: (plr_rcv_code_err / plr_rcv_codes) * 10^10 BW Loss % = (plr_codes_loss / 10^10 ) *100,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_plr_codes_loss MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,HiRetransmissionRate,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-codes-loss}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +932,NVSWITCH-PORT-BUFFER-OVERRUN-ERRORS,PORT-BUFFER-OVERRUN-ERRORS,Total number of packets received on the port that were discarded due to buffer overrun.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_buffer_overrun_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/counters/port/excessive-buffer-overrun,ExcessiveBufferOverrunErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{buffer-overrun-errors}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +933,NVSWITCH-LINK-SPEED-ACTIVE,LINK-SPEED-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_speed_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/speed,Link_speed_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{speed}}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +934,NVSWITCH-PLR-RCV-CODES,PLR-RCV-CODES,Number of received PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-codes,PlrRcvCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +935,NVSWITCH-PLR-RCV-CODES-ERR,PLR-RCV-CODES-ERR,The total number of rejected PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes_err MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-code-err,PlrRcvCodeErr,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes-err}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +936,NVSWITCH-PLR-RCV-UNCORRECTABLES-CODE,PLR-RCV-UNCORRECTABLES-CODE,The total number of uncorrectable PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_uncorrectables_code MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-uncorrectable-code,PlrRcvUncorrectableCode,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-uncorrectable-code}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +937,NVSWITCH-PLR-XMIT-CODES,PLR-XMIT-CODES,Number of transmitted PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-codes,PlrXmitCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +938,NVSWITCH-PLR-XMIT-RETRYS-CODES,PLR-XMIT-RETRYS-CODES,The total number of PLR codewords retransmitted,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-codes,PlrXmitRetryCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +939,NVSWITCH-PLR-XMIT-RETRYS-EVENTS,PLR-XMIT-RETRYS-EVENTS,The total number of retransmitted events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_events MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events,PlrXmitRetryEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +940,NVSWITCH-PLR-SYNC-EVENTS,PLR-SYNC-EVENTS,The number of PLR sync events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_sync_events MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-sync-events,PlrSyncEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-sync-events}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +941,NVSWITCH-PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,The maximum number of retransmitted events in 60 sec window based upon the action of undertaking PLR (physical layer retry),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retry_codes_within_minute MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events-within-t-sec-max,PlrXmitRetryCodesWithinTSecMax,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +942,NVSWITCH-PLR-BW-LOSS-PERCENT,PLR-BW-LOSS-PERCENT,The bandwidth loss (percentage) based upon PLR on the NVLink.,Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_plr_bw_loss_percent MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +943,NVSWITCH-RQ-GENERAL-ERROR,RQ-GENERAL-ERROR,The total number of packets that were dropped since it contained errors. Reasons for this include: Dropped due to MPR mismatch.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_rq_general_error MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/rq-general-error,rq_general_error,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{rq-general-error}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +944,NVSWITCH-TIME-TO-LINKS-UP,TIME-TO-LINKS-UP,"Time in msec to link up from disable until phy up state. While the phy manager did not reach phy up state the timer will return 0. The timer resets to 0 in one of the following cases: 1. When moving to disable or rx disable state. 2. When moving from active or phy up to polling state, while working at force mode.",Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_to_links_up MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,time_to_link_up_ext_msec,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +945,NVSWITCH-STATUS-OPCODE,STATUS-OPCODE,Opcode for advanced debug. String representation can be found in STATUS-MESSAGE.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,Advanced_Status_Opcode,"nv show interface --view link-diagnostics ""code""",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +946,NVSWITCH-STATUS-MESSAGE,STATUS-MESSAGE,String represntation of STATUS-OPCODE. All Messages are terminated by a Null character ‘\0’,Status,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_message as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,Status_Message,"nv show interface --view link-diagnostics ""status""",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +947,NVSWITCH-DOWN-BLAME,DOWN-BLAME,Which receiver caused last link down: 0: Unknown 1: Local_phy 2: Remote_phy,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_down_blame MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,down_blame,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +948,NVSWITCH-LOCAL-REASON-OPCODE,LOCAL-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 3: Block_Lock_loss 4: Alignment_loss 5: FEC_sync_loss 6: PLL_lock_loss 7: FIFO_overflow 8: false_SKIP_condition 9: Minor_Error_threshold_exceeded 10: Physical_layer_retransmission_timeout 11: Heartbeat_errors 12: Link_Layer_credit_monitoring_watchdog 13: Link_Layer_integrity_threshold_exceeded 14: Link_Layer_buffer_overrun 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 19: Down_by_verification_GW 20: Received_Remote_Fault 21: Received_TS1 22: Down_by_management_command 23: Cable_was_unplugged 24: Cable_access_issue 25: Thermal_shutdown 26: Current_issue 27: Power_budget 28: Fast_recovery_raw_ber 29: Fast_recovery_effective_ber 30: Fast_recovery_symbol_ber 31: Fast_recovery_credit_watchdog 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_local_reason_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,local_reason_opcode,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +949,NVSWITCH-REMOTE-REASON-OPCODE,REMOTE-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 4: Alignment_loss 10: Physical_layer_retransmission_timeout 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 21: Received_TS1 22: Down_by_management_command 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event 38: Reset_no_power_cycle 40: Down_due_to_HW_force_event 41: Down_due_to_thermal_event 42: L1_exit_failure 43: too_many_link_error_recoveries 44: Down_due_to_contain_mode 45: BW_loss_threshold_exceeded 47: Hi_SER 48: down_by_nmx_adminstate_cmd,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_remote_reason_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,remote_reason_opcode,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +950,NVSWITCH-PHY-RECEIVED-BITS,PHY-RECEIVED-BITS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_received_bits MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/phy-received-bits,phy_received_bits,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{phy-received-bits}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +951,NVSWITCH-PORT-RCV-CONSTRAINT-ERRORS,PORT-RCV-CONSTRAINT-ERRORS,Total number of packets received on the switch physical port that are discarded for the following reasons: • FilterRawInbound is true and packet is raw • PartitionEnforcementInbound is true and packet fails partition key check or IP version check,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_constraint_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/counters/port/rcv-constraints-errors,PortRcvConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-constraint-errors}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +952,NVSWITCH-PORT-XMIT-CONSTRAINTS-ERRORS,PORT-XMIT-CONSTRAINTS-ERRORS,Total number of packets not transmitted from the switch physical port for the following reasons: • FilterRawOutbound is true and packet is raw • PartitionEnforcementOutbound is true and packet fails partition key check or IP version check,Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_out_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_out_errors,NA,/interfaces/interface [name]/state/counters/out-errors,PortXmitConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-errors}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +953,NVSWITCH-PORT-LOCAL-PHYSICAL-ERRORS,PORT-LOCAL-PHYSICAL-ERRORS,"Total number of packets received on the port that contain local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine).",Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_local_physical_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-local-physical-errors,PortLocalPhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-local-physical-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +954,NVSWITCH-SYNC-HEADER-ERROR-COUNTER,SYNC-HEADER-ERROR-COUNTER,Count of errored block sync header on one or more lanes.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_sync_header_error_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/sync-header-error-counter,SyncHeaderErrorCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{sync-header-error-counter}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +955,NVSWITCH-PORT-DLID-MAPPING-ERRORS,PORT-DLID-MAPPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to DLID mapping errors.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_dlid_mapping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-dlid-mapping-errors,PortDLIDMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-dlid-mapping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +956,NVSWITCH-LOCAL-LINK-INTEGRITY-ERRORS,LOCAL-LINK-INTEGRITY-ERRORS,The number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors;,Error,Integer,GB200 NVL HMC; GB200 NVL BMC; GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_local_link_integrity_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name]/infiniband/state/counters/port/local-link-integrity-errors,LocalLinkIntegrityErrors,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +957,NVSWITCH-PORT-VL-MAPPING-ERRORS,PORT-VL-MAPPING-ERRORS,"Packet discards due to VL mapping behavior are not considered errors, so the behavior of this counter is implementation dependent. However, it is recommended that this counter be used to count the total number of packets received on the port that were discarded because they could not be forwarded by the switch relay due to VL mapping behavior",Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_vl_mapping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-vl-mapping-errors,PortVLMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-vl-mapping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +958,NVSWITCH-PORT-LOOPING-ERRORS,PORT-LOOPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to looping errors (output port = input port). This applies to switches only.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_looping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-looping-errors,PortLoopingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-looping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +959,NVSWITCH-PORT-INACTIVE-DISCARDS,PORT-INACTIVE-DISCARDS,Number of outbound packets discarded by the port because it is not in the active state.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_inactive_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-inactive-discards,PortInactiveDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-inactive-discards}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +960,NVSWITCH-LINK-WIDTH-ACTIVE,LINK-WIDTH-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_width_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/width,Link_width_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{lanes}}",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +961,NVSWITCH-PHY-MANAGER-STATE,PHY-MANAGER-STATE,Show some more info about the PHY state: 0:Disabled 1:Open_port 2:Polling 3:Active_or_Linkup 4:Close_port 5:Phy_up 7:Rx_disable,Status,Text,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_manager_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/phy-manager-state,Phy_Manager_State,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +962,NVSWITCH-MTU,MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mtu MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/mtu,NA,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{mtu}}",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +963,NVSWITCH-MAX-SUPPORTED-MTU,MAX-SUPPORTED-MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_max_supported_mtu MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/max-supported-mtus,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{max-supported-mtu}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +964,NVSWITCH-SUPPORTED-WIDTH,SUPPORTED-WIDTH,Maximum Transmission Unit,Specs,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_supported_width MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/supported-widths,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{supported-lanes}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +965,NVSWITCH-VL-CAPABILITIES,VL-CAPABILITIES,Maximum Transmission Unit,Specs,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl_capabilities as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/vl-capabilities,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{vl-capabilities}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +966,NVSWITCH-FAN-STATE,FAN-STATE,Fan status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fan_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name=FAN1/1]/state/oper-status,NA,nv show platform environment fan $FanId {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +967,NVSWITCH-FAN-LED,FAN-LED,Fan LED color,Sensor.Other,Text,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_fan_led as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,,nv show platform environment led $LedID {color},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +968,NVSWITCH-CABLE-PART-NUMBER,CABLE-PART-NUMBER,Cable part num,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_part_number as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Cable_PN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-pn},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +969,NVSWITCH-CABLE-SERIAL-NUMBER,CABLE-SERIAL-NUMBER,Cabl Serial num,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_serial_number MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",Na,NA,Cable_SN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-sn},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +970,NVSWITCH-CABLE-TRANSMITTER-TECHNOLOGY,CABLE-TRANSMITTER-TECHNOLOGY,Active/Passive cable,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_transmitter_technology as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_technology,TBD,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +971,NVSWITCH-CABLE-TYPE,CABLE-TYPE,Cable/module type: 0: Unidentified 1: Active_cable - (active copper / optics) 2: Optical_Module - (separated) 3: Passive_copper_cable 4: Cable_unplugged 5: Twisted_pair,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_type as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_type,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-type},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +972,NVSWITCH-CABLE-VENDOR,CABLE-VENDOR,Cable vendor: 0: Other 1: Mellanox 2: Known_OUI 3: NVIDIA,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_vendor as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_vendor,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-name},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +973,NVSWITCH-CABLE-LENGTH,CABLE-LENGTH,Cable length in 1m units. For CMIS modules: bits 6:7 represent cable_length_multiplier for calculating cable length 00 - 0.1 multiplier (0.1 to 6.3m) 01- 1 multiplier (1 to 63m) 10 - 10 multiplier (10 to 630m) 11 - 100 multiplier (100 to 6300m) bits 0:5 represent cable_length_value for calculating cable length. length is calculated with cable_length_value * cable_length_- multiplier,Specs,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_length MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_length,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-length},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +974,NVSWITCH-CABLE-IDENTIFIER,CABLE-IDENTIFIER,"0: QSFP28 1: QSFP_Plus 2: SFP28_or_SFP_Plus 3: QSA - (QSFP->SFP) 4: Backplane 5: SFP_DD 6: QSFP_DD 7: QSFP_CMIS 8: OSFP 9: C2C 10: DSFP 11: QSFP_Split_Cable identifiers that are CMIS compliant are: 5,6,7,8,10",Specs,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_identifier MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_identifier,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {identifier},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +975,NVSWITCH-CABLE-REV,CABLE-REV,ASCII Vendor revision aligned to right padded with 0h on the left,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rev as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,vendor_rev,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-rev},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +976,NVSWITCH-CABLE-FW-VERSION,CABLE-FW-VERSION,module FW version (relevant for optic only),Config,,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_fw_version MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_fw_version,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {firmware},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +977,NVSWITCH-CABLE-RX-POWER-LANE0,CABLE-RX-POWER-LANE0,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rx_power_lane_0,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-1{rx-power{Power}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +978,NVSWITCH-CABLE-RX-POWER-LANE1,CABLE-RX-POWER-LANE1,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rx_power_lane_1,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-2{rx-power{Power}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +979,NVSWITCH-CABLE-DIAG-SUPPLY-VOLTAGE,CABLE-DIAG-SUPPLY-VOLTAGE,Internally measured supply voltage in 100uV (relevant for optic only),Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_diag_supply_voltage MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Module_Voltage,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {voltage{voltage}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +980,NVSWITCH-CABLE-TEMP,CABLE-TEMP,Module main temperature sensor measured on a unit scale of 1/256 C degrees(relevant for optic only),Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_temp MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Module_Temperature,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {temperature{temperature}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +981,NVSWITCH-CABLE-TEMP-ALARM,CABLE-TEMP-ALARM,Temperature warning threshold on a unit scale of 1/256 C degrees.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_temp_alarm MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag,Temp_flags,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +982,NVSWITCH-CABLE-VOLTAGE-ALARM,CABLE-VOLTAGE-ALARM,Voltage warning threshold on a unit scale of 100uV.,Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_voltage_alarm MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag,Vcc_flags,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +983,NVSWITCH-CABLE-TX-CDR-LOL,CABLE-TX-CDR-LOL,Bitmask for latched Tx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_cdr_lol as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-cdr-lol,tx_cdr_lol,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +984,NVSWITCH-CABLE-RX-CDR-LOL,CABLE-RX-CDR-LOL,Bitmask for latched Rx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_cdr_lol as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-cdr-lol,rx_cdr_lol,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +985,NVSWITCH-CABLE-TX-LOS,CABLE-TX-LOS,Bitmask for latched Tx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_los as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-los,tx_los,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +986,NVSWITCH-CABLE-RX-LOS,CABLE-RX-LOS,Bitmask for latched Rx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_los as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-los,rx_los,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +987,NVSWITCH-LINK-PARTNER-DESCRIPTION,LINK-PARTNER-DESCRIPTION,Description of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_description as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_description,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +988,NVSWITCH-LINK-PARTNER-NODE-GUID,LINK-PARTNER-NODE-GUID,GUID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_node_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_node_guid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +989,NVSWITCH-LINK-PARTNER-LID,LINK-PARTNER-LID,LID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_lid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_lid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +990,NVSWITCH-LINK-PARTNER-PORT-NUM,LINK-PARTNER-PORT-NUM,Port number of the link partner side (port that is connected to the port),Inventory,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_port_num MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_port_num,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1174,NVSWITCH-CPU-STATE,CPU-STATE,CPU status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name=cpu]/state/oper-status,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1241,NVSWITCH-DRIVE-TEMP-CRITICAL,DRIVE-TEMP-CRITICAL,"Critical temperature threshold for drive, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1242,NVSWITCH-DRIVE-TEMP-MAX,DRIVE-TEMP-MAX,Max temperature threshold for drive,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1243,NVSWITCH-DRIVE-TEMP-STATE,DRIVE-TEMP-STATE,Drive Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1244,NVSWITCH-DRIVE-TEMP-CURRENT,DRIVE-TEMP-CURRENT,Drive Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1245,NVSWITCH-HSC-VINDC-TEMP-CRITICAL,HSC-VINDC-TEMP-CRITICAL,"Critical temperature threshold for HSC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1246,NVSWITCH-HSC-VINDC-TEMP-MAX,HSC-VINDC-TEMP-MAX,Max temperature threshold for HSC,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1247,NVSWITCH-HSC-VINDC-TEMP-STATE,HSC-VINDC-TEMP-STATE,HSC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1248,NVSWITCH-HSC-VINDC-TEMP-CURRENT,HSC-VINDC-TEMP-CURRENT,HSC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1249,NVSWITCH-PDB-CONV-TEMP-CRITICAL,PDB-CONV-TEMP-CRITICAL,"Critical temperature threshold for PDB, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1251,NVSWITCH-PDB-CONV-TEMP-STATE,PDB-CONV-TEMP-STATE,PDB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1252,NVSWITCH-PDB-CONV-TEMP-CURRENT,PDB-CONV-TEMP-CURRENT,PDB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1253,NVSWITCH-PMIC-TEMP-CRITICAL,PMIC-TEMP-CRITICAL,"Critical temperature threshold for PMIC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1255,NVSWITCH-PMIC-TEMP-STATE,PMIC-TEMP-STATE,PMIC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1256,NVSWITCH-PMIC-TEMP-CURRENT,PMIC-TEMP-CURRENT,PMIC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1259,NVSWITCH-SWB-ASIC-PCB-TEMP-STATE,SWB-ASIC-PCB-TEMP-STATE,SWB ASIC PCB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_swb_asic_pcb_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1260,NVSWITCH-SWB-ASIC-PCB-TEMP-CURRENT,SWB-ASIC-PCB-TEMP-CURRENT,SWB ASIC PCB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_swb_asic_pcb_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1688,NVSWITCH-LINK-RECOVERY-SUCCESS-CNT,LINK-RECOVERY-SUCCESS-CNT,Successful recovery count in an active link. Counter resets on link flap.,"Status, Event",Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_recovery_success_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,successful_recovery_events,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1689,NVSWITCH-TOTAL-LINK-RECOVERY-SUCCESS-CNT,TOTAL-LINK-RECOVERY-SUCCESS-CNT,Total successful recovery count accumulated across link flaps.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_total_link_recovery_success_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,total_successful_recovery_events,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1690,NVSWITCH-TIME-SINCE-LAST-RECOVERY,TIME-SINCE-LAST-RECOVERY,"Elapsed time since last recovery event, measured in seconds.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_since_last_recovery MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_since_last_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1691,NVSWITCH-TIME-BTWN-TWO-RECOVERIES,TIME-BTWN-TWO-RECOVERIES,"Time in msec between two last consecutive recoveries (success or fail) from exit of first to entry of second. When value is OxFFFF, time is more than 1 minute.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_btwn_two_recoveries MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_between_last_2_recoveries,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1692,NVSWITCH-RECOVERY-ATTEMPTS-L1-CNT,RECOVERY-ATTEMPTS-L1-CNT,Number of first level (logical lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l1_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,last_host_logical_recovery_attempts_count,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1693,NVSWITCH-RECOVERY-ATTEMPTS-L2-CNT,RECOVERY-ATTEMPTS-L2-CNT,Number of second level (Serdes) lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l2_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,last_host_serdes_feq_attempts_count,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1694,NVSWITCH-RECOVERY-CYCLE-DURATION,RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last logical recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_cycle_duration MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_in_last_host_logical_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1695,NVSWITCH-SERDES-RECOVERY-CYCLE-DURATION,SERDES-RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last Serdes recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serdes_recovery_cycle_duration MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_in_last_host_serdes_feq_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1696,NVSWITCH-CONTAIN-DRAIN-XMIT-DISCARD,CONTAIN-DRAIN-XMIT-DISCARD,Number of transmit discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_xmit_discard MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,contain_n_drain_xmit_discards,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1697,NVSWITCH-CONTAIN-DRAIN-RCV-DISCARD,CONTAIN-DRAIN-RCV-DISCARD,Number of receive discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_rcv_discard MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,contain_n_drain_rcv_discards,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1698,NVSWITCH-DEVICE-NUM,DEVICE-NUM,Device number on tray,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_num MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,device_num_on_tray,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1699,NVSWITCH-BOARD-TYPE,BOARD-TYPE,board type,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_board_type as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,board_type,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1700,NVSWITCH-CHASSIS-SLOT-IDX,CHASSIS-SLOT-IDX,chassis slot index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_slot_idx MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,chassis_slot_index,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1701,NVSWITCH-TRAY-IDX,TRAY-IDX,Tray index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_tray_idx MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,tray_index,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1702,NVSWITCH-TOPOLOGY-ID,TOPOLOGY-ID,Topology Id,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_topology_id MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,topology_id,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1703,NVSWITCH-CHASSIS-ID,CHASSIS-ID,Chassis Id,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_id as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,chassis_id,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1704,NVSWITCH-RAW-ERR-LANE-2,RAW-ERR-LANE-2,Raw errors lane 2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_2 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Raw_Errors_Lane_2,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1705,NVSWITCH-RAW-ERR-LANE-3,RAW-ERR-LANE-3,Raw errors lane 3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_3 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Raw_Errors_Lane_3,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1706,NVSWITCH-RQ-NUM-WRFE,RQ-NUM-WRFE,RQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_wrfe MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rq_num_wrfe,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1707,NVSWITCH-RQ-NUM-LLE,RQ-NUM-LLE,RQ num LLE,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_lle MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rq_num_lle,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +1708,NVSWITCH-SQ-NUM-WRFE,SQ-NUM-WRFE,SQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_sq_num_wrfe MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,sq_num_wrfe,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2293,NVSWITCH-CABLE-OPER-STATUS,CABLE-OPER-STATUS,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_oper_status MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/transceiver-diag/state/module-oper-status,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2294,NVSWITCH-CABLE-SNR-MEDIA-LANE-N,CABLE-SNR-MEDIA-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,SOURCE UNLISTED live source resolution,,No catalog source listed for GB200 row,source-resolution required before live signoff,live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted,nvswitch_cable_snr_media_lane_n MetricSample,source_resolution_required,requires-live-source-resolution,"Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.",,NA,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2295,NVSWITCH-CABLE-SNR-HOST-LANE-N,CABLE-SNR-HOST-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,SOURCE UNLISTED live source resolution,,No catalog source listed for GB200 row,source-resolution required before live signoff,live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted,nvswitch_cable_snr_host_lane_n MetricSample,source_resolution_required,requires-live-source-resolution,"Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.",,NA,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2296,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_low_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-lower,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2297,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_low_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-lower,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2298,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_high_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-upper,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +2299,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_high_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-upper,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index 184aad30dc..f27fe9ec94 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -1,6 +1,6 @@ # NVSWITCH telemetry GB200 source matrix -Generated from `.omx/artifacts/nvswitch_rows.csv` for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`: +Generated from sanitized Telemetry Catalog extraction artifacts for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`: - `Applicable for GB200 NVL HMC` - `Applicable for GB200 NVL BMC` @@ -15,29 +15,48 @@ CSV matrix: `docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv` ### Implementation status - already-covered-regression-required: 5 -- blocker-source-resolution: 2 -- gap-needs-implementation: 183 -- partial-needs-implementation: 3 +- covered-by-generic-infra-requires-live-validation: 150 +- requires-live-source-equivalent: 36 +- requires-live-source-resolution: 2 -### Current coverage +### Branch coverage status -- catalog_no_source_gap: 2 +- covered_generic_infra_unvalidated: 150 - covered_host_gnmi: 4 - covered_host_nmxt: 1 -- gap: 183 -- partial_host: 3 +- source_equivalent_required: 36 +- source_resolution_required: 2 ### Primary source -- BLOCKER source resolution: 2 - NMX-T: 57 - NVOS CLI: 36 - NVOS gNMI: 97 - Redfish Fabric/Switch/Port: 1 +- SOURCE UNLISTED live source resolution: 2 + +## GB200 branch implementation coverage + +The `nvswitch_telemetry_gaps` branch implements common GB+VR-friendly collector infrastructure for the GB200 phase: + +- Redfish BMC: enabled `nv-redfish` `telemetry-service`, added a switch-BMC-only TelemetryService collector, and emits every numeric/boolean/string `MetricReport` value as `redfish_telemetry_service` samples with report and source-property labels. +- BMC proxy: widened TelemetryService ACLs to `MetricReportDefinitions/*` and `MetricReports/*` so live GB200 validation is not limited to `NvidiaNMMetrics_0`. +- NMX-T HOST: preserves all numeric Prometheus samples instead of dropping unknown metric names; legacy `Effective_BER`, `Symbol_Errors`, and `Link_Down` metric names remain canonical. +- NVUE gNMI HOST: subscribes to `components`, `interfaces`, and `platform-general`; known current metrics keep their existing names, and previously unmapped leaves are emitted as source-qualified `nvswitch_*` samples. +- Config: `collectors.telemetry_service` is disabled by default, and `collectors.nvue.gnmi.paths.platform_general_enabled` is an explicit opt-in path gate; the example and live-validation configs enable the full GB200 switch collector set. + +The generic-preservation surfaces are behavior-locked by unit tests before live hardware validation: + +- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples` covers numeric, string/info, and boolean/state MetricReport values. +- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels` cover stable key identity for unknown Prometheus samples with extra labels. +- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric` cover previously unmapped interface leaves and platform-general string leaves. + +Rows that still have no catalog-listed source remain in scope: `CABLE-SNR-MEDIA-LANE-N` and `CABLE-SNR-HOST-LANE-N` are marked `requires-live-source-resolution` and must be checked during live validation. The generic Redfish MetricReport, NMX-T, and gNMI preservation paths will expose them if the device emits them; if not, open a source-owner follow-up immediately. ## Execution rules - Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete. - Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale. -- Rows marked `blocker-source-resolution` are not deferred; they require immediate source-resolution or escalation. +- Generic-preserved metrics must keep bounded identity labels: report id/URI/definition and metric id/property/identity for Redfish MetricReports, raw source metric plus sorted source-label identity for NMX-T, and full gNMI path plus endpoint/entity labels for gNMI. Redfish internal keys must use escaped raw MetricId/MetricProperty identity, and NMX-T generic keys must escape raw port/source/node/label identity, to avoid aliasing. Raw string metric values must not be emitted as labels. +- Rows marked `requires-live-source-resolution` or `requires-live-source-equivalent` remain in scope; they require live source proof or immediate escalation before GB200 signoff. - Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed. diff --git a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md index fb7d2ed36d..2b651d16a9 100644 --- a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md +++ b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md @@ -6,15 +6,15 @@ Generated during the GB200 NVSWITCH telemetry branch setup. - `Cargo.toml` pins `nv-redfish = { version = "0.10.0" }`. - `Cargo.lock` resolves `nv-redfish`, `nv-redfish-bmc-http`, `nv-redfish-core`, `nv-redfish-schema`, and `nv-redfish-csdl-compiler` to `0.10.0` from crates.io. -- `crates/health/Cargo.toml` enables standard health features but not `telemetry-service`. +- This branch enables `telemetry-service` in `crates/health/Cargo.toml` for the new Redfish TelemetryService collector. - The GB200 branch has a local `nv-redfish` worktree available for companion development only: - - `/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps` + - `${NV_REDFISH_WORKTREE}` - Branch: `nvswitch_telemetry_gaps` - Base: `origin/main` at `dbd2789c987fd320d263d87524fc25fde305bc7f` ## Refreshed upstream state -- Local `/Users/mkoci/Projects/nv-redfish` was fetched from `origin` on 2026-06-18. +- Local `${NV_REDFISH_SOURCE_CHECKOUT}` was fetched from `origin` on 2026-06-18. - Latest observed public tags: `v0.10.2`, `v0.10.1`, `v0.10.0`. - `v0.10.2` does not appear to contain Fabric/Switch/Port/NVSwitch changes relevant to this work. - `origin/main` includes a `telemetry-service` feature in `redfish/features.toml`. @@ -33,18 +33,18 @@ During local development, keep user-local absolute paths out of committed manife ```bash cargo test -p carbide-health --lib --no-run \ - --config 'patch.crates-io.nv-redfish.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/redfish"' + --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" ``` If companion changes touch internal nv-redfish crates, patch the affected packages too: ```bash cargo test -p carbide-health --lib --no-run \ - --config 'patch.crates-io.nv-redfish.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/redfish"' \ - --config 'patch.crates-io.nv-redfish-core.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/core"' \ - --config 'patch.crates-io.nv-redfish-schema.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/schema"' \ - --config 'patch.crates-io.nv-redfish-csdl-compiler.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/csdl-compiler"' \ - --config 'patch.crates-io.nv-redfish-bmc-http.path="/Users/mkoci/.config/superpowers/worktrees/nv-redfish/nvswitch_telemetry_gaps/bmc-http"' + --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" \ + --config "patch.crates-io.nv-redfish-core.path=\"${NV_REDFISH_WORKTREE}/core\"" \ + --config "patch.crates-io.nv-redfish-schema.path=\"${NV_REDFISH_WORKTREE}/schema\"" \ + --config "patch.crates-io.nv-redfish-csdl-compiler.path=\"${NV_REDFISH_WORKTREE}/csdl-compiler\"" \ + --config "patch.crates-io.nv-redfish-bmc-http.path=\"${NV_REDFISH_WORKTREE}/bmc-http\"" ``` ## Final MR strategy @@ -54,3 +54,9 @@ Do not commit local absolute path dependencies. Before final review, use one of 1. A released `nv-redfish` version containing companion support, with `Cargo.toml` and `Cargo.lock` updated accordingly. 2. A reviewer-approved git revision dependency if release timing blocks final integration. 3. A documented split where infra-controller names the required `nv-redfish` companion MR and keeps local path overrides out of the final diff. + +## Branch implementation update + +The GB200 branch consumes the typed TelemetryService API already present in `nv-redfish` 0.10.0 (`ServiceRoot::telemetry_service()`, `TelemetryService::metric_report_links()`, and `MetricReportLink::fetch()`). No local `nv-redfish` path dependency is committed. + +Direct Fabric/Switch/Port wrappers are still absent from `nv-redfish` 0.10.x and `origin/main` as inspected. The GB200 branch therefore uses Redfish TelemetryService MetricReports for BMC-side switch telemetry now, while keeping the local companion worktree available if live GB200 evidence proves that a required metric is only available from Fabric/Switch/Port resources and not from MetricReports, NMX-T, or gNMI. diff --git a/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml b/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml index 6bf66e685e..34d2c95fee 100644 --- a/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml +++ b/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml @@ -70,7 +70,9 @@ additional_issuer_cns = [] "GET /redfish/v1/UpdateService/FirmwareInventory/HGX_FW_BMC_0", "GET /redfish/v1/UpdateService/FirmwareInventory/HostBMC_0", "GET /redfish/v1/TelemetryService", + "GET /redfish/v1/TelemetryService/MetricReportDefinitions", + "GET /redfish/v1/TelemetryService/MetricReportDefinitions/*", "GET /redfish/v1/TelemetryService/MetricReports", - "GET /redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", + "GET /redfish/v1/TelemetryService/MetricReports/*", "GET /redfish/v1/TaskService/Tasks/*", ] From ca733f2275e291fe4e6ac5f8234d2d5ffb94d372 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 22 Jun 2026 21:57:47 -0400 Subject: [PATCH 05/25] feat(health): rework GB200 NVSwitch telemetry to explicit live-validated mappings Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/Cargo.toml | 1 - crates/health/example/config.example.toml | 27 +- crates/health/src/collectors/mod.rs | 2 - crates/health/src/collectors/nmxt.rs | 541 +++--- .../health/src/collectors/nvue/gnmi/client.rs | 47 +- .../collectors/nvue/gnmi/sample_processor.rs | 1660 +++++++++++++---- .../health/src/collectors/nvue/rest/client.rs | 66 + .../src/collectors/nvue/rest/collector.rs | 178 +- .../src/collectors/telemetry_service.rs | 564 ------ crates/health/src/config.rs | 177 +- crates/health/src/discovery/cleanup.rs | 2 - crates/health/src/discovery/context.rs | 15 +- crates/health/src/discovery/spawn.rs | 67 +- crates/health/src/endpoint/sources.rs | 4 +- ...vswitch_telemetry_gb200_live_validation.md | 198 +- .../nvswitch_telemetry_gb200_matrix.csv | 388 ++-- .../health/nvswitch_telemetry_gb200_matrix.md | 119 +- ...vswitch_telemetry_nv_redfish_dependency.md | 18 +- .../files/carbide-bmc-proxy.toml | 4 +- 19 files changed, 2321 insertions(+), 1757 deletions(-) delete mode 100644 crates/health/src/collectors/telemetry_service.rs diff --git a/crates/health/Cargo.toml b/crates/health/Cargo.toml index 1605b2bee8..5423024726 100644 --- a/crates/health/Cargo.toml +++ b/crates/health/Cargo.toml @@ -73,7 +73,6 @@ nv-redfish = { workspace = true, features = [ "processors", "sensors", "storages", - "telemetry-service", "thermal", "update-service", "resource-status", diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index f9f98ed26f..6b33c8e299 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -56,7 +56,7 @@ port = 443 mac = "11:22:33:44:55:77" username = "admin" password = "secret" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, nmxt_enabled = true, slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" @@ -146,13 +146,6 @@ include_sensor_thresholds = true fetch_interval = "2m" fetch_concurrency = 4 -[collectors.telemetry_service] -poll_interval = "1m" -fetch_concurrency = 4 -# Empty means collect all Redfish TelemetryService MetricReports exposed by the switch BMC. -# For GB200 live validation this can be narrowed to ["NvidiaNMMetrics_0"] if needed. -metric_report_ids = [] - [collectors.firmware] firmware_refresh_interval = "30m" @@ -196,6 +189,12 @@ logs_state_file = "/tmp/logs_collector_{machine_id}.json" # ============================================================================== # Switch Host Collectors: What data to collect from NVLink Switch Hosts +# +# NMX-T and NVUE (REST + gNMI) are disabled by default in config.rs because +# they are only meaningful for switch host endpoints +# (switch.endpoint_role = "host"). This example enables all three for the +# GB200 NVLink switch-host scenario. Deploy only where the target endpoint is +# a switch host; non-switch hosts should omit these sections entirely. # ============================================================================== [collectors.nmxt] @@ -212,10 +211,11 @@ cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true -# NVUE gNMI streaming collector (switches only, disabled by default). -# Subscribes to gNMI SAMPLE paths and pushes metrics through the DataSink -# pipeline. PrometheusSink serves the /metrics endpoint; OtlpSink (when -# configured separately) pushes to an OTel Collector. +# NVUE gNMI streaming collector. Disabled by default in code; explicitly +# enabled here for the GB200 NVLink switch-host scenario. Subscribes to +# gNMI SAMPLE paths (components + interfaces) and pushes metrics through +# the DataSink pipeline. PrometheusSink serves the /metrics endpoint; +# OtlpSink (when configured separately) pushes to an OTel Collector. [collectors.nvue.gnmi] gnmi_port = 9339 sample_interval = "5m" @@ -226,7 +226,8 @@ system_events_enabled = true [collectors.nvue.gnmi.paths] components_enabled = true interfaces_enabled = true -# Defaults to false; enable for GB200 NVSwitch platform-general catalog leaves. +# Switch-level memory and disk utilization from `/platform-general/state` +# (a singleton, not keyed by interface or component name). platform_general_enabled = true # ============================================================================== diff --git a/crates/health/src/collectors/mod.rs b/crates/health/src/collectors/mod.rs index 2d3a699766..6499644edf 100644 --- a/crates/health/src/collectors/mod.rs +++ b/crates/health/src/collectors/mod.rs @@ -25,7 +25,6 @@ mod nmxt; mod nvue; mod runtime; mod sensors; -mod telemetry_service; pub use discovery::{EntityDiscoveryCollector, EntityDiscoveryCollectorConfig}; pub use entity_metrics::{MetricsCollector, MetricsCollectorConfig}; @@ -46,4 +45,3 @@ pub use runtime::{ StreamingCollectorStartContext, open_sse_stream, }; pub use sensors::{SensorCollector, SensorCollectorConfig}; -pub use telemetry_service::{TelemetryServiceCollector, TelemetryServiceCollectorConfig}; diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index ffa24bfee3..d338f7c313 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -17,8 +17,14 @@ //! This module collects metrics from NMX-T telemetry endpoints on NVLink switches if the service is enabled. //! Scrapes HTTP on 9352 (default for NMX-T) - NOT A Redfish collector! -//! Known switch metrics are emitted with existing canonical names; all other -//! numeric Prometheus samples are preserved as source-qualified NMX-T metrics. +//! +//! Mapping is an EXPLICIT, catalog-row allowlist over the live NMX-T Prometheus scrape (see +//! `NMXT_METRIC_MAP` and `NMXT_LABEL_MAP`). Each NMX-T source name is either: +//! * a numeric **family** -> emitted as one canonical `switch_nmxt` series (`NMXT_METRIC_MAP`), or +//! * an identity/inventory **label dimension** carried on every series -> re-exported as a +//! canonical label, never as a standalone metric (`NMXT_LABEL_MAP`). +//! +//! Source names not on either allowlist are skipped and counted only (never sanitized into telemetry). use std::borrow::Cow; use std::collections::HashMap; @@ -38,6 +44,110 @@ const NMXT_PORT: u16 = 9352; /// NMX-T endpoint const NMXT_ENDPOINT: &str = "/xcset/nvlink_domain_telemetry"; +/// Producer name for every emitted NMX-T series. Preserved across all mappings so the +/// downstream sink keeps a single `switch_nmxt` family. +const NMXT_PRODUCER: &str = "switch_nmxt"; + +/// Explicit allowlist: live NMX-T Prometheus **family** (numeric series) -> canonical mapping. +/// +/// Tuple is `(nmxt_source_name, metric_type, unit)`. One canonical series per catalog row; the +/// source name is matched verbatim against the scraped line name. Names absent from this table +/// (and from [`NMXT_LABEL_MAP`]) are never exported. Each entry was confirmed live in the GB200 +/// NMX-T scrape (Stage 0). Catalog rows are noted for traceability. +const NMXT_METRIC_MAP: &[(&str, &str, &str)] = &[ + // BER / error counters (existing mappings, retained) + ("Effective_BER", "effective_ber", "ratio"), + ("Symbol_Errors", "symbol_errors", "count"), // row 908 PHY-SYMBOL-ERRORS + ("Link_Down", "link_down", "count"), + // Identity / inventory numeric families + ("lid", "lid", "id"), // row 865 LID + ("device_hw_rev", "device_hw_rev", "id"), // row 869 DEVICE-HARDWARE-REVISION + // Status / link-down attribution + ("Advanced_Status_Opcode", "status_opcode", "code"), // row 945 STATUS-OPCODE + ("remote_reason_opcode", "remote_reason_opcode", "code"), // row 949 REMOTE-REASON-OPCODE + ("time_to_link_up_ext_msec", "time_to_link_up", "milliseconds"), // row 944 TIME-TO-LINKS-UP + // Cable optics (numeric families) + ("cable_technology", "cable_transmitter_technology", "code"), // row 970 CABLE-TRANSMITTER-TECHNOLOGY + ("rx_power_lane_0", "cable_rx_power_lane0", "milliwatts"), // row 977 CABLE-RX-POWER-LANE0 + ("rx_power_lane_1", "cable_rx_power_lane1", "milliwatts"), // row 978 CABLE-RX-POWER-LANE1 + ("Module_Voltage", "cable_diag_supply_voltage", "volts"), // row 979 CABLE-DIAG-SUPPLY-VOLTAGE + // Link partner + ("link_partner_lid", "link_partner_lid", "id"), // row 989 LINK-PARTNER-LID + // Recovery counters / timers + ("successful_recovery_events", "link_recovery_success_cnt", "count"), // row 1688 LINK-RECOVERY-SUCCESS-CNT + ("total_successful_recovery_events", "total_link_recovery_success_cnt", "count"), // row 1689 TOTAL-LINK-RECOVERY-SUCCESS-CNT + ("time_since_last_recovery", "time_since_last_recovery", "seconds"), // row 1690 TIME-SINCE-LAST-RECOVERY + ("time_between_last_2_recoveries", "time_btwn_two_recoveries", "seconds"), // row 1691 TIME-BTWN-TWO-RECOVERIES + ("last_host_logical_recovery_attempts_count", "recovery_attempts_l1_cnt", "count"), // row 1692 RECOVERY-ATTEMPTS-L1-CNT + ("last_host_serdes_feq_attempts_count", "recovery_attempts_l2_cnt", "count"), // row 1693 RECOVERY-ATTEMPTS-L2-CNT + ("time_in_last_host_logical_recovery", "recovery_cycle_duration", "seconds"), // row 1694 RECOVERY-CYCLE-DURATION + ("time_in_last_host_serdes_feq_recovery", "serdes_recovery_cycle_duration", "seconds"), // row 1695 SERDES-RECOVERY-CYCLE-DURATION + // Contain-and-drain discards + ("contain_n_drain_xmit_discards", "contain_drain_xmit_discard", "count"), // row 1696 CONTAIN-DRAIN-XMIT-DISCARD + ("contain_n_drain_rcv_discards", "contain_drain_rcv_discard", "count"), // row 1697 CONTAIN-DRAIN-RCV-DISCARD + // Raw error lanes + ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), // row 1704 RAW-ERR-LANE-2 + ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), // row 1705 RAW-ERR-LANE-3 +]; + +/// Explicit allowlist: live NMX-T Prometheus **label** key -> canonical label name. +/// +/// These catalog rows are identity/inventory dimensions, not standalone metrics. NMX-T carries +/// them as labels on every series, so they are re-exported as canonical labels on each emitted +/// `switch_nmxt` sample (consistent with the existing `node_guid` / `port_num` handling). They are +/// never emitted as their own metric family. Tuple is `(nmxt_label_key, canonical_label_name)`. +/// Catalog rows are noted for traceability. +const NMXT_LABEL_MAP: &[(&str, &str)] = &[ + ("FW_Version", "net_fw_ver"), // row 763 NET-FW-VER + ("sw_serial_number", "serial"), // row 804 SERIAL + ("Node_GUID", "node_guid"), // row 806 NODE-GUID + ("port_guid", "port_guid"), // row 807 PORT-GUID + ("Port_Number", "port_num"), // row 866 PORT-NUMBER + ("port_label", "port_label"), // row 867 PORT-LABEL + ("sw_revision", "revision"), // row 868 REVISION + ("Active_FEC", "fec_mode_active"), // row 898 FEC-MODE-ACTIVE + ("Device_ID", "device_id"), // row 910 DEVICE-ID + ("Status_Message", "status_message"), // row 946 STATUS-MESSAGE + ("down_blame", "down_blame"), // row 947 DOWN-BLAME + ("local_reason_opcode", "local_reason_opcode"), // row 948 LOCAL-REASON-OPCODE + ("Cable_PN", "cable_part_number"), // row 968 CABLE-PART-NUMBER + ("Cable_SN", "cable_serial_number"), // row 969 CABLE-SERIAL-NUMBER + ("cable_type", "cable_type"), // row 971 CABLE-TYPE + ("cable_vendor", "cable_vendor"), // row 972 CABLE-VENDOR + ("cable_length", "cable_length"), // row 973 CABLE-LENGTH + ("cable_identifier", "cable_identifier"), // row 974 CABLE-IDENTIFIER + ("vendor_rev", "cable_rev"), // row 975 CABLE-REV + ("cable_fw_version", "cable_fw_version"), // row 976 CABLE-FW-VERSION + ("Module_Temperature", "cable_temp"), // row 980 CABLE-TEMP + ("link_partner_description", "link_partner_description"), // row 987 LINK-PARTNER-DESCRIPTION + ("link_partner_node_guid", "link_partner_node_guid"), // row 988 LINK-PARTNER-NODE-GUID + ("link_partner_port_num", "link_partner_port_num"), // row 990 LINK-PARTNER-PORT-NUM + ("device_num_on_tray", "device_num"), // row 1698 DEVICE-NUM + ("board_type", "board_type"), // row 1699 BOARD-TYPE + ("chassis_slot_index", "chassis_slot_idx"), // row 1700 CHASSIS-SLOT-IDX + ("tray_index", "tray_idx"), // row 1701 TRAY-IDX + ("topology_id", "topology_id"), // row 1702 TOPOLOGY-ID + ("chassis_id", "chassis_id"), // row 1703 CHASSIS-ID +]; + +/// Look up a live NMX-T family name in the explicit allowlist, returning `(metric_type, unit)`. +fn lookup_nmxt_metric(name: &str) -> Option<(&'static str, &'static str)> { + NMXT_METRIC_MAP + .iter() + .find(|(source, _, _)| *source == name) + .map(|(_, metric_type, unit)| (*metric_type, *unit)) +} + +/// Look up a live NMX-T label key in the explicit allowlist, returning the canonical label name. +/// Test-only helper; production re-exports labels by iterating `NMXT_LABEL_MAP` directly in `build_labels`. +#[cfg(test)] +fn lookup_nmxt_label(key: &str) -> Option<&'static str> { + NMXT_LABEL_MAP + .iter() + .find(|(source, _)| *source == key) + .map(|(_, canonical)| *canonical) +} + /// Prometheus text -> NmxtMetricSample #[derive(Debug, Clone)] struct NmxtMetricSample { @@ -208,6 +318,30 @@ impl NmxtCollector { } } + /// Build the canonical label set for one emitted `switch_nmxt` series. + /// + /// Always carries `switch_id` / `switch_ip`. Identity and inventory dimensions are re-exported + /// from the scraped sample only when their NMX-T label key is on the explicit + /// [`NMXT_LABEL_MAP`] allowlist; their canonical names come from that map. Label keys not on + /// the allowlist are dropped (never sanitized into exported labels). + fn build_labels( + &self, + switch_ip: &str, + sample_labels: &HashMap, + ) -> Vec<(Cow<'static, str>, String)> { + let mut labels: Vec<(Cow<'static, str>, String)> = Vec::with_capacity(2 + NMXT_LABEL_MAP.len()); + labels.push((Cow::Borrowed("switch_id"), self.switch_id.clone())); + labels.push((Cow::Borrowed("switch_ip"), switch_ip.to_string())); + + for (source_key, canonical) in NMXT_LABEL_MAP { + if let Some(value) = sample_labels.get(*source_key) { + labels.push((Cow::Borrowed(*canonical), value.clone())); + } + } + + labels + } + async fn scrape_iteration(&self) -> Result<(), HealthError> { let switch_ip = self.endpoint.addr.ip.to_string(); @@ -215,51 +349,42 @@ impl NmxtCollector { self.emit_event(CollectorEvent::MetricCollectionStart); + // Count of scraped families not on the explicit allowlist. These are skipped (never + // sanitized into telemetry) and only reported diagnostically. + let mut unmapped_families = 0u64; + for sample in metrics { let NmxtMetricSample { name, - labels: mut sample_labels, + labels: sample_labels, value, } = sample; - let port_num = sample_labels.remove("Port_Number").unwrap_or_default(); - let node_guid = sample_labels.remove("Node_GUID").unwrap_or_default(); - let known_legacy_metric = matches!( - name.as_str(), - "Effective_BER" | "Symbol_Errors" | "Link_Down" - ); - let metric_type = match name.as_str() { - "Effective_BER" => "effective_ber".to_string(), - "Symbol_Errors" => "symbol_errors".to_string(), - "Link_Down" => "link_down".to_string(), - _ => sanitize_metric_token(&name), + // Explicit family allowlist: an unknown source name is dropped and counted only. + let Some((metric_type, unit)) = lookup_nmxt_metric(&name) else { + unmapped_families += 1; + continue; }; - let metric_key = if known_legacy_metric { - legacy_metric_key(&metric_type, &port_num) - } else { - generic_metric_key(&metric_type, &name, &port_num, &node_guid, &sample_labels) - }; + // Port number anchors the per-series key; sourced from the explicit label dimension. + let port_num = sample_labels + .get("Port_Number") + .cloned() + .unwrap_or_default(); - let mut labels = vec![ - (Cow::Borrowed("switch_id"), self.switch_id.clone()), - (Cow::Borrowed("switch_ip"), switch_ip.clone()), - (Cow::Borrowed("node_guid"), node_guid), - (Cow::Borrowed("port_num"), port_num), - ]; - if !known_legacy_metric { - labels.push((Cow::Borrowed("source_metric"), name)); - } - for (label_name, label_value) in sample_labels { - labels.push((Cow::Owned(sanitize_label_name(&label_name)), label_value)); - } + let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); + metric_key.push_str(metric_type); + metric_key.push(':'); + metric_key.push_str(&port_num); + + let labels = self.build_labels(&switch_ip, &sample_labels); self.emit_event(CollectorEvent::Metric( MetricSample { key: metric_key, - name: "switch_nmxt".to_string(), - metric_type, - unit: "count".to_string(), + name: NMXT_PRODUCER.to_string(), + metric_type: metric_type.to_string(), + unit: unit.to_string(), value, labels, context: None, @@ -268,126 +393,18 @@ impl NmxtCollector { )); } - self.emit_event(CollectorEvent::MetricCollectionEnd); - - Ok(()) - } -} - -fn legacy_metric_key(metric_type: &str, port_num: &str) -> String { - let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); - metric_key.push_str(metric_type); - metric_key.push(':'); - metric_key.push_str(port_num); - metric_key -} - -fn generic_metric_key( - metric_type: &str, - source_metric: &str, - port_num: &str, - node_guid: &str, - sample_labels: &HashMap, -) -> String { - let mut metric_key = metric_type.to_string(); - - append_metric_key_identity(&mut metric_key, "port_num", port_num); - append_metric_key_identity(&mut metric_key, "source_metric", source_metric); - append_metric_key_identity(&mut metric_key, "node_guid", node_guid); - - let mut identity_labels = sample_labels - .iter() - .map(|(label_name, label_value)| (sanitize_label_name(label_name), label_name, label_value)) - .collect::>(); - identity_labels.sort_by( - |(left_sanitized, left_name, left_value), (right_sanitized, right_name, right_value)| { - left_sanitized - .cmp(right_sanitized) - .then_with(|| left_name.cmp(right_name)) - .then_with(|| left_value.cmp(right_value)) - }, - ); - - for (_, label_name, label_value) in identity_labels { - append_metric_key_identity(&mut metric_key, "label_name", label_name); - append_metric_key_identity(&mut metric_key, "label_value", label_value); - } - - metric_key -} - -fn append_metric_key_identity( - metric_key: &mut String, - component_name: &str, - component_value: &str, -) { - if component_value.is_empty() { - return; - } - metric_key.push(':'); - metric_key.push_str(&escape_metric_key_component(component_name)); - metric_key.push('='); - metric_key.push_str(&escape_metric_key_component(component_value)); -} - -fn escape_metric_key_component(value: &str) -> String { - let mut escaped = String::with_capacity(value.len()); - for byte in value.bytes() { - match byte { - b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { - escaped.push(byte as char); - } - _ => { - escaped.push('%'); - escaped.push(hex_digit(byte >> 4)); - escaped.push(hex_digit(byte & 0x0f)); - } + if unmapped_families > 0 { + tracing::debug!( + switch_id = %self.switch_id, + count = unmapped_families, + "skipped NMX-T families not on explicit allowlist" + ); } - } - escaped -} -fn hex_digit(nibble: u8) -> char { - match nibble { - 0..=9 => (b'0' + nibble) as char, - 10..=15 => (b'A' + nibble - 10) as char, - _ => unreachable!("hex nibble is always <= 15"), - } -} - -fn sanitize_metric_token(value: &str) -> String { - let mut token = String::with_capacity(value.len()); - let mut previous_was_separator = false; - let chars = value.chars().collect::>(); - for (index, ch) in chars.iter().copied().enumerate() { - if ch.is_ascii_alphanumeric() { - let previous = index.checked_sub(1).and_then(|i| chars.get(i)).copied(); - let next = chars.get(index + 1).copied(); - let starts_word = ch.is_ascii_uppercase() - && !previous_was_separator - && previous.is_some_and(|prev| prev.is_ascii_alphanumeric()) - && (previous - .is_some_and(|prev| prev.is_ascii_lowercase() || prev.is_ascii_digit()) - || next.is_some_and(|next| next.is_ascii_lowercase())); - if starts_word { - token.push('_'); - } - token.push(ch.to_ascii_lowercase()); - previous_was_separator = false; - } else if !previous_was_separator { - token.push('_'); - previous_was_separator = true; - } - } - token.trim_matches('_').to_string() -} + self.emit_event(CollectorEvent::MetricCollectionEnd); -fn sanitize_label_name(value: &str) -> String { - let mut label = sanitize_metric_token(value); - if label.chars().next().is_some_and(|ch| ch.is_ascii_digit()) { - label.insert(0, '_'); + Ok(()) } - label } #[cfg(test)] @@ -433,90 +450,170 @@ Link_Down{Port_Number="1"} 5 assert_eq!(samples.len(), 4); } - #[test] - fn unknown_nmxt_metric_names_are_sanitized_instead_of_dropped() { - assert_eq!( - sanitize_metric_token("PortMalformedPacketErrors"), - "port_malformed_packet_errors" - ); - assert_eq!(sanitize_label_name("Lane-Number"), "lane_number"); - assert_eq!(sanitize_label_name("8b10b"), "_8b10b"); - } + /// Representative live NMX-T `lid` series carrying the full identity/inventory label set. + /// Mirrors the Stage-0 GB200 scrape (`nmxt-prometheus.txt`). + const SAMPLE_LID_LINE: &str = r#"lid{Device_ID="GB100", port_label="GPUP10", logical_state="ACT", device_num_on_tray="2", board_type="3", chassis_slot_index="27", tray_index="17", topology_id="128", chassis_id="1820325172739", Active_FEC="Int_KP4_FEC_PLR", link_partner_description="MF0;sw06:N5400_LD/U1", link_partner_node_guid="0x2c5eab0300b6a900", link_partner_port_num="71", cable_vendor="Other", down_blame="Unknown", local_reason_opcode="No_link_down_indication", Node_GUID="0xe1d04a69816f16bc", node_description="GB100 Nvidia Technologies", Port_Number="11", FW_Version="36.2014.1866", Cable_PN="NA", Cable_SN="NA", cable_type="850 nm VCSEL", cable_length="NA", cable_identifier="Backplane", vendor_rev="NA", cable_fw_version="N/A", Module_Temperature="0C", Status_Message="No issue was observed", port_guid="0xe1d04a69816f16c6", sw_serial_number="MT123", sw_revision="A1", remote_reason_opcode="4"} 3093 1781993954087"#; + // Catalog row -> NMX-T family -> (metric_type, unit). One row per explicit family mapping. #[test] - fn generic_metric_key_includes_sorted_extra_label_identity() { - let labels = HashMap::from([ - ("Lane-Number".to_string(), "3".to_string()), - ("Device".to_string(), "nvswitch0".to_string()), - ]); - - assert_eq!( - generic_metric_key( - "port_malformed_packet_errors", - "PortMalformedPacketErrors", - "4", - "0x8e2161c8803caf64", - &labels, + fn test_nmxt_metric_map_locks_type_and_unit() { + let expected: &[(&str, &str, &str)] = &[ + ("Effective_BER", "effective_ber", "ratio"), + ("Symbol_Errors", "symbol_errors", "count"), + ("Link_Down", "link_down", "count"), + ("lid", "lid", "id"), + ("device_hw_rev", "device_hw_rev", "id"), + ("Advanced_Status_Opcode", "status_opcode", "code"), + ("remote_reason_opcode", "remote_reason_opcode", "code"), + ("time_to_link_up_ext_msec", "time_to_link_up", "milliseconds"), + ("cable_technology", "cable_transmitter_technology", "code"), + ("rx_power_lane_0", "cable_rx_power_lane0", "milliwatts"), + ("rx_power_lane_1", "cable_rx_power_lane1", "milliwatts"), + ("Module_Voltage", "cable_diag_supply_voltage", "volts"), + ("link_partner_lid", "link_partner_lid", "id"), + ("successful_recovery_events", "link_recovery_success_cnt", "count"), + ( + "total_successful_recovery_events", + "total_link_recovery_success_cnt", + "count", ), - "port_malformed_packet_errors:port_num=4:source_metric=PortMalformedPacketErrors:node_guid=0x8e2161c8803caf64:label_name=Device:label_value=nvswitch0:label_name=Lane-Number:label_value=3" - ); + ("time_since_last_recovery", "time_since_last_recovery", "seconds"), + ("time_between_last_2_recoveries", "time_btwn_two_recoveries", "seconds"), + ( + "last_host_logical_recovery_attempts_count", + "recovery_attempts_l1_cnt", + "count", + ), + ( + "last_host_serdes_feq_attempts_count", + "recovery_attempts_l2_cnt", + "count", + ), + ("time_in_last_host_logical_recovery", "recovery_cycle_duration", "seconds"), + ( + "time_in_last_host_serdes_feq_recovery", + "serdes_recovery_cycle_duration", + "seconds", + ), + ("contain_n_drain_xmit_discards", "contain_drain_xmit_discard", "count"), + ("contain_n_drain_rcv_discards", "contain_drain_rcv_discard", "count"), + ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), + ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), + ]; + + for (source, metric_type, unit) in expected { + assert_eq!( + lookup_nmxt_metric(source), + Some((*metric_type, *unit)), + "family `{source}` must map to ({metric_type}, {unit})" + ); + } + // The allowlist must contain exactly these explicit families (no extras, no generic). + assert_eq!(NMXT_METRIC_MAP.len(), expected.len()); } + // Catalog identity/inventory row -> NMX-T label key -> canonical label name. #[test] - fn generic_metric_key_includes_raw_source_metric_to_avoid_sanitized_name_aliasing() { - let labels = HashMap::new(); - - assert_ne!( - generic_metric_key("rx_errors", "RxErrors", "1", "", &labels), - generic_metric_key("rx_errors", "rx-errors", "1", "", &labels), - ); + fn test_nmxt_label_map_locks_canonical_names() { + let expected: &[(&str, &str)] = &[ + ("FW_Version", "net_fw_ver"), + ("sw_serial_number", "serial"), + ("Node_GUID", "node_guid"), + ("port_guid", "port_guid"), + ("Port_Number", "port_num"), + ("port_label", "port_label"), + ("sw_revision", "revision"), + ("Active_FEC", "fec_mode_active"), + ("Device_ID", "device_id"), + ("Status_Message", "status_message"), + ("down_blame", "down_blame"), + ("local_reason_opcode", "local_reason_opcode"), + ("Cable_PN", "cable_part_number"), + ("Cable_SN", "cable_serial_number"), + ("cable_type", "cable_type"), + ("cable_vendor", "cable_vendor"), + ("cable_length", "cable_length"), + ("cable_identifier", "cable_identifier"), + ("vendor_rev", "cable_rev"), + ("cable_fw_version", "cable_fw_version"), + ("Module_Temperature", "cable_temp"), + ("link_partner_description", "link_partner_description"), + ("link_partner_node_guid", "link_partner_node_guid"), + ("link_partner_port_num", "link_partner_port_num"), + ("device_num_on_tray", "device_num"), + ("board_type", "board_type"), + ("chassis_slot_index", "chassis_slot_idx"), + ("tray_index", "tray_idx"), + ("topology_id", "topology_id"), + ("chassis_id", "chassis_id"), + ]; + + for (key, canonical) in expected { + assert_eq!( + lookup_nmxt_label(key), + Some(*canonical), + "label `{key}` must map to canonical `{canonical}`" + ); + } + assert_eq!(NMXT_LABEL_MAP.len(), expected.len()); } + // Unknown NMX-T source names are not on either allowlist (never sanitized into telemetry). #[test] - fn generic_metric_key_escapes_identity_delimiters_to_avoid_aliasing() { - let labels_with_delimiter_value = HashMap::from([("b".to_string(), "c:d=e".to_string())]); - let labels_split_by_delimiters = HashMap::from([ - ("b".to_string(), "c".to_string()), - ("d".to_string(), "e".to_string()), - ]); - - assert_ne!( - generic_metric_key( - "rx_errors", - "RxErrors", - "1", - "", - &labels_with_delimiter_value - ), - generic_metric_key( - "rx_errors", - "RxErrors", - "1", - "", - &labels_split_by_delimiters - ) - ); - - assert_ne!( - generic_metric_key( - "rx_errors", - "RxErrors:node_guid=x", - "1", - "", - &HashMap::new() - ), - generic_metric_key("rx_errors", "RxErrors", "1", "x", &HashMap::new()) - ); + fn test_unknown_nmxt_sources_not_allowlisted() { + // Live-but-blocked families and arbitrary unknowns: all must be rejected. + for unknown in [ + "HiRetransmissionRate", // row 931, not live + "rq_num_wrfe", // row 1706, not live + "rq_num_lle", // row 1707, not live + "sq_num_wrfe", // row 1708, not live + "Chip_Temp", // threshold blocker, not an NMX-T explicit mapping + "totally_made_up_metric", + ] { + assert!( + lookup_nmxt_metric(unknown).is_none(), + "`{unknown}` must not be an allowlisted family" + ); + assert!( + lookup_nmxt_label(unknown).is_none(), + "`{unknown}` must not be an allowlisted label" + ); + } } + // End-to-end: a live family line yields one canonical key and re-exported allowlisted labels. #[test] - fn generic_metric_key_distinguishes_same_port_samples_by_extra_labels() { - let first = HashMap::from([("Lane".to_string(), "0".to_string())]); - let second = HashMap::from([("Lane".to_string(), "1".to_string())]); + fn test_label_map_reexports_identity_dims_from_live_series() { + let sample = parse_prometheus_line(SAMPLE_LID_LINE).expect("parse lid line"); + assert_eq!(sample.name, "lid"); + + // Resolve canonical labels exactly as build_labels would (allowlist-gated). + let mut canonical = HashMap::new(); + for (source_key, canonical_name) in NMXT_LABEL_MAP { + if let Some(value) = sample.labels.get(*source_key) { + canonical.insert(*canonical_name, value.clone()); + } + } - assert_ne!( - generic_metric_key("rx_errors", "RxErrors", "1", "", &first), - generic_metric_key("rx_errors", "RxErrors", "1", "", &second) - ); + // Identity/inventory rows are present as labels with their canonical names. + assert_eq!(canonical.get("node_guid"), Some(&"0xe1d04a69816f16bc".to_string())); // 806 + assert_eq!(canonical.get("port_guid"), Some(&"0xe1d04a69816f16c6".to_string())); // 807 + assert_eq!(canonical.get("port_num"), Some(&"11".to_string())); // 866 + assert_eq!(canonical.get("port_label"), Some(&"GPUP10".to_string())); // 867 + assert_eq!(canonical.get("net_fw_ver"), Some(&"36.2014.1866".to_string())); // 763 + assert_eq!(canonical.get("serial"), Some(&"MT123".to_string())); // 804 + assert_eq!(canonical.get("revision"), Some(&"A1".to_string())); // 868 + assert_eq!(canonical.get("device_id"), Some(&"GB100".to_string())); // 910 + assert_eq!(canonical.get("fec_mode_active"), Some(&"Int_KP4_FEC_PLR".to_string())); // 898 + assert_eq!(canonical.get("cable_part_number"), Some(&"NA".to_string())); // 968 + assert_eq!(canonical.get("cable_temp"), Some(&"0C".to_string())); // 980 + assert_eq!(canonical.get("chassis_id"), Some(&"1820325172739".to_string())); // 1703 + assert_eq!( + canonical.get("link_partner_node_guid"), + Some(&"0x2c5eab0300b6a900".to_string()) + ); // 988 + + // node_description is present on the series but NOT allowlisted -> not re-exported. + assert!(!canonical.contains_key("node_description")); } } diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 01544d0b08..1db5050f44 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -63,11 +63,19 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { }); } if paths_config.platform_general_enabled { + // switch-level singleton: `/platform-general/state` carries the memory + // and disk utilization leaves (no interface/component name key). paths.push(Path { - elem: vec![PathElem { - name: "platform-general".into(), - key: Default::default(), - }], + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "state".into(), + key: Default::default(), + }, + ], ..Default::default() }); } @@ -448,9 +456,9 @@ mod tests { } #[test] - fn test_nvue_subscribe_paths_defaults_do_not_enable_platform_general() { + fn test_nvue_subscribe_paths_all_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); - assert_eq!(paths.len(), 2); + assert_eq!(paths.len(), 3); assert_eq!(paths[0].elem.len(), 2); assert_eq!(paths[0].elem[0].name, "components"); @@ -459,18 +467,10 @@ mod tests { assert_eq!(paths[1].elem.len(), 2); assert_eq!(paths[1].elem[0].name, "interfaces"); assert_eq!(paths[1].elem[1].name, "interface"); - } - #[test] - fn test_nvue_subscribe_paths_all_enabled() { - let paths = nvue_subscribe_paths(&NvueGnmiPaths { - components_enabled: true, - interfaces_enabled: true, - platform_general_enabled: true, - }); - assert_eq!(paths.len(), 3); - assert_eq!(paths[2].elem.len(), 1); + assert_eq!(paths[2].elem.len(), 2); assert_eq!(paths[2].elem[0].name, "platform-general"); + assert_eq!(paths[2].elem[1].name, "state"); } #[test] @@ -486,6 +486,19 @@ mod tests { assert_eq!(paths[0].elem[1].name, "interface"); } + #[test] + fn test_nvue_subscribe_paths_platform_general_only() { + let paths = nvue_subscribe_paths(&NvueGnmiPaths { + components_enabled: false, + interfaces_enabled: false, + platform_general_enabled: true, + }); + assert_eq!(paths.len(), 1); + assert_eq!(paths[0].elem.len(), 2); + assert_eq!(paths[0].elem[0].name, "platform-general"); + assert_eq!(paths[0].elem[1].name, "state"); + } + #[test] fn test_nvue_subscribe_paths_none_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths { @@ -522,7 +535,7 @@ mod tests { let prefix = sub_list.prefix.expect("prefix must be set"); assert_eq!(prefix.target, "nvos", "target must be nvos"); - assert_eq!(sub_list.subscription.len(), 2); + assert_eq!(sub_list.subscription.len(), 3); for sub in &sub_list.subscription { assert_eq!( sub.mode, diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 2d34f0dc45..e24d9be779 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -94,20 +94,15 @@ impl GnmiSampleProcessor { if let Some(iface) = find_elem_key_ref(&combined, "interface", "name") { entities.insert(("interface", iface)); - if !self.process_interface_metric(&combined, iface, val) { - self.emit_generic_leaf_metric(&combined, "interface", iface, val); - } + self.process_interface_metric(&combined, iface, val); } else if let Some(comp) = find_elem_key_ref(&combined, "component", "name") { entities.insert(("component", comp)); - if !self.process_component_metric(&combined, comp, val) { - self.emit_generic_leaf_metric(&combined, "component", comp, val); - } - } else if combined - .first() - .is_some_and(|elem| elem.name == "platform-general") - { - entities.insert(("platform", "platform-general")); - self.emit_generic_leaf_metric(&combined, "platform", "platform-general", val); + self.process_component_metric(&combined, comp, val); + } else if combined.iter().any(|e| e.name == "platform-general") { + // switch-level singleton: no interface/component name key. Count + // it as a single entity so monitored_entities stays accurate. + entities.insert(("platform-general", "")); + self.process_platform_general_metric(&combined, val); } } @@ -119,182 +114,147 @@ impl GnmiSampleProcessor { elems: &[&PathElem], iface_name: &str, val: &proto::TypedValue, - ) -> bool { + ) { + // Explicit per-leaf canonical mappings for `/interfaces/interface`. Each + // arm is an allowlisted GB200 NVOS gNMI leaf proven live in the Stage-0 + // probe. Unknown leaves fall through and are never exported. if leaf_matches(elems, &["state", "oper-status"]) { let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); - self.emit_data_metric( - "interface_oper_status", - iface_name, - v, - "state", - "interface_name", - iface_name, - ); - true - } else if leaf_matches(elems, &["state", "counters", "in-errors"]) { - self.emit_numeric_metric_if_valid( - "interface_in_errors", - "count", - "interface_name", - iface_name, - elems, - val, - ); - true - } else if leaf_matches(elems, &["state", "counters", "out-errors"]) { - self.emit_numeric_metric_if_valid( - "interface_out_errors", - "count", - "interface_name", - iface_name, - elems, - val, - ); - true - } else if leaf_matches(elems, &["phy-diag", "state", "effective-ber"]) { - self.emit_numeric_metric_if_valid( - "interface_effective_ber", - "ratio", - "interface_name", - iface_name, - elems, - val, - ); - true - } else if leaf_matches(elems, &["phy-diag", "state", "symbol-ber"]) { - self.emit_numeric_metric_if_valid( - "interface_symbol_ber", - "ratio", - "interface_name", - iface_name, - elems, - val, - ); - true - } else if leaf_matches( - elems, - &["phy-diag", "state", "unintentional-link-down-events"], - ) { - self.emit_numeric_metric_if_valid( - "interface_link_down_events", - "count", - "interface_name", - iface_name, - elems, - val, - ); - true - } else { - false + self.emit_iface("interface_oper_status", iface_name, v, "state"); + } else if let Some(metric_type) = numeric_interface_leaf(elems) { + // numeric counters, gauges, and BER ratios share the same numeric + // coercion; the matched leaf decides the canonical metric_type/unit. + match typed_value_to_f64(val) { + Some(v) => self.emit_iface(metric_type.name, iface_name, v, metric_type.unit), + None => debug_unmapped_value(elems, val, metric_type.name), + } + } else if leaf_matches(elems, &["infiniband", "state", "physical-port-state"]) { + let v = physical_port_state_to_f64(typed_value_to_string(val).as_deref()); + self.emit_iface("interface_physical_port_state", iface_name, v, "state"); + } else if leaf_matches(elems, &["infiniband", "state", "logical-port-state"]) { + let v = logical_port_state_to_f64(typed_value_to_string(val).as_deref()); + self.emit_iface("interface_logical_port_state", iface_name, v, "state"); + } else if leaf_matches(elems, &["infiniband", "state", "speed"]) { + // NVOS types speed as a string/enum, but live GB200 emits bare + // numeric Gbps ("400", "100", "0"). Parse via the string path and + // normalize to Gbps; unparseable forms (e.g. "hdr") emit nothing. + match link_speed_to_gbps(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_link_speed_active", iface_name, v, "gbps"), + None => debug_unmapped_value(elems, val, "interface_link_speed_active"), + } + } else if leaf_matches(elems, &["infiniband", "state", "width"]) { + match link_width_to_f64(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_link_width_active", iface_name, v, "lanes"), + None => debug_unmapped_value(elems, val, "interface_link_width_active"), + } + } else if leaf_matches(elems, &["infiniband", "state", "supported-widths"]) { + match link_width_to_f64(typed_value_to_string(val).as_deref()) { + Some(v) => self.emit_iface("interface_supported_width", iface_name, v, "lanes"), + None => debug_unmapped_value(elems, val, "interface_supported_width"), + } } } + /// emit a `/interfaces/interface` canonical series keyed on `interface_name` + fn emit_iface(&self, metric_type: &str, iface_name: &str, value: f64, unit: &str) { + self.emit_data_metric( + metric_type, + iface_name, + value, + unit, + "interface_name", + iface_name, + ); + } + fn process_component_metric( &self, elems: &[&PathElem], comp_name: &str, val: &proto::TypedValue, - ) -> bool { + ) { + // Explicit per-leaf canonical mappings for `/components/component`. The + // `component_name` label (e.g. "ASIC1", "FAN1/1", "cpu") distinguishes + // catalog rows that share a leaf (FAN-STATE and CPU-STATE both resolve + // to `state/oper-status`). Unknown leaves are never exported. if leaf_matches(elems, &["healthz", "state", "status"]) { let v = component_health_to_f64(typed_value_to_string(val).as_deref()); - self.emit_data_metric( - "component_health_status", - comp_name, - v, - "state", - "component_name", - comp_name, - ); - true - } else if leaf_matches(elems, &["state", "temperature", "instant"]) { - self.emit_numeric_metric_if_valid( - "component_temperature_celsius", - "celsius", - "component_name", - comp_name, - elems, - val, - ); - true - } else { - false + self.emit_comp("component_health_status", comp_name, v, "state"); + } else if leaf_matches(elems, &["state", "temperature", "instant"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_comp("component_temperature_celsius", comp_name, v, "celsius"); + } else if leaf_matches(elems, &["state", "oper-status"]) { + // FAN-STATE (row 966) and CPU-STATE (row 1174) share this leaf. + let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); + self.emit_comp("component_oper_status", comp_name, v, "state"); + } else if leaf_matches(elems, &["asic", "state", "asic-temp"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_comp("component_asic_temperature_celsius", comp_name, v, "celsius"); + } else if leaf_matches(elems, &["cpu", "utilization", "state", "avg"]) + && let Some(v) = typed_value_to_f64(val) + { + self.emit_comp("component_cpu_utilization", comp_name, v, "percent"); } } - fn emit_generic_leaf_metric( - &self, - elems: &[&PathElem], - entity_label_name: &'static str, - entity_label_value: &str, - val: &proto::TypedValue, - ) { - let Some(sink) = &self.data_sink else { return }; - let Some(leaf_name) = elems.last().map(|elem| elem.name.as_str()) else { - return; - }; - let Some((value, unit)) = typed_value_to_metric_value(val) else { + /// emit a `/components/component` canonical series keyed on `component_name` + fn emit_comp(&self, metric_type: &str, comp_name: &str, value: f64, unit: &str) { + self.emit_data_metric( + metric_type, + comp_name, + value, + unit, + "component_name", + comp_name, + ); + } + + fn process_platform_general_metric(&self, elems: &[&PathElem], val: &proto::TypedValue) { + // Explicit per-leaf canonical mappings for `/platform-general/state`. + // This is a switch-level singleton: only the four numeric memory/disk + // leaves proven live in the Stage-0 probe are mapped; every other + // platform-general leaf (contact, location, platform-name, ...) falls + // through and is never exported. + let metric_type = if leaf_matches(elems, &["state", "memory-used"]) { + "platform_memory_used" + } else if leaf_matches(elems, &["state", "memory-total-size"]) { + "platform_memory_total" + } else if leaf_matches(elems, &["state", "disk-total-size"]) { + "platform_disk_total" + } else if leaf_matches(elems, &["state", "disk-used"]) { + "platform_disk_used" + } else { return; }; - let metric_type = catalog_metric_type_for_leaf(leaf_name) - .map(str::to_string) - .unwrap_or_else(|| { - let leaf = sanitize_metric_token(leaf_name); - format!("nvswitch_{leaf}") - }); - let path = path_string(elems); - - let key = format!("{metric_type}:{entity_label_value}:{path}"); - let labels = vec![ - ( - Cow::Borrowed(entity_label_name), - entity_label_value.to_string(), - ), - (Cow::Borrowed("source_path"), path), - ]; + + match typed_value_to_f64(val) { + Some(v) => self.emit_switch(metric_type, v, "bytes"), + None => debug_unmapped_value(elems, val, metric_type), + } + } + + /// emit a switch-level singleton series. Unlike interface/component series + /// there is no per-entity name; endpoint identity is added by PrometheusSink + /// from EventContext. + fn emit_switch(&self, metric_type: &str, value: f64, unit: &str) { + let Some(sink) = &self.data_sink else { return }; + sink.handle_event( &self.event_context, &CollectorEvent::Metric(Box::new(MetricSample { - key, + key: metric_type.to_string(), name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), - metric_type, - unit, + metric_type: metric_type.to_string(), + unit: unit.to_string(), value, - labels, + labels: Vec::new(), context: None, })), ); } - fn emit_numeric_metric_if_valid( - &self, - metric_type: &str, - unit: &str, - entity_label_name: &'static str, - entity_label_value: &str, - elems: &[&PathElem], - val: &proto::TypedValue, - ) { - if let Some(value) = typed_value_to_f64(val).filter(|value| value.is_finite()) { - self.emit_data_metric( - metric_type, - entity_label_value, - value, - unit, - entity_label_name, - entity_label_value, - ); - return; - } - - tracing::warn!( - metric_type, - source_path = %path_string(elems), - entity_label_name, - entity_label_value, - "nvue_gnmi SAMPLE: skipping known numeric leaf with invalid value" - ); - } - fn emit_data_metric( &self, metric_type: &str, @@ -333,103 +293,6 @@ impl GnmiSampleProcessor { } } -fn typed_value_to_metric_value(value: &proto::TypedValue) -> Option<(f64, String)> { - if let Some(value) = typed_value_to_f64(value) { - return value.is_finite().then_some((value, "value".to_string())); - } - let raw = typed_value_to_string(value)?; - if raw.eq_ignore_ascii_case("up") - || raw.eq_ignore_ascii_case("healthy") - || raw.eq_ignore_ascii_case("true") - { - return Some((1.0, "state".to_string())); - } - if raw.eq_ignore_ascii_case("down") - || raw.eq_ignore_ascii_case("unhealthy") - || raw.eq_ignore_ascii_case("false") - { - return Some((0.0, "state".to_string())); - } - Some((1.0, "info".to_string())) -} - -fn path_string(elems: &[&PathElem]) -> String { - elems - .iter() - .map(|elem| { - if elem.key.is_empty() { - elem.name.clone() - } else { - let mut keys = elem - .key - .iter() - .map(|(key, value)| format!("{key}={value}")) - .collect::>(); - keys.sort(); - format!("{}[{}]", elem.name, keys.join(",")) - } - }) - .collect::>() - .join("/") -} - -fn catalog_metric_type_for_leaf(leaf_name: &str) -> Option<&'static str> { - match leaf_name { - "link-downed" => Some("nvswitch_link_downed_counter"), - "port-malformed-packet-errors" => Some("nvswitch_port_malformed_packet_errors"), - "port-neighbor-mtu-discards" => Some("nvswitch_port_neighbor_mtu_discards"), - "out-discards" => Some("nvswitch_port_xmit_discards"), - "rcv-remote-phy-errors" => Some("nvswitch_port_rcv_remote_physical_errors"), - "rcv-switch-relay-errors" => Some("nvswitch_port_rcv_switch_relay_errors"), - "qp1-dropped" => Some("nvswitch_qp1dropped"), - "vl15-dropped" => Some("nvswitch_vl15_dropped"), - "physical-port-state" => Some("nvswitch_nvlink_status"), - "link-error-recovery" => Some("nvswitch_link_error_recovery_counter"), - "port-multi-cast-rcv-pkts" => Some("nvswitch_port_multicast_rcv_pkts"), - "port-multi-cast-xmit-pkts" => Some("nvswitch_port_multicast_xmit_pkts"), - "in-octets" => Some("nvswitch_port_rcv_data"), - "in-pkts" => Some("nvswitch_port_rcv_pkts"), - "port-uni-cast-rcv-pkts" => Some("nvswitch_port_unicast_rcv_pkts"), - "port-uni-cast-xmit-pkts" => Some("nvswitch_port_unicast_xmit_pkts"), - "out-octets" => Some("nvswitch_port_xmit_data"), - "out-pkts" => Some("nvswitch_port_xmit_pkts"), - "xmit-wait" => Some("nvswitch_port_xmit_wait"), - "raw-ber" => Some("nvswitch_raw_ber"), - "zero-hist" => Some("nvswitch_zero_hist"), - "raw-errors-ch-1" => Some("nvswitch_phy_raw_errors_lane0"), - "raw-errors-ch-2" => Some("nvswitch_phy_raw_errors_lane1"), - "raw-ber-ch-1" => Some("nvswitch_raw_ber_lane0"), - "raw-ber-ch-2" => Some("nvswitch_raw_ber_lane1"), - "effective-errors" => Some("nvswitch_phy_effective_errors"), - "time-since-last-clear-min" => Some("nvswitch_time_since_lasts_clear"), - "excessive-buffer-overrun" => Some("nvswitch_port_buffer_overrun_errors"), - "speed" => Some("nvswitch_link_speed_active"), - "width" => Some("nvswitch_link_width_active"), - "mtu" => Some("nvswitch_mtu"), - "max-supported-mtus" => Some("nvswitch_max_supported_mtu"), - "supported-widths" => Some("nvswitch_supported_width"), - "vl-capabilities" => Some("nvswitch_vl_capabilities"), - "local-link-integrity-errors" => Some("nvswitch_local_link_integrity_errors"), - "module-oper-status" => Some("nvswitch_cable_oper_status"), - _ => None, - } -} - -fn sanitize_metric_token(value: &str) -> String { - let mut token = String::with_capacity(value.len()); - let mut previous_was_separator = false; - for ch in value.chars() { - if ch.is_ascii_alphanumeric() { - token.push(ch.to_ascii_lowercase()); - previous_was_separator = false; - } else if !previous_was_separator { - token.push('_'); - previous_was_separator = true; - } - } - token.trim_matches('_').to_string() -} - fn find_elem_key_ref<'a>( elems: &[&'a PathElem], elem_name: &str, @@ -452,13 +315,426 @@ fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { .all(|(elem, name)| elem.name == *name) } +/// canonical (`metric_type`, `unit`) for an allowlisted numeric interface leaf +struct NumericLeaf { + name: &'static str, + unit: &'static str, +} + +/// Table-driven dispatch for numeric `/interfaces/interface` leaves. Every entry +/// is an explicit GB200 catalog mapping proven live in the Stage-0 probe; the +/// expected leaf path tail is matched against the live gNMI tree. Leaves not in +/// this table are never exported as metrics. +fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { + // (leaf path tail, metric_type, unit) + const TABLE: &[(&[&str], &str, &str)] = &[ + // OpenConfig interface counters (`/state/counters/*`) + ( + &["state", "counters", "in-errors"], + "interface_in_errors", + "count", + ), + ( + &["state", "counters", "out-errors"], + "interface_out_errors", + "count", + ), + ( + &["state", "counters", "out-discards"], + "interface_out_discards", + "count", + ), + ( + &["state", "counters", "in-octets"], + "interface_in_octets", + "bytes", + ), + ( + &["state", "counters", "out-octets"], + "interface_out_octets", + "bytes", + ), + ( + &["state", "counters", "in-pkts"], + "interface_in_packets", + "count", + ), + ( + &["state", "counters", "out-pkts"], + "interface_out_packets", + "count", + ), + // InfiniBand port counters (`/infiniband/state/counters/port/*`) + ( + &["infiniband", "state", "counters", "port", "link-downed"], + "interface_link_downed", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "link-error-recovery", + ], + "interface_link_error_recovery", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-remote-phy-errors", + ], + "interface_rcv_remote_physical_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-switch-relay-errors", + ], + "interface_rcv_switch_relay_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-constraints-errors", + ], + "interface_rcv_constraint_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "local-link-integrity-errors", + ], + "interface_local_link_integrity_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "excessive-buffer-overrun", + ], + "interface_port_buffer_overrun_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "qp1-dropped"], + "interface_qp1_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "vl15-dropped"], + "interface_vl15_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "xmit-wait"], + "interface_port_xmit_wait", + "count", + ), + // NOTE: `infiniband/state/speed` is intentionally NOT in this numeric + // table. NVOS types it as a string/enum and the live GB200 form is a + // bare Gbps numeric; it is handled by a dedicated `link_speed_to_gbps` + // arm in `process_interface_metric` that emits unit `gbps`. + (&["infiniband", "state", "mtu"], "interface_mtu", "bytes"), + ( + &["infiniband", "state", "max-supported-mtus"], + "interface_max_supported_mtu", + "bytes", + ), + // phy-diag counters and ratios (`/phy-diag/state/*`) + (&["phy-diag", "state", "raw-ber"], "interface_raw_ber", "ratio"), + ( + &["phy-diag", "state", "effective-ber"], + "interface_effective_ber", + "ratio", + ), + (&["phy-diag", "state", "symbol-ber"], "interface_symbol_ber", "ratio"), + (&["phy-diag", "state", "raw-ber-ch-1"], "interface_raw_ber_lane0", "ratio"), + (&["phy-diag", "state", "raw-ber-ch-2"], "interface_raw_ber_lane1", "ratio"), + ( + &["phy-diag", "state", "raw-errors-ch-1"], + "interface_phy_raw_errors_lane0", + "count", + ), + ( + &["phy-diag", "state", "raw-errors-ch-2"], + "interface_phy_raw_errors_lane1", + "count", + ), + ( + &["phy-diag", "state", "effective-errors"], + "interface_phy_effective_errors", + "count", + ), + (&["phy-diag", "state", "zero-hist"], "interface_zero_hist", "count"), + ( + &["phy-diag", "state", "phy-received-bits"], + "interface_phy_received_bits", + "count", + ), + ( + &["phy-diag", "state", "port-malformed-packet-errors"], + "interface_port_malformed_packet_errors", + "count", + ), + ( + &["phy-diag", "state", "port-neighbor-mtu-discards"], + "interface_port_neighbor_mtu_discards", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-rcv-pkts"], + "interface_port_multicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-xmit-pkts"], + "interface_port_multicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-rcv-pkts"], + "interface_port_unicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-xmit-pkts"], + "interface_port_unicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-local-physical-errors"], + "interface_port_local_physical_errors", + "count", + ), + ( + &["phy-diag", "state", "sync-header-error-counter"], + "interface_sync_header_error_counter", + "count", + ), + ( + &["phy-diag", "state", "port-dlid-mapping-errors"], + "interface_port_dlid_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-vl-mapping-errors"], + "interface_port_vl_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-looping-errors"], + "interface_port_looping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-inactive-discards"], + "interface_port_inactive_discards", + "count", + ), + ( + &["phy-diag", "state", "rq-general-error"], + "interface_rq_general_error", + "count", + ), + (&["phy-diag", "state", "plr-rcv-codes"], "interface_plr_rcv_codes", "count"), + ( + &["phy-diag", "state", "plr-rcv-code-err"], + "interface_plr_rcv_codes_err", + "count", + ), + ( + &["phy-diag", "state", "plr-rcv-uncorrectable-code"], + "interface_plr_rcv_uncorrectables_code", + "count", + ), + (&["phy-diag", "state", "plr-xmit-codes"], "interface_plr_xmit_codes", "count"), + ( + &["phy-diag", "state", "plr-xmit-retry-codes"], + "interface_plr_xmit_retrys_codes", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-events"], + "interface_plr_xmit_retrys_events", + "count", + ), + ( + &["phy-diag", "state", "plr-sync-events"], + "interface_plr_sync_events", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-events-within-t-sec-max"], + "interface_plr_xmit_retry_codes_within_minute", + "count", + ), + // existing pre-branch mapping retained (leaf out of GB200 row set but + // restored upstream; kept so the canonical series is not dropped) + ( + &["phy-diag", "state", "unintentional-link-down-events"], + "interface_link_down_events", + "count", + ), + ]; + + // FEC histogram bins 0..=15 -> interface_fec_hist_{n} (rows 911..926) + if let Some(leaf) = elems.last().map(|e| e.name.as_str()) + && let Some(bin) = leaf.strip_prefix("rs-num-corr-err-bin") + && let Ok(n) = bin.parse::() + && n <= 15 + && leaf_matches(elems, &["phy-diag", "state", leaf]) + { + return Some(NumericLeaf { + name: FEC_HIST_NAMES[n], + unit: "count", + }); + } + + TABLE.iter().find_map(|&(tail, name, unit)| { + leaf_matches(elems, tail).then_some(NumericLeaf { name, unit }) + }) +} + +/// Stable, leaked-free metric_type names for FEC histogram bins 0..=15. The +/// catalog defines exactly 16 bins (FEC-HIST-0 .. FEC-HIST-15). +const FEC_HIST_NAMES: [&str; 16] = [ + "interface_fec_hist_0", + "interface_fec_hist_1", + "interface_fec_hist_2", + "interface_fec_hist_3", + "interface_fec_hist_4", + "interface_fec_hist_5", + "interface_fec_hist_6", + "interface_fec_hist_7", + "interface_fec_hist_8", + "interface_fec_hist_9", + "interface_fec_hist_10", + "interface_fec_hist_11", + "interface_fec_hist_12", + "interface_fec_hist_13", + "interface_fec_hist_14", + "interface_fec_hist_15", +]; + fn oper_status_to_f64(status: Option<&str>) -> f64 { match status { Some(s) if s.eq_ignore_ascii_case("up") => 1.0, + Some(s) if s.eq_ignore_ascii_case("active") => 1.0, + _ => 0.0, + } +} + +/// InfiniBand physical port state enum -> numeric code. Values observed live on +/// GB200: `LINK_UP`, `POLLING`, `PORT_CONFIGURATION_TRAINING`. 1.0 == link up. +fn physical_port_state_to_f64(state: Option<&str>) -> f64 { + match state { + Some(s) if s.eq_ignore_ascii_case("link_up") => 1.0, + Some(s) if s.eq_ignore_ascii_case("polling") => 2.0, + Some(s) if s.eq_ignore_ascii_case("port_configuration_training") => 3.0, + _ => 0.0, + } +} + +/// InfiniBand logical port state enum -> numeric code. Values observed live on +/// GB200: `ACTIVE`, `DOWN`. 1.0 == active. +fn logical_port_state_to_f64(state: Option<&str>) -> f64 { + match state { + Some(s) if s.eq_ignore_ascii_case("active") => 1.0, _ => 0.0, } } +/// IB link width -> active lane count. Handles both the single live form +/// ("2X") and the comma-composite the NVOS schema allows for supported-widths +/// ("1X,2X,4X"); each token is parsed as `X` and the maximum lane count is +/// returned. Returns None when no token matches the `X` shape so unknown +/// widths are not exported. +fn link_width_to_f64(width: Option<&str>) -> Option { + let w = width?; + w.split(',') + .filter_map(|tok| { + tok.trim() + .strip_suffix(['X', 'x']) + .and_then(|digits| digits.parse::().ok()) + }) + .reduce(f64::max) +} + +/// IB link speed -> Gbps. NVOS types speed as a string/enum, but the live GB200 +/// capture emits bare numeric Gbps ("400" pairs with ib-speed=SPEED_NDR). We +/// accept the bare numeric (authoritative for this hardware) plus the defensive +/// suffix forms the schema permits, and normalize everything to Gbps: +/// - bare numeric ("400", "2.5") -> that value +/// - "G"/"G" (trailing G, case-insensitive) -> n +/// - "Mb/s" or "M" -> n/1000 +/// - anything else (e.g. "hdr") -> None (not exported) +fn link_speed_to_gbps(speed: Option<&str>) -> Option { + let s = speed?.trim(); + if s.is_empty() { + return None; + } + // Mb/s forms first ("M" alone is ambiguous with a stray suffix, but the + // longest match wins so "Mb/s" is checked before the bare "M"). + if let Some(mbps) = s + .strip_suffix("Mb/s") + .or_else(|| s.strip_suffix("MB/s")) + .or_else(|| s.strip_suffix("Mbps")) + .or_else(|| s.strip_suffix('M')) + .or_else(|| s.strip_suffix('m')) + { + return mbps.trim().parse::().ok().map(|v| v / 1000.0); + } + // "G" Gbps suffix + if let Some(gbps) = s.strip_suffix(['G', 'g']) { + return gbps.trim().parse::().ok(); + } + // bare numeric Gbps (live GB200 form) + s.parse::().ok() +} + +/// Log (at debug) an interface leaf that matched a known mapping arm but whose +/// value could not be coerced, so the silent drop is observable. Nothing is +/// emitted for the metric in this case. +fn debug_unmapped_value(elems: &[&PathElem], val: &proto::TypedValue, metric_type: &str) { + tracing::debug!( + leaf = %leaf_path(elems), + raw = ?typed_value_to_string(val), + metric_type, + "nvue_gnmi SAMPLE: matched leaf but value coercion returned None; dropping" + ); +} + +/// Render the gNMI element tail as a slash path for diagnostics, e.g. +/// "infiniband/state/speed". +fn leaf_path(elems: &[&PathElem]) -> String { + elems + .iter() + .map(|e| e.name.as_str()) + .collect::>() + .join("/") +} + fn component_health_to_f64(status: Option<&str>) -> f64 { match status { Some(s) if s.eq_ignore_ascii_case("healthy") => 1.0, @@ -642,138 +918,6 @@ mod tests { assert_eq!(count, 1); } - #[test] - fn unmapped_interface_leaf_emits_catalog_metric_sample() { - let sink = Arc::new(CapturingSink::default()); - let mut proc = test_processor(); - proc.data_sink = Some(sink.clone()); - let notification = proto::Notification { - timestamp: 0, - prefix: Some(proto::Path { - elem: vec![ - make_path_elem("interfaces", &[]), - make_path_elem("interface", &[("name", "nvl4")]), - ], - ..Default::default() - }), - update: vec![proto::Update { - path: Some(proto::Path { - elem: vec![ - make_path_elem("phy-diag", &[]), - make_path_elem("state", &[]), - make_path_elem("port-malformed-packet-errors", &[]), - ], - ..Default::default() - }), - val: Some(make_typed_value_uint(9)), - ..Default::default() - }], - ..Default::default() - }; - - let count = proc.process_notification(¬ification); - assert_eq!(count, 1); - - let events = sink.events.lock().expect("lock poisoned"); - assert_eq!(events.len(), 1); - let CollectorEvent::Metric(sample) = &events[0].1 else { - panic!("expected metric event"); - }; - assert_eq!(sample.metric_type, "nvswitch_port_malformed_packet_errors"); - assert_eq!(sample.value, 9.0); - assert!( - sample - .labels - .iter() - .any(|(key, value)| key.as_ref() == "interface" && value == "nvl4") - ); - } - - #[test] - fn known_numeric_interface_leaf_with_invalid_value_does_not_emit_generic_info_metric() { - let sink = Arc::new(CapturingSink::default()); - let mut proc = test_processor(); - proc.data_sink = Some(sink.clone()); - let notification = proto::Notification { - timestamp: 0, - prefix: Some(proto::Path { - elem: vec![ - make_path_elem("interfaces", &[]), - make_path_elem("interface", &[("name", "nvl4")]), - ], - ..Default::default() - }), - update: vec![proto::Update { - path: Some(proto::Path { - elem: vec![ - make_path_elem("state", &[]), - make_path_elem("counters", &[]), - make_path_elem("in-errors", &[]), - ], - ..Default::default() - }), - val: Some(make_typed_value_string("N/A")), - ..Default::default() - }], - ..Default::default() - }; - - let count = proc.process_notification(¬ification); - assert_eq!(count, 1); - - let events = sink.events.lock().expect("lock poisoned"); - assert!( - events.is_empty(), - "known numeric leaf with invalid value must be skipped instead of emitted as generic info" - ); - } - - #[test] - fn platform_general_string_leaf_emits_info_metric() { - let sink = Arc::new(CapturingSink::default()); - let mut proc = test_processor(); - proc.data_sink = Some(sink.clone()); - let notification = proto::Notification { - timestamp: 0, - prefix: Some(proto::Path { - elem: vec![make_path_elem("platform-general", &[])], - ..Default::default() - }), - update: vec![proto::Update { - path: Some(proto::Path { - elem: vec![ - make_path_elem("state", &[]), - make_path_elem("platform-name", &[]), - ], - ..Default::default() - }), - val: Some(make_typed_value_string("gb200-switch-a")), - ..Default::default() - }], - ..Default::default() - }; - - let count = proc.process_notification(¬ification); - assert_eq!(count, 1); - - let events = sink.events.lock().expect("lock poisoned"); - let CollectorEvent::Metric(sample) = &events[0].1 else { - panic!("expected metric event"); - }; - assert_eq!(sample.metric_type, "nvswitch_platform_name"); - assert_eq!(sample.unit, "info"); - assert_eq!(sample.value, 1.0); - assert!( - sample - .labels - .iter() - .all(|(key, _)| key.as_ref() != "leaf_value") - ); - assert!(sample.labels.iter().any(|(key, value)| { - key.as_ref() == "source_path" && value == "platform-general/state/platform-name" - })); - } - #[test] fn emitted_metrics_preserve_switch_position_context() { use std::str::FromStr; @@ -1176,20 +1320,593 @@ mod tests { assert_eq!(metrics.stream_errors_total.get(), 0.0); } - #[test] - fn test_process_subscribe_response_update_increments_notification_counter() { - let proc = test_processor(); - let metrics = test_stream_metrics(); - let resp = proto::SubscribeResponse { - response: Some(proto::subscribe_response::Response::Update( - proto::Notification { - timestamp: 0, - prefix: Some(proto::Path { - elem: vec![ - make_path_elem("interfaces", &[]), - make_path_elem("interface", &[("name", "nvl0")]), - ], - ..Default::default() + // ---- explicit GB200 mapping coverage ------------------------------------ + + /// Drive a single `/interfaces/interface[name=acp0]/` update and + /// return the one captured `MetricSample`, asserting the producer-level + /// invariants (stream `name`, `collector_type`, `interface_name` label). + fn run_interface_leaf( + tail: &[&str], + val: proto::TypedValue, + ) -> (MetricSample, EventContext) { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + // shared producer invariants for every interface mapping + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed("interface_name"), "acp0".to_string())] + ); + (*sample, ctx) + } + + /// Same as `run_interface_leaf` but for `/components/component[name=...]`. + fn run_component_leaf( + comp_name: &str, + tail: &[&str], + val: proto::TypedValue, + ) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", comp_name)]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed("component_name"), comp_name.to_string())] + ); + *sample + } + + #[test] + fn test_interface_numeric_leaf_table_mappings() { + // (leaf tail, expected metric_type, expected unit) + let cases: &[(&[&str], &str, &str)] = &[ + (&["state", "counters", "in-errors"], "interface_in_errors", "count"), + (&["state", "counters", "out-errors"], "interface_out_errors", "count"), + (&["state", "counters", "out-discards"], "interface_out_discards", "count"), + (&["state", "counters", "in-octets"], "interface_in_octets", "bytes"), + (&["state", "counters", "out-octets"], "interface_out_octets", "bytes"), + (&["state", "counters", "in-pkts"], "interface_in_packets", "count"), + (&["state", "counters", "out-pkts"], "interface_out_packets", "count"), + ( + &["infiniband", "state", "counters", "port", "link-downed"], + "interface_link_downed", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "link-error-recovery"], + "interface_link_error_recovery", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "rcv-remote-phy-errors"], + "interface_rcv_remote_physical_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "rcv-switch-relay-errors"], + "interface_rcv_switch_relay_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "rcv-constraints-errors"], + "interface_rcv_constraint_errors", + "count", + ), + ( + &[ + "infiniband", + "state", + "counters", + "port", + "local-link-integrity-errors", + ], + "interface_local_link_integrity_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "excessive-buffer-overrun"], + "interface_port_buffer_overrun_errors", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "qp1-dropped"], + "interface_qp1_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "vl15-dropped"], + "interface_vl15_dropped", + "count", + ), + ( + &["infiniband", "state", "counters", "port", "xmit-wait"], + "interface_port_xmit_wait", + "count", + ), + (&["infiniband", "state", "mtu"], "interface_mtu", "bytes"), + ( + &["infiniband", "state", "max-supported-mtus"], + "interface_max_supported_mtu", + "bytes", + ), + (&["phy-diag", "state", "raw-ber"], "interface_raw_ber", "ratio"), + (&["phy-diag", "state", "effective-ber"], "interface_effective_ber", "ratio"), + (&["phy-diag", "state", "symbol-ber"], "interface_symbol_ber", "ratio"), + (&["phy-diag", "state", "raw-ber-ch-1"], "interface_raw_ber_lane0", "ratio"), + (&["phy-diag", "state", "raw-ber-ch-2"], "interface_raw_ber_lane1", "ratio"), + ( + &["phy-diag", "state", "raw-errors-ch-1"], + "interface_phy_raw_errors_lane0", + "count", + ), + ( + &["phy-diag", "state", "raw-errors-ch-2"], + "interface_phy_raw_errors_lane1", + "count", + ), + ( + &["phy-diag", "state", "effective-errors"], + "interface_phy_effective_errors", + "count", + ), + (&["phy-diag", "state", "zero-hist"], "interface_zero_hist", "count"), + ( + &["phy-diag", "state", "phy-received-bits"], + "interface_phy_received_bits", + "count", + ), + ( + &["phy-diag", "state", "port-malformed-packet-errors"], + "interface_port_malformed_packet_errors", + "count", + ), + ( + &["phy-diag", "state", "port-neighbor-mtu-discards"], + "interface_port_neighbor_mtu_discards", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-rcv-pkts"], + "interface_port_multicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-multi-cast-xmit-pkts"], + "interface_port_multicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-rcv-pkts"], + "interface_port_unicast_rcv_packets", + "count", + ), + ( + &["phy-diag", "state", "port-uni-cast-xmit-pkts"], + "interface_port_unicast_xmit_packets", + "count", + ), + ( + &["phy-diag", "state", "port-local-physical-errors"], + "interface_port_local_physical_errors", + "count", + ), + ( + &["phy-diag", "state", "sync-header-error-counter"], + "interface_sync_header_error_counter", + "count", + ), + ( + &["phy-diag", "state", "port-dlid-mapping-errors"], + "interface_port_dlid_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-vl-mapping-errors"], + "interface_port_vl_mapping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-looping-errors"], + "interface_port_looping_errors", + "count", + ), + ( + &["phy-diag", "state", "port-inactive-discards"], + "interface_port_inactive_discards", + "count", + ), + ( + &["phy-diag", "state", "rq-general-error"], + "interface_rq_general_error", + "count", + ), + (&["phy-diag", "state", "plr-rcv-codes"], "interface_plr_rcv_codes", "count"), + ( + &["phy-diag", "state", "plr-rcv-code-err"], + "interface_plr_rcv_codes_err", + "count", + ), + ( + &["phy-diag", "state", "plr-rcv-uncorrectable-code"], + "interface_plr_rcv_uncorrectables_code", + "count", + ), + (&["phy-diag", "state", "plr-xmit-codes"], "interface_plr_xmit_codes", "count"), + ( + &["phy-diag", "state", "plr-xmit-retry-codes"], + "interface_plr_xmit_retrys_codes", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-events"], + "interface_plr_xmit_retrys_events", + "count", + ), + ( + &["phy-diag", "state", "plr-sync-events"], + "interface_plr_sync_events", + "count", + ), + ( + &["phy-diag", "state", "plr-xmit-retry-events-within-t-sec-max"], + "interface_plr_xmit_retry_codes_within_minute", + "count", + ), + ]; + + for (tail, expected_name, expected_unit) in cases { + let (sample, _) = run_interface_leaf(tail, make_typed_value_uint(7)); + assert_eq!( + &sample.metric_type, expected_name, + "metric_type mismatch for leaf {tail:?}" + ); + assert_eq!( + &sample.unit, expected_unit, + "unit mismatch for leaf {tail:?}" + ); + assert_eq!(sample.value, 7.0, "value mismatch for leaf {tail:?}"); + } + } + + #[test] + fn test_interface_fec_histogram_bins() { + for n in 0u8..=15 { + let leaf = format!("rs-num-corr-err-bin{n}"); + let (sample, _) = + run_interface_leaf(&["phy-diag", "state", &leaf], make_typed_value_uint(11)); + assert_eq!(sample.metric_type, format!("interface_fec_hist_{n}")); + assert_eq!(sample.unit, "count"); + assert_eq!(sample.value, 11.0); + } + } + + #[test] + fn test_interface_ber_parses_scientific_notation() { + // live BER values arrive as scientific-notation strings, e.g. "15E-255" + let (sample, _) = run_interface_leaf( + &["phy-diag", "state", "raw-ber"], + make_typed_value_string("1E-12"), + ); + assert_eq!(sample.metric_type, "interface_raw_ber"); + assert_eq!(sample.unit, "ratio"); + assert!((sample.value - 1e-12).abs() < f64::EPSILON); + } + + #[test] + fn test_interface_physical_port_state_enum() { + for (raw, expected) in [ + ("LINK_UP", 1.0), + ("POLLING", 2.0), + ("PORT_CONFIGURATION_TRAINING", 3.0), + ("SOMETHING_ELSE", 0.0), + ] { + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "physical-port-state"], + make_typed_value_string(raw), + ); + assert_eq!(sample.metric_type, "interface_physical_port_state"); + assert_eq!(sample.unit, "state"); + assert_eq!(sample.value, expected, "physical-port-state {raw}"); + } + } + + #[test] + fn test_interface_logical_port_state_enum() { + for (raw, expected) in [("ACTIVE", 1.0), ("DOWN", 0.0)] { + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "logical-port-state"], + make_typed_value_string(raw), + ); + assert_eq!(sample.metric_type, "interface_logical_port_state"); + assert_eq!(sample.unit, "state"); + assert_eq!(sample.value, expected, "logical-port-state {raw}"); + } + } + + #[test] + fn test_interface_link_width_enum() { + let (active, _) = run_interface_leaf( + &["infiniband", "state", "width"], + make_typed_value_string("2X"), + ); + assert_eq!(active.metric_type, "interface_link_width_active"); + assert_eq!(active.unit, "lanes"); + assert_eq!(active.value, 2.0); + + let (supported, _) = run_interface_leaf( + &["infiniband", "state", "supported-widths"], + make_typed_value_string("4X"), + ); + assert_eq!(supported.metric_type, "interface_supported_width"); + assert_eq!(supported.unit, "lanes"); + assert_eq!(supported.value, 4.0); + } + + #[test] + fn test_component_explicit_leaf_mappings() { + // ASIC-TEMP-CURRENT (row 875) + let asic = run_component_leaf("ASIC1", &["asic", "state", "asic-temp"], make_typed_value_uint(46)); + assert_eq!(asic.metric_type, "component_asic_temperature_celsius"); + assert_eq!(asic.unit, "celsius"); + assert_eq!(asic.value, 46.0); + + // CPU-UTIL (row 885) + let cpu = run_component_leaf( + "cpu", + &["cpu", "utilization", "state", "avg"], + make_typed_value_uint(24), + ); + assert_eq!(cpu.metric_type, "component_cpu_utilization"); + assert_eq!(cpu.unit, "percent"); + assert_eq!(cpu.value, 24.0); + } + + #[test] + fn test_component_oper_status_shared_leaf_fan_and_cpu() { + // FAN-STATE (row 966) and CPU-STATE (row 1174) share state/oper-status; + // the component_name label is the only discriminator. + let fan = run_component_leaf( + "FAN1/1", + &["state", "oper-status"], + make_typed_value_string("ACTIVE"), + ); + assert_eq!(fan.metric_type, "component_oper_status"); + assert_eq!(fan.unit, "state"); + assert_eq!(fan.value, 1.0); + + let cpu = run_component_leaf( + "cpu", + &["state", "oper-status"], + make_typed_value_string("ACTIVE"), + ); + assert_eq!(cpu.metric_type, "component_oper_status"); + assert_eq!(cpu.value, 1.0); + } + + #[test] + fn test_unknown_interface_leaf_is_not_exported() { + // a live but unmapped leaf (phy-manager-state is flagged, not mapped) + // must never produce a MetricSample. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("phy-diag", &[]), + make_path_elem("state", &[]), + make_path_elem("phy-manager-state", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("SUBFSM_ACTIVE_E")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unmapped leaf must not emit a metric" + ); + } + + #[test] + fn test_link_width_to_f64_helper() { + assert_eq!(link_width_to_f64(Some("1X")), Some(1.0)); + assert_eq!(link_width_to_f64(Some("2X")), Some(2.0)); + assert_eq!(link_width_to_f64(Some("4x")), Some(4.0)); + // comma-composite supported-widths -> max lane count + assert_eq!(link_width_to_f64(Some("1X,2X,4X")), Some(4.0)); + assert_eq!(link_width_to_f64(Some("1X, 2X")), Some(2.0)); + // partially-unrecognized composites still yield the max of the valid lanes + assert_eq!(link_width_to_f64(Some("2X,foo")), Some(2.0)); + assert_eq!(link_width_to_f64(Some("VL0-VL7")), None); + assert_eq!(link_width_to_f64(Some("")), None); + assert_eq!(link_width_to_f64(None), None); + } + + #[test] + fn test_link_speed_to_gbps_helper() { + // live GB200: bare numerics are already Gbps + assert_eq!(link_speed_to_gbps(Some("400")), Some(400.0)); + assert_eq!(link_speed_to_gbps(Some("100")), Some(100.0)); + assert_eq!(link_speed_to_gbps(Some("0")), Some(0.0)); + assert_eq!(link_speed_to_gbps(Some("2.5")), Some(2.5)); + // defensive: trailing "G"/"g" suffix (NVOS schema enum form) + assert_eq!(link_speed_to_gbps(Some("400G")), Some(400.0)); + assert_eq!(link_speed_to_gbps(Some("2.5g")), Some(2.5)); + // defensive: Mb/s and M suffix -> divide by 1000 + assert_eq!(link_speed_to_gbps(Some("1000Mb/s")), Some(1.0)); + assert_eq!(link_speed_to_gbps(Some("1000M")), Some(1.0)); + // unrecognized -> None + assert_eq!(link_speed_to_gbps(Some("hdr")), None); + assert_eq!(link_speed_to_gbps(Some("")), None); + assert_eq!(link_speed_to_gbps(None), None); + } + + #[test] + fn test_interface_link_speed_active_gbps() { + // bare numerics (live GB200 form) pass through as Gbps + for (raw, expected) in [("400", 400.0), ("100", 100.0), ("0", 0.0)] { + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string(raw), + ); + assert_eq!(sample.metric_type, "interface_link_speed_active"); + assert_eq!(sample.unit, "gbps", "speed unit must be gbps for {raw}"); + assert_eq!(sample.value, expected, "speed {raw}"); + } + + // defensive suffix forms + let (g_suffix, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("400G"), + ); + assert_eq!(g_suffix.unit, "gbps"); + assert_eq!(g_suffix.value, 400.0); + + let (g_frac, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("2.5G"), + ); + assert_eq!(g_frac.value, 2.5); + + let (mb, _) = run_interface_leaf( + &["infiniband", "state", "speed"], + make_typed_value_string("1000Mb/s"), + ); + assert_eq!(mb.unit, "gbps"); + assert_eq!(mb.value, 1.0); + } + + #[test] + fn test_interface_link_speed_unparseable_is_not_exported() { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("infiniband", &[]), + make_path_elem("state", &[]), + make_path_elem("speed", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("hdr")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unparseable speed must not emit a metric" + ); + } + + #[test] + fn test_oper_status_active_is_up() { + assert_eq!(oper_status_to_f64(Some("ACTIVE")), 1.0); + assert_eq!(oper_status_to_f64(Some("active")), 1.0); + assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); + } + + #[test] + fn test_process_subscribe_response_update_increments_notification_counter() { + let proc = test_processor(); + let metrics = test_stream_metrics(); + let resp = proto::SubscribeResponse { + response: Some(proto::subscribe_response::Response::Update( + proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "nvl0")]), + ], + ..Default::default() }), update: vec![proto::Update { path: Some(proto::Path { @@ -1214,4 +1931,139 @@ mod tests { assert_eq!(metrics.monitored_entities.get(), 1.0); assert_eq!(metrics.stream_errors_total.get(), 0.0); } + + // ---- /platform-general switch-level singleton coverage ----------------- + + /// Drive a single `/platform-general/` update and return the one + /// captured `MetricSample`, asserting the producer-level invariants (stream + /// `name`, `collector_type`, and that the switch-level singleton carries no + /// per-entity name label). + fn run_platform_general_leaf(tail: &[&str], val: proto::TypedValue) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + assert!( + sample.labels.is_empty(), + "switch-level singleton must not carry a per-entity name label" + ); + *sample + } + + #[test] + fn test_platform_general_numeric_leaf_mappings() { + // (leaf tail, raw bytes value, expected metric_type, expected value) + // values are the authoritative live GB200 Stage-0 capture. + let cases: &[(&[&str], u64, &str)] = &[ + (&["state", "memory-used"], 3_856_510_976, "platform_memory_used"), + ( + &["state", "memory-total-size"], + 16_151_990_272, + "platform_memory_total", + ), + ( + &["state", "disk-total-size"], + 77_780_082_688, + "platform_disk_total", + ), + (&["state", "disk-used"], 22_848_192_512, "platform_disk_used"), + ]; + for (tail, raw, metric_type) in cases { + let sample = run_platform_general_leaf(tail, make_typed_value_uint(*raw)); + assert_eq!(sample.metric_type, *metric_type, "leaf {tail:?}"); + assert_eq!(sample.unit, "bytes", "leaf {tail:?} unit must be bytes"); + assert_eq!(sample.value, *raw as f64, "leaf {tail:?} value"); + } + } + + #[test] + fn test_platform_general_non_numeric_value_is_not_exported() { + // A numeric leaf whose value cannot be coerced to f64 emits nothing. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem("memory-used", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("not-a-number")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "non-numeric platform-general value must not emit a metric" + ); + } + + #[test] + fn test_platform_general_string_leaf_is_not_exported() { + // String leaves at the same level (contact, location, platform-name) + // are out of scope: they must fall through unmapped and emit nothing. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem("platform-name", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("MQM9700")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + // the platform-general entity is still counted, but nothing is emitted + assert_eq!(count, 1); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "unmapped platform-general string leaf must not emit a metric" + ); + } } diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index d9f53dd5a3..906964da9d 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -33,6 +33,7 @@ const NVUE_SYSTEM_HEALTH: &str = "/nvue_v1/system/health"; const NVUE_CLUSTER_APPS: &str = "/nvue_v1/cluster/apps"; const NVUE_SDN_PARTITIONS: &str = "/nvue_v1/sdn/partition"; const NVUE_INTERFACES: &str = "/nvue_v1/interface"; +const NVUE_PLATFORM_ENVIRONMENT_FAN: &str = "/nvue_v1/platform/environment/fan"; #[derive(Clone)] pub struct UsernamePassword { @@ -125,6 +126,16 @@ impl RestClient { self.do_get(url, &[]).await.map(Some) } + pub async fn get_platform_environment_fan( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_fan_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT_FAN)?; + self.do_get(url, &[]).await.map(Some) + } + pub async fn get_interfaces(&self) -> Result, HealthError> { if !self.paths.interfaces_enabled { return Ok(None); @@ -287,6 +298,17 @@ pub struct SdnPartition { pub num_gpus: Option, } +pub type FanEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct FanData { + /// Fan maximum speed in RPM, reported by NVUE as a string (e.g. "33000"). + /// Other per-fan fields (current-speed, min-speed, direction, state) are + /// intentionally not captured — only max-speed is in scope. + #[serde(rename = "max-speed")] + pub max_speed: Option, +} + pub type InterfacesResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] @@ -520,6 +542,47 @@ mod tests { assert!(eth0.link.speed.is_none()); } + #[test] + fn test_parse_platform_environment_fan() { + let json = r#"{ + "FAN1/1": { + "current-speed": "10096", + "direction": "F2B", + "max-speed": "33000", + "min-speed": "6000", + "state": "ok" + }, + "FAN1/2": { + "current-speed": "9800", + "direction": "F2B", + "max-speed": "33000", + "min-speed": "6000", + "state": "ok" + } + }"#; + + let resp: FanEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 2); + assert_eq!(resp["FAN1/1"].max_speed.as_deref(), Some("33000")); + assert_eq!(resp["FAN1/2"].max_speed.as_deref(), Some("33000")); + } + + #[test] + fn test_parse_platform_environment_fan_missing_max_speed() { + let json = r#"{ + "FAN1/1": { + "current-speed": "10096", + "direction": "F2B", + "min-speed": "6000", + "state": "ok" + } + }"#; + + let resp: FanEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 1); + assert!(resp["FAN1/1"].max_speed.is_none()); + } + #[test] fn test_parse_empty_responses() { let empty_map: ClusterAppsResponse = serde_json::from_str("{}").unwrap(); @@ -530,5 +593,8 @@ mod tests { let empty_interfaces: InterfacesResponse = serde_json::from_str("{}").unwrap(); assert!(empty_interfaces.is_empty()); + + let empty_fans: FanEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_fans.is_empty()); } } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 2165a5f9d2..0e75c55aca 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -62,6 +62,13 @@ fn diagnostic_opcode_to_f64(code: &str) -> f64 { } } +/// NVUE reports fan max-speed as a string (e.g. "33000"). Parse it to RPM as +/// f64; return `None` when the field is absent or unparseable so callers emit +/// nothing rather than fabricating a value. +fn fan_max_speed_to_f64(max_speed: Option<&str>) -> Option { + max_speed.and_then(|s| s.trim().parse::().ok()) +} + pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, pub data_sink: Option>, @@ -246,6 +253,34 @@ impl PeriodicCollector for NvueRestCollector { } } + match self.client.get_platform_environment_fan().await { + Ok(Some(fans)) => { + for (fan_name, fan) in &fans { + // Only emit when max-speed parses; absent/garbage → nothing. + if let Some(value) = fan_max_speed_to_f64(fan.max_speed.as_deref()) { + self.emit_metric( + "fan_max_speed", + Some(fan_name), + value, + "rpm", + vec![(Cow::Borrowed("fan_name"), fan_name.clone())], + ); + entity_count += 1; + } + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment fan" + ); + } + } + if saw_auth_failure { tracing::warn!( switch_id = %self.switch_id, @@ -391,6 +426,146 @@ mod tests { assert_eq!(diagnostic_opcode_to_f64("57"), 1.0); } + #[test] + fn test_fan_max_speed_parsing() { + assert_eq!(fan_max_speed_to_f64(Some("33000")), Some(33000.0)); + assert_eq!(fan_max_speed_to_f64(Some(" 33000 ")), Some(33000.0)); + assert_eq!(fan_max_speed_to_f64(Some("6000")), Some(6000.0)); + assert_eq!(fan_max_speed_to_f64(Some("not-a-number")), None); + assert_eq!(fan_max_speed_to_f64(Some("")), None); + assert_eq!(fan_max_speed_to_f64(None), None); + } + + /// Drives the same parse + emit logic `run_iteration` uses for the + /// platform/environment/fan endpoint against a captured sink, asserting the + /// emitted MAX-SPEED sample shape. Table-driven over representative payloads. + #[test] + fn test_fan_max_speed_emit() { + use crate::collectors::nvue::rest::client::FanEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + struct Case { + name: &'static str, + json: &'static str, + // (fan_name, expected_value) pairs that MUST be emitted. + expected: &'static [(&'static str, f64)], + // Fan names that MUST NOT produce a sample. + absent: &'static [&'static str], + } + + let cases = [ + Case { + name: "two healthy fans emit max-speed", + json: r#"{ + "FAN1/1": {"current-speed": "10096", "direction": "F2B", "max-speed": "33000", "min-speed": "6000", "state": "ok"}, + "FAN1/2": {"current-speed": "9800", "direction": "F2B", "max-speed": "33000", "min-speed": "6000", "state": "ok"} + }"#, + expected: &[("FAN1/1", 33000.0), ("FAN1/2", 33000.0)], + absent: &[], + }, + Case { + name: "missing max-speed emits nothing", + json: r#"{ + "FAN1/1": {"current-speed": "10096", "min-speed": "6000", "state": "ok"} + }"#, + expected: &[], + absent: &["FAN1/1"], + }, + Case { + name: "garbage max-speed emits nothing", + json: r#"{ + "FAN1/1": {"max-speed": "bogus", "state": "ok"} + }"#, + expected: &[], + absent: &["FAN1/1"], + }, + ]; + + for case in cases { + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let fans: FanEnvironmentResponse = + serde_json::from_str(case.json).expect("fan json parses"); + // Mirror run_iteration's emit loop exactly. + for (fan_name, fan) in &fans { + if let Some(value) = fan_max_speed_to_f64(fan.max_speed.as_deref()) { + collector.emit_metric( + "fan_max_speed", + Some(fan_name), + value, + "rpm", + vec![(Cow::Borrowed("fan_name"), fan_name.clone())], + ); + } + } + + let samples = sink.samples.lock().unwrap(); + assert_eq!( + samples.len(), + case.expected.len(), + "case '{}': unexpected emitted sample count", + case.name + ); + + for (fan_name, expected_value) in case.expected { + let sample = samples + .iter() + .find(|s| { + s.labels + .iter() + .any(|(k, v)| k == "fan_name" && v == fan_name) + }) + .unwrap_or_else(|| { + panic!("case '{}': no sample for fan {fan_name}", case.name) + }); + + assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); + assert_eq!(sample.metric_type, "fan_max_speed", "case '{}'", case.name); + assert_eq!(sample.unit, "rpm", "case '{}'", case.name); + assert_eq!(sample.value, *expected_value, "case '{}'", case.name); + assert_eq!( + sample.key, + format!("fan_max_speed:{fan_name}"), + "case '{}'", + case.name + ); + assert_eq!(sample.labels.len(), 1, "case '{}'", case.name); + assert_eq!(sample.labels[0].0, "fan_name", "case '{}'", case.name); + assert_eq!(sample.labels[0].1, *fan_name, "case '{}'", case.name); + } + + for fan_name in case.absent { + assert!( + !samples.iter().any(|s| s + .labels + .iter() + .any(|(k, v)| k == "fan_name" && v == fan_name)), + "case '{}': fan {fan_name} should not emit a sample", + case.name + ); + } + } + } + struct ScriptedProvider { calls: AtomicUsize, // Each call pops the front of this queue; an empty queue yields an @@ -442,6 +617,7 @@ mod tests { cluster_apps_enabled: false, sdn_partitions_enabled: false, interfaces_enabled: false, + platform_environment_fan_enabled: false, } } @@ -496,7 +672,7 @@ mod tests { assert!(collector.client.has_credentials()); assert_eq!( result.fetch_failures, 0, - "all four paths disabled → no HTTP, no failures" + "all paths disabled → no HTTP, no failures" ); // Subsequent iterations reuse the already-installed credentials. collector diff --git a/crates/health/src/collectors/telemetry_service.rs b/crates/health/src/collectors/telemetry_service.rs deleted file mode 100644 index 9bb904805b..0000000000 --- a/crates/health/src/collectors/telemetry_service.rs +++ /dev/null @@ -1,564 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//! Redfish TelemetryService MetricReport collection. - -use std::borrow::Cow; -use std::collections::HashSet; -use std::sync::Arc; - -use futures::{StreamExt, stream}; -use nv_redfish::ServiceRoot; -use nv_redfish::core::{Bmc, EntityTypeRef}; -use nv_redfish::schema::metric_report::{MetricReport, MetricValue}; - -use crate::HealthError; -use crate::collectors::{IterationResult, PeriodicCollector}; -use crate::config::TelemetryServiceCollectorConfig as TelemetryServiceCollectorOptions; -use crate::endpoint::BmcEndpoint; -use crate::metrics::MetricLabel; -use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; - -pub struct TelemetryServiceCollectorConfig { - pub data_sink: Option>, - pub options: TelemetryServiceCollectorOptions, -} - -pub struct TelemetryServiceCollector { - bmc: Arc, - event_context: EventContext, - data_sink: Option>, - metric_report_ids: HashSet, - fetch_concurrency: usize, -} - -impl PeriodicCollector for TelemetryServiceCollector { - type Config = TelemetryServiceCollectorConfig; - - fn new_runner( - bmc: Arc, - endpoint: Arc, - config: Self::Config, - ) -> Result { - Ok(Self { - bmc, - event_context: EventContext::from_endpoint( - endpoint.as_ref(), - "redfish_telemetry_service", - ), - data_sink: config.data_sink, - metric_report_ids: config.options.metric_report_ids.into_iter().collect(), - fetch_concurrency: config.options.fetch_concurrency.max(1), - }) - } - - async fn run_iteration(&mut self) -> Result { - self.collect_metric_reports().await - } - - fn collector_type(&self) -> &'static str { - "redfish_telemetry_service" - } - - async fn stop(&mut self) { - self.emit_event(CollectorEvent::CollectorRemoved); - } -} - -impl TelemetryServiceCollector { - fn emit_event(&self, event: CollectorEvent) { - if let Some(data_sink) = &self.data_sink { - data_sink.handle_event(&self.event_context, &event); - } - } - - async fn collect_metric_reports(&self) -> Result { - let root = ServiceRoot::new(self.bmc.clone()) - .await - .map_err(|error| HealthError::BmcError(Box::new(error)))?; - - let Some(telemetry_service) = root - .telemetry_service() - .await - .map_err(|error| HealthError::BmcError(Box::new(error)))? - else { - tracing::debug!("BMC endpoint does not expose Redfish TelemetryService"); - return Ok(IterationResult { - refresh_triggered: true, - entity_count: Some(0), - fetch_failures: 0, - }); - }; - - let Some(metric_report_links) = telemetry_service - .metric_report_links() - .await - .map_err(|error| HealthError::BmcError(Box::new(error)))? - else { - tracing::debug!("Redfish TelemetryService has no MetricReports collection"); - return Ok(IterationResult { - refresh_triggered: true, - entity_count: Some(0), - fetch_failures: 0, - }); - }; - - let requested_ids = &self.metric_report_ids; - let fetch_concurrency = self.fetch_concurrency; - let reports = stream::iter(metric_report_links) - .filter(|link| { - let include = requested_ids.is_empty() - || link - .odata_id() - .last_segment() - .is_some_and(|id| requested_ids.contains(id)); - async move { include } - }) - .map(|link| async move { - let report_id = link.odata_id().to_string(); - link.fetch().await.map(|report| (report_id, report)) - }) - .buffer_unordered(fetch_concurrency) - .collect::>() - .await; - - self.emit_event(CollectorEvent::MetricCollectionStart); - - let mut sample_count = 0; - let mut fetch_failures = 0; - for result in reports { - match result { - Ok((report_uri, report)) => { - for sample in metric_samples_from_report(&report, &report_uri) { - sample_count += 1; - self.emit_event(CollectorEvent::Metric(sample.into())); - } - } - Err(error) => { - fetch_failures += 1; - tracing::warn!(?error, "failed to fetch Redfish MetricReport"); - } - } - } - - self.emit_event(CollectorEvent::MetricCollectionEnd); - - Ok(IterationResult { - refresh_triggered: true, - entity_count: Some(sample_count), - fetch_failures, - }) - } -} - -fn metric_samples_from_report(report: &MetricReport, report_uri: &str) -> Vec { - let report_id = report.base.id.as_str(); - let report_definition = report - .metric_report_definition - .as_ref() - .map(|reference| reference.odata_id().to_string()); - - report - .metric_values - .as_deref() - .unwrap_or_default() - .iter() - .filter_map(|metric| { - metric_sample_from_value(report_id, report_uri, report_definition.as_deref(), metric) - }) - .collect() -} - -fn metric_sample_from_value( - report_id: &str, - report_uri: &str, - report_definition: Option<&str>, - metric: &MetricValue, -) -> Option { - let raw_value = nested_optional_str(&metric.metric_value)?; - let metric_id = nested_optional_str(&metric.metric_id); - let metric_property = nested_optional_str(&metric.metric_property); - let (value, unit) = metric_value_to_f64(raw_value)?; - let metric_identity = metric_identity(metric_id, metric_property).or_else(|| { - tracing::warn!( - report_id, - report_uri, - "Skipping Redfish MetricReport value without MetricId or MetricProperty" - ); - None - })?; - let metric_type = metric_type(metric_id, metric_property)?; - - let mut labels: Vec = vec![ - (Cow::Borrowed("report_id"), report_id.to_string()), - (Cow::Borrowed("report_uri"), report_uri.to_string()), - ]; - if let Some(metric_id) = metric_id { - labels.push((Cow::Borrowed("metric_id"), metric_id.to_string())); - } - if let Some(metric_property) = metric_property { - labels.push(( - Cow::Borrowed("metric_property"), - metric_property.to_string(), - )); - } - if let Some(report_definition) = report_definition { - labels.push(( - Cow::Borrowed("metric_report_definition"), - report_definition.to_string(), - )); - } - labels.push((Cow::Borrowed("metric_identity"), metric_identity)); - let key = metric_sample_key(report_id, metric_id, metric_property)?; - Some(MetricSample { - key, - name: "redfish_telemetry_service".to_string(), - metric_type, - unit, - value, - labels, - context: None, - }) -} - -fn nested_optional_str(value: &Option>) -> Option<&str> { - value.as_ref().and_then(|inner| inner.as_deref()) -} - -fn metric_value_to_f64(raw: &str) -> Option<(f64, String)> { - if raw.eq_ignore_ascii_case("true") { - return Some((1.0, "state".to_string())); - } - if raw.eq_ignore_ascii_case("false") { - return Some((0.0, "state".to_string())); - } - if let Ok(value) = raw.parse::() { - return value.is_finite().then_some((value, "value".to_string())); - } - - Some((1.0, "info".to_string())) -} - -fn metric_identity(metric_id: Option<&str>, metric_property: Option<&str>) -> Option { - let mut parts = Vec::new(); - if let Some(metric_id) = metric_id.and_then(non_empty) { - let token = sanitize_metric_token(metric_id); - if !token.is_empty() { - parts.push(format!("metric_id:{token}")); - } - } - if let Some(metric_property) = metric_property.and_then(non_empty) { - let token = sanitize_metric_token(metric_property); - if !token.is_empty() { - parts.push(format!("metric_property:{token}")); - } - } - (!parts.is_empty()).then(|| parts.join(":")) -} - -fn metric_sample_key( - report_id: &str, - metric_id: Option<&str>, - metric_property: Option<&str>, -) -> Option { - let mut parts = Vec::new(); - if let Some(metric_id) = metric_id.and_then(non_empty) { - parts.push(format!( - "metric_id={}", - escape_metric_key_component(metric_id) - )); - } - if let Some(metric_property) = metric_property.and_then(non_empty) { - parts.push(format!( - "metric_property={}", - escape_metric_key_component(metric_property) - )); - } - - (!parts.is_empty()).then(|| format!("{report_id}:{}", parts.join(":"))) -} - -fn escape_metric_key_component(value: &str) -> String { - let mut escaped = String::with_capacity(value.len()); - for byte in value.bytes() { - match byte { - b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { - escaped.push(byte as char); - } - _ => { - escaped.push('%'); - escaped.push(hex_digit(byte >> 4)); - escaped.push(hex_digit(byte & 0x0f)); - } - } - } - escaped -} - -fn hex_digit(nibble: u8) -> char { - match nibble { - 0..=9 => (b'0' + nibble) as char, - 10..=15 => (b'A' + nibble - 10) as char, - _ => unreachable!("hex nibble is always <= 15"), - } -} - -fn metric_type(metric_id: Option<&str>, metric_property: Option<&str>) -> Option { - metric_id - .and_then(non_empty) - .or_else(|| metric_property.and_then(last_path_segment)) - .map(sanitize_metric_token) - .filter(|token| !token.is_empty()) -} - -fn non_empty(value: &str) -> Option<&str> { - (!value.is_empty()).then_some(value) -} - -fn last_path_segment(value: &str) -> Option<&str> { - let pointer = value - .split_once('#') - .map(|(_, pointer)| pointer) - .filter(|pointer| !pointer.is_empty()); - let path = pointer.unwrap_or(value).trim_end_matches('/'); - path.rsplit('/').find(|segment| !segment.is_empty()) -} - -fn sanitize_metric_token(value: &str) -> String { - let mut token = String::with_capacity(value.len()); - let mut previous_was_separator = false; - let chars = value.chars().collect::>(); - for (index, ch) in chars.iter().copied().enumerate() { - if ch.is_ascii_alphanumeric() { - let previous = index.checked_sub(1).and_then(|i| chars.get(i)).copied(); - let next = chars.get(index + 1).copied(); - let starts_word = ch.is_ascii_uppercase() - && !previous_was_separator - && previous.is_some_and(|prev| prev.is_ascii_alphanumeric()) - && (previous - .is_some_and(|prev| prev.is_ascii_lowercase() || prev.is_ascii_digit()) - || next.is_some_and(|next| next.is_ascii_lowercase())); - if starts_word { - token.push('_'); - } - token.push(ch.to_ascii_lowercase()); - previous_was_separator = false; - } else if !previous_was_separator { - token.push('_'); - previous_was_separator = true; - } - } - token.trim_matches('_').to_string() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn metric_report_values_emit_numeric_and_info_samples() { - let report: MetricReport = serde_json::from_value(serde_json::json!({ - "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - "@odata.type": "#MetricReport.v1_3_0.MetricReport", - "Id": "NvidiaNMMetrics_0", - "Name": "NVIDIA NVSwitch metrics", - "MetricReportDefinition": { - "@odata.id": "/redfish/v1/TelemetryService/MetricReportDefinitions/NvidiaNMMetrics" - }, - "MetricValues": [ - { - "MetricId": "PortMalformedPacketErrors", - "MetricValue": "17", - "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/Oem/Nvidia/MalformedPackets" - }, - { - "MetricId": "SwitchFirmwareVersion", - "MetricValue": "1.2.3" - }, - { - "MetricId": "LinkHealthy", - "MetricValue": "true" - } - ] - })) - .expect("MetricReport JSON should parse"); - - let samples = metric_samples_from_report( - &report, - "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - ); - - assert_eq!(samples.len(), 3); - assert_eq!(samples[0].name, "redfish_telemetry_service"); - assert_eq!(samples[0].metric_type, "port_malformed_packet_errors"); - assert_eq!(samples[0].unit, "value"); - assert_eq!(samples[0].value, 17.0); - assert!( - samples[0].key.starts_with( - "NvidiaNMMetrics_0:metric_id=PortMalformedPacketErrors:metric_property=" - ) - ); - assert_eq!(samples[1].metric_type, "switch_firmware_version"); - assert_eq!(samples[1].unit, "info"); - assert_eq!(samples[1].value, 1.0); - assert!( - samples[1] - .labels - .iter() - .all(|(key, _)| key.as_ref() != "metric_value") - ); - assert_eq!(samples[2].metric_type, "link_healthy"); - assert_eq!(samples[2].unit, "state"); - assert_eq!(samples[2].value, 1.0); - } - - #[test] - fn metric_report_keys_use_stable_metric_identity_instead_of_array_index() { - let report: MetricReport = serde_json::from_value(serde_json::json!({ - "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - "@odata.type": "#MetricReport.v1_3_0.MetricReport", - "Id": "NvidiaNMMetrics_0", - "Name": "NVIDIA NVSwitch metrics", - "MetricValues": [ - { - "MetricId": "PortRcvErrors", - "MetricValue": "1", - "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors" - }, - { - "MetricId": "PortRcvErrors", - "MetricValue": "2", - "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/2/Metrics#/RXErrors" - } - ] - })) - .expect("MetricReport JSON should parse"); - let reversed: MetricReport = serde_json::from_value(serde_json::json!({ - "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - "@odata.type": "#MetricReport.v1_3_0.MetricReport", - "Id": "NvidiaNMMetrics_0", - "Name": "NVIDIA NVSwitch metrics", - "MetricValues": [ - { - "MetricId": "PortRcvErrors", - "MetricValue": "2", - "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/2/Metrics#/RXErrors" - }, - { - "MetricId": "PortRcvErrors", - "MetricValue": "1", - "MetricProperty": "/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors" - } - ] - })) - .expect("MetricReport JSON should parse"); - - let original_keys = metric_samples_from_report( - &report, - "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - ) - .into_iter() - .map(|sample| sample.key) - .collect::>(); - let reversed_keys = metric_samples_from_report( - &reversed, - "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - ) - .into_iter() - .map(|sample| sample.key) - .collect::>(); - - assert_eq!(original_keys, reversed_keys); - assert_eq!(original_keys.len(), 2); - assert!( - original_keys - .iter() - .all(|key| !key.ends_with(":0") && !key.ends_with(":1")) - ); - } - - #[test] - fn metric_report_keys_preserve_raw_identity_after_sanitized_aliasing() { - let report: MetricReport = serde_json::from_value(serde_json::json!({ - "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - "@odata.type": "#MetricReport.v1_3_0.MetricReport", - "Id": "NvidiaNMMetrics_0", - "Name": "NVIDIA NVSwitch metrics", - "MetricValues": [ - { - "MetricId": "Port-RcvErrors", - "MetricValue": "1" - }, - { - "MetricId": "Port_RcvErrors", - "MetricValue": "2" - } - ] - })) - .expect("MetricReport JSON should parse"); - - let samples = metric_samples_from_report( - &report, - "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - ); - - assert_eq!(samples.len(), 2); - assert_eq!(samples[0].metric_type, samples[1].metric_type); - assert_ne!(samples[0].key, samples[1].key); - assert_eq!( - samples - .iter() - .map(|sample| sample.key.as_str()) - .collect::>() - .len(), - 2 - ); - } - - #[test] - fn metric_report_value_without_source_identity_is_skipped() { - let report: MetricReport = serde_json::from_value(serde_json::json!({ - "@odata.id": "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", - "@odata.type": "#MetricReport.v1_3_0.MetricReport", - "Id": "NvidiaNMMetrics_0", - "Name": "NVIDIA NVSwitch metrics", - "MetricValues": [{ "MetricValue": "3" }] - })) - .expect("MetricReport JSON should parse"); - - assert!( - metric_samples_from_report( - &report, - "/redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0" - ) - .is_empty() - ); - } - - #[test] - fn metric_type_falls_back_to_metric_property_last_segment() { - assert_eq!( - metric_type( - None, - Some("/redfish/v1/Fabrics/NVLink/Switches/0/Ports/1/Metrics#/RXErrors"), - ) - .as_deref(), - Some("rx_errors") - ); - assert_eq!(metric_type(None, None), None); - } -} diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 51f6816037..1e1b43d530 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -137,6 +137,10 @@ pub enum StaticSwitchEndpointRole { Host, } +fn default_static_switch_endpoint_role() -> StaticSwitchEndpointRole { + StaticSwitchEndpointRole::Host +} + #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticSwitchEndpoint { @@ -146,6 +150,7 @@ pub struct StaticSwitchEndpoint { pub slot_number: Option, #[serde(alias = "compute_tray_index")] pub tray_index: Option, + #[serde(default = "default_static_switch_endpoint_role")] pub endpoint_role: StaticSwitchEndpointRole, #[serde(default)] pub is_primary: bool, @@ -190,19 +195,13 @@ impl StaticBmcEndpoint { )); } - if let Some(switch) = &self.switch { - if switch.id.is_none() && switch.serial.is_none() { - return Err(format!( - "endpoint_sources.static_bmc_endpoints[{index}].switch requires id or serial" - )); - } - if switch.endpoint_role == StaticSwitchEndpointRole::Host - && switch.nmxt_enabled.is_none() - { - return Err(format!( - "endpoint_sources.static_bmc_endpoints[{index}].switch.nmxt_enabled must be explicit for host switch endpoints" - )); - } + if let Some(switch) = &self.switch + && switch.id.is_none() + && switch.serial.is_none() + { + return Err(format!( + "endpoint_sources.static_bmc_endpoints[{index}].switch requires id or serial" + )); } Ok(()) @@ -461,9 +460,6 @@ pub struct CollectorsConfig { /// Entity metrics collector configuration (if present, metrics collector is enabled) pub metrics: Configurable, - /// Redfish TelemetryService MetricReports collector configuration. - pub telemetry_service: Configurable, - /// Firmware collector configuration (if present, firmware collector is enabled) pub firmware: Configurable, @@ -486,7 +482,6 @@ impl Default for CollectorsConfig { discovery: DiscoveryConfig::default(), sensors: Configurable::Enabled(SensorCollectorConfig::default()), metrics: Configurable::Disabled, - telemetry_service: Configurable::Disabled, firmware: Configurable::Disabled, leak_detector: Configurable::Enabled(LeakDetectorCollectorConfig::default()), logs: Configurable::Disabled, @@ -532,30 +527,6 @@ impl Default for MetricsCollectorConfig { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(default)] -pub struct TelemetryServiceCollectorConfig { - /// Interval between Redfish TelemetryService MetricReport polls. - #[serde(with = "humantime_serde")] - pub poll_interval: Duration, - - /// Maximum number of MetricReports fetched concurrently per endpoint. - pub fetch_concurrency: usize, - - /// Optional allow-list of MetricReport resource IDs. Empty means all MetricReports. - pub metric_report_ids: Vec, -} - -impl Default for TelemetryServiceCollectorConfig { - fn default() -> Self { - Self { - poll_interval: Duration::from_secs(60), - fetch_concurrency: 4, - metric_report_ids: Vec::new(), - } - } -} - #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct ProcessorsConfig { @@ -957,6 +928,9 @@ impl Default for NvueGnmiConfig { pub struct NvueGnmiPaths { pub components_enabled: bool, pub interfaces_enabled: bool, + /// Subscribe to `/platform-general/state` for switch-level memory and disk + /// utilization. This is a singleton resource (not keyed by interface or + /// component name). pub platform_general_enabled: bool, } @@ -965,7 +939,7 @@ impl Default for NvueGnmiPaths { Self { components_enabled: true, interfaces_enabled: true, - platform_general_enabled: false, + platform_general_enabled: true, } } } @@ -1000,6 +974,7 @@ impl Default for NvueRestConfig { /// - cluster_apps_enabled: Poll `/nvue_v1/cluster/apps`. /// - sdn_partitions_enabled: Poll `/nvue_v1/sdn/partition` (including per-partition details) /// - interfaces_enabled: Poll `/nvue_v1/interface`. +/// - platform_environment_fan_enabled: Poll `/nvue_v1/platform/environment/fan`. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct NvueRestPaths { @@ -1007,6 +982,7 @@ pub struct NvueRestPaths { pub cluster_apps_enabled: bool, pub sdn_partitions_enabled: bool, pub interfaces_enabled: bool, + pub platform_environment_fan_enabled: bool, } impl Default for NvueRestPaths { @@ -1016,6 +992,7 @@ impl Default for NvueRestPaths { cluster_apps_enabled: true, sdn_partitions_enabled: true, interfaces_enabled: true, + platform_environment_fan_enabled: true, } } } @@ -1265,7 +1242,6 @@ mod tests { assert!(config.collectors.firmware.is_enabled()); assert!(config.collectors.leak_detector.is_enabled()); assert!(config.collectors.logs.is_enabled()); - assert!(config.collectors.telemetry_service.is_enabled()); assert!(config.collectors.nvue.is_enabled()); assert!(!config.sinks.tracing.is_enabled()); assert!(config.sinks.prometheus.is_enabled()); @@ -1322,14 +1298,6 @@ mod tests { assert_eq!(config.cache_size, 100); assert_eq!(config.endpoint_discovery_interval, Duration::from_secs(300)); - if let Configurable::Enabled(ref telemetry_service) = config.collectors.telemetry_service { - assert_eq!(telemetry_service.poll_interval, Duration::from_secs(60)); - assert_eq!(telemetry_service.fetch_concurrency, 4); - assert!(telemetry_service.metric_report_ids.is_empty()); - } else { - panic!("telemetry service config should be enabled in example config"); - } - if let Configurable::Enabled(ref nvue) = config.collectors.nvue { if let Configurable::Enabled(ref rest) = nvue.rest { assert_eq!(rest.poll_interval, Duration::from_secs(60)); @@ -1342,7 +1310,6 @@ mod tests { assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); assert!(gnmi.system_events_enabled); - assert!(gnmi.paths.platform_general_enabled); } else { panic!("nvue gnmi config should be enabled in example config"); } @@ -1615,9 +1582,6 @@ skip_empty_reports = false assert!(rest.paths.sdn_partitions_enabled); assert!(rest.paths.interfaces_enabled); } - if let Configurable::Enabled(ref gnmi) = defaults.gnmi { - assert!(!gnmi.paths.platform_general_enabled); - } } #[test] @@ -1661,42 +1625,6 @@ request_timeout = "45s" assert!(!config.collectors.nvue.is_enabled()); } - #[test] - fn test_telemetry_service_config_parsing() { - let toml_content = r#" -[endpoint_sources.carbide_api] -enabled = false - -[sinks.health_report] -enabled = false - -[collectors.telemetry_service] -poll_interval = "45s" -fetch_concurrency = 8 -metric_report_ids = ["NvidiaNMMetrics_0"] -"#; - - let config: Config = Figment::new() - .merge(Serialized::defaults(Config::default())) - .merge(Toml::string(toml_content)) - .extract() - .expect("failed to parse telemetry service config"); - - if let Configurable::Enabled(ref telemetry_service) = config.collectors.telemetry_service { - assert_eq!(telemetry_service.poll_interval, Duration::from_secs(45)); - assert_eq!(telemetry_service.fetch_concurrency, 8); - assert_eq!(telemetry_service.metric_report_ids, ["NvidiaNMMetrics_0"]); - } else { - panic!("telemetry service config should be enabled"); - } - } - - #[test] - fn test_telemetry_service_config_disabled_by_default() { - let config = Config::default(); - assert!(!config.collectors.telemetry_service.is_enabled()); - } - #[test] fn test_nvue_config_explicit_disable() { let toml_content = r#" @@ -1841,7 +1769,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "cumulus" password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = true, slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" @@ -1928,7 +1856,7 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 } #[test] - fn test_static_switch_host_accepts_primary_with_explicit_nmxt_enabled() { + fn test_static_switch_host_accepts_primary_without_nmxt_override() { let toml_content = r#" [endpoint_sources.carbide_api] enabled = false @@ -1938,7 +1866,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "admin" password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true, nmxt_enabled = true } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true } "#; let config: Config = Figment::new() @@ -1954,7 +1882,7 @@ switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); assert!(switch.is_primary); - assert_eq!(switch.nmxt_enabled, Some(true)); + assert_eq!(switch.nmxt_enabled, None); } #[test] @@ -1987,61 +1915,6 @@ switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", assert_eq!(switch.nmxt_enabled, Some(true)); } - #[test] - fn test_static_switch_endpoint_requires_explicit_role() { - let toml_content = r#" -[endpoint_sources.carbide_api] -enabled = false - -[[endpoint_sources.static_bmc_endpoints]] -ip = "10.0.1.3" -mac = "11:22:33:44:55:88" -username = "admin" -password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-003" } -"#; - - let err = Figment::new() - .merge(Serialized::defaults(Config::default())) - .merge(Toml::string(toml_content)) - .extract::() - .expect_err("switch endpoint role must be explicit"); - - assert!( - err.to_string().contains("endpoint_role"), - "error should mention the missing endpoint_role: {err}" - ); - } - - #[test] - fn test_static_switch_host_requires_explicit_nmxt_enabled() { - let toml_content = r#" -[endpoint_sources.carbide_api] -enabled = false - -[[endpoint_sources.static_bmc_endpoints]] -ip = "10.0.1.4" -mac = "11:22:33:44:55:99" -username = "admin" -password = "pass" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-004", endpoint_role = "host", is_primary = true } -"#; - - let config = Figment::new() - .merge(Serialized::defaults(Config::default())) - .merge(Toml::string(toml_content)) - .extract::() - .expect("config parses before validation"); - - let err = config - .validate() - .expect_err("host nmxt_enabled must be explicit"); - assert!( - err.contains("nmxt_enabled"), - "error should mention missing nmxt_enabled: {err}" - ); - } - #[test] fn test_static_machine_endpoint_accepts_placement_and_nvlink_metadata() { let toml_content = r#" @@ -2093,7 +1966,7 @@ ip = "10.0.1.1" mac = "11:22:33:44:55:66" username = "cumulus" password = "pass" -switch = { serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = false, physical_slot_number = 7, compute_tray_index = 3 } +switch = { serial = "SN-SW-001", physical_slot_number = 7, compute_tray_index = 3 } "#; let config: Config = Figment::new() @@ -2132,7 +2005,7 @@ mac = "aa:bb:cc:dd:ee:ff" username = "admin" password = "pass" machine = { id = "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" } -switch = { serial = "SN-SW-001", endpoint_role = "host", nmxt_enabled = false } +switch = { serial = "SN-SW-001" } "#; let config: Config = Figment::new() diff --git a/crates/health/src/discovery/cleanup.rs b/crates/health/src/discovery/cleanup.rs index a4e8ab5693..5dba8d0728 100644 --- a/crates/health/src/discovery/cleanup.rs +++ b/crates/health/src/discovery/cleanup.rs @@ -57,8 +57,6 @@ pub(super) fn stop_removed_bmc_collectors( tracing::info!( removed_count = removed_keys.len(), remaining_sensors = ctx.collectors.len(CollectorKind::Sensor), - remaining_telemetry_service_collectors = - ctx.collectors.len(CollectorKind::TelemetryService), remaining_collectors = ctx.collectors.len(CollectorKind::Logs), remaining_firmware_collectors = ctx.collectors.len(CollectorKind::Firmware), remaining_leak_detector_collectors = ctx.collectors.len(CollectorKind::LeakDetector), diff --git a/crates/health/src/discovery/context.rs b/crates/health/src/discovery/context.rs index 6345042a20..9a1948d27d 100644 --- a/crates/health/src/discovery/context.rs +++ b/crates/health/src/discovery/context.rs @@ -31,7 +31,6 @@ use crate::config::{ LogsCollectorConfig as LogsCollectorOptions, MetricsCollectorConfig as MetricsCollectorOptions, NmxtCollectorConfig as NmxtCollectorOptions, NvueCollectorConfig as NvueCollectorOptions, SensorCollectorConfig as SensorCollectorOptions, - TelemetryServiceCollectorConfig as TelemetryServiceCollectorOptions, }; use crate::limiter::RateLimiter; use crate::metrics::{MetricsManager, operation_duration_buckets_seconds}; @@ -41,7 +40,6 @@ pub(super) enum CollectorKind { Discovery, Sensor, Metrics, - TelemetryService, Logs, Firmware, LeakDetector, @@ -51,11 +49,10 @@ pub(super) enum CollectorKind { } impl CollectorKind { - pub(super) const ALL: [CollectorKind; 10] = [ + pub(super) const ALL: [CollectorKind; 9] = [ CollectorKind::Discovery, CollectorKind::Sensor, CollectorKind::Metrics, - CollectorKind::TelemetryService, CollectorKind::Logs, CollectorKind::Firmware, CollectorKind::LeakDetector, @@ -71,9 +68,6 @@ impl CollectorKind { } CollectorKind::Sensor => "Stopping sensor collector for removed BMC endpoint", CollectorKind::Metrics => "Stopping entity metrics collector for removed BMC endpoint", - CollectorKind::TelemetryService => { - "Stopping Redfish TelemetryService collector for removed BMC endpoint" - } CollectorKind::Logs => "Stopping logs collector for removed BMC endpoint", CollectorKind::Firmware => "Stopping firmware collector for removed BMC endpoint", CollectorKind::LeakDetector => { @@ -92,7 +86,6 @@ pub(super) struct CollectorState { discovery: HashMap, Collector>, sensors: HashMap, Collector>, metrics: HashMap, Collector>, - telemetry_service: HashMap, Collector>, firmware: HashMap, Collector>, leak_detector: HashMap, Collector>, logs: HashMap, Collector>, @@ -108,7 +101,6 @@ impl CollectorState { discovery: HashMap::new(), sensors: HashMap::new(), metrics: HashMap::new(), - telemetry_service: HashMap::new(), firmware: HashMap::new(), leak_detector: HashMap::new(), logs: HashMap::new(), @@ -124,7 +116,6 @@ impl CollectorState { CollectorKind::Discovery => &self.discovery, CollectorKind::Sensor => &self.sensors, CollectorKind::Metrics => &self.metrics, - CollectorKind::TelemetryService => &self.telemetry_service, CollectorKind::Logs => &self.logs, CollectorKind::Firmware => &self.firmware, CollectorKind::LeakDetector => &self.leak_detector, @@ -142,7 +133,6 @@ impl CollectorState { CollectorKind::Discovery => &mut self.discovery, CollectorKind::Sensor => &mut self.sensors, CollectorKind::Metrics => &mut self.metrics, - CollectorKind::TelemetryService => &mut self.telemetry_service, CollectorKind::Logs => &mut self.logs, CollectorKind::Firmware => &mut self.firmware, CollectorKind::LeakDetector => &mut self.leak_detector, @@ -192,7 +182,6 @@ impl CollectorState { .keys() .chain(self.sensors.keys()) .chain(self.metrics.keys()) - .chain(self.telemetry_service.keys()) .chain(self.logs.keys()) .chain(self.firmware.keys()) .chain(self.leak_detector.keys()) @@ -228,7 +217,6 @@ pub struct DiscoveryLoopContext { pub(crate) discovery_config: DiscoveryConfig, pub(crate) sensors_config: Configurable, pub(crate) metrics_config: Configurable, - pub(crate) telemetry_service_config: Configurable, pub(crate) logs_config: Configurable, pub(crate) firmware_config: Configurable, pub(crate) leak_detector_config: Configurable, @@ -274,7 +262,6 @@ impl DiscoveryLoopContext { discovery_config: config.collectors.discovery.clone(), sensors_config: config.collectors.sensors.clone(), metrics_config: config.collectors.metrics.clone(), - telemetry_service_config: config.collectors.telemetry_service.clone(), logs_config: config.collectors.logs.clone(), firmware_config: config.collectors.firmware.clone(), leak_detector_config: config.collectors.leak_detector.clone(), diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index e0f8dbed6d..107c90a882 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -29,7 +29,7 @@ use crate::collectors::{ LogsCollectorConfig, MetricsCollector, MetricsCollectorConfig, NmxtCollector, NmxtCollectorConfig, NvueRestCollector, NvueRestCollectorConfig, SensorCollector, SensorCollectorConfig, SseLogCollector, SseLogCollectorConfig, StreamingCollectorStartContext, - TelemetryServiceCollector, TelemetryServiceCollectorConfig, spawn_gnmi_collector, + spawn_gnmi_collector, }; use crate::config::{Configurable, LogCollectionMode, PeriodicLogConfig}; use crate::endpoint::{BmcEndpoint, EndpointMetadata, SwitchEndpointRole}; @@ -66,10 +66,6 @@ fn spawn_generic_redfish_collectors( let sensors_enabled = matches!(ctx.sensors_config, Configurable::Enabled(_)); let metrics_enabled = matches!(ctx.metrics_config, Configurable::Enabled(_)); - let telemetry_service_enabled = endpoint - .switch_data() - .is_some_and(|switch| matches!(switch.endpoint_role, SwitchEndpointRole::Bmc)) - && ctx.telemetry_service_config.is_enabled(); if (sensors_enabled || metrics_enabled) && !ctx.collectors.contains(CollectorKind::Discovery, &key) @@ -197,56 +193,6 @@ fn spawn_generic_redfish_collectors( } } - if telemetry_service_enabled - && let Configurable::Enabled(telemetry_service_cfg) = &ctx.telemetry_service_config - && !ctx - .collectors - .contains(CollectorKind::TelemetryService, &key) - { - if let Some(data_sink) = data_sink.clone() { - let collector_registry = Arc::new(ctx.metrics_manager.create_collector_registry( - format!("telemetry_service_collector_{key}"), - metrics_prefix, - )?); - match Collector::start::>( - endpoint_arc.clone(), - bmc.clone(), - TelemetryServiceCollectorConfig { - data_sink: Some(data_sink), - options: telemetry_service_cfg.clone(), - }, - CollectorStartContext { - limiter: ctx.limiter.clone(), - iteration_interval: telemetry_service_cfg.poll_interval, - collector_registry, - metrics_manager: ctx.metrics_manager.clone(), - }, - ) { - Ok(monitor) => { - ctx.collectors.insert( - CollectorKind::TelemetryService, - key.clone().into(), - monitor, - ); - tracing::info!( - endpoint_key = %key, - total_collectors = ctx.collectors.len(CollectorKind::TelemetryService), - "Started Redfish TelemetryService collection for switch BMC endpoint" - ); - } - Err(error) => { - tracing::error!( - ?error, - "Could not start Redfish TelemetryService collector for: {:?}", - endpoint.addr - ); - } - } - } else { - tracing::warn!("Redfish TelemetryService collector requires a data sink, skipping"); - } - } - if let Configurable::Enabled(logs_cfg) = &ctx.logs_config && !ctx.collectors.contains(CollectorKind::Logs, &key) { @@ -749,7 +695,6 @@ mod tests { async fn test_switch_bmc_endpoint_starts_redfish_but_not_switch_host_collectors() { let mut config = Config::default(); config.collectors.sensors = Configurable::Enabled(Default::default()); - config.collectors.telemetry_service = Configurable::Enabled(Default::default()); config.collectors.logs = Configurable::Disabled; config.collectors.firmware = Configurable::Disabled; config.collectors.leak_detector = Configurable::Disabled; @@ -771,16 +716,10 @@ mod tests { )), ); - spawn_collectors_for_endpoint( - &mut ctx, - &endpoint, - Some(Arc::new(NoopSink)), - "test_switch_bmc_redfish_only", - ) - .expect("spawn should succeed"); + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test_switch_bmc_redfish_only") + .expect("spawn should succeed"); assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 1); - assert_eq!(ctx.collectors.len(CollectorKind::TelemetryService), 1); assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); assert_eq!(ctx.collectors.len(CollectorKind::NvueGnmi), 0); diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index 53197db084..cad243113d 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -100,7 +100,7 @@ impl StaticEndpointSource { StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, }; - let nmxt_enabled = switch.nmxt_enabled.unwrap_or(false); + let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); Some(EndpointMetadata::Switch(SwitchData { id, @@ -318,7 +318,7 @@ mod tests { tray_index: Some(3), endpoint_role: StaticSwitchEndpointRole::Host, is_primary: true, - nmxt_enabled: Some(true), + nmxt_enabled: None, }), rack_id: None, }]; diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index 6df5826cdf..4c18a94154 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -1,5 +1,14 @@ # GB200 NVSWITCH telemetry live-validation runbook +> **Implementation note.** GB200 telemetry is collected via **explicit catalog-row +> allowlists** over the live host surfaces: NMX-T (`switch_nmxt`), NVOS gNMI +> (`nvue_gnmi`, explicit per-leaf), NVUE REST (`fan_max_speed` from +> `/platform/environment/fan`), and standard Redfish sensors (`hw_sensor`). There is +> **no** standalone Redfish `TelemetryService` collector and **no** generic/sanitized +> source preservation — both were evaluated against the live GB200 BMC and removed. +> Unknown gNMI/NMX-T sources are dropped and debug-logged, never emitted. nv-redfish is +> consumed at the released `0.10.0` (no local patch). + This branch stops before live hardware validation. After build/test/lint review, run the health service locally against one GB200 NVLink Switch BMC endpoint and one switch HOST/NVOS endpoint. ## Collectors that must be enabled @@ -7,18 +16,13 @@ This branch stops before live hardware validation. After build/test/lint review, For the GB200 phase, enable all switch telemetry collectors below: - BMC endpoint (`switch.endpoint_role = "bmc"`): - - `collectors.sensors` for standard Redfish sensor readings and threshold/range context. - - `collectors.telemetry_service` for Redfish `TelemetryService/MetricReports/*`. + - `collectors.sensors` for standard Redfish sensor readings and threshold/range context (the temp/thermal `hw_sensor` series plus `*_range_max`/`*_range_min`). - HOST endpoint (`switch.endpoint_role = "host"`): - `collectors.nmxt` for NMX-T Prometheus telemetry on port `9352`. - - `collectors.nvue.rest` for existing NVUE health/app/partition/interface diagnostics. - - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general`, plus ON_CHANGE system events. - -The BMC proxy ACL must allow: + - `collectors.nvue.rest` for NVUE health/app/partition/interface diagnostics and `fan_max_speed` from `/platform/environment/fan`. + - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general` (memory/disk), plus ON_CHANGE system events. -- `GET /redfish/v1/TelemetryService` -- `GET /redfish/v1/TelemetryService/MetricReportDefinitions/*` -- `GET /redfish/v1/TelemetryService/MetricReports/*` +No TelemetryService proxy ACL changes are required — collection uses the standard Redfish sensor paths plus the host NMX-T/gNMI/NVUE endpoints. ## Local static config template @@ -72,12 +76,6 @@ sensor_fetch_interval = "1m" sensor_fetch_concurrency = 8 include_sensor_thresholds = true -[collectors.telemetry_service] -poll_interval = "1m" -fetch_concurrency = 4 -# Empty means all exposed MetricReports. Narrow to ["NvidiaNMMetrics_0"] only if the BMC exposes noisy unrelated reports. -metric_report_ids = [] - [collectors.metrics] enabled = false @@ -103,6 +101,7 @@ system_health_enabled = true cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true +platform_environment_fan_enabled = true # MAX-SPEED via /nvue_v1/platform/environment/fan [collectors.nvue.gnmi] gnmi_port = 9339 @@ -116,42 +115,151 @@ interfaces_enabled = true platform_general_enabled = true ``` -## Local nv-redfish patch command +## Run the local health service -The infra-controller MR must not commit absolute local paths. For local validation against a locally built `nv-redfish` checkout, use Cargo command-line patching. The local `nv-redfish` workspace package version must satisfy the infra-controller dependency (`0.10.x` for this branch); if the companion checkout is on `origin/main` with a development `0.1.0` workspace version, use a matching release tag or a temporary local-only version edit that is not committed. +nv-redfish is consumed at the released `0.10.0` — no local patch or companion checkout is needed. ```bash -cargo run \ - --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" \ - -p carbide-health --bin forge-hw-health -- \ - /path/to/gb200-switch-local.toml +cargo run -p carbide-health --bin forge-hw-health -- /path/to/gb200-switch-local.toml ``` -If the companion `nv-redfish` checkout changes internal crates, add the matching `patch.crates-io` entries documented in `nvswitch_telemetry_nv_redfish_dependency.md`. - ## Evidence to capture during live validation -1. `/telemetry` output contains `redfish_telemetry_service` samples for the BMC endpoint. -2. `/telemetry` output contains `switch_nmxt` samples for the HOST endpoint, including any source metric names beyond the three legacy hard-coded metrics. -3. `/telemetry` output contains `nvue_gnmi` samples for: - - existing canonical interface metrics (`interface_*`), and - - newly preserved `nvswitch_*` catalog leaf metrics from previously unmapped gNMI leaves. -4. Logs show the TelemetryService, NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected endpoint roles. +1. `/telemetry` output contains `hw_sensor` samples for the BMC endpoint (temp/thermal readings; plus `*_range_max`/`*_range_min` where the sensor exposes ranges). +2. `/telemetry` output contains `switch_nmxt` samples for the HOST endpoint — only the explicit `NMXT_METRIC_MAP` families with the allowlisted identity labels (no sanitized/unknown source names). +3. `/telemetry` output contains `nvue_gnmi` samples for the HOST endpoint: canonical `interface_*` (incl. `interface_link_speed_active` in gbps), `component_*`, and `platform_memory_used/total` + `platform_disk_total/used`. +4. `/telemetry` output contains the NVUE REST `fan_max_speed` sample (HOST). Logs show the NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected roles; matched-but-uncoercible leaves are debug-logged, not emitted. 5. The two catalog rows with no listed source (`CABLE-SNR-MEDIA-LANE-N`, `CABLE-SNR-HOST-LANE-N`) are checked explicitly in live output. If they do not appear through Redfish MetricReports, NMX-T, or gNMI, open a catalog/source-owner follow-up immediately; keep them open until source-owner resolution. -## Cardinality and series-shape acceptance checks - -The branch intentionally preserves generic Redfish MetricReport, NMX-T, and gNMI samples so GB200 bring-up does not drop unknown NVSWITCH rows. Before treating live validation as successful, capture the series shape and confirm it is bounded by device structure rather than by scrape churn: - -1. Capture the distinct `(metric name, metric_type, key)` tuples from two consecutive `/telemetry` scrapes after collectors are warm. -2. Confirm the tuple set is stable across those scrapes except for expected hot-plug, link, or error-counter changes. -3. For Redfish MetricReports, confirm labels are limited to report id/URI/definition and metric id/property/identity, and that internal sample keys use escaped raw MetricId/MetricProperty identity so sanitized aliases do not collapse. Raw string values must not appear as metric labels. -4. For NMX-T, confirm unknown metric keys include escaped raw port/source/node identity and stable sorted source-label identity so same metric/port samples with different lane/device labels do not collapse. -5. For gNMI, confirm unknown leaves are keyed by full source path plus endpoint/entity labels and do not create time-varying label names. -6. If live GB200 only needs a subset of TelemetryService reports, narrow `metric_report_ids` and consider tightening the BMC proxy ACL before final merge. - -Unit coverage that locks the pre-live behavior: - -- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples`. -- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels`. -- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric`. +## Series-shape acceptance checks + +Only explicit catalog-row mappings are emitted; unknown sources are dropped (debug-logged), never sanitized into metrics. Before treating live validation as successful: + +1. Capture the distinct `(name, metric_type, key)` tuples from two consecutive `/telemetry` scrapes after collectors are warm. +2. Confirm the tuple set is stable across scrapes except for expected link/error-counter changes. +3. Confirm every emitted series is one of the known families: `hw_sensor`, `switch_nmxt`, `nvue_gnmi` (`interface_*`/`component_*`/`platform_*`), or `fan_max_speed`. No `nvswitch_*`, `source_metric`, or `redfish_telemetry_service` series may appear. +4. Confirm NMX-T identity labels are the allowlisted `NMXT_LABEL_MAP` set (bounded per port); no raw/unknown source names as labels. + +Unit coverage that locks this behavior: + +- NMX-T: `test_nmxt_metric_map_locks_type_and_unit`, `test_unknown_nmxt_sources_not_allowlisted`. +- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_string_leaf_is_not_exported` (string leaves emit nothing). +- NVUE REST: `test_fan_max_speed_emit`. + +## Blocker escalations (Stage 0) + +Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. +44 rows are escalated below (21 config-threshold, 17 absent-from-live-probe, 6 string-valued). No +row is deferred — each has an explicit disposition and a named resolution path. + +### Group A — Config-threshold rows (21 rows, BLOCKER-THRESHOLD) + +These catalog entries represent threshold/limit/alarm-state values configured on the device, not +streamed telemetry counters. They are not exposed as live gNMI leaves and cannot be implemented +without a new data source. + +**Resolution:** Source owner (NVOS gNMI / Redfish sensor threshold team) must confirm whether +a future gNMI path or Redfish sensor `ThresholdHigh`/`ThresholdLow`/`ReadingRangeMax` field +can expose these. Until confirmed, they are out-of-scope for this branch. + +| Row | Metric | +|------|-------------------------| +| 872 | ASIC-TEMP-CRITICAL | +| 873 | ASIC-TEMP-MAX | +| 874 | ASIC-TEMP-STATE | +| 879 | AMBIENT-MNG-TEMP-STATE | +| 881 | CPU_PACK_TEMP_CRITICAL | +| 882 | CPU_PACK_TEMP_MAX | +| 883 | CPU_PACK_TEMP_STATE | +| 890 | SODIMM_TEMP_CRITICAL | +| 891 | SODIMM_TEMP_MAX | +| 892 | SODIMM_TEMP_STATE | +| 1241 | DRIVE-TEMP-CRITICAL | +| 1242 | DRIVE-TEMP-MAX | +| 1243 | DRIVE-TEMP-STATE | +| 1245 | HSC-VINDC-TEMP-CRITICAL | +| 1246 | HSC-VINDC-TEMP-MAX | +| 1247 | HSC-VINDC-TEMP-STATE | +| 1249 | PDB-CONV-TEMP-CRITICAL | +| 1251 | PDB-CONV-TEMP-STATE | +| 1253 | PMIC-TEMP-CRITICAL | +| 1255 | PMIC-TEMP-STATE | +| 1259 | SWB-ASIC-PCB-TEMP-STATE | + +### Group B — Cable/transceiver alarm leaves (9 rows, ABSENT-BLOCKER) + +**CAVEAT: likely empty due to uncabled test rig.** These gNMI leaves exist in the NVOS schema +and are in the explicit allowlist, but returned no data during Stage 0 probing. The probe +switch had no cables attached, which is the most probable cause — transceiver alarm flags only +populate when a transceiver is inserted. + +**Resolution:** Re-probe on a cabled production switch before treating these as permanent +blockers. If leaves remain absent on a cabled switch, escalate to NVOS gNMI owner with the +exact NVOS version and transceiver module type. + +| Row | Metric | gNMI leaf (not live) | +|------|---------------------------------|----------------------------------------------------------------------------------------------------| +| 981 | CABLE-TEMP-ALARM | `/components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag` | +| 982 | CABLE-VOLTAGE-ALARM | `/components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag` | +| 983 | CABLE-TX-CDR-LOL | `/components/component/transceiver/physical-channels/channel/channel-diag/tx-cdr-lol` | +| 984 | CABLE-RX-CDR-LOL | `/components/component/transceiver/physical-channels/channel/channel-diag/rx-cdr-lol` | +| 985 | CABLE-TX-LOS | `/components/component/transceiver/physical-channels/channel/channel-diag/tx-los` | +| 986 | CABLE-RX-LOS | `/components/component/transceiver/physical-channels/channel/channel-diag/rx-los` | +| 2293 | CABLE-OPER-STATUS | `/components/component/transceiver/transceiver-diag/state/module-oper-status` | +| 2296 | NVSWITCH-CABLE-RX-POWER-LANE-LOW-N | `/components/component/transceiver/thresholds/threshold/state/input-power-lower` | +| 2297 | NVSWITCH-CABLE-TX-POWER-LANE-LOW-N | `/components/component/transceiver/thresholds/threshold/state/output-power-lower` | +| 2298 | NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N | `/components/component/transceiver/thresholds/threshold/state/input-power-upper` | +| 2299 | NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N | `/components/component/transceiver/thresholds/threshold/state/output-power-upper` | + +Note: rows 2296–2299 are four rows, bringing Group B to 11 entries — the "9 cable/transceiver +alarm leaves" figure from the plan refers to the 9 alarm/status leaves (981–986, 2293); the +4 power threshold rows (2296–2299) overlap in root cause and are included here as they share +the same uncabled-rig caveat and re-probe condition. + +### Group C — NMX-T RDMA queue counters (3 rows, ABSENT-BLOCKER) + +NMX-T fields were not present in the live scrape output. These are RDMA queue error counters +that may only appear under active RDMA workloads or specific firmware versions. + +**Resolution:** Escalate to NMX-T / RDMA owner with the NMX-T version from the test rig. +Re-probe under active RDMA traffic if possible. + +| Row | Metric | NMX-T field not live | +|------|-------------|----------------------| +| 1706 | RQ-NUM-WRFE | `rq_num_wrfe` | +| 1707 | RQ-NUM-LLE | `rq_num_lle` | +| 1708 | SQ-NUM-WRFE | `sq_num_wrfe` | + +### Group D — Single-field ABSENT-BLOCKERs + +**OS-KERNEL (row 765):** Catalog source is NVOS CLI only (`nv show system version {build-id}`). +No gNMI leaf or NMX-T field matched. Implementing this row requires either a new CLI collector +(not in scope for this branch) or a new NVOS gNMI exposure. Escalate to NVOS owner. + +**TIME-SINCE-LASTS-CLEAR (row 909):** gNMI leaf +`/interfaces/interface/phy-diag/state/time-since-last-clear-min` is in the NVOS schema but +returned no data. Escalate to NVOS gNMI owner with NVOS version; confirm whether this leaf +requires a specific counter-clear event to populate. + +**PLR-CODES-LOSS (row 931):** NMX-T field `HiRetransmissionRate` is not present in the live +scrape. This may be a naming discrepancy or a field absent in the installed NMX-T version. +Escalate to NMX-T owner with the NMX-T version string from the test rig. + +### Group E — String-valued rows (6 rows, BLOCKER-STRING) + +These catalog rows are present live but carry string values with no numeric encoding, so they +cannot be emitted as numeric `MetricSample`s (the producer is numeric-only). They are not silently +dropped — they are escalated pending a string/label export path (or enum-coding for the FSM-style ones). + +| Row | Metric | Live source / value | +|-----|-------------------|-----------------------------------------------------------------------------| +| 862 | CONTACT | gNMI `/platform-general/state/contact` (empty on rig) | +| 863 | LOCATION | gNMI `/platform-general/state/location` (empty on rig) | +| 864 | NODE-DESCRIPTION | gNMI `/platform-general/state/platform-name` ("x86_64-nvidia_n5400_ld-r0") | +| 876 | ASIC-NAME | gNMI `/components/component/state/name` (e.g. "ASIC1") | +| 961 | PHY-MANAGER-STATE | gNMI `/interfaces/interface/phy-diag/state/phy-manager-state` (FSM enum) | +| 965 | VL-CAPABILITIES | gNMI `/interfaces/interface/infiniband/state/vl-capabilities` ("VL0-VL7") | + +**Resolution:** add a string/label export path (e.g. an info-style series or label) for the +descriptive rows, or enum-code the FSM-style ones (`PHY-MANAGER-STATE`) like the existing +`physical_port_state` converter. Tracked as a follow-up (#11). diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 0f85f68b4d..93dce6af3a 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -1,194 +1,194 @@ -catalog_row,guid,metric_param_name,description,category,data_type,gb200_applicability,availability,source_families,primary_source,fallback_source,source_precedence,duplicate_alias_policy,target_collector,target_emitted_surface,current_coverage,implementation_status,coverage_reason,redfish_or_mrd_path,nvos_gnmi_path,nmx_t_field,nvos_cli_reference,onboard_dbus_reference,test_fixture_plan,live_validation_plan -763,NVSWITCH-NET-FW-VER,NET-FW-VER,Switch ASIC Firmware Version,Config,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_net_fw_ver as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId {FirmwareVersion},NA,FW_Version,nv show platform firmware $name {name: {Name: ASIC}} {actual-firmware},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/software/HGX_FW_NVSwitch_{InstanceId} xyz.openbmc_project.Software.Version Version,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -764,NVSWITCH-OS-VERSION,OS-VERSION,OS version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_os_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show system version {kernel},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -765,NVSWITCH-OS-KERNEL,OS-KERNEL,OS Kernel version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_os_kernel as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show system version {image{build-id}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -766,NVSWITCH-EROT-FW-VERSION,EROT-FW-VERSION,ERoT FW version,Config,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_erot_fw_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform firmware $name {name: {Name: EROT}} {actual-firmware},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -767,NVSWITCH-BMC-VERSION,BMC-VERSION,BMC firmware version,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_bmc_version as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform firmware $name {name: {Name: BMC}} {actual-firmware},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -794,NVSWITCH-LINK-DOWNED-COUNTER,LINK-DOWNED-COUNTER,Total number of times the Port Training state machine has failed the link error recovery process and downed the link.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_downed_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkDownedCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-downed,Link_Down,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{link-downed}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -795,NVSWITCH-PORT-MALFORMED-PACKET-ERRORS,PORT-MALFORMED-PACKET-ERRORS,"Total number of packets received on the port that contain malformed packet errors • Data packets: LVer, length, VL • Link packets: operand, length, VL",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_malformed_packet_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{MalformedPackets}}},/interfaces/interface [name]/phy-diag/state/port-malformed-packet-errors,PortMalformedPacketErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-malformed-packet-errors}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -796,NVSWITCH-PORT-NEIGHBOR-MTU-DISCARDS,PORT-NEIGHBOR-MTU-DISCARDS,Number of outbound packets discarded by the port because packet length exceeded the NeighborMTU.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_neighbor_mtu_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{NeighborMTUDiscards}}},/interfaces/interface [name]/phy-diag/state/port-neighbor-mtu-discards,PortNeighborMTUDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-neighbor-mtu-discards}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -797,NVSWITCH-PORT-RCV-ERRORS,PORT-RCV-ERRORS,"Total number of packets containing an error that were received on the port. These errors include: • Local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine) • Malformed data packet errors (LVer, length, VL) • Malformed link packet errors (operand, length, VL) • Packets discarded due to buffer overrun",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_in_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_in_errors,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXErrors},interfaces/interface [name]/state/counters/in-errors,PortRcvErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-errors}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -798,NVSWITCH-PORT-XMIT-DISCARDS,PORT-XMIT-DISCARDS,Total number of outbound packets discarded by the port because the port is down or congested.,Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXDiscards}},interfaces/interface[name=*]/state/counters/out-discards,PortXmitDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-drops}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -799,NVSWITCH-PORT-RCV-REMOTE-PHYSICAL-ERRORS,PORT-RCV-REMOTE-PHYSICAL-ERRORS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_remote_physical_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXRemotePhysicalErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-remote-phy-errors,PortRcvRemotePhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -800,NVSWITCH-PORT-RCV-SWITCH-RELAY-ERRORS,PORT-RCV-SWITCH-RELAY-ERRORS,"Total number of packets received on the port that were discarded because they could not be forwarded by the switch relay.This might happen if, for instance, the destination port is congested or there are internal switch errors.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_switch_relay_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{RXSwitchRelayErrors}}},interfaces/interface[name=*]/infiniband/state/counters/port/rcv-switch-relay-errors,PortRcvSwitchRelayErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-remote-physical-errors}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -801,NVSWITCH-QP1Dropped,QP1Dropped,"Number of QP1 MADs (packets) dropped due to resource limitations (e.g., lack of buffers or receives posted) on the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_qp1dropped MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{QP1Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/qp1-dropped,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{counters{qp1-drops}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -802,NVSWITCH-VL15-DROPPED,VL15-DROPPED,"Number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers) of the port.",Error,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl15_dropped MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{VL15Dropped}}},interfaces/interface[name=*]/infiniband/state/counters/port/vl15-dropped,VL15Dropped,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -804,NVSWITCH-SERIAL,SERIAL,Serial Number,Inventory,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serial as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {SerialNumber},NA,sw_serial_number,nv show platform {serial-number},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/NVSwitch1 xyz.openbmc_project.Inventory.Decorator.Asset SerialNumber,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -806,NVSWITCH-NODE-GUID,NODE-GUID,"GUID of the HCA, switch, GPU, or router itself. All ports on the same node shall report the same NodeGUID. Provides a means to uniquely identify a node within a subnet and determine co-location of ports.",Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T; NVOS CLI; Onboard DBus,NMX-T,NVOS CLI,NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_node_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Node_GUID,nv show ib device $IbDeviceId {IbDeviceId: {type: NVLink*}} {guid},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/chassis/HGX_NVSwitch_{InstanceId} xyz.openbmc_project.Common.UUID UUID,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -807,NVSWITCH-PORT-GUID,PORT-GUID,GUID of the port. All ports on the same switch shall report the same NodeGUID.,Inventory,Text,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NMX-T,NMX-T,Redfish Fabric/Switch/Port,NMX-T then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Chassis/$ChassisId {UUID},NA,Port_GUID,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -834,NVSWITCH-NVLINK-STATUS,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVLink Link status (e.g. LinkUp),Status,Text,GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI; Onboard DBus,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Onboard DBus then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_pshima_nvidia_com_should_be_called_port_physical_state_ziv_hillel_il_nvlink_status as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2021.1c /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId {LinkStatus},interfaces/interface[name=$port_name]/infiniband/state/physical-port-state,phy_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{physical-state}},busctl get-property xyz.openbmc_project.GpuMgr /xyz/openbmc_project/inventory/system/fabrics/HGX_NVLinkFabric_{InstanceId}/Switches/NVSwitch_{InstanceId}/Ports/NVLink_{InstanceId} xyz.openbmc_project.Inventory.Item.Port LinkStatus,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -846,NVSWITCH-LINK-ERROR-RECOVERY-COUNTER,LINK-ERROR-RECOVERY-COUNTER,Total number of times the Port Training state machine has successfully completed the link error recovery process. This enrty is applicable for platforms with NVL5.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_error_recovery_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{LinkErrorRecoveryCount}}},interfaces/interface[name=*]/infiniband/state/counters/port/link-error-recovery,LinkErrorRecoveryCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{error-recovery}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -847,NVSWITCH-PORT-MULTICAST-RCV-PKTS,PORT-MULTICAST-RCV-PKTS,"Total number of multicast packets, including multicast packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",@pshima@nvidia.com spelling is wrong RXMulitcastFrames -> RXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXMulticastFrames}},/interfaces/interface [name]/phy-diag/state/port-multi-cast-rcv-pkts,PortMultiCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-multicast-pkts}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -848,NVSWITCH-PORT-MULTICAST-XMIT-PKTS,PORT-MULTICAST-XMIT-PKTS,Total number of multicast packets transmitted on all VLs from the port. This may include multicast packets with errors.,Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_multicast_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.","@pshima@nvidia.com spelling issue , should be TXMulticastFrames _Assigned to Pradeep Kumar Shima US_ -Rajat Jain IN 2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXMulticastFrames}}}",/interfaces/interface [name]/phy-diag/state/port-multi-cast-xmit-pkts,PortMultiCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-multicast-pkts}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -849,NVSWITCH-PORT-RCV-DATA,PORT-RCV-DATA,"Total number of data octets, divided by 4, received on all VLs at the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_data MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {RXBytes},interfaces/interface[name=*]/state/counters/in-octets,PortRcvDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-bytes}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -850,NVSWITCH-PORT-RCV-PKTS,PORT-RCV-PKTS,"Total number of received packets, including packets containing errors.",Status,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXFrames}},interfaces/interface[name=*]/state/counters/in-pkts,PortRcvPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -851,NVSWITCH-PORT-UNICAST-RCV-PKTS,PORT-UNICAST-RCV-PKTS,"Total number of unicast packets, including unicast packets containing errors.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_rcv_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{RXUnicastFrames}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-rcv-pkts,PortUniCastRcvPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{in-unicast-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -852,NVSWITCH-PORT-UNICAST-XMIT-PKTS,PORT-UNICAST-XMIT-PKTS,Total number of unicast packets transmitted on all VLs from the port. This may include unicast packets with errors.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_unicast_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXUnicastFrames}}},/interfaces/interface [name]/phy-diag/state/port-uni-cast-xmit-pkts,PortUniCastXmitPkts,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-unicast-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -853,NVSWITCH-PORT-XMIT-DATA,PORT-XMIT-DATA,"Total number of data octets, divided by 4, transmitted on all VLs from the port. This includes all octets between (and not including) the start of packet delimiter and the VCRC, and may include packets containing errors. It excludes all link packets.",Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_data MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {TXBytes}},interfaces/interface[name=*]/state/counters/out-octets,PortXmitDataExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-bytes}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -854,NVSWITCH-PORT-XMIT-PKTS,PORT-XMIT-PKTS,Total number of packets transmitted on all VLs from the port. This may include packets with errors,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_pkts MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Networking{TXFrames}},interfaces/interface[name=*]/state/counters/out-pkts,PortXmitPktsExtended,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-pkts}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -855,NVSWITCH-PORT-XMIT-WAIT,PORT-XMIT-WAIT,The number of ticks during which the port selected by PortSelect had data to transmit but no data was sent during the entire tick either because of insufficient credits or because of lack of arbitration.,Performance,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_xmit_wait MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{TXWait}}},interfaces/interface[name=*]/infiniband/state/counters/port/xmit-wait,PortXmitWait,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-wait}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -862,NVSWITCH-CONTACT,CONTACT,UTF-8 encoded string to describe contact person.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_contact as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/contact,NA,TBD,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -863,NVSWITCH-LOCATION,LOCATION,UTF-8 encoded string to describe location of the device.,Platform,Text,GB200 NVL NvswitchTray,Available IB,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_location as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/location,NA,nv show platform chassis-location {slot-number},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -864,NVSWITCH-NODE-DESCRIPTION,NODE-DESCRIPTION,UTF-8 encoded string to describe node in text format.,Inventory,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_node_description as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,platform-general/state/platform-name,node_description,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -865,NVSWITCH-LID,LID,Local ID- Link layer address of an end port.,NetworkId,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_lid MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,lid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -866,NVSWITCH-PORT-NUMBER,PORT-NUMBER,Port number,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_number as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Port_Number,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -867,NVSWITCH-PORT-LABEL,PORT-LABEL,Front panel label of the port,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_port_label as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,port_label,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -868,NVSWITCH-REVISION,REVISION,Switch HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_revision MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,sw_revision,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -869,NVSWITCH-DEVICE-HARDWARE-REVISION,DEVICE-HARDWARE-REVISION,DEvice HW revision,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_hardware_revision MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,device_hw_rev,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -870,NVSWITCH-CPU_CORE_NUMBER,CPU_CORE_NUMBER,Number of cores,System,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_core_number MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show system cpu {core-count},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -872,NVSWITCH-ASIC-TEMP-CRITICAL,ASIC-TEMP-CRITICAL,"Critical temperature threshold for NVSwitch ASIC. Above this level, the system will shutdown.",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -873,NVSWITCH-ASIC-TEMP-MAX,ASIC-TEMP-MAX,Max temperature threshold for NVSwitch ASIC.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {max}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -874,NVSWITCH-ASIC-TEMP-STATE,ASIC-TEMP-STATE,NVSwitch ASIC state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_asic_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -875,NVSWITCH-ASIC-TEMP-CURRENT,ASIC-TEMP-CURRENT,NVSwitch ASIC current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_temp_current MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=ASIC*]/asic/state/asic-temp,Chip_Temp,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""ASIC*""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -876,NVSWITCH-ASIC-NAME,ASIC-NAME,NVSwitch ASIC current temperature,Platform,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_asic_name MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=ASIC*]/state/name,NA,nv show platform {asic-model},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -879,NVSWITCH-AMBIENT-MNG-TEMP-STATE,AMBIENT-MNG-TEMP-STATE,Ambient temperature located in port side state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_ambient_mng_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -880,NVSWITCH-AMBIENT-MNG-TEMP-CURRENT,AMBIENT-MNG-TEMP-CURRENT,Ambient temperature located in port side,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_ambient_mng_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Ambient-MNG-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -881,NVSWITCH-CPU_PACK_TEMP_CRITICAL,CPU_PACK_TEMP_CRITICAL,"Critical temperature threshold for CPU PACK, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -882,NVSWITCH-CPU_PACK_TEMP_MAX,CPU_PACK_TEMP_MAX,Max temperature threshold for CPU PACK,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -883,NVSWITCH-CPU_PACK_TEMP_STATE,CPU_PACK_TEMP_STATE,CPU PACK temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -884,NVSWITCH-CPU_PACK_TEMP_CURRENT,CPU_PACK_TEMP_CURRENT,CPU PACK temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_cpu_pack_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: CPU-Pack-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -885,NVSWITCH-CPU-UTIL,CPU-UTIL,ComE CPU utilization,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_util MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,components/component[name=cpu]/cpu/utilization/state/avg,NA,nv show system cpu {total-utilization},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -886,NVSWITCH-MEM-UTIL,MEM-UTIL,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_util MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/memory-used,NA,nv show system memory {physical{utilization}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -887,NVSWITCH-MEM-TOTAL-SIZE,MEM-TOTAL-SIZE,Memory total size,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mem_total_size MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/memory-total-size,NA,nv show system memory {physical{total}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -888,NVSWITCH-DISK-TOTAL-SIZE,DISK-TOTAL-SIZE,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_total_size MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/disk-total-size,NA,TBD,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -889,NVSWITCH-DISK-USED,DISK-USED,Memory in used,System,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_disk_used MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,platform-general/state/disk-used,NA,TBD,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -890,NVSWITCH-SODIMM_TEMP_CRITICAL,SODIMM_TEMP_CRITICAL,"Critical temperature threshold for SODIMM temperature, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -891,NVSWITCH-SODIMM_TEMP_MAX,SODIMM_TEMP_MAX,Max temperature threshold for SODIMM temperature,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {max}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -892,NVSWITCH-SODIMM_TEMP_STATE,SODIMM_TEMP_STATE,SODIMM temperature state - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -893,NVSWITCH-SODIMM_TEMP_CURRENT,SODIMM_TEMP_CURRENT,SODIMM temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_sodimm_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SODIMM-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -894,FAN-MAX-SPEED,MAX-SPEED,Chassis fan reading range (max),Config,Float,GB200 NVL BMC; GB200 NVL NvswitchTray,Available OOB,Redfish Fabric/Switch/Port; NVOS CLI,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port,Redfish Fabric/Switch/Port then NVOS CLI,one canonical series unless source-qualified duplicate is justified,existing SensorsCollector range emission when include_sensor_thresholds=true,hw_sensor {reading_type}_range_max MetricSample with sensor_range=reading_range_max,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2023.3 /redfish/v1/Chassis/$ChassisId/Sensors/$SensorId {ReadingRangeMax},NA,NA,nv show platform environment fan $FanId {max-speed},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -897,NVSWITCH-PORT-LOGICAL-STATE,PORT-LOGICAL-STATE,Port State. Enumerated as: 0: No State Change; 1: Down (includes failed links) 2: Initialize 3: Armed 4: Active,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_logical_state MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/logical-port-state,logical_state,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{logical-state}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -898,NVSWITCH-FEC-MODE-ACTIVE,FEC-MODE-ACTIVE,"FEC mode active: 0: No_FEC 1: Firecode_FEC 2: Standard_RS_FEC - RS(528,514) 3: Standard_LL_RS_FEC - RS(271,257) 6: Interleaved_Standard_RS-FEC - (544,514) 7: Standard_RS-FEC - (544,514)",Status,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_fec_mode_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Active_FEC,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -899,NVSWITCH-RAW-BER,RAW-BER,Raw BER- calculated by the following: bits 15:8 - raw_ber_magnitude bits 3:0 - raw_ber_coef Raw_BER = raw_ber_coef*10^(-raw_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber,Total_Raw_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{raw-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -900,NVSWITCH-EFFECTIVE-BER,EFFECTIVE-BER,Effective BER- calculated by the following: bits 15:8 - effective_ber_magnitude bits 3:0 - effective_ber_coef Effective_BER = effective_ber_coef*10^(-effective_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_effective_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,NA,/interfaces/interface [name]/phy-diag/state/effective-ber,Effective_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -901,NVSWITCH-SYMBOL-BER,SYMBOL-BER,Symbol BER- calculated by the following: bits 15:8 - symbol_ber_magnitude bits 3:0 - symbol_ber_coef Symbol_BER = symbol_ber_coef*10^(-symbol_ber_magnitude),Link-Quality,Float,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; Redfish TelemetryService; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,Redfish TelemetryService,NVOS gNMI then Redfish TelemetryService then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_symbol_ber MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_effective_ber/interface_symbol_ber,2023.0b /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{BitErrorRate}}},/interfaces/interface [name]/phy-diag/state/symbol-ber,Symbol_BER,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{symbol-ber}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -902,NVSWITCH-ZERO-HIST,ZERO-HIST,First FEC histogram bin with value of 0 while all higher bins are only with 0 value as well.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_zero_hist MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/zero-hist,fc_zero_hist,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{zero-hist}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -903,NVSWITCH-PHY-RAW-ERRORS-LANE0,PHY-RAW-ERRORS-LANE0,This counter provides information on error bits that were identified on lane 0. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-1,Raw_Errors_Lane_0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{phy-raw-errors}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -904,NVSWITCH-PHY-RAW-ERRORS-LANE1,PHY-RAW-ERRORS-LANE1,This counter provides information on error bits that were identified on lane 1. (pre FEC & PLR),Link-Quality,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_raw_errors_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-errors-ch-2,Raw_Errors_Lane_1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{phy-raw-errors}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -905,NVSWITCH-RAW-BER-LANE0,RAW-BER-LANE0,Raw BER for lane 0. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-1,raw_ber_lane0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{0{raw-ber}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -906,NVSWITCH-RAW-BER-LANE1,RAW-BER-LANE1,Raw BER for lane 1. same calculation as RAW-BER.,Link-Quality,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_raw_ber_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/raw-ber-ch-2,raw_ber_lane1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{lane{1{raw-ber}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -907,NVSWITCH-PHY-EFFECTIVE-ERRORS,PHY-EFFECTIVE-ERRORS,This counter provides information on error bits that were not corrected by FEC correction algorithm or that FEC is not active. (post FEC pre PLR),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,Redfish Fabric/Switch/Port; NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI then Redfish Fabric/Switch/Port,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_effective_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",2025.1 /redfish/v1/Fabrics/$FabricId/Switches/$SwitchId/Ports/$PortId/Metrics {Oem{Nvidia{EffectiveError}}},/interfaces/interface [name]/phy-diag/state/effective-errors,Effective_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{effective-errors}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -908,NVSWITCH-PHY-SYMBOL-ERRORS,PHY-SYMBOL-ERRORS,Total number of minor link errors detected on one or more physical lanes. This counter provides information on error bits that were not corrected by phy correction mechanisms. (post FEC & PLR),Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NMX-T,NVOS gNMI,NMX-T then NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,existing switch_nmxt symbol_errors MetricSample,covered_host_nmxt,already-covered-regression-required,NMX-T maps Symbol_Errors to symbol_errors,NA,/interfaces/interface [name]/phy-diag/state/symbol-errors,Symbol_Errors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{nvl{errors{symbol-errors{receive}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -909,NVSWITCH-TIME-SINCE-LASTS-CLEAR,TIME-SINCE-LASTS-CLEAR,The time passed since the last counters clear event in msec- time since the port was raised to up.,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_time_since_lasts_clear MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/time-since-last-clear-min,Time_since_last_clear_Min,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{time-since-last-clear-min}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -910,NVSWITCH-DEVICE-ID,DEVICE-ID,Device ID information as assigned by device manufacturer.,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_id as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Device_ID,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -911,NVSWITCH-FEC-HIST-0,FEC-HIST-0,Value of RS FEC Histogram (Reed Solomon error correction) bin0,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin0,hist0,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{0{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -912,NVSWITCH-FEC-HIST-1,FEC-HIST-1,Value of RS FEC Histogram (Reed Solomon error correction) bin1,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin1,hist1,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{1{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -913,NVSWITCH-FEC-HIST-2,FEC-HIST-2,Value of RS FEC Histogram (Reed Solomon error correction) bin2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_2 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin2,hist2,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{2{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -914,NVSWITCH-FEC-HIST-3,FEC-HIST-3,Value of RS FEC Histogram (Reed Solomon error correction) bin3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_3 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin3,hist3,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{3{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -915,NVSWITCH-FEC-HIST-4,FEC-HIST-4,Value of RS FEC Histogram (Reed Solomon error correction) bin4,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_4 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin4,hist4,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{4{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -916,NVSWITCH-FEC-HIST-5,FEC-HIST-5,Value of RS FEC Histogram (Reed Solomon error correction) bin5,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_5 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin5,hist5,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{5{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -917,NVSWITCH-FEC-HIST-6,FEC-HIST-6,Value of RS FEC Histogram (Reed Solomon error correction) bin6,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_6 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin6,Hist6,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{6{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -918,NVSWITCH-FEC-HIST-7,FEC-HIST-7,Value of RS FEC Histogram (Reed Solomon error correction) bin7,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_7 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin7,Hist7,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{7{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -919,NVSWITCH-FEC-HIST-8,FEC-HIST-8,Value of RS FEC Histogram (Reed Solomon error correction) bin8,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_8 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin8,Hist8,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{8{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -920,NVSWITCH-FEC-HIST-9,FEC-HIST-9,Value of RS FEC Histogram (Reed Solomon error correction) bin9,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_9 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin9,Hist9,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{9{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -921,NVSWITCH-FEC-HIST-10,FEC-HIST-10,Value of RS FEC Histogram (Reed Solomon error correction) bin10,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_10 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin10,Hist10,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{10{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -922,NVSWITCH-FEC-HIST-11,FEC-HIST-11,Value of RS FEC Histogram (Reed Solomon error correction) bin11,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_11 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin11,Hist11,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{11{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -923,NVSWITCH-FEC-HIST-12,FEC-HIST-12,Value of RS FEC Histogram (Reed Solomon error correction) bin12,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_12 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin12,hist12,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{12{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -924,NVSWITCH-FEC-HIST-13,FEC-HIST-13,Value of RS FEC Histogram (Reed Solomon error correction) bin13,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_13 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin13,hist13,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{13{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -925,NVSWITCH-FEC-HIST-14,FEC-HIST-14,Value of RS FEC Histogram (Reed Solomon error correction) bin14,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_14 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin14,hist14,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{14{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -926,NVSWITCH-FEC-HIST-15,FEC-HIST-15,Value of RS FEC Histogram (Reed Solomon error correction) bin15,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fec_hist_15 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/interfaces/interface [name]/phy-diag/state/rs-num-corr-err-bin15,hist15,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{histogram{rs-fec-corrected-errors{15{count}}}}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -931,NVSWITCH-PLR-CODES-LOSS,PLR-CODES-LOSS,Recieved bandwidth loss due to codes retransmission. calculated in resolution of: (plr_rcv_code_err / plr_rcv_codes) * 10^10 BW Loss % = (plr_codes_loss / 10^10 ) *100,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_plr_codes_loss MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,HiRetransmissionRate,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-codes-loss}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -932,NVSWITCH-PORT-BUFFER-OVERRUN-ERRORS,PORT-BUFFER-OVERRUN-ERRORS,Total number of packets received on the port that were discarded due to buffer overrun.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_buffer_overrun_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/counters/port/excessive-buffer-overrun,ExcessiveBufferOverrunErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{buffer-overrun-errors}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -933,NVSWITCH-LINK-SPEED-ACTIVE,LINK-SPEED-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_speed_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,interfaces/interface[name=$port_name]/infiniband/state/speed,Link_speed_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{speed}}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -934,NVSWITCH-PLR-RCV-CODES,PLR-RCV-CODES,Number of received PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-codes,PlrRcvCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -935,NVSWITCH-PLR-RCV-CODES-ERR,PLR-RCV-CODES-ERR,The total number of rejected PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_codes_err MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-code-err,PlrRcvCodeErr,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-codes-err}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -936,NVSWITCH-PLR-RCV-UNCORRECTABLES-CODE,PLR-RCV-UNCORRECTABLES-CODE,The total number of uncorrectable PLR codewords received,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_rcv_uncorrectables_code MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-rcv-uncorrectable-code,PlrRcvUncorrectableCode,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-rcv-uncorrectable-code}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -937,NVSWITCH-PLR-XMIT-CODES,PLR-XMIT-CODES,Number of transmitted PLR codewords,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-codes,PlrXmitCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -938,NVSWITCH-PLR-XMIT-RETRYS-CODES,PLR-XMIT-RETRYS-CODES,The total number of PLR codewords retransmitted,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_codes MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-codes,PlrXmitRetryCodes,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-codes}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -939,NVSWITCH-PLR-XMIT-RETRYS-EVENTS,PLR-XMIT-RETRYS-EVENTS,The total number of retransmitted events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retrys_events MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events,PlrXmitRetryEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -940,NVSWITCH-PLR-SYNC-EVENTS,PLR-SYNC-EVENTS,The number of PLR sync events,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_sync_events MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-sync-events,PlrSyncEvents,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-sync-events}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -941,NVSWITCH-PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,The maximum number of retransmitted events in 60 sec window based upon the action of undertaking PLR (physical layer retry),Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_plr_xmit_retry_codes_within_minute MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/plr-xmit-retry-events-within-t-sec-max,PlrXmitRetryCodesWithinTSecMax,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -942,NVSWITCH-PLR-BW-LOSS-PERCENT,PLR-BW-LOSS-PERCENT,The bandwidth loss (percentage) based upon PLR on the NVLink.,Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_plr_bw_loss_percent MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{plr-xmit-retry-events-within-t-sec-max}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -943,NVSWITCH-RQ-GENERAL-ERROR,RQ-GENERAL-ERROR,The total number of packets that were dropped since it contained errors. Reasons for this include: Dropped due to MPR mismatch.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_rq_general_error MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/rq-general-error,rq_general_error,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{rq-general-error}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -944,NVSWITCH-TIME-TO-LINKS-UP,TIME-TO-LINKS-UP,"Time in msec to link up from disable until phy up state. While the phy manager did not reach phy up state the timer will return 0. The timer resets to 0 in one of the following cases: 1. When moving to disable or rx disable state. 2. When moving from active or phy up to polling state, while working at force mode.",Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_to_links_up MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,time_to_link_up_ext_msec,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -945,NVSWITCH-STATUS-OPCODE,STATUS-OPCODE,Opcode for advanced debug. String representation can be found in STATUS-MESSAGE.,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,Advanced_Status_Opcode,"nv show interface --view link-diagnostics ""code""",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -946,NVSWITCH-STATUS-MESSAGE,STATUS-MESSAGE,String represntation of STATUS-OPCODE. All Messages are terminated by a Null character ‘\0’,Status,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_status_message as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,Status_Message,"nv show interface --view link-diagnostics ""status""",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -947,NVSWITCH-DOWN-BLAME,DOWN-BLAME,Which receiver caused last link down: 0: Unknown 1: Local_phy 2: Remote_phy,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_down_blame MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,down_blame,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -948,NVSWITCH-LOCAL-REASON-OPCODE,LOCAL-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 3: Block_Lock_loss 4: Alignment_loss 5: FEC_sync_loss 6: PLL_lock_loss 7: FIFO_overflow 8: false_SKIP_condition 9: Minor_Error_threshold_exceeded 10: Physical_layer_retransmission_timeout 11: Heartbeat_errors 12: Link_Layer_credit_monitoring_watchdog 13: Link_Layer_integrity_threshold_exceeded 14: Link_Layer_buffer_overrun 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 19: Down_by_verification_GW 20: Received_Remote_Fault 21: Received_TS1 22: Down_by_management_command 23: Cable_was_unplugged 24: Cable_access_issue 25: Thermal_shutdown 26: Current_issue 27: Power_budget 28: Fast_recovery_raw_ber 29: Fast_recovery_effective_ber 30: Fast_recovery_symbol_ber 31: Fast_recovery_credit_watchdog 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_local_reason_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,local_reason_opcode,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -949,NVSWITCH-REMOTE-REASON-OPCODE,REMOTE-REASON-OPCODE,Opcde of link down reason: 0: No_link_down_indication 1: Unknown_reason 2: Hi_SER_or_Hi_BER 4: Alignment_loss 10: Physical_layer_retransmission_timeout 15: Down_by_outband_command_with_healthy_link 16: Down_by_outband_command_for_link_with_hi_ber 17: Down_by_inband_command_with_healthy_link 18: Down_by_inband_command_for_link_with_hi_ber 21: Received_TS1 22: Down_by_management_command 32: Timeout 33: Peer_side_down_to_disable_state 34: Peer_side_down_to_disable_and_port_lock 35: Peer_side_down_due_to_thermal_event 36: Peer_side_down_due_to_force_event 37: Peer_side_down_due_to_reset_event 38: Reset_no_power_cycle 40: Down_due_to_HW_force_event 41: Down_due_to_thermal_event 42: L1_exit_failure 43: too_many_link_error_recoveries 44: Down_due_to_contain_mode 45: BW_loss_threshold_exceeded 47: Hi_SER 48: down_by_nmx_adminstate_cmd,Link-Quality,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_remote_reason_opcode MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,NA,remote_reason_opcode,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -950,NVSWITCH-PHY-RECEIVED-BITS,PHY-RECEIVED-BITS,"Total number of packets marked with the EBP delimiter received on the port. EBP is a special kind of packet that indicates the end of a burst of packets. A burst is a sequence of packets sent in rapid succession. The use of EBP helps in flow control. By knowing the end of a burst, the receiving side can manage its buffers efficiently and ensure that packets are processed in order without dropping any due to buffer overruns.",Performance,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_received_bits MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/phy-received-bits,phy_received_bits,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{health{phy-received-bits}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -951,NVSWITCH-PORT-RCV-CONSTRAINT-ERRORS,PORT-RCV-CONSTRAINT-ERRORS,Total number of packets received on the switch physical port that are discarded for the following reasons: • FilterRawInbound is true and packet is raw • PartitionEnforcementInbound is true and packet fails partition key check or IP version check,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_rcv_constraint_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/counters/port/rcv-constraints-errors,PortRcvConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{link{port-rcv-constraint-errors}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -952,NVSWITCH-PORT-XMIT-CONSTRAINTS-ERRORS,PORT-XMIT-CONSTRAINTS-ERRORS,Total number of packets not transmitted from the switch physical port for the following reasons: • FilterRawOutbound is true and packet is raw • PartitionEnforcementOutbound is true and packet fails partition key check or IP version check,Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,existing interface_out_errors MetricSample,covered_host_gnmi,already-covered-regression-required,NVUE gNMI sample processor emits interface_out_errors,NA,/interfaces/interface [name]/state/counters/out-errors,PortXmitConstraintErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {counters{out-errors}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -953,NVSWITCH-PORT-LOCAL-PHYSICAL-ERRORS,PORT-LOCAL-PHYSICAL-ERRORS,"Total number of packets received on the port that contain local physical errors (ICRC, VCRC, LPCRC, and all physical errors that cause entry into the BAD PACKET or BAD PACKET DISCARD states of the packet receiver state machine).",Error,,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_local_physical_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-local-physical-errors,PortLocalPhysicalErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-local-physical-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -954,NVSWITCH-SYNC-HEADER-ERROR-COUNTER,SYNC-HEADER-ERROR-COUNTER,Count of errored block sync header on one or more lanes.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_sync_header_error_counter MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/sync-header-error-counter,SyncHeaderErrorCounter,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{sync-header-error-counter}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -955,NVSWITCH-PORT-DLID-MAPPING-ERRORS,PORT-DLID-MAPPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to DLID mapping errors.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_dlid_mapping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-dlid-mapping-errors,PortDLIDMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-dlid-mapping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -956,NVSWITCH-LOCAL-LINK-INTEGRITY-ERRORS,LOCAL-LINK-INTEGRITY-ERRORS,The number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors;,Error,Integer,GB200 NVL HMC; GB200 NVL BMC; GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_local_link_integrity_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name]/infiniband/state/counters/port/local-link-integrity-errors,LocalLinkIntegrityErrors,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -957,NVSWITCH-PORT-VL-MAPPING-ERRORS,PORT-VL-MAPPING-ERRORS,"Packet discards due to VL mapping behavior are not considered errors, so the behavior of this counter is implementation dependent. However, it is recommended that this counter be used to count the total number of packets received on the port that were discarded because they could not be forwarded by the switch relay due to VL mapping behavior",Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_vl_mapping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-vl-mapping-errors,PortVLMappingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-vl-mapping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -958,NVSWITCH-PORT-LOOPING-ERRORS,PORT-LOOPING-ERRORS,Total number of packets received on the port that were discarded because they could not be forwarded by the witch relay due to looping errors (output port = input port). This applies to switches only.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_looping_errors MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-looping-errors,PortLoopingErrors,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-looping-errors}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -959,NVSWITCH-PORT-INACTIVE-DISCARDS,PORT-INACTIVE-DISCARDS,Number of outbound packets discarded by the port because it is not in the active state.,Error,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_port_inactive_discards MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/port-inactive-discards,PortInactiveDiscards,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{phy{detail{port-inactive-discards}}}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -960,NVSWITCH-LINK-WIDTH-ACTIVE,LINK-WIDTH-ACTIVE,link active width: Bit 0: 1x Bit 1: 2x Bit 2: 4x,Status,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T; NVOS CLI,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_link_width_active MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/width,Link_width_active,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{lanes}}",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -961,NVSWITCH-PHY-MANAGER-STATE,PHY-MANAGER-STATE,Show some more info about the PHY state: 0:Disabled 1:Open_port 2:Polling 3:Active_or_Linkup 4:Close_port 5:Phy_up 7:Rx_disable,Status,Text,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_phy_manager_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,/interfaces/interface [name]/phy-diag/state/phy-manager-state,Phy_Manager_State,NA,NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -962,NVSWITCH-MTU,MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_mtu MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/mtu,NA,"nv show interface $InterfaceId {InterfaceId: {type: nvl, state: up}} {link{mtu}}",NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -963,NVSWITCH-MAX-SUPPORTED-MTU,MAX-SUPPORTED-MTU,Maximum Transmission Unit,Specs,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_max_supported_mtu MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/max-supported-mtus,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{max-supported-mtu}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -964,NVSWITCH-SUPPORTED-WIDTH,SUPPORTED-WIDTH,Maximum Transmission Unit,Specs,Float,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_supported_width MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/supported-widths,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{supported-lanes}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -965,NVSWITCH-VL-CAPABILITIES,VL-CAPABILITIES,Maximum Transmission Unit,Specs,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_vl_capabilities as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",NA,interfaces/interface[name=$port_name]/infiniband/state/vl-capabilities,NA,nv show interface $InterfaceId {InterfaceId: {type: nvl}} {link{vl-capabilities}},NA,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -966,NVSWITCH-FAN-STATE,FAN-STATE,Fan status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NVOS CLI,NVOS gNMI,NVOS CLI,NVOS gNMI then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_fan_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name=FAN1/1]/state/oper-status,NA,nv show platform environment fan $FanId {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -967,NVSWITCH-FAN-LED,FAN-LED,Fan LED color,Sensor.Other,Text,GB200 NVL NvswitchTray,Available,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_fan_led as inventory/info event or state metric with bounded labels,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,,nv show platform environment led $LedID {color},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -968,NVSWITCH-CABLE-PART-NUMBER,CABLE-PART-NUMBER,Cable part num,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_part_number as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Cable_PN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-pn},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -969,NVSWITCH-CABLE-SERIAL-NUMBER,CABLE-SERIAL-NUMBER,Cabl Serial num,Inventory,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_serial_number MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",Na,NA,Cable_SN,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-sn},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -970,NVSWITCH-CABLE-TRANSMITTER-TECHNOLOGY,CABLE-TRANSMITTER-TECHNOLOGY,Active/Passive cable,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_transmitter_technology as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_technology,TBD,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -971,NVSWITCH-CABLE-TYPE,CABLE-TYPE,Cable/module type: 0: Unidentified 1: Active_cable - (active copper / optics) 2: Optical_Module - (separated) 3: Passive_copper_cable 4: Cable_unplugged 5: Twisted_pair,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_type as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_type,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-type},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -972,NVSWITCH-CABLE-VENDOR,CABLE-VENDOR,Cable vendor: 0: Other 1: Mellanox 2: Known_OUI 3: NVIDIA,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_vendor as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_vendor,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-name},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -973,NVSWITCH-CABLE-LENGTH,CABLE-LENGTH,Cable length in 1m units. For CMIS modules: bits 6:7 represent cable_length_multiplier for calculating cable length 00 - 0.1 multiplier (0.1 to 6.3m) 01- 1 multiplier (1 to 63m) 10 - 10 multiplier (10 to 630m) 11 - 100 multiplier (100 to 6300m) bits 0:5 represent cable_length_value for calculating cable length. length is calculated with cable_length_value * cable_length_- multiplier,Specs,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_length MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_length,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {cable-length},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -974,NVSWITCH-CABLE-IDENTIFIER,CABLE-IDENTIFIER,"0: QSFP28 1: QSFP_Plus 2: SFP28_or_SFP_Plus 3: QSA - (QSFP->SFP) 4: Backplane 5: SFP_DD 6: QSFP_DD 7: QSFP_CMIS 8: OSFP 9: C2C 10: DSFP 11: QSFP_Split_Cable identifiers that are CMIS compliant are: 5,6,7,8,10",Specs,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_identifier MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_identifier,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {identifier},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -975,NVSWITCH-CABLE-REV,CABLE-REV,ASCII Vendor revision aligned to right padded with 0h on the left,Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rev as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,vendor_rev,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {vendor-rev},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -976,NVSWITCH-CABLE-FW-VERSION,CABLE-FW-VERSION,module FW version (relevant for optic only),Config,,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_fw_version MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,cable_fw_version,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {firmware},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -977,NVSWITCH-CABLE-RX-POWER-LANE0,CABLE-RX-POWER-LANE0,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane0 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rx_power_lane_0,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-1{rx-power{Power}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -978,NVSWITCH-CABLE-RX-POWER-LANE1,CABLE-RX-POWER-LANE1,module internally measured Rx input optical power for lane 1 in uW / dBm (relevant for optic only),Sensor.Power,Float,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_rx_power_lane1 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rx_power_lane_1,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {channel{channel-2{rx-power{Power}}}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -979,NVSWITCH-CABLE-DIAG-SUPPLY-VOLTAGE,CABLE-DIAG-SUPPLY-VOLTAGE,Internally measured supply voltage in 100uV (relevant for optic only),Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_diag_supply_voltage MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Module_Voltage,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {voltage{voltage}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -980,NVSWITCH-CABLE-TEMP,CABLE-TEMP,Module main temperature sensor measured on a unit scale of 1/256 C degrees(relevant for optic only),Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T; NVOS CLI,NMX-T,NVOS CLI,NMX-T then NVOS CLI,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_cable_temp MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Module_Temperature,nv show platform transceiver $TransceiverId {TransceiverId: {status: Inserted}} {temperature{temperature}},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -981,NVSWITCH-CABLE-TEMP-ALARM,CABLE-TEMP-ALARM,Temperature warning threshold on a unit scale of 1/256 C degrees.,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_temp_alarm MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag,Temp_flags,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -982,NVSWITCH-CABLE-VOLTAGE-ALARM,CABLE-VOLTAGE-ALARM,Voltage warning threshold on a unit scale of 100uV.,Sensor.Voltage,Integer,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_voltage_alarm MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag,Vcc_flags,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -983,NVSWITCH-CABLE-TX-CDR-LOL,CABLE-TX-CDR-LOL,Bitmask for latched Tx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_cdr_lol as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-cdr-lol,tx_cdr_lol,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -984,NVSWITCH-CABLE-RX-CDR-LOL,CABLE-RX-CDR-LOL,Bitmask for latched Rx cdr loss of lock flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_cdr_lol as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-cdr-lol,rx_cdr_lol,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -985,NVSWITCH-CABLE-TX-LOS,CABLE-TX-LOS,Bitmask for latched Tx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_tx_los as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/tx-los,tx_los,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -986,NVSWITCH-CABLE-RX-LOS,CABLE-RX-LOS,Bitmask for latched Rx loss of signal flag per lane. Bit 0 - lane 0 ... Bit 7 - lane 7,Config,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI; NMX-T,NVOS gNMI,NMX-T,NVOS gNMI then NMX-T,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_rx_los as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name]/transceiver/physical-channels/channel[channel]/channel-diag/rx-los,rx_los,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -987,NVSWITCH-LINK-PARTNER-DESCRIPTION,LINK-PARTNER-DESCRIPTION,Description of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_description as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_description,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -988,NVSWITCH-LINK-PARTNER-NODE-GUID,LINK-PARTNER-NODE-GUID,GUID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_node_guid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_node_guid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -989,NVSWITCH-LINK-PARTNER-LID,LINK-PARTNER-LID,LID of the link partner side (port that is connected to the port),Inventory,Text,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_lid as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_lid,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -990,NVSWITCH-LINK-PARTNER-PORT-NUM,LINK-PARTNER-PORT-NUM,Port number of the link partner side (port that is connected to the port),Inventory,Integer,GB200 NVL NvswitchTray,Available,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_partner_port_num MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,link_partner_port_num,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1174,NVSWITCH-CPU-STATE,CPU-STATE,CPU status,Status,Text,GB200 NVL NvswitchTray,Available,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cpu_state as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,components/component[name=cpu]/state/oper-status,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1241,NVSWITCH-DRIVE-TEMP-CRITICAL,DRIVE-TEMP-CRITICAL,"Critical temperature threshold for drive, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1242,NVSWITCH-DRIVE-TEMP-MAX,DRIVE-TEMP-MAX,Max temperature threshold for drive,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1243,NVSWITCH-DRIVE-TEMP-STATE,DRIVE-TEMP-STATE,Drive Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1244,NVSWITCH-DRIVE-TEMP-CURRENT,DRIVE-TEMP-CURRENT,Drive Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_drive_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: Drive-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1245,NVSWITCH-HSC-VINDC-TEMP-CRITICAL,HSC-VINDC-TEMP-CRITICAL,"Critical temperature threshold for HSC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {crit},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1246,NVSWITCH-HSC-VINDC-TEMP-MAX,HSC-VINDC-TEMP-MAX,Max temperature threshold for HSC,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_max MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {max},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1247,NVSWITCH-HSC-VINDC-TEMP-STATE,HSC-VINDC-TEMP-STATE,HSC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {state},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1248,NVSWITCH-HSC-VINDC-TEMP-CURRENT,HSC-VINDC-TEMP-CURRENT,HSC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_hsc_vindc_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,nv show platform environment temperature $TemparatureId {TemparatureId: {Name: HSC-VinDC-Temp}} {current},,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1249,NVSWITCH-PDB-CONV-TEMP-CRITICAL,PDB-CONV-TEMP-CRITICAL,"Critical temperature threshold for PDB, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1251,NVSWITCH-PDB-CONV-TEMP-STATE,PDB-CONV-TEMP-STATE,PDB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1252,NVSWITCH-PDB-CONV-TEMP-CURRENT,PDB-CONV-TEMP-CURRENT,PDB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pdb_conv_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PDB-Conv-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1253,NVSWITCH-PMIC-TEMP-CRITICAL,PMIC-TEMP-CRITICAL,"Critical temperature threshold for PMIC, above this level the system will shutdown",Config,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_critical MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",NA,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {crit}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1255,NVSWITCH-PMIC-TEMP-STATE,PMIC-TEMP-STATE,PMIC Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1256,NVSWITCH-PMIC-TEMP-CURRENT,PMIC-TEMP-CURRENT,PMIC Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,NVOS CLI,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_pmic_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""PMIC-*-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1259,NVSWITCH-SWB-ASIC-PCB-TEMP-STATE,SWB-ASIC-PCB-TEMP-STATE,SWB ASIC PCB Temperature - o.k./not o.k.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_swb_asic_pcb_temp_state MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {state}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1260,NVSWITCH-SWB-ASIC-PCB-TEMP-CURRENT,SWB-ASIC-PCB-TEMP-CURRENT,SWB ASIC PCB Temperature - current temperature,Sensor.Thermal,Integer,GB200 NVL NvswitchTray,Available OOB,NVOS CLI,NVOS CLI,,NVOS CLI,one canonical series unless source-qualified duplicate is justified,"live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector",nvswitch_swb_asic_pcb_temp_current MetricSample,source_equivalent_required,requires-live-source-equivalent,"Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.",,NA,NA,"nv show platform environment temperature $TemparatureId {TemparatureId: {Name: ""SWB-ASIC*-PCB-Temp""}} {current}",,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1688,NVSWITCH-LINK-RECOVERY-SUCCESS-CNT,LINK-RECOVERY-SUCCESS-CNT,Successful recovery count in an active link. Counter resets on link flap.,"Status, Event",Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_link_recovery_success_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,successful_recovery_events,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1689,NVSWITCH-TOTAL-LINK-RECOVERY-SUCCESS-CNT,TOTAL-LINK-RECOVERY-SUCCESS-CNT,Total successful recovery count accumulated across link flaps.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_total_link_recovery_success_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,total_successful_recovery_events,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1690,NVSWITCH-TIME-SINCE-LAST-RECOVERY,TIME-SINCE-LAST-RECOVERY,"Elapsed time since last recovery event, measured in seconds.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_since_last_recovery MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_since_last_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1691,NVSWITCH-TIME-BTWN-TWO-RECOVERIES,TIME-BTWN-TWO-RECOVERIES,"Time in msec between two last consecutive recoveries (success or fail) from exit of first to entry of second. When value is OxFFFF, time is more than 1 minute.",Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_time_btwn_two_recoveries MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_between_last_2_recoveries,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1692,NVSWITCH-RECOVERY-ATTEMPTS-L1-CNT,RECOVERY-ATTEMPTS-L1-CNT,Number of first level (logical lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l1_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,last_host_logical_recovery_attempts_count,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1693,NVSWITCH-RECOVERY-ATTEMPTS-L2-CNT,RECOVERY-ATTEMPTS-L2-CNT,Number of second level (Serdes) lock attempts made during the last recovery.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_attempts_l2_cnt MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,last_host_serdes_feq_attempts_count,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1694,NVSWITCH-RECOVERY-CYCLE-DURATION,RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last logical recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_recovery_cycle_duration MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_in_last_host_logical_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1695,NVSWITCH-SERDES-RECOVERY-CYCLE-DURATION,SERDES-RECOVERY-CYCLE-DURATION,Duration (in milliseconds) of the last Serdes recovery cycle.,Status,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_serdes_recovery_cycle_duration MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,time_in_last_host_serdes_feq_recovery,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1696,NVSWITCH-CONTAIN-DRAIN-XMIT-DISCARD,CONTAIN-DRAIN-XMIT-DISCARD,Number of transmit discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_xmit_discard MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,contain_n_drain_xmit_discards,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1697,NVSWITCH-CONTAIN-DRAIN-RCV-DISCARD,CONTAIN-DRAIN-RCV-DISCARD,Number of receive discards related to the contain and drain mechanism on NVLink ports,Performance,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_contain_drain_rcv_discard MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,contain_n_drain_rcv_discards,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1698,NVSWITCH-DEVICE-NUM,DEVICE-NUM,Device number on tray,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_device_num MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,device_num_on_tray,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1699,NVSWITCH-BOARD-TYPE,BOARD-TYPE,board type,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_board_type as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,board_type,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1700,NVSWITCH-CHASSIS-SLOT-IDX,CHASSIS-SLOT-IDX,chassis slot index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_slot_idx MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,chassis_slot_index,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1701,NVSWITCH-TRAY-IDX,TRAY-IDX,Tray index,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_tray_idx MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,tray_index,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1702,NVSWITCH-TOPOLOGY-ID,TOPOLOGY-ID,Topology Id,Config,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_topology_id MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,topology_id,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1703,NVSWITCH-CHASSIS-ID,CHASSIS-ID,Chassis Id,Config,Text,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_chassis_id as inventory/info event or state metric with bounded labels,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,chassis_id,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1704,NVSWITCH-RAW-ERR-LANE-2,RAW-ERR-LANE-2,Raw errors lane 2,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_2 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Raw_Errors_Lane_2,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1705,NVSWITCH-RAW-ERR-LANE-3,RAW-ERR-LANE-3,Raw errors lane 3,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_raw_err_lane_3 MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,Raw_Errors_Lane_3,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1706,NVSWITCH-RQ-NUM-WRFE,RQ-NUM-WRFE,RQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_wrfe MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rq_num_wrfe,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1707,NVSWITCH-RQ-NUM-LLE,RQ-NUM-LLE,RQ num LLE,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_rq_num_lle MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,rq_num_lle,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -1708,NVSWITCH-SQ-NUM-WRFE,SQ-NUM-WRFE,SQ num wrfe,Link-Quality,Integer,GB200 NVL NvswitchTray,Available OOB,NMX-T,NMX-T,,NMX-T,one canonical series unless source-qualified duplicate is justified,extend NmxtCollector mapping,nvswitch_sq_num_wrfe MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,NA,sq_num_wrfe,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2293,NVSWITCH-CABLE-OPER-STATUS,CABLE-OPER-STATUS,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_cable_oper_status MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/transceiver-diag/state/module-oper-status,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2294,NVSWITCH-CABLE-SNR-MEDIA-LANE-N,CABLE-SNR-MEDIA-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,SOURCE UNLISTED live source resolution,,No catalog source listed for GB200 row,source-resolution required before live signoff,live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted,nvswitch_cable_snr_media_lane_n MetricSample,source_resolution_required,requires-live-source-resolution,"Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.",,NA,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2295,NVSWITCH-CABLE-SNR-HOST-LANE-N,CABLE-SNR-HOST-LANE-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,,SOURCE UNLISTED live source resolution,,No catalog source listed for GB200 row,source-resolution required before live signoff,live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted,nvswitch_cable_snr_host_lane_n MetricSample,source_resolution_required,requires-live-source-resolution,"Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.",,NA,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2296,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_low_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-lower,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2297,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_low_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-lower,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2298,NVSWITCH-NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_rx_power_lane_high_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/input-power-upper,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review -2299,NVSWITCH-NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,,Link-Quality,Float,GB200 NVL NvswitchTray,Needs Review,NVOS gNMI,NVOS gNMI,,NVOS gNMI,one canonical series unless source-qualified duplicate is justified,extend NvueGnmiCollector sample paths/processors,nvswitch_nvswitch_cable_tx_power_lane_high_n MetricSample,covered_generic_infra_unvalidated,covered-by-generic-infra-requires-live-validation,"GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.",,/components/component[name]/transceiver/thresholds/threshold[severity=CRITICAL]/state/output-power-upper,NA,NA,,required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation,validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review +catalog_row,metric_param_name,corrected_primary_source,final_status,disposition,match_detail +763,NET-FW-VER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +764,OS-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" +765,OS-KERNEL,source-equivalent required; no new CLI collector by default,ABSENT-BLOCKER,blocker,no live token match (CLI-only) +766,EROT-FW-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" +767,BMC-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" +794,LINK-DOWNED-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +795,PORT-MALFORMED-PACKET-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +796,PORT-NEIGHBOR-MTU-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +797,PORT-RCV-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +798,PORT-XMIT-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +799,PORT-RCV-REMOTE-PHYSICAL-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +800,PORT-RCV-SWITCH-RELAY-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +801,QP1Dropped,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +802,VL15-DROPPED,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +804,SERIAL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +806,NODE-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +807,PORT-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +834,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +846,LINK-ERROR-RECOVERY-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +847,PORT-MULTICAST-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +848,PORT-MULTICAST-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +849,PORT-RCV-DATA,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +850,PORT-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +851,PORT-UNICAST-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +852,PORT-UNICAST-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +853,PORT-XMIT-DATA,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +854,PORT-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +855,PORT-XMIT-WAIT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +862,CONTACT,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +863,LOCATION,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +864,NODE-DESCRIPTION,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +865,LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +866,PORT-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +867,PORT-LABEL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +868,REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +869,DEVICE-HARDWARE-REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +870,CPU_CORE_NUMBER,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['core-to-phy-link-width-enabled', 'core-to-phy-link-proto-enabled'] nmxt~['Port_Number', 'sw_serial_number']" +872,ASIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +875,ASIC-TEMP-CURRENT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +876,ASIC-NAME,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +879,AMBIENT-MNG-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +880,AMBIENT-MNG-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +881,CPU_PACK_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" +882,CPU_PACK_TEMP_MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" +883,CPU_PACK_TEMP_STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" +884,CPU_PACK_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" +885,CPU-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +886,MEM-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +887,MEM-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +888,DISK-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +889,DISK-USED,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +890,SODIMM_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +891,SODIMM_TEMP_MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +892,SODIMM_TEMP_STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +893,SODIMM_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +894,MAX-SPEED,BMC Redfish live resource only,IMPLEMENTED,implemented,nvue rest /platform/environment/fan .max-speed +897,PORT-LOGICAL-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +898,FEC-MODE-ACTIVE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +899,RAW-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +900,EFFECTIVE-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +901,SYMBOL-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +902,ZERO-HIST,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +903,PHY-RAW-ERRORS-LANE0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +904,PHY-RAW-ERRORS-LANE1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +905,RAW-BER-LANE0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +906,RAW-BER-LANE1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +907,PHY-EFFECTIVE-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +908,PHY-SYMBOL-ERRORS,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +909,TIME-SINCE-LASTS-CLEAR,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /interfaces/interface/phy-diag/state/time-since-last-clear-min +910,DEVICE-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +911,FEC-HIST-0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +912,FEC-HIST-1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +913,FEC-HIST-2,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +914,FEC-HIST-3,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +915,FEC-HIST-4,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +916,FEC-HIST-5,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +917,FEC-HIST-6,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +918,FEC-HIST-7,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +919,FEC-HIST-8,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +920,FEC-HIST-9,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +921,FEC-HIST-10,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +922,FEC-HIST-11,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +923,FEC-HIST-12,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +924,FEC-HIST-13,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +925,FEC-HIST-14,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +926,FEC-HIST-15,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +931,PLR-CODES-LOSS,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: HiRetransmissionRate +932,PORT-BUFFER-OVERRUN-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +933,LINK-SPEED-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +934,PLR-RCV-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +935,PLR-RCV-CODES-ERR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +936,PLR-RCV-UNCORRECTABLES-CODE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +937,PLR-XMIT-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +938,PLR-XMIT-RETRYS-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +939,PLR-XMIT-RETRYS-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +940,PLR-SYNC-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +941,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +942,PLR-BW-LOSS-PERCENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['plr-xmit-retry-events', 'plr-rcv-code-err'] nmxt~['PlrRcvUncorrectableCode', 'PlrXmitRetryEvents']" +943,RQ-GENERAL-ERROR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +944,TIME-TO-LINKS-UP,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +945,STATUS-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +946,STATUS-MESSAGE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +947,DOWN-BLAME,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +948,LOCAL-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +949,REMOTE-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +950,PHY-RECEIVED-BITS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +951,PORT-RCV-CONSTRAINT-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +952,PORT-XMIT-CONSTRAINTS-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +953,PORT-LOCAL-PHYSICAL-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +954,SYNC-HEADER-ERROR-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +955,PORT-DLID-MAPPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +956,LOCAL-LINK-INTEGRITY-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +957,PORT-VL-MAPPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +958,PORT-LOOPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +959,PORT-INACTIVE-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +960,LINK-WIDTH-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +962,MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +963,MAX-SUPPORTED-MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +964,SUPPORTED-WIDTH,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +966,FAN-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +967,FAN-LED,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['pd-link-speed-enabled', 'phy-hst-link-speed-enabled'] nmxt~[]" +968,CABLE-PART-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +969,CABLE-SERIAL-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +970,CABLE-TRANSMITTER-TECHNOLOGY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +971,CABLE-TYPE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +972,CABLE-VENDOR,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +973,CABLE-LENGTH,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +974,CABLE-IDENTIFIER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +975,CABLE-REV,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +976,CABLE-FW-VERSION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +977,CABLE-RX-POWER-LANE0,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +978,CABLE-RX-POWER-LANE1,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +979,CABLE-DIAG-SUPPLY-VOLTAGE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +980,CABLE-TEMP,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +981,CABLE-TEMP-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag +982,CABLE-VOLTAGE-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag +983,CABLE-TX-CDR-LOL,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/tx-cdr-lol +984,CABLE-RX-CDR-LOL,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/rx-cdr-lol +985,CABLE-TX-LOS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/tx-los +986,CABLE-RX-LOS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/rx-los +987,LINK-PARTNER-DESCRIPTION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +988,LINK-PARTNER-NODE-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +989,LINK-PARTNER-LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +990,LINK-PARTNER-PORT-NUM,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1174,CPU-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact +1241,DRIVE-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1242,DRIVE-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1243,DRIVE-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1244,DRIVE-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1245,HSC-VINDC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1246,HSC-VINDC-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1247,HSC-VINDC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1248,HSC-VINDC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1249,PDB-CONV-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1251,PDB-CONV-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1252,PDB-CONV-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1253,PMIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1255,PMIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1256,PMIC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1259,SWB-ASIC-PCB-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1260,SWB-ASIC-PCB-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1688,LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1689,TOTAL-LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1690,TIME-SINCE-LAST-RECOVERY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1691,TIME-BTWN-TWO-RECOVERIES,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1692,RECOVERY-ATTEMPTS-L1-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1693,RECOVERY-ATTEMPTS-L2-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1694,RECOVERY-CYCLE-DURATION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1695,SERDES-RECOVERY-CYCLE-DURATION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1696,CONTAIN-DRAIN-XMIT-DISCARD,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1697,CONTAIN-DRAIN-RCV-DISCARD,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1698,DEVICE-NUM,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1699,BOARD-TYPE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1700,CHASSIS-SLOT-IDX,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1701,TRAY-IDX,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1702,TOPOLOGY-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1703,CHASSIS-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +1704,RAW-ERR-LANE-2,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1705,RAW-ERR-LANE-3,NMX-T explicit allowlist,PRESENT,implemented,nmxt family +1706,RQ-NUM-WRFE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: rq_num_wrfe +1707,RQ-NUM-LLE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: rq_num_lle +1708,SQ-NUM-WRFE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: sq_num_wrfe +2293,CABLE-OPER-STATUS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/transceiver-diag/state/module-oper-status +2294,CABLE-SNR-MEDIA-LANE-N,live source resolution required,RESOLVED-LIVE,implemented,"gnmi~['cable-proto-cap-ext'] nmxt~['tx_power_lane_3', 'rx_power_lane_5']" +2295,CABLE-SNR-HOST-LANE-N,live source resolution required,RESOLVED-LIVE,implemented,"gnmi~['cable-proto-cap-ext', 'hostname'] nmxt~['tx_power_lane_3', 'rx_power_lane_5']" +2296,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-lower +2297,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/output-power-lower +2298,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-upper +2299,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/output-power-upper diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index f27fe9ec94..28db12522b 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -1,62 +1,75 @@ # NVSWITCH telemetry GB200 source matrix -Generated from sanitized Telemetry Catalog extraction artifacts for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`: - -- `Applicable for GB200 NVL HMC` -- `Applicable for GB200 NVL BMC` -- `Applicable for GB200 NVL NvswitchTray` +Generated from Stage 0 live-probe results (`nvswitch-stage0-live-coverage-20260620.md`) via +`catalog-coverage-final.csv`. Supersedes the pre-live-validation matrix generated from the raw +catalog extraction. CSV matrix: `docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv` +Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_status`, +`disposition`, `match_detail`. + ## Counts - Total GB200-applicable NVSWITCH rows: 193 -### Implementation status - -- already-covered-regression-required: 5 -- covered-by-generic-infra-requires-live-validation: 150 -- requires-live-source-equivalent: 36 -- requires-live-source-resolution: 2 - -### Branch coverage status - -- covered_generic_infra_unvalidated: 150 -- covered_host_gnmi: 4 -- covered_host_nmxt: 1 -- source_equivalent_required: 36 -- source_resolution_required: 2 - -### Primary source - -- NMX-T: 57 -- NVOS CLI: 36 -- NVOS gNMI: 97 -- Redfish Fabric/Switch/Port: 1 -- SOURCE UNLISTED live source resolution: 2 - -## GB200 branch implementation coverage - -The `nvswitch_telemetry_gaps` branch implements common GB+VR-friendly collector infrastructure for the GB200 phase: - -- Redfish BMC: enabled `nv-redfish` `telemetry-service`, added a switch-BMC-only TelemetryService collector, and emits every numeric/boolean/string `MetricReport` value as `redfish_telemetry_service` samples with report and source-property labels. -- BMC proxy: widened TelemetryService ACLs to `MetricReportDefinitions/*` and `MetricReports/*` so live GB200 validation is not limited to `NvidiaNMMetrics_0`. -- NMX-T HOST: preserves all numeric Prometheus samples instead of dropping unknown metric names; legacy `Effective_BER`, `Symbol_Errors`, and `Link_Down` metric names remain canonical. -- NVUE gNMI HOST: subscribes to `components`, `interfaces`, and `platform-general`; known current metrics keep their existing names, and previously unmapped leaves are emitted as source-qualified `nvswitch_*` samples. -- Config: `collectors.telemetry_service` is disabled by default, and `collectors.nvue.gnmi.paths.platform_general_enabled` is an explicit opt-in path gate; the example and live-validation configs enable the full GB200 switch collector set. - -The generic-preservation surfaces are behavior-locked by unit tests before live hardware validation: - -- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples` covers numeric, string/info, and boolean/state MetricReport values. -- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels` cover stable key identity for unknown Prometheus samples with extra labels. -- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric` cover previously unmapped interface leaves and platform-general string leaves. - -Rows that still have no catalog-listed source remain in scope: `CABLE-SNR-MEDIA-LANE-N` and `CABLE-SNR-HOST-LANE-N` are marked `requires-live-source-resolution` and must be checked during live validation. The generic Redfish MetricReport, NMX-T, and gNMI preservation paths will expose them if the device emits them; if not, open a source-owner follow-up immediately. - -## Execution rules - -- Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete. -- Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale. -- Generic-preserved metrics must keep bounded identity labels: report id/URI/definition and metric id/property/identity for Redfish MetricReports, raw source metric plus sorted source-label identity for NMX-T, and full gNMI path plus endpoint/entity labels for gNMI. Redfish internal keys must use escaped raw MetricId/MetricProperty identity, and NMX-T generic keys must escape raw port/source/node/label identity, to avoid aliasing. Raw string metric values must not be emitted as labels. -- Rows marked `requires-live-source-resolution` or `requires-live-source-equivalent` remain in scope; they require live source proof or immediate escalation before GB200 signoff. -- Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed. +### Disposition (post-live-probe) + +| Disposition | Count | Meaning | +|----------------|-------|-----------------------------------------------------------------------| +| implemented | 149 | PRESENT/RESOLVED-LIVE allowlist hit, or IMPLEMENTED (MAX-SPEED via NVUE REST) | +| blocker | 44 | ABSENT-BLOCKER (leaf not live), BLOCKER-THRESHOLD (config-only), or BLOCKER-STRING (string-valued) | + +### final_status breakdown + +| final_status | Count | +|-------------------|-------| +| PRESENT | 132 | +| RESOLVED-LIVE | 16 | +| IMPLEMENTED | 1 | +| ABSENT-BLOCKER | 17 | +| BLOCKER-THRESHOLD | 21 | +| BLOCKER-STRING | 6 | + +## Blocker escalations + +See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the +full annotated list of 44 rows, grouped by root cause, with resolution path and re-probe +conditions. + +## Notes on implemented rows + +- **PRESENT** rows have an explicit gNMI or NMX-T allowlist mapping confirmed live by the Stage 0 + probe. No further work required before merge. +- **RESOLVED-LIVE** rows have no direct catalog-listed source but a live token match was found in + gNMI or NMX-T output. Match tokens are recorded in `match_detail`. These are accepted as + covered; if live validation on a production rig disputes a mapping, re-escalate immediately. +- **IMPLEMENTED — MAX-SPEED (row 894):** sourced from NVUE REST `/nvue_v1/platform/environment/fan` + `.max-speed` (not Redfish — confirmed live). The 4 `/platform-general` memory/disk rows + (`886/887/888/889`) are PRESENT via the new gNMI `platform-general` subscribe path. + +## Notes on blocker rows + +No row is marked "deferred." Every blocker has an explicit escalation disposition: + +- **BLOCKER-THRESHOLD (21 rows):** The catalog entry represents a threshold/limit/alarm-state + value, not a streamed telemetry counter. These are configuration parameters unavailable as live + gNMI leaves. Source owner must confirm whether a future gNMI path or Redfish sensor threshold + can expose them; until confirmed they cannot be implemented without a new data source. +- **BLOCKER-STRING (6 rows):** string-valued catalog rows with no numeric encoding — `CONTACT`, + `LOCATION`, `NODE-DESCRIPTION` (platform), `ASIC-NAME`, `PHY-MANAGER-STATE`, `VL-CAPABILITIES`. + Present live but cannot be emitted as numeric metrics; need a string/label export path (tracked + as #11), or enum-coding for the FSM-style ones. Not silently dropped — escalated. +- **ABSENT-BLOCKER — cable/transceiver alarm leaves (9 rows: 981-986, 2293, 2296-2299):** gNMI + leaves exist in the schema but returned no data on the test rig. Likely empty due to an uncabled + switch. Re-probe on a cabled switch before treating as a permanent blocker. +- **ABSENT-BLOCKER — TIME-SINCE-LASTS-CLEAR (row 909):** gNMI leaf + `/interfaces/interface/phy-diag/state/time-since-last-clear-min` not live. Escalate to NVOS + gNMI owner for NVOS version confirmation. +- **ABSENT-BLOCKER — PLR-CODES-LOSS (row 931):** NMX-T field `HiRetransmissionRate` not live. + Escalate to NMX-T owner. +- **ABSENT-BLOCKER — NMX-T RDMA queue counters (rows 1706-1708):** RQ-NUM-WRFE, RQ-NUM-LLE, + SQ-NUM-WRFE — NMX-T fields `rq_num_wrfe`, `rq_num_lle`, `sq_num_wrfe` not live. Escalate to + NMX-T/RDMA owner. +- **ABSENT-BLOCKER — OS-KERNEL (row 765):** CLI-only, no gNMI or NMX-T token match. Requires a + new CLI collector or NVOS gNMI exposure; escalate to NVOS owner. diff --git a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md index 2b651d16a9..96890c3164 100644 --- a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md +++ b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md @@ -1,12 +1,22 @@ # NVSWITCH telemetry nv-redfish dependency notes +> **Superseded (2026-06-18).** The standalone Redfish `TelemetryService` MetricReports +> collector described below was **removed**: live GB200 BMC probes show +> `/redfish/v1/TelemetryService` MetricReports are absent/404, `SwitchMetrics` are +> empty, histograms are empty, and `Ports` are absent. The corrected direction uses +> explicit, catalog-row allowlist mappings over the live BMC sensor/thermal surface +> and the live host NVOS gNMI / NMX-T surfaces. This file is retained for the +> nv-redfish dependency history only; the `telemetry-service` feature, the +> `[collectors.telemetry_service]` config, and the collector itself are no longer +> present in this branch. + Generated during the GB200 NVSWITCH telemetry branch setup. ## Current infra-controller dependency state - `Cargo.toml` pins `nv-redfish = { version = "0.10.0" }`. - `Cargo.lock` resolves `nv-redfish`, `nv-redfish-bmc-http`, `nv-redfish-core`, `nv-redfish-schema`, and `nv-redfish-csdl-compiler` to `0.10.0` from crates.io. -- This branch enables `telemetry-service` in `crates/health/Cargo.toml` for the new Redfish TelemetryService collector. +- ~~This branch enables `telemetry-service` in `crates/health/Cargo.toml` for the new Redfish TelemetryService collector.~~ (Reverted: the `telemetry-service` feature and collector were removed; see the superseded banner above.) - The GB200 branch has a local `nv-redfish` worktree available for companion development only: - `${NV_REDFISH_WORKTREE}` - Branch: `nvswitch_telemetry_gaps` @@ -23,7 +33,7 @@ Generated during the GB200 NVSWITCH telemetry branch setup. ## Dependency conclusion -TelemetryService MetricReports can be wired in infra-controller by enabling `telemetry-service` and consuming the typed `TelemetryService` APIs already available in nv-redfish 0.10.x. +Historical note: TelemetryService MetricReports *could* in principle be wired in infra-controller by enabling `telemetry-service` and consuming the typed `TelemetryService` APIs available in nv-redfish 0.10.x. This was attempted and then **reverted** — live GB200 BMC exposes no usable MetricReports, so no TelemetryService collector is wired in this branch. Redfish Fabric/Switch/Port support needs companion `nv-redfish` work if GB200 live hardware or the catalog requires those paths. The companion work should add standard DMTF schema XMLs and feature entries for Fabric, Switch, Port, SwitchMetrics, PortMetrics, Endpoint, and Zone families, plus ergonomic ServiceRoot/Fabric/Switch navigation wrappers and mock tests. @@ -57,6 +67,6 @@ Do not commit local absolute path dependencies. Before final review, use one of ## Branch implementation update -The GB200 branch consumes the typed TelemetryService API already present in `nv-redfish` 0.10.0 (`ServiceRoot::telemetry_service()`, `TelemetryService::metric_report_links()`, and `MetricReportLink::fetch()`). No local `nv-redfish` path dependency is committed. +~~The GB200 branch consumes the typed TelemetryService API already present in `nv-redfish` 0.10.0 (`ServiceRoot::telemetry_service()`, `TelemetryService::metric_report_links()`, and `MetricReportLink::fetch()`).~~ **Reverted.** The branch no longer consumes the TelemetryService API; the collector was removed after live GB200 probes returned no MetricReports. No local `nv-redfish` path dependency is committed. -Direct Fabric/Switch/Port wrappers are still absent from `nv-redfish` 0.10.x and `origin/main` as inspected. The GB200 branch therefore uses Redfish TelemetryService MetricReports for BMC-side switch telemetry now, while keeping the local companion worktree available if live GB200 evidence proves that a required metric is only available from Fabric/Switch/Port resources and not from MetricReports, NMX-T, or gNMI. +Direct Fabric/Switch/Port wrappers are still absent from `nv-redfish` 0.10.x and `origin/main` as inspected. BMC-side switch telemetry is now sourced from the live BMC sensor/thermal surface (not TelemetryService MetricReports), with the local companion worktree kept available if live GB200 evidence later proves a required metric is only available from Fabric/Switch/Port resources and not from the BMC sensor surface, NMX-T, or gNMI. diff --git a/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml b/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml index 34d2c95fee..6bf66e685e 100644 --- a/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml +++ b/helm/charts/nico-bmc-proxy/files/carbide-bmc-proxy.toml @@ -70,9 +70,7 @@ additional_issuer_cns = [] "GET /redfish/v1/UpdateService/FirmwareInventory/HGX_FW_BMC_0", "GET /redfish/v1/UpdateService/FirmwareInventory/HostBMC_0", "GET /redfish/v1/TelemetryService", - "GET /redfish/v1/TelemetryService/MetricReportDefinitions", - "GET /redfish/v1/TelemetryService/MetricReportDefinitions/*", "GET /redfish/v1/TelemetryService/MetricReports", - "GET /redfish/v1/TelemetryService/MetricReports/*", + "GET /redfish/v1/TelemetryService/MetricReports/NvidiaNMMetrics_0", "GET /redfish/v1/TaskService/Tasks/*", ] From bef554b9e2916de77d7eb360ab3c30ff12082b60 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 22 Jun 2026 22:06:36 -0400 Subject: [PATCH 06/25] feat(health): reclaim 4 NVSwitch cable fault rows via NMX-T Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 11 ++++ ...vswitch_telemetry_gb200_live_validation.md | 59 +++++++++---------- .../nvswitch_telemetry_gb200_matrix.csv | 8 +-- .../health/nvswitch_telemetry_gb200_matrix.md | 20 ++++--- 4 files changed, 56 insertions(+), 42 deletions(-) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index d338f7c313..812681c952 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -88,6 +88,13 @@ const NMXT_METRIC_MAP: &[(&str, &str, &str)] = &[ // Raw error lanes ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), // row 1704 RAW-ERR-LANE-2 ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), // row 1705 RAW-ERR-LANE-3 + // Cable/transceiver fault flags (0/1). Re-sourced from NMX-T: NVLink ports on + // the N5400_LD are not modeled as gNMI transceiver components, so the catalog's + // gNMI transceiver-diag path is absent live; NMX-T exposes these per active link. + ("tx_cdr_lol", "cable_tx_cdr_lol", "state"), // row 983 CABLE-TX-CDR-LOL + ("rx_cdr_lol", "cable_rx_cdr_lol", "state"), // row 984 CABLE-RX-CDR-LOL + ("tx_los", "cable_tx_los", "state"), // row 985 CABLE-TX-LOS + ("rx_los", "cable_rx_los", "state"), // row 986 CABLE-RX-LOS ]; /// Explicit allowlist: live NMX-T Prometheus **label** key -> canonical label name. @@ -499,6 +506,10 @@ Link_Down{Port_Number="1"} 5 ("contain_n_drain_rcv_discards", "contain_drain_rcv_discard", "count"), ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), + ("tx_cdr_lol", "cable_tx_cdr_lol", "state"), + ("rx_cdr_lol", "cable_rx_cdr_lol", "state"), + ("tx_los", "cable_tx_los", "state"), + ("rx_los", "cable_rx_los", "state"), ]; for (source, metric_type, unit) in expected { diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index 4c18a94154..630e704bcc 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -149,7 +149,7 @@ Unit coverage that locks this behavior: ## Blocker escalations (Stage 0) Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. -44 rows are escalated below (21 config-threshold, 17 absent-from-live-probe, 6 string-valued). No +40 rows are escalated below (21 config-threshold, 13 absent-from-live-probe, 6 string-valued). No row is deferred — each has an explicit disposition and a named resolution path. ### Group A — Config-threshold rows (21 rows, BLOCKER-THRESHOLD) @@ -186,35 +186,34 @@ can expose these. Until confirmed, they are out-of-scope for this branch. | 1255 | PMIC-TEMP-STATE | | 1259 | SWB-ASIC-PCB-TEMP-STATE | -### Group B — Cable/transceiver alarm leaves (9 rows, ABSENT-BLOCKER) - -**CAVEAT: likely empty due to uncabled test rig.** These gNMI leaves exist in the NVOS schema -and are in the explicit allowlist, but returned no data during Stage 0 probing. The probe -switch had no cables attached, which is the most probable cause — transceiver alarm flags only -populate when a transceiver is inserted. - -**Resolution:** Re-probe on a cabled production switch before treating these as permanent -blockers. If leaves remain absent on a cabled switch, escalate to NVOS gNMI owner with the -exact NVOS version and transceiver module type. - -| Row | Metric | gNMI leaf (not live) | -|------|---------------------------------|----------------------------------------------------------------------------------------------------| -| 981 | CABLE-TEMP-ALARM | `/components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag` | -| 982 | CABLE-VOLTAGE-ALARM | `/components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag` | -| 983 | CABLE-TX-CDR-LOL | `/components/component/transceiver/physical-channels/channel/channel-diag/tx-cdr-lol` | -| 984 | CABLE-RX-CDR-LOL | `/components/component/transceiver/physical-channels/channel/channel-diag/rx-cdr-lol` | -| 985 | CABLE-TX-LOS | `/components/component/transceiver/physical-channels/channel/channel-diag/tx-los` | -| 986 | CABLE-RX-LOS | `/components/component/transceiver/physical-channels/channel/channel-diag/rx-los` | -| 2293 | CABLE-OPER-STATUS | `/components/component/transceiver/transceiver-diag/state/module-oper-status` | -| 2296 | NVSWITCH-CABLE-RX-POWER-LANE-LOW-N | `/components/component/transceiver/thresholds/threshold/state/input-power-lower` | -| 2297 | NVSWITCH-CABLE-TX-POWER-LANE-LOW-N | `/components/component/transceiver/thresholds/threshold/state/output-power-lower` | -| 2298 | NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N | `/components/component/transceiver/thresholds/threshold/state/input-power-upper` | -| 2299 | NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N | `/components/component/transceiver/thresholds/threshold/state/output-power-upper` | - -Note: rows 2296–2299 are four rows, bringing Group B to 11 entries — the "9 cable/transceiver -alarm leaves" figure from the plan refers to the 9 alarm/status leaves (981–986, 2293); the -4 power threshold rows (2296–2299) overlap in root cause and are included here as they share -the same uncabled-rig caveat and re-probe condition. +### Group B — Cable/transceiver leaves (7 rows, ABSENT-BLOCKER) + +**Root cause (NOT an uncabled rig).** The N5400_LD NVLink switch enumerates **no gNMI transceiver +components** — the live component tree has only `ASIC`/`CPU`/`FAN`/`SWITCH` types and no +`/components/component/transceiver/*` subtree, even though 64+ ports are active NDR/XDR backplane +links (re-probed live 2026-06-23). The catalog mapped these rows to an openconfig transceiver-diag +path this platform does not expose; NVLink backplane cables are not modeled as openconfig +transceivers. + +**Re-sourced to NMX-T (now implemented):** 4 fault-flag rows have live NMX-T families (value 0 = no +fault on the active links) and were moved into `NMXT_METRIC_MAP` — 983 CABLE-TX-CDR-LOL +(`tx_cdr_lol`), 984 CABLE-RX-CDR-LOL (`rx_cdr_lol`), 985 CABLE-TX-LOS (`tx_los`), 986 CABLE-RX-LOS +(`rx_los`). They are no longer blockers. + +**Resolution (remaining 7):** no NMX-T or gNMI source exists for the alarm/threshold/oper-status +rows below. Escalate to the NVOS gNMI / NMX-T owner: is there any source (gNMI/NMX-T/Redfish/CLI) +for NVLink cable optical alarms, module oper-status, and per-lane power thresholds on N5400_LD, or +are these rows N/A for NVLink backplane switches? + +| Row | Metric | Catalog source (absent live) | +|------|-------------------------------------|----------------------------------------------------------------| +| 981 | CABLE-TEMP-ALARM | gNMI transceiver `temp-high-alarm-flag` (no transceiver component) | +| 982 | CABLE-VOLTAGE-ALARM | gNMI transceiver `vcc-high-alarm-flag` (no transceiver component) | +| 2293 | CABLE-OPER-STATUS | gNMI transceiver `module-oper-status` (no transceiver component) | +| 2296 | NVSWITCH-CABLE-RX-POWER-LANE-LOW-N | gNMI transceiver thresholds `input-power-lower` (absent) | +| 2297 | NVSWITCH-CABLE-TX-POWER-LANE-LOW-N | gNMI transceiver thresholds `output-power-lower` (absent) | +| 2298 | NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N | gNMI transceiver thresholds `input-power-upper` (absent) | +| 2299 | NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N | gNMI transceiver thresholds `output-power-upper` (absent) | ### Group C — NMX-T RDMA queue counters (3 rows, ABSENT-BLOCKER) diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 93dce6af3a..23e2ea2823 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -139,10 +139,10 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 980,CABLE-TEMP,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 981,CABLE-TEMP-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag 982,CABLE-VOLTAGE-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag -983,CABLE-TX-CDR-LOL,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/tx-cdr-lol -984,CABLE-RX-CDR-LOL,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/rx-cdr-lol -985,CABLE-TX-LOS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/tx-los -986,CABLE-RX-LOS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/channel/channel-diag/rx-los +983,CABLE-TX-CDR-LOL,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) +984,CABLE-RX-CDR-LOL,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) +985,CABLE-TX-LOS,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) +986,CABLE-RX-LOS,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) 987,LINK-PARTNER-DESCRIPTION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 988,LINK-PARTNER-NODE-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 989,LINK-PARTNER-LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index 28db12522b..b94762f042 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -17,24 +17,24 @@ Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_ | Disposition | Count | Meaning | |----------------|-------|-----------------------------------------------------------------------| -| implemented | 149 | PRESENT/RESOLVED-LIVE allowlist hit, or IMPLEMENTED (MAX-SPEED via NVUE REST) | -| blocker | 44 | ABSENT-BLOCKER (leaf not live), BLOCKER-THRESHOLD (config-only), or BLOCKER-STRING (string-valued) | +| implemented | 153 | PRESENT/RESOLVED-LIVE allowlist hit, or IMPLEMENTED (MAX-SPEED via NVUE REST) | +| blocker | 40 | ABSENT-BLOCKER (leaf not live), BLOCKER-THRESHOLD (config-only), or BLOCKER-STRING (string-valued) | ### final_status breakdown | final_status | Count | |-------------------|-------| -| PRESENT | 132 | +| PRESENT | 136 | | RESOLVED-LIVE | 16 | | IMPLEMENTED | 1 | -| ABSENT-BLOCKER | 17 | +| ABSENT-BLOCKER | 13 | | BLOCKER-THRESHOLD | 21 | | BLOCKER-STRING | 6 | ## Blocker escalations See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the -full annotated list of 44 rows, grouped by root cause, with resolution path and re-probe +full annotated list of 40 rows, grouped by root cause, with resolution path and re-probe conditions. ## Notes on implemented rows @@ -60,9 +60,13 @@ No row is marked "deferred." Every blocker has an explicit escalation dispositio `LOCATION`, `NODE-DESCRIPTION` (platform), `ASIC-NAME`, `PHY-MANAGER-STATE`, `VL-CAPABILITIES`. Present live but cannot be emitted as numeric metrics; need a string/label export path (tracked as #11), or enum-coding for the FSM-style ones. Not silently dropped — escalated. -- **ABSENT-BLOCKER — cable/transceiver alarm leaves (9 rows: 981-986, 2293, 2296-2299):** gNMI - leaves exist in the schema but returned no data on the test rig. Likely empty due to an uncabled - switch. Re-probe on a cabled switch before treating as a permanent blocker. +- **ABSENT-BLOCKER — cable/transceiver leaves (7 rows: 981, 982, 2293, 2296-2299):** the catalog's + gNMI transceiver-diag path is absent live — the N5400_LD NVLink switch enumerates **no gNMI + transceiver components** (confirmed live; 64+ active backplane links, so *not* an uncabled rig). + The 4 fault-flag rows (983-986: CABLE-TX/RX-CDR-LOL, CABLE-TX/RX-LOS) were **re-sourced to NMX-T** + (live flag families) and are now implemented. The remaining 7 (temp/vcc alarm flags, module + oper-status, RX/TX power-lane LOW/HIGH thresholds) have no NMX-T or gNMI source; escalate to the + NVOS gNMI / NMX-T owner re: NVLink cable optical telemetry. - **ABSENT-BLOCKER — TIME-SINCE-LASTS-CLEAR (row 909):** gNMI leaf `/interfaces/interface/phy-diag/state/time-since-last-clear-min` not live. Escalate to NVOS gNMI owner for NVOS version confirmation. From 7b01512966414cc1666d8050ebcb9c9ce01e2154 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 22 Jun 2026 22:21:39 -0400 Subject: [PATCH 07/25] feat(health): implement 6 string-valued NVSwitch catalog rows Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- .../collectors/nvue/gnmi/sample_processor.rs | 367 +++++++++++++++++- ...vswitch_telemetry_gb200_live_validation.md | 26 +- .../nvswitch_telemetry_gb200_matrix.csv | 12 +- .../health/nvswitch_telemetry_gb200_matrix.md | 9 +- 4 files changed, 369 insertions(+), 45 deletions(-) diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index e24d9be779..8540e3a700 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -152,6 +152,24 @@ impl GnmiSampleProcessor { Some(v) => self.emit_iface("interface_supported_width", iface_name, v, "lanes"), None => debug_unmapped_value(elems, val, "interface_supported_width"), } + } else if leaf_matches(elems, &["phy-diag", "state", "phy-manager-state"]) { + // PHY-MANAGER-STATE (row 961): a dynamic PHY FSM string. Enum-code it + // rather than carry it as an info label (the value changes over time). + let v = phy_manager_state_to_f64(typed_value_to_string(val).as_deref()); + self.emit_iface("interface_phy_manager_state", iface_name, v, "state"); + } else if leaf_matches(elems, &["infiniband", "state", "vl-capabilities"]) { + // VL-CAPABILITIES (row 965): a stable capability string (e.g. + // "VL0-VL7"). Surface it as an info-metric: a constant 1.0 sample + // whose information lives in the `vl_capabilities` label. Empty + // strings carry no information and emit nothing. + if let Some(caps) = typed_value_to_string(val).filter(|s| !s.is_empty()) { + self.emit_iface_info( + "interface_vl_capabilities_info", + iface_name, + "vl_capabilities", + &caps, + ); + } } } @@ -167,6 +185,42 @@ impl GnmiSampleProcessor { ); } + /// emit a per-interface info-metric: a constant `1.0` sample whose + /// information is carried by an extra string label alongside the + /// `interface_name` label. Used for stable interface capability strings. + fn emit_iface_info( + &self, + metric_type: &str, + iface_name: &str, + info_label_name: &'static str, + info_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let mut key = String::with_capacity(metric_type.len() + 1 + iface_name.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(iface_name); + + let labels = vec![ + (Cow::Borrowed("interface_name"), iface_name.to_string()), + (Cow::Borrowed(info_label_name), info_label_value.to_string()), + ]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "info".to_string(), + value: 1.0, + labels, + context: None, + })), + ); + } + fn process_component_metric( &self, elems: &[&PathElem], @@ -197,6 +251,9 @@ impl GnmiSampleProcessor { { self.emit_comp("component_cpu_utilization", comp_name, v, "percent"); } + // ASIC-NAME (row 876): `state/name` is intentionally not emitted; the + // same value is already surfaced as the `component_name` label on every + // component metric, so a dedicated series would be redundant. } /// emit a `/components/component` canonical series keyed on `component_name` @@ -213,10 +270,32 @@ impl GnmiSampleProcessor { fn process_platform_general_metric(&self, elems: &[&PathElem], val: &proto::TypedValue) { // Explicit per-leaf canonical mappings for `/platform-general/state`. - // This is a switch-level singleton: only the four numeric memory/disk - // leaves proven live in the Stage-0 probe are mapped; every other - // platform-general leaf (contact, location, platform-name, ...) falls - // through and is never exported. + // This is a switch-level singleton: the four numeric memory/disk leaves + // are numeric gauges; contact/location/platform-name are stable strings + // surfaced as switch-level info-metrics. Every other platform-general + // leaf falls through and is never exported. + // + // String info-metrics first (CONTACT 862, LOCATION 863, + // NODE-DESCRIPTION 864): each emits a constant 1.0 sample whose + // information is carried by a single string label. Empty strings carry + // no information and emit nothing (CONTACT/LOCATION are empty on the + // GB200 rig, so only NODE-DESCRIPTION emits live). + let info: Option<(&str, &'static str)> = if leaf_matches(elems, &["state", "contact"]) { + Some(("platform_contact_info", "contact")) + } else if leaf_matches(elems, &["state", "location"]) { + Some(("platform_location_info", "location")) + } else if leaf_matches(elems, &["state", "platform-name"]) { + Some(("platform_node_description_info", "node_description")) + } else { + None + }; + if let Some((metric_type, info_label_name)) = info { + if let Some(s) = typed_value_to_string(val).filter(|s| !s.is_empty()) { + self.emit_switch_info(metric_type, info_label_name, &s); + } + return; + } + let metric_type = if leaf_matches(elems, &["state", "memory-used"]) { "platform_memory_used" } else if leaf_matches(elems, &["state", "memory-total-size"]) { @@ -255,6 +334,36 @@ impl GnmiSampleProcessor { ); } + /// emit a switch-level singleton info-metric: a constant `1.0` sample whose + /// information is carried by a single string label. Like `emit_switch`, + /// endpoint identity is added by PrometheusSink from EventContext. + fn emit_switch_info( + &self, + metric_type: &str, + info_label_name: &'static str, + info_label_value: &str, + ) { + let Some(sink) = &self.data_sink else { return }; + + let labels = vec![( + Cow::Borrowed(info_label_name), + info_label_value.to_string(), + )]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key: metric_type.to_string(), + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "info".to_string(), + value: 1.0, + labels, + context: None, + })), + ); + } + fn emit_data_metric( &self, metric_type: &str, @@ -656,6 +765,24 @@ fn physical_port_state_to_f64(state: Option<&str>) -> f64 { } } +/// PHY manager FSM state string -> numeric code. The PHY manager reports a +/// dynamic FSM label (e.g. "Active_or_Linkup", "Disabled"); a substring match +/// is used because the exact tokens vary. 1.0 == the PHY is active or linked +/// up, 0.0 otherwise (including empty/None). Mirrors `physical_port_state_to_f64`. +fn phy_manager_state_to_f64(state: Option<&str>) -> f64 { + match state { + Some(s) => { + let lower = s.to_ascii_lowercase(); + if lower.contains("active") || lower.contains("linkup") { + 1.0 + } else { + 0.0 + } + } + None => 0.0, + } +} + /// InfiniBand logical port state enum -> numeric code. Values observed live on /// GB200: `ACTIVE`, `DOWN`. 1.0 == active. fn logical_port_state_to_f64(state: Option<&str>) -> f64 { @@ -1360,12 +1487,15 @@ mod tests { let CollectorEvent::Metric(sample) = event else { panic!("expected a Metric event"); }; - // shared producer invariants for every interface mapping + // shared producer invariants for every interface mapping. The + // `interface_name` label is always present as the first (entity) label; + // info-metrics may carry additional info labels after it, so assert the + // first label rather than the exact set. assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); assert_eq!( - sample.labels, - vec![(Cow::Borrowed("interface_name"), "acp0".to_string())] + sample.labels.first(), + Some(&(Cow::Borrowed("interface_name"), "acp0".to_string())) ); (*sample, ctx) } @@ -1683,6 +1813,97 @@ mod tests { } } + #[test] + fn test_phy_manager_state_to_f64_helper() { + // substring match, case-insensitive: active/linkup => 1.0 + assert_eq!(phy_manager_state_to_f64(Some("Active_or_Linkup")), 1.0); + assert_eq!(phy_manager_state_to_f64(Some("LINKUP")), 1.0); + assert_eq!(phy_manager_state_to_f64(Some("active")), 1.0); + // anything else => 0.0 + assert_eq!(phy_manager_state_to_f64(Some("Disabled")), 0.0); + assert_eq!(phy_manager_state_to_f64(Some("")), 0.0); + assert_eq!(phy_manager_state_to_f64(None), 0.0); + } + + #[test] + fn test_interface_phy_manager_state_enum() { + // PHY-MANAGER-STATE (row 961): dynamic FSM string enum-coded to 1/0. + for (raw, expected) in [ + ("Active_or_Linkup", 1.0), + ("LINKUP", 1.0), + ("Disabled", 0.0), + ("", 0.0), + ] { + let (sample, _) = run_interface_leaf( + &["phy-diag", "state", "phy-manager-state"], + make_typed_value_string(raw), + ); + assert_eq!(sample.metric_type, "interface_phy_manager_state"); + assert_eq!(sample.unit, "state"); + assert_eq!(sample.value, expected, "phy-manager-state {raw:?}"); + } + } + + #[test] + fn test_interface_vl_capabilities_info() { + // VL-CAPABILITIES (row 965): non-empty string -> one info sample whose + // information is carried by the `vl_capabilities` label alongside + // `interface_name`. The shared invariant assert in `run_interface_leaf` + // only checks the first (interface_name) label, so assert the full set + // explicitly here. + let (sample, _) = run_interface_leaf( + &["infiniband", "state", "vl-capabilities"], + make_typed_value_string("VL0-VL7"), + ); + assert_eq!(sample.metric_type, "interface_vl_capabilities_info"); + assert_eq!(sample.unit, "info"); + assert_eq!(sample.value, 1.0); + assert_eq!( + sample.labels, + vec![ + (Cow::Borrowed("interface_name"), "acp0".to_string()), + (Cow::Borrowed("vl_capabilities"), "VL0-VL7".to_string()), + ] + ); + } + + #[test] + fn test_interface_vl_capabilities_empty_is_not_exported() { + // An empty vl-capabilities string carries no information and emits nothing. + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: Some(proto::Path { + elem: vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ], + ..Default::default() + }), + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("infiniband", &[]), + make_path_elem("state", &[]), + make_path_elem("vl-capabilities", &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty vl-capabilities must not emit a metric" + ); + } + #[test] fn test_interface_link_width_enum() { let (active, _) = run_interface_leaf( @@ -1745,8 +1966,9 @@ mod tests { #[test] fn test_unknown_interface_leaf_is_not_exported() { - // a live but unmapped leaf (phy-manager-state is flagged, not mapped) - // must never produce a MetricSample. + // a live but unmapped leaf (e.g. ip-address, which is not in any + // canonical mapping arm or the numeric table) must never produce a + // MetricSample. let sink = Arc::new(CapturingSink::default()); let mut proc = test_processor(); proc.data_sink = Some(sink.clone()); @@ -1762,13 +1984,12 @@ mod tests { update: vec![proto::Update { path: Some(proto::Path { elem: vec![ - make_path_elem("phy-diag", &[]), make_path_elem("state", &[]), - make_path_elem("phy-manager-state", &[]), + make_path_elem("ip-address", &[]), ], ..Default::default() }), - val: Some(make_typed_value_string("SUBFSM_ACTIVE_E")), + val: Some(make_typed_value_string("10.0.0.1")), ..Default::default() }], ..Default::default() @@ -2034,9 +2255,10 @@ mod tests { } #[test] - fn test_platform_general_string_leaf_is_not_exported() { - // String leaves at the same level (contact, location, platform-name) - // are out of scope: they must fall through unmapped and emit nothing. + fn test_platform_general_unmapped_string_leaf_is_not_exported() { + // A platform-general string leaf that is not one of the mapped info + // leaves (contact/location/platform-name) must fall through and emit + // nothing, while still being counted as the platform-general entity. let sink = Arc::new(CapturingSink::default()); let mut proc = test_processor(); proc.data_sink = Some(sink.clone()); @@ -2048,7 +2270,7 @@ mod tests { elem: vec![ make_path_elem("platform-general", &[]), make_path_elem("state", &[]), - make_path_elem("platform-name", &[]), + make_path_elem("product-name", &[]), ], ..Default::default() }), @@ -2066,4 +2288,117 @@ mod tests { "unmapped platform-general string leaf must not emit a metric" ); } + + #[test] + fn test_platform_general_empty_info_string_is_not_exported() { + // CONTACT/LOCATION are empty on the GB200 rig; an empty info string + // carries no information and must emit nothing. + for leaf in ["contact", "location", "platform-name"] { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: vec![ + make_path_elem("platform-general", &[]), + make_path_elem("state", &[]), + make_path_elem(leaf, &[]), + ], + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + assert_eq!(count, 1, "platform-general entity is still counted for {leaf}"); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty info string must not emit a metric for {leaf}" + ); + } + } + + #[test] + fn test_platform_general_node_description_info() { + // NODE-DESCRIPTION (row 864): a non-empty platform-name emits a single + // switch-level info-metric carrying the raw string as `node_description`. + let sample = run_platform_general_leaf_info( + &["state", "platform-name"], + "x86_64-nvidia_n5400_ld-r0", + ); + assert_eq!(sample.metric_type, "platform_node_description_info"); + assert_eq!(sample.unit, "info"); + assert_eq!(sample.value, 1.0); + assert_eq!( + sample.labels, + vec![( + Cow::Borrowed("node_description"), + "x86_64-nvidia_n5400_ld-r0".to_string() + )] + ); + } + + #[test] + fn test_platform_general_contact_and_location_info() { + // CONTACT (862) / LOCATION (863): non-empty strings emit their info + // series with the matching single label. + for (leaf, metric_type, label, raw) in [ + ("contact", "platform_contact_info", "contact", "noc@example.com"), + ("location", "platform_location_info", "location", "rack-7"), + ] { + let sample = run_platform_general_leaf_info(&["state", leaf], raw); + assert_eq!(sample.metric_type, metric_type, "leaf {leaf}"); + assert_eq!(sample.unit, "info", "leaf {leaf}"); + assert_eq!(sample.value, 1.0, "leaf {leaf}"); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed(label), raw.to_string())], + "leaf {leaf}" + ); + } + } + + /// Drive a single `/platform-general/` string update and return the + /// one captured info `MetricSample`. Unlike `run_platform_general_leaf`, the + /// switch-level info series carries a single string label (no per-entity + /// name), so the empty-labels invariant does not apply. + fn run_platform_general_leaf_info(tail: &[&str], raw: &str) -> MetricSample { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(make_typed_value_string(raw)), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + let events = sink.events.lock().expect("lock poisoned"); + assert_eq!(events.len(), 1, "expected exactly one emitted metric"); + let (ctx, event) = events[0].clone(); + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + assert_eq!(sample.name, NVUE_GNMI_SAMPLE_STREAM_ID); + assert_eq!(ctx.collector_type, NVUE_GNMI_SAMPLE_STREAM_ID); + *sample + } } diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index 630e704bcc..a124395a95 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -149,7 +149,7 @@ Unit coverage that locks this behavior: ## Blocker escalations (Stage 0) Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. -40 rows are escalated below (21 config-threshold, 13 absent-from-live-probe, 6 string-valued). No +34 rows are escalated below (21 config-threshold, 13 absent-from-live-probe). No row is deferred — each has an explicit disposition and a named resolution path. ### Group A — Config-threshold rows (21 rows, BLOCKER-THRESHOLD) @@ -244,21 +244,11 @@ requires a specific counter-clear event to populate. scrape. This may be a naming discrepancy or a field absent in the installed NMX-T version. Escalate to NMX-T owner with the NMX-T version string from the test rig. -### Group E — String-valued rows (6 rows, BLOCKER-STRING) +### String-valued rows — RESOLVED (6 rows, now implemented) -These catalog rows are present live but carry string values with no numeric encoding, so they -cannot be emitted as numeric `MetricSample`s (the producer is numeric-only). They are not silently -dropped — they are escalated pending a string/label export path (or enum-coding for the FSM-style ones). - -| Row | Metric | Live source / value | -|-----|-------------------|-----------------------------------------------------------------------------| -| 862 | CONTACT | gNMI `/platform-general/state/contact` (empty on rig) | -| 863 | LOCATION | gNMI `/platform-general/state/location` (empty on rig) | -| 864 | NODE-DESCRIPTION | gNMI `/platform-general/state/platform-name` ("x86_64-nvidia_n5400_ld-r0") | -| 876 | ASIC-NAME | gNMI `/components/component/state/name` (e.g. "ASIC1") | -| 961 | PHY-MANAGER-STATE | gNMI `/interfaces/interface/phy-diag/state/phy-manager-state` (FSM enum) | -| 965 | VL-CAPABILITIES | gNMI `/interfaces/interface/infiniband/state/vl-capabilities` ("VL0-VL7") | - -**Resolution:** add a string/label export path (e.g. an info-style series or label) for the -descriptive rows, or enum-code the FSM-style ones (`PHY-MANAGER-STATE`) like the existing -`physical_port_state` converter. Tracked as a follow-up (#11). +These 6 catalog rows are string-valued and were previously escalated; they are now implemented: +- `961 PHY-MANAGER-STATE` — enum-coded to `interface_phy_manager_state` (active/linkup = 1, else 0). +- `965 VL-CAPABILITIES`, `862 CONTACT`, `863 LOCATION`, `864 NODE-DESCRIPTION` — emitted as + info-metrics (value 1 with the string carried in a label; skipped when empty, so `CONTACT`/`LOCATION` + emit only when configured). +- `876 ASIC-NAME` — covered by the existing `component_name` label on every component metric (not re-emitted). diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 23e2ea2823..7b1338475d 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -27,9 +27,9 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 853,PORT-XMIT-DATA,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 854,PORT-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 855,PORT-XMIT-WAIT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -862,CONTACT,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) -863,LOCATION,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) -864,NODE-DESCRIPTION,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +862,CONTACT,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_contact_info (emits when set) +863,LOCATION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_location_info (emits when set) +864,NODE-DESCRIPTION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_node_description_info 865,LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 866,PORT-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 867,PORT-LABEL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim @@ -40,7 +40,7 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" 874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" 875,ASIC-TEMP-CURRENT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -876,ASIC-NAME,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +876,ASIC-NAME,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,covered by component_name label on component metrics 879,AMBIENT-MNG-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" 880,AMBIENT-MNG-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" 881,CPU_PACK_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" @@ -117,11 +117,11 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 958,PORT-LOOPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 959,PORT-INACTIVE-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 960,LINK-WIDTH-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi enum-coded interface_phy_manager_state 962,MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 963,MAX-SUPPORTED-MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 964,SUPPORTED-WIDTH,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,BLOCKER-STRING,blocker,present but string-valued; needs string/label export (#11) +965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric interface_vl_capabilities_info 966,FAN-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 967,FAN-LED,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['pd-link-speed-enabled', 'phy-hst-link-speed-enabled'] nmxt~[]" 968,CABLE-PART-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index b94762f042..0c36cfbb11 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -17,8 +17,8 @@ Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_ | Disposition | Count | Meaning | |----------------|-------|-----------------------------------------------------------------------| -| implemented | 153 | PRESENT/RESOLVED-LIVE allowlist hit, or IMPLEMENTED (MAX-SPEED via NVUE REST) | -| blocker | 40 | ABSENT-BLOCKER (leaf not live), BLOCKER-THRESHOLD (config-only), or BLOCKER-STRING (string-valued) | +| implemented | 159 | PRESENT/RESOLVED-LIVE allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded), or covered by an existing label | +| blocker | 34 | ABSENT-BLOCKER (leaf not live) or BLOCKER-THRESHOLD (config-only) | ### final_status breakdown @@ -26,15 +26,14 @@ Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_ |-------------------|-------| | PRESENT | 136 | | RESOLVED-LIVE | 16 | -| IMPLEMENTED | 1 | +| IMPLEMENTED | 7 | | ABSENT-BLOCKER | 13 | | BLOCKER-THRESHOLD | 21 | -| BLOCKER-STRING | 6 | ## Blocker escalations See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the -full annotated list of 40 rows, grouped by root cause, with resolution path and re-probe +full annotated list of 34 rows, grouped by root cause, with resolution path and re-probe conditions. ## Notes on implemented rows From f4882fd9232e62328cc2c3058c68d9bca4cd6302 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 22 Jun 2026 22:47:20 -0400 Subject: [PATCH 08/25] feat(health): implement 21 temp-threshold + 8 temp-current rows via NVUE REST Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 4 + .../health/src/collectors/nvue/rest/client.rs | 60 +++++ .../src/collectors/nvue/rest/collector.rs | 238 ++++++++++++++++++ crates/health/src/config.rs | 3 + ...vswitch_telemetry_gb200_live_validation.md | 42 +--- .../nvswitch_telemetry_gb200_matrix.csv | 58 ++--- .../health/nvswitch_telemetry_gb200_matrix.md | 28 +-- 7 files changed, 353 insertions(+), 80 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 6b33c8e299..ef1b607aa7 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -210,6 +210,10 @@ system_health_enabled = true cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true +# Per-sensor temperatures from `/nvue_v1/platform/environment/temperature` +# (current/max/crit in celsius + sensor state). Emits only the fields each +# sensor actually reports. +platform_environment_temperature_enabled = true # NVUE gNMI streaming collector. Disabled by default in code; explicitly # enabled here for the GB200 NVLink switch-host scenario. Subscribes to diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 906964da9d..4f2daaed2f 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -34,6 +34,7 @@ const NVUE_CLUSTER_APPS: &str = "/nvue_v1/cluster/apps"; const NVUE_SDN_PARTITIONS: &str = "/nvue_v1/sdn/partition"; const NVUE_INTERFACES: &str = "/nvue_v1/interface"; const NVUE_PLATFORM_ENVIRONMENT_FAN: &str = "/nvue_v1/platform/environment/fan"; +const NVUE_PLATFORM_ENVIRONMENT_TEMPERATURE: &str = "/nvue_v1/platform/environment/temperature"; #[derive(Clone)] pub struct UsernamePassword { @@ -136,6 +137,16 @@ impl RestClient { self.do_get(url, &[]).await.map(Some) } + pub async fn get_platform_environment_temperature( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_temperature_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT_TEMPERATURE)?; + self.do_get(url, &[]).await.map(Some) + } + pub async fn get_interfaces(&self) -> Result, HealthError> { if !self.paths.interfaces_enabled { return Ok(None); @@ -309,6 +320,22 @@ pub struct FanData { pub max_speed: Option, } +pub type TemperatureEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct TempData { + /// Current temperature in degrees Celsius, reported by NVUE as a string + /// (e.g. "43.00"). Each per-sensor field is optional — NVUE reports only a + /// subset for many sensors (e.g. ambient sensors expose only current+state). + pub current: Option, + /// Maximum (warning) threshold in degrees Celsius, as a string (e.g. "105.00"). + pub max: Option, + /// Critical threshold in degrees Celsius, as a string (e.g. "120.00"). + pub crit: Option, + /// Sensor state as a string (e.g. "ok"). + pub state: Option, +} + pub type InterfacesResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] @@ -583,6 +610,36 @@ mod tests { assert!(resp["FAN1/1"].max_speed.is_none()); } + #[test] + fn test_parse_platform_environment_temperature() { + let json = r#"{ + "ASIC1": {"crit": "120.00", "current": "43.00", "max": "105.00", "state": "ok"}, + "Ambient-MNG-Temp": {"current": "27.00", "state": "ok"}, + "PDB-Conv-1-Temp": {"crit": "115.00", "current": "38.00", "state": "ok"} + }"#; + + let resp: TemperatureEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp.len(), 3); + + let asic1 = &resp["ASIC1"]; + assert_eq!(asic1.current.as_deref(), Some("43.00")); + assert_eq!(asic1.max.as_deref(), Some("105.00")); + assert_eq!(asic1.crit.as_deref(), Some("120.00")); + assert_eq!(asic1.state.as_deref(), Some("ok")); + + // Ambient sensor reports only current + state. + let ambient = &resp["Ambient-MNG-Temp"]; + assert_eq!(ambient.current.as_deref(), Some("27.00")); + assert!(ambient.max.is_none()); + assert!(ambient.crit.is_none()); + assert_eq!(ambient.state.as_deref(), Some("ok")); + + // PDB sensor has crit + current + state but no max. + let pdb = &resp["PDB-Conv-1-Temp"]; + assert_eq!(pdb.crit.as_deref(), Some("115.00")); + assert!(pdb.max.is_none()); + } + #[test] fn test_parse_empty_responses() { let empty_map: ClusterAppsResponse = serde_json::from_str("{}").unwrap(); @@ -596,5 +653,8 @@ mod tests { let empty_fans: FanEnvironmentResponse = serde_json::from_str("{}").unwrap(); assert!(empty_fans.is_empty()); + + let empty_temps: TemperatureEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_temps.is_empty()); } } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 0e75c55aca..b05e59a535 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -69,6 +69,27 @@ fn fan_max_speed_to_f64(max_speed: Option<&str>) -> Option { max_speed.and_then(|s| s.trim().parse::().ok()) } +/// NVUE reports temperatures (current/max/crit) as strings in degrees Celsius +/// (e.g. "105.00"). Parse to f64; return `None` when the field is absent or +/// unparseable so callers emit nothing rather than fabricating a value. Shares +/// the same trim-then-parse contract as `fan_max_speed_to_f64`. +fn temp_to_f64(value: Option<&str>) -> Option { + value.and_then(|s| s.trim().parse::().ok()) +} + +/// Map a temperature sensor's string `state` to a numeric gauge: "ok" +/// (case-insensitive) => 1.0, any other non-empty value => 0.0, absent => None +/// (so callers emit nothing rather than fabricating a value). +fn temp_state_to_f64(state: Option<&str>) -> Option { + state.map(|s| { + if s.trim().eq_ignore_ascii_case("ok") { + 1.0 + } else { + 0.0 + } + }) +} + pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, pub data_sink: Option>, @@ -281,6 +302,68 @@ impl PeriodicCollector for NvueRestCollector { } } + match self.client.get_platform_environment_temperature().await { + Ok(Some(temps)) => { + for (sensor_name, temp) in &temps { + // Each field is optional; emit only the ones present/parseable + // rather than fabricating absent thresholds. + let sensor_label = + || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; + + if let Some(value) = temp_to_f64(temp.current.as_deref()) { + self.emit_metric( + "platform_temperature", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + if let Some(value) = temp_to_f64(temp.max.as_deref()) { + self.emit_metric( + "platform_temperature_max", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + if let Some(value) = temp_to_f64(temp.crit.as_deref()) { + self.emit_metric( + "platform_temperature_critical", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + entity_count += 1; + } + if let Some(value) = temp_state_to_f64(temp.state.as_deref()) { + self.emit_metric( + "platform_temperature_state", + Some(sensor_name), + value, + "state", + sensor_label(), + ); + entity_count += 1; + } + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment temperature" + ); + } + } + if saw_auth_failure { tracing::warn!( switch_id = %self.switch_id, @@ -436,6 +519,26 @@ mod tests { assert_eq!(fan_max_speed_to_f64(None), None); } + #[test] + fn test_temp_to_f64_parsing() { + assert_eq!(temp_to_f64(Some("105.00")), Some(105.0)); + assert_eq!(temp_to_f64(Some(" 43 ")), Some(43.0)); + assert_eq!(temp_to_f64(Some("120.00")), Some(120.0)); + assert_eq!(temp_to_f64(Some("x")), None); + assert_eq!(temp_to_f64(Some("")), None); + assert_eq!(temp_to_f64(None), None); + } + + #[test] + fn test_temp_state_to_f64_mapping() { + assert_eq!(temp_state_to_f64(Some("ok")), Some(1.0)); + assert_eq!(temp_state_to_f64(Some("OK")), Some(1.0)); + assert_eq!(temp_state_to_f64(Some(" ok ")), Some(1.0)); + assert_eq!(temp_state_to_f64(Some("warning")), Some(0.0)); + assert_eq!(temp_state_to_f64(Some("")), Some(0.0)); + assert_eq!(temp_state_to_f64(None), None); + } + /// Drives the same parse + emit logic `run_iteration` uses for the /// platform/environment/fan endpoint against a captured sink, asserting the /// emitted MAX-SPEED sample shape. Table-driven over representative payloads. @@ -566,6 +669,140 @@ mod tests { } } + /// Drives the same parse + emit logic `run_iteration` uses for the + /// platform/environment/temperature endpoint against a captured sink. A + /// fully-populated sensor (ASIC1) emits all four series; a sparse sensor + /// (Ambient-MNG-Temp, only current + state) emits exactly two and must NOT + /// fabricate the absent max/critical thresholds. + #[test] + fn test_platform_temperature_emit() { + use crate::collectors::nvue::rest::client::TemperatureEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let json = r#"{ + "ASIC1": {"crit": "120.00", "current": "43.00", "max": "105.00", "state": "ok"}, + "Ambient-MNG-Temp": {"current": "27.00", "state": "ok"} + }"#; + + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let temps: TemperatureEnvironmentResponse = + serde_json::from_str(json).expect("temperature json parses"); + // Mirror run_iteration's emit loop exactly. + for (sensor_name, temp) in &temps { + let sensor_label = || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; + if let Some(value) = temp_to_f64(temp.current.as_deref()) { + collector.emit_metric( + "platform_temperature", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(value) = temp_to_f64(temp.max.as_deref()) { + collector.emit_metric( + "platform_temperature_max", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(value) = temp_to_f64(temp.crit.as_deref()) { + collector.emit_metric( + "platform_temperature_critical", + Some(sensor_name), + value, + "celsius", + sensor_label(), + ); + } + if let Some(value) = temp_state_to_f64(temp.state.as_deref()) { + collector.emit_metric( + "platform_temperature_state", + Some(sensor_name), + value, + "state", + sensor_label(), + ); + } + } + + let samples = sink.samples.lock().unwrap(); + // ASIC1: 4 series; Ambient-MNG-Temp: 2 series (current + state) = 6 total. + assert_eq!(samples.len(), 6, "unexpected emitted sample count"); + + // Helper: find a sample by metric_type + sensor label. + let find = |metric_type: &str, sensor: &str| { + samples.iter().find(|s| { + s.metric_type == metric_type + && s.labels + .iter() + .any(|(k, v)| k == "sensor" && v == sensor) + }) + }; + + // ASIC1: all four series present with correct name/unit/value/label/key. + let expected_asic1: &[(&str, &str, f64)] = &[ + ("platform_temperature", "celsius", 43.0), + ("platform_temperature_max", "celsius", 105.0), + ("platform_temperature_critical", "celsius", 120.0), + ("platform_temperature_state", "state", 1.0), + ]; + for (metric_type, unit, value) in expected_asic1 { + let sample = find(metric_type, "ASIC1") + .unwrap_or_else(|| panic!("no ASIC1 sample for {metric_type}")); + assert_eq!(sample.name, COLLECTOR_NAME); + assert_eq!(&sample.metric_type, metric_type); + assert_eq!(&sample.unit, unit); + assert_eq!(sample.value, *value, "value for {metric_type}"); + assert_eq!(sample.key, format!("{metric_type}:ASIC1")); + assert_eq!(sample.labels.len(), 1); + assert_eq!(sample.labels[0].0, "sensor"); + assert_eq!(sample.labels[0].1, "ASIC1"); + } + + // Ambient-MNG-Temp: only current + state emitted. + let ambient_current = find("platform_temperature", "Ambient-MNG-Temp") + .expect("ambient current sample"); + assert_eq!(ambient_current.value, 27.0); + assert_eq!(ambient_current.unit, "celsius"); + assert!( + find("platform_temperature_state", "Ambient-MNG-Temp").is_some(), + "ambient state sample expected" + ); + + // A sensor missing max/crit must NOT emit those series. + assert!( + find("platform_temperature_max", "Ambient-MNG-Temp").is_none(), + "ambient sensor without max must not emit platform_temperature_max" + ); + assert!( + find("platform_temperature_critical", "Ambient-MNG-Temp").is_none(), + "ambient sensor without crit must not emit platform_temperature_critical" + ); + } + struct ScriptedProvider { calls: AtomicUsize, // Each call pops the front of this queue; an empty queue yields an @@ -618,6 +855,7 @@ mod tests { sdn_partitions_enabled: false, interfaces_enabled: false, platform_environment_fan_enabled: false, + platform_environment_temperature_enabled: false, } } diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 1e1b43d530..a28be67684 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -975,6 +975,7 @@ impl Default for NvueRestConfig { /// - sdn_partitions_enabled: Poll `/nvue_v1/sdn/partition` (including per-partition details) /// - interfaces_enabled: Poll `/nvue_v1/interface`. /// - platform_environment_fan_enabled: Poll `/nvue_v1/platform/environment/fan`. +/// - platform_environment_temperature_enabled: Poll `/nvue_v1/platform/environment/temperature`. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct NvueRestPaths { @@ -983,6 +984,7 @@ pub struct NvueRestPaths { pub sdn_partitions_enabled: bool, pub interfaces_enabled: bool, pub platform_environment_fan_enabled: bool, + pub platform_environment_temperature_enabled: bool, } impl Default for NvueRestPaths { @@ -993,6 +995,7 @@ impl Default for NvueRestPaths { sdn_partitions_enabled: true, interfaces_enabled: true, platform_environment_fan_enabled: true, + platform_environment_temperature_enabled: true, } } } diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index a124395a95..b0b50eb70e 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -149,42 +149,16 @@ Unit coverage that locks this behavior: ## Blocker escalations (Stage 0) Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. -34 rows are escalated below (21 config-threshold, 13 absent-from-live-probe). No +13 rows are escalated below — all ABSENT-BLOCKER (no live source on this platform). No row is deferred — each has an explicit disposition and a named resolution path. -### Group A — Config-threshold rows (21 rows, BLOCKER-THRESHOLD) - -These catalog entries represent threshold/limit/alarm-state values configured on the device, not -streamed telemetry counters. They are not exposed as live gNMI leaves and cannot be implemented -without a new data source. - -**Resolution:** Source owner (NVOS gNMI / Redfish sensor threshold team) must confirm whether -a future gNMI path or Redfish sensor `ThresholdHigh`/`ThresholdLow`/`ReadingRangeMax` field -can expose these. Until confirmed, they are out-of-scope for this branch. - -| Row | Metric | -|------|-------------------------| -| 872 | ASIC-TEMP-CRITICAL | -| 873 | ASIC-TEMP-MAX | -| 874 | ASIC-TEMP-STATE | -| 879 | AMBIENT-MNG-TEMP-STATE | -| 881 | CPU_PACK_TEMP_CRITICAL | -| 882 | CPU_PACK_TEMP_MAX | -| 883 | CPU_PACK_TEMP_STATE | -| 890 | SODIMM_TEMP_CRITICAL | -| 891 | SODIMM_TEMP_MAX | -| 892 | SODIMM_TEMP_STATE | -| 1241 | DRIVE-TEMP-CRITICAL | -| 1242 | DRIVE-TEMP-MAX | -| 1243 | DRIVE-TEMP-STATE | -| 1245 | HSC-VINDC-TEMP-CRITICAL | -| 1246 | HSC-VINDC-TEMP-MAX | -| 1247 | HSC-VINDC-TEMP-STATE | -| 1249 | PDB-CONV-TEMP-CRITICAL | -| 1251 | PDB-CONV-TEMP-STATE | -| 1253 | PMIC-TEMP-CRITICAL | -| 1255 | PMIC-TEMP-STATE | -| 1259 | SWB-ASIC-PCB-TEMP-STATE | +### Temperature threshold rows — RESOLVED (21 rows, formerly BLOCKER-THRESHOLD) + +The 21 temperature `*-CRITICAL` / `*-MAX` / `*-STATE` rows (ASIC / CPU-Pack / SODIMM / Drive / +HSC-VinDC / PDB-Conv / PMIC / SWB-ASIC-PCB / Ambient-MNG) are now implemented from NVUE REST +`/nvue_v1/platform/environment/temperature` (`.crit` / `.max` / `.state` per sensor; only the fields +a sensor actually exposes are emitted). The 8 `*-TEMP-CURRENT` rows were re-sourced from `.current` +on the same endpoint, correcting an earlier spurious gNMI token match. No longer escalated. ### Group B — Cable/transceiver leaves (7 rows, ABSENT-BLOCKER) diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 7b1338475d..5b7aa96fbc 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -36,26 +36,26 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 868,REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 869,DEVICE-HARDWARE-REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 870,CPU_CORE_NUMBER,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['core-to-phy-link-width-enabled', 'core-to-phy-link-proto-enabled'] nmxt~['Port_Number', 'sw_serial_number']" -872,ASIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +872,ASIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) 875,ASIC-TEMP-CURRENT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 876,ASIC-NAME,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,covered by component_name label on component metrics -879,AMBIENT-MNG-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -880,AMBIENT-MNG-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -881,CPU_PACK_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" -882,CPU_PACK_TEMP_MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" -883,CPU_PACK_TEMP_STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" -884,CPU_PACK_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'PortMalformedPacketErrors']" +879,AMBIENT-MNG-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +880,AMBIENT-MNG-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) +881,CPU_PACK_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +882,CPU_PACK_TEMP_MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +883,CPU_PACK_TEMP_STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +884,CPU_PACK_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) 885,CPU-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 886,MEM-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 887,MEM-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 888,DISK-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 889,DISK-USED,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -890,SODIMM_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -891,SODIMM_TEMP_MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -892,SODIMM_TEMP_STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -893,SODIMM_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +890,SODIMM_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +891,SODIMM_TEMP_MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +892,SODIMM_TEMP_STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +893,SODIMM_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) 894,MAX-SPEED,BMC Redfish live resource only,IMPLEMENTED,implemented,nvue rest /platform/environment/fan .max-speed 897,PORT-LOGICAL-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 898,FEC-MODE-ACTIVE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim @@ -148,22 +148,22 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 989,LINK-PARTNER-LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 990,LINK-PARTNER-PORT-NUM,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 1174,CPU-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -1241,DRIVE-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1242,DRIVE-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1243,DRIVE-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1244,DRIVE-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1245,HSC-VINDC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1246,HSC-VINDC-TEMP-MAX,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1247,HSC-VINDC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1248,HSC-VINDC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1249,PDB-CONV-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1251,PDB-CONV-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1252,PDB-CONV-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1253,PMIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1255,PMIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1256,PMIC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1259,SWB-ASIC-PCB-TEMP-STATE,source-equivalent required; no new CLI collector by default,BLOCKER-THRESHOLD,blocker,"gnmi~['asic-temp', 'psi-fsm-state'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" -1260,SWB-ASIC-PCB-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['asic-temp', 'ambient-temperature'] nmxt~['Chip_Temp', 'last_host_logical_recovery_attempts_count']" +1241,DRIVE-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1242,DRIVE-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1243,DRIVE-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1244,DRIVE-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) +1245,HSC-VINDC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1246,HSC-VINDC-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1247,HSC-VINDC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1248,HSC-VINDC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) +1249,PDB-CONV-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1251,PDB-CONV-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1252,PDB-CONV-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) +1253,PMIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1255,PMIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1256,PMIC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) +1259,SWB-ASIC-PCB-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) +1260,SWB-ASIC-PCB-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) 1688,LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 1689,TOTAL-LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 1690,TIME-SINCE-LAST-RECOVERY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index 0c36cfbb11..01d2b88906 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -17,23 +17,22 @@ Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_ | Disposition | Count | Meaning | |----------------|-------|-----------------------------------------------------------------------| -| implemented | 159 | PRESENT/RESOLVED-LIVE allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded), or covered by an existing label | -| blocker | 34 | ABSENT-BLOCKER (leaf not live) or BLOCKER-THRESHOLD (config-only) | +| implemented | 180 | PRESENT/RESOLVED-LIVE allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded), or covered by an existing label | +| blocker | 13 | ABSENT-BLOCKER — leaf/family not live on this platform | ### final_status breakdown | final_status | Count | |-------------------|-------| | PRESENT | 136 | -| RESOLVED-LIVE | 16 | -| IMPLEMENTED | 7 | +| RESOLVED-LIVE | 8 | +| IMPLEMENTED | 36 | | ABSENT-BLOCKER | 13 | -| BLOCKER-THRESHOLD | 21 | ## Blocker escalations See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the -full annotated list of 34 rows, grouped by root cause, with resolution path and re-probe +full annotated list of 13 rows, grouped by root cause, with resolution path and re-probe conditions. ## Notes on implemented rows @@ -43,22 +42,17 @@ conditions. - **RESOLVED-LIVE** rows have no direct catalog-listed source but a live token match was found in gNMI or NMX-T output. Match tokens are recorded in `match_detail`. These are accepted as covered; if live validation on a production rig disputes a mapping, re-escalate immediately. -- **IMPLEMENTED — MAX-SPEED (row 894):** sourced from NVUE REST `/nvue_v1/platform/environment/fan` - `.max-speed` (not Redfish — confirmed live). The 4 `/platform-general` memory/disk rows - (`886/887/888/889`) are PRESENT via the new gNMI `platform-general` subscribe path. +- **IMPLEMENTED** rows are sourced beyond the plain gNMI/NMX-T allowlist: + - NVUE REST `/nvue_v1/platform/environment/{fan,temperature}` → MAX-SPEED (894); the 21 temp + `*-CRITICAL/MAX/STATE` rows (`.crit`/`.max`/`.state`) and the 8 `*-TEMP-CURRENT` rows + (`.current`), emitted per sensor as `platform_temperature{,_max,_critical,_state}` with a `sensor` label. + - gNMI `platform-general` subscribe path → the 4 memory/disk rows (`886-889`). + - String rows → `interface_phy_manager_state` (enum-coded), `*_info` info-metrics, and the existing `component_name` label (`ASIC-NAME`). ## Notes on blocker rows No row is marked "deferred." Every blocker has an explicit escalation disposition: -- **BLOCKER-THRESHOLD (21 rows):** The catalog entry represents a threshold/limit/alarm-state - value, not a streamed telemetry counter. These are configuration parameters unavailable as live - gNMI leaves. Source owner must confirm whether a future gNMI path or Redfish sensor threshold - can expose them; until confirmed they cannot be implemented without a new data source. -- **BLOCKER-STRING (6 rows):** string-valued catalog rows with no numeric encoding — `CONTACT`, - `LOCATION`, `NODE-DESCRIPTION` (platform), `ASIC-NAME`, `PHY-MANAGER-STATE`, `VL-CAPABILITIES`. - Present live but cannot be emitted as numeric metrics; need a string/label export path (tracked - as #11), or enum-coding for the FSM-style ones. Not silently dropped — escalated. - **ABSENT-BLOCKER — cable/transceiver leaves (7 rows: 981, 982, 2293, 2296-2299):** the catalog's gNMI transceiver-diag path is absent live — the N5400_LD NVLink switch enumerates **no gNMI transceiver components** (confirmed live; 64+ active backplane links, so *not* an uncabled rig). From 2e3209b64d4656f9f5a69056d95555a10e745315 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 22 Jun 2026 23:10:11 -0400 Subject: [PATCH 09/25] feat(health): reclaim 5 NVSwitch catalog rows via live gNMI/NVUE-REST sources Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 4 + .../health/src/collectors/nvue/gnmi/client.rs | 30 +++- .../collectors/nvue/gnmi/sample_processor.rs | 96 +++++++++++ .../health/src/collectors/nvue/rest/client.rs | 53 ++++++ .../src/collectors/nvue/rest/collector.rs | 153 ++++++++++++++++++ crates/health/src/config.rs | 4 + ...vswitch_telemetry_gb200_live_validation.md | 61 +++++-- .../nvswitch_telemetry_gb200_matrix.csv | 16 +- .../health/nvswitch_telemetry_gb200_matrix.md | 35 ++-- 9 files changed, 420 insertions(+), 32 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index ef1b607aa7..c742b9971b 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -214,6 +214,10 @@ interfaces_enabled = true # (current/max/crit in celsius + sensor state). Emits only the fields each # sensor actually reports. platform_environment_temperature_enabled = true +# Aggregate fan LED state from the `/nvue_v1/platform/environment` parent +# summary (the `FAN_STATUS` entry). Switch-level: "green"/"ok" => 1.0, any +# other state => 0.0. +platform_environment_status_enabled = true # NVUE gNMI streaming collector. Disabled by default in code; explicitly # enabled here for the GB200 NVLink switch-host scenario. Subscribes to diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 1db5050f44..0b7abbb27e 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -31,7 +31,7 @@ use crate::HealthError; use crate::config::NvueGnmiPaths; pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { - let mut paths = Vec::with_capacity(3); + let mut paths = Vec::with_capacity(4); if paths_config.components_enabled { paths.push(Path { elem: vec![ @@ -78,6 +78,21 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { ], ..Default::default() }); + // sibling singleton: `/platform-general/versions` carries the OS/BMC/EROT + // firmware version leaves (also no interface/component name key). + paths.push(Path { + elem: vec![ + PathElem { + name: "platform-general".into(), + key: Default::default(), + }, + PathElem { + name: "versions".into(), + key: Default::default(), + }, + ], + ..Default::default() + }); } paths } @@ -458,7 +473,7 @@ mod tests { #[test] fn test_nvue_subscribe_paths_all_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); - assert_eq!(paths.len(), 3); + assert_eq!(paths.len(), 4); assert_eq!(paths[0].elem.len(), 2); assert_eq!(paths[0].elem[0].name, "components"); @@ -471,6 +486,10 @@ mod tests { assert_eq!(paths[2].elem.len(), 2); assert_eq!(paths[2].elem[0].name, "platform-general"); assert_eq!(paths[2].elem[1].name, "state"); + + assert_eq!(paths[3].elem.len(), 2); + assert_eq!(paths[3].elem[0].name, "platform-general"); + assert_eq!(paths[3].elem[1].name, "versions"); } #[test] @@ -493,10 +512,13 @@ mod tests { interfaces_enabled: false, platform_general_enabled: true, }); - assert_eq!(paths.len(), 1); + assert_eq!(paths.len(), 2); assert_eq!(paths[0].elem.len(), 2); assert_eq!(paths[0].elem[0].name, "platform-general"); assert_eq!(paths[0].elem[1].name, "state"); + assert_eq!(paths[1].elem.len(), 2); + assert_eq!(paths[1].elem[0].name, "platform-general"); + assert_eq!(paths[1].elem[1].name, "versions"); } #[test] @@ -535,7 +557,7 @@ mod tests { let prefix = sub_list.prefix.expect("prefix must be set"); assert_eq!(prefix.target, "nvos", "target must be nvos"); - assert_eq!(sub_list.subscription.len(), 3); + assert_eq!(sub_list.subscription.len(), 4); for sub in &sub_list.subscription { assert_eq!( sub.mode, diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 8540e3a700..77f3be627f 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -280,12 +280,23 @@ impl GnmiSampleProcessor { // information is carried by a single string label. Empty strings carry // no information and emit nothing (CONTACT/LOCATION are empty on the // GB200 rig, so only NODE-DESCRIPTION emits live). + // + // The firmware version info-metrics (OS-VERSION 868, BMC-VERSION 869, + // EROT-FW-VERSION 870) live under the sibling `/platform-general/versions` + // subtree rather than `/state`; they follow the same info-metric contract + // (constant 1.0 sample, single string label, empty strings emit nothing). let info: Option<(&str, &'static str)> = if leaf_matches(elems, &["state", "contact"]) { Some(("platform_contact_info", "contact")) } else if leaf_matches(elems, &["state", "location"]) { Some(("platform_location_info", "location")) } else if leaf_matches(elems, &["state", "platform-name"]) { Some(("platform_node_description_info", "node_description")) + } else if leaf_matches(elems, &["versions", "state", "nos-version"]) { + Some(("platform_os_version_info", "os_version")) + } else if leaf_matches(elems, &["versions", "state", "fw-version-bmc"]) { + Some(("platform_bmc_version_info", "bmc_version")) + } else if leaf_matches(elems, &["versions", "state", "fw-version-erot"]) { + Some(("platform_erot_version_info", "erot_version")) } else { None }; @@ -698,6 +709,11 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { "interface_plr_xmit_retry_codes_within_minute", "count", ), + ( + &["phy-diag", "state", "plr-bw-loss-percent"], + "interface_plr_bw_loss_percent", + "percent", + ), // existing pre-branch mapping retained (leaf out of GB200 row set but // restored upstream; kept so the canonical series is not dropped) ( @@ -1742,6 +1758,11 @@ mod tests { "interface_plr_xmit_retry_codes_within_minute", "count", ), + ( + &["phy-diag", "state", "plr-bw-loss-percent"], + "interface_plr_bw_loss_percent", + "percent", + ), ]; for (tail, expected_name, expected_unit) in cases { @@ -2364,6 +2385,81 @@ mod tests { } } + #[test] + fn test_platform_general_version_info_metrics() { + // OS-VERSION (868) / BMC-VERSION (869) / EROT-FW-VERSION (870): non-empty + // version strings under `/platform-general/versions/state` each emit one + // switch-level info-metric carrying the raw version in a single label. + // Values are the authoritative live GB200 Stage-0 capture. + for (tail, metric_type, label, raw) in [ + ( + ["versions", "state", "nos-version"], + "platform_os_version_info", + "os_version", + "nvos-25.02.2553", + ), + ( + ["versions", "state", "fw-version-bmc"], + "platform_bmc_version_info", + "bmc_version", + "88.0002.1336", + ), + ( + ["versions", "state", "fw-version-erot"], + "platform_erot_version_info", + "erot_version", + "01.04.0026.0000_n04", + ), + ] { + let sample = run_platform_general_leaf_info(&tail, raw); + assert_eq!(sample.metric_type, metric_type, "leaf {tail:?}"); + assert_eq!(sample.unit, "info", "leaf {tail:?}"); + assert_eq!(sample.value, 1.0, "leaf {tail:?}"); + assert_eq!( + sample.labels, + vec![(Cow::Borrowed(label), raw.to_string())], + "leaf {tail:?}" + ); + } + } + + #[test] + fn test_platform_general_empty_version_string_is_not_exported() { + // An empty version string carries no information and must emit nothing, + // while still being counted as the platform-general entity. + for tail in [ + ["versions", "state", "nos-version"], + ["versions", "state", "fw-version-bmc"], + ["versions", "state", "fw-version-erot"], + ] { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + let mut elems = vec![make_path_elem("platform-general", &[])]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(make_typed_value_string("")), + ..Default::default() + }], + ..Default::default() + }; + let count = proc.process_notification(¬ification); + assert_eq!(count, 1, "platform-general entity is still counted for {tail:?}"); + assert_eq!( + sink.events.lock().expect("lock poisoned").len(), + 0, + "empty version string must not emit a metric for {tail:?}" + ); + } + } + /// Drive a single `/platform-general/` string update and return the /// one captured info `MetricSample`. Unlike `run_platform_general_leaf`, the /// switch-level info series carries a single string label (no per-entity diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 4f2daaed2f..951e65d279 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -35,6 +35,7 @@ const NVUE_SDN_PARTITIONS: &str = "/nvue_v1/sdn/partition"; const NVUE_INTERFACES: &str = "/nvue_v1/interface"; const NVUE_PLATFORM_ENVIRONMENT_FAN: &str = "/nvue_v1/platform/environment/fan"; const NVUE_PLATFORM_ENVIRONMENT_TEMPERATURE: &str = "/nvue_v1/platform/environment/temperature"; +const NVUE_PLATFORM_ENVIRONMENT: &str = "/nvue_v1/platform/environment"; #[derive(Clone)] pub struct UsernamePassword { @@ -147,6 +148,16 @@ impl RestClient { self.do_get(url, &[]).await.map(Some) } + pub async fn get_platform_environment( + &self, + ) -> Result, HealthError> { + if !self.paths.platform_environment_status_enabled { + return Ok(None); + } + let url = self.join_path(NVUE_PLATFORM_ENVIRONMENT)?; + self.do_get(url, &[]).await.map(Some) + } + pub async fn get_interfaces(&self) -> Result, HealthError> { if !self.paths.interfaces_enabled { return Ok(None); @@ -336,6 +347,20 @@ pub struct TempData { pub state: Option, } +/// Parent `/nvue_v1/platform/environment` summary. Keys are aggregate status +/// entries (e.g. `FAN_STATUS`) as well as the `fan`/`temperature` subtrees. +/// Only the LED-style summary entries carry a top-level `state`; the nested +/// subtree objects have a different shape and deserialize with `state` absent +/// (serde ignores unknown keys, including the LED `type` discriminator we do +/// not consume), so they are harmlessly skipped by callers. +pub type PlatformEnvironmentResponse = HashMap; + +#[derive(Debug, Clone, Deserialize, Default)] +pub struct EnvItem { + /// Aggregate status string (e.g. "green"/"amber" for `FAN_STATUS`). + pub state: Option, +} + pub type InterfacesResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] @@ -640,6 +665,31 @@ mod tests { assert!(pdb.max.is_none()); } + #[test] + fn test_parse_platform_environment_fan_status() { + // Parent summary carries the aggregate `FAN_STATUS` LED entry alongside + // nested `fan`/`temperature` subtree objects of a different shape. The + // LED entry parses into `state`; the nested objects parse with `state` + // absent (serde ignores unknown keys) and are skipped by callers. + let json = r#"{ + "FAN_STATUS": {"state": "green", "type": "led"}, + "PSU_STATUS": {"state": "amber", "type": "led"}, + "fan": { + "FAN1/1": {"current-speed": "10096", "max-speed": "33000", "state": "ok"} + }, + "temperature": { + "ASIC1": {"current": "43.00", "state": "ok"} + } + }"#; + + let resp: PlatformEnvironmentResponse = serde_json::from_str(json).unwrap(); + assert_eq!(resp["FAN_STATUS"].state.as_deref(), Some("green")); + assert_eq!(resp["PSU_STATUS"].state.as_deref(), Some("amber")); + // nested subtree objects have no top-level state -> None. + assert!(resp["fan"].state.is_none()); + assert!(resp["temperature"].state.is_none()); + } + #[test] fn test_parse_empty_responses() { let empty_map: ClusterAppsResponse = serde_json::from_str("{}").unwrap(); @@ -656,5 +706,8 @@ mod tests { let empty_temps: TemperatureEnvironmentResponse = serde_json::from_str("{}").unwrap(); assert!(empty_temps.is_empty()); + + let empty_env: PlatformEnvironmentResponse = serde_json::from_str("{}").unwrap(); + assert!(empty_env.is_empty()); } } diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index b05e59a535..35eb011586 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -90,6 +90,22 @@ fn temp_state_to_f64(state: Option<&str>) -> Option { }) } +/// Map the aggregate `FAN_STATUS` LED state from the platform/environment parent +/// summary to a numeric gauge: "green"/"ok" (case-insensitive) => 1.0, any other +/// non-empty value (e.g. "amber"/"red") => 0.0, absent/empty => None (so callers +/// emit nothing rather than fabricating a value). +fn fan_led_to_f64(state: Option<&str>) -> Option { + let s = state?.trim(); + if s.is_empty() { + return None; + } + if s.eq_ignore_ascii_case("green") || s.eq_ignore_ascii_case("ok") { + Some(1.0) + } else { + Some(0.0) + } +} + pub struct NvueRestCollectorConfig { pub rest_config: NvueRestConfig, pub data_sink: Option>, @@ -364,6 +380,29 @@ impl PeriodicCollector for NvueRestCollector { } } + match self.client.get_platform_environment().await { + Ok(Some(env)) => { + // Switch-level aggregate FAN_STATUS LED; emit only when present + // and the state maps to a value, absent → nothing. + if let Some(value) = + env.get("FAN_STATUS").and_then(|s| fan_led_to_f64(s.state.as_deref())) + { + self.emit_metric("fan_led", None, value, "state", vec![]); + entity_count += 1; + } + } + Ok(None) => {} + Err(e) => { + fetch_failures += 1; + saw_auth_failure |= is_auth_error(&e); + tracing::warn!( + error = ?e, + switch_id = %self.switch_id, + "nvue_rest: failed to collect platform environment status" + ); + } + } + if saw_auth_failure { tracing::warn!( switch_id = %self.switch_id, @@ -539,6 +578,23 @@ mod tests { assert_eq!(temp_state_to_f64(None), None); } + #[test] + fn test_fan_led_to_f64_mapping() { + // green/ok (case-insensitive) => 1.0 + assert_eq!(fan_led_to_f64(Some("green")), Some(1.0)); + assert_eq!(fan_led_to_f64(Some("GREEN")), Some(1.0)); + assert_eq!(fan_led_to_f64(Some(" green ")), Some(1.0)); + assert_eq!(fan_led_to_f64(Some("ok")), Some(1.0)); + assert_eq!(fan_led_to_f64(Some("OK")), Some(1.0)); + // any other non-empty value => 0.0 + assert_eq!(fan_led_to_f64(Some("amber")), Some(0.0)); + assert_eq!(fan_led_to_f64(Some("red")), Some(0.0)); + // absent/empty => None (emit nothing) + assert_eq!(fan_led_to_f64(Some("")), None); + assert_eq!(fan_led_to_f64(Some(" ")), None); + assert_eq!(fan_led_to_f64(None), None); + } + /// Drives the same parse + emit logic `run_iteration` uses for the /// platform/environment/fan endpoint against a captured sink, asserting the /// emitted MAX-SPEED sample shape. Table-driven over representative payloads. @@ -803,6 +859,102 @@ mod tests { ); } + /// Drives the same parse + emit logic `run_iteration` uses for the + /// platform/environment parent summary against a captured sink, asserting the + /// emitted switch-level `fan_led` sample shape. "green"/"ok" => 1.0, + /// "amber" => 0.0, and an absent `FAN_STATUS` emits nothing. + #[test] + fn test_fan_led_emit() { + use crate::collectors::nvue::rest::client::PlatformEnvironmentResponse; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + struct Case { + name: &'static str, + json: &'static str, + // expected emitted fan_led value, or None when nothing must emit. + expected: Option, + } + + let cases = [ + Case { + name: "green LED emits 1.0", + json: r#"{"FAN_STATUS": {"state": "green", "type": "led"}}"#, + expected: Some(1.0), + }, + Case { + name: "ok LED emits 1.0", + json: r#"{"FAN_STATUS": {"state": "ok", "type": "led"}}"#, + expected: Some(1.0), + }, + Case { + name: "amber LED emits 0.0", + json: r#"{"FAN_STATUS": {"state": "amber", "type": "led"}}"#, + expected: Some(0.0), + }, + Case { + name: "absent FAN_STATUS emits nothing", + json: r#"{"PSU_STATUS": {"state": "green", "type": "led"}}"#, + expected: None, + }, + ]; + + for case in cases { + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let mut collector = collector_with_provider(ScriptedProvider::new(vec![])); + collector.data_sink = Some(sink.clone()); + + let env: PlatformEnvironmentResponse = + serde_json::from_str(case.json).expect("env json parses"); + // Mirror run_iteration's emit logic exactly. + if let Some(value) = + env.get("FAN_STATUS").and_then(|s| fan_led_to_f64(s.state.as_deref())) + { + collector.emit_metric("fan_led", None, value, "state", vec![]); + } + + let samples = sink.samples.lock().unwrap(); + match case.expected { + Some(expected_value) => { + assert_eq!(samples.len(), 1, "case '{}': expected one sample", case.name); + let sample = &samples[0]; + assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); + assert_eq!(sample.metric_type, "fan_led", "case '{}'", case.name); + assert_eq!(sample.unit, "state", "case '{}'", case.name); + assert_eq!(sample.value, expected_value, "case '{}'", case.name); + assert_eq!(sample.key, "fan_led", "case '{}'", case.name); + assert!( + sample.labels.is_empty(), + "case '{}': fan_led is switch-level, no per-entity label", + case.name + ); + } + None => assert_eq!( + samples.len(), + 0, + "case '{}': absent FAN_STATUS must not emit a sample", + case.name + ), + } + } + } + struct ScriptedProvider { calls: AtomicUsize, // Each call pops the front of this queue; an empty queue yields an @@ -856,6 +1008,7 @@ mod tests { interfaces_enabled: false, platform_environment_fan_enabled: false, platform_environment_temperature_enabled: false, + platform_environment_status_enabled: false, } } diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index a28be67684..a68af7a322 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -976,6 +976,8 @@ impl Default for NvueRestConfig { /// - interfaces_enabled: Poll `/nvue_v1/interface`. /// - platform_environment_fan_enabled: Poll `/nvue_v1/platform/environment/fan`. /// - platform_environment_temperature_enabled: Poll `/nvue_v1/platform/environment/temperature`. +/// - platform_environment_status_enabled: Poll `/nvue_v1/platform/environment` parent +/// summary for the aggregate `FAN_STATUS` LED state. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct NvueRestPaths { @@ -985,6 +987,7 @@ pub struct NvueRestPaths { pub interfaces_enabled: bool, pub platform_environment_fan_enabled: bool, pub platform_environment_temperature_enabled: bool, + pub platform_environment_status_enabled: bool, } impl Default for NvueRestPaths { @@ -996,6 +999,7 @@ impl Default for NvueRestPaths { interfaces_enabled: true, platform_environment_fan_enabled: true, platform_environment_temperature_enabled: true, + platform_environment_status_enabled: true, } } } diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index b0b50eb70e..3593849938 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -3,7 +3,8 @@ > **Implementation note.** GB200 telemetry is collected via **explicit catalog-row > allowlists** over the live host surfaces: NMX-T (`switch_nmxt`), NVOS gNMI > (`nvue_gnmi`, explicit per-leaf), NVUE REST (`fan_max_speed` from -> `/platform/environment/fan`), and standard Redfish sensors (`hw_sensor`). There is +> `/platform/environment/fan`, `fan_led` from `/platform/environment`), and standard +> Redfish sensors (`hw_sensor`). There is > **no** standalone Redfish `TelemetryService` collector and **no** generic/sanitized > source preservation — both were evaluated against the live GB200 BMC and removed. > Unknown gNMI/NMX-T sources are dropped and debug-logged, never emitted. nv-redfish is @@ -19,8 +20,8 @@ For the GB200 phase, enable all switch telemetry collectors below: - `collectors.sensors` for standard Redfish sensor readings and threshold/range context (the temp/thermal `hw_sensor` series plus `*_range_max`/`*_range_min`). - HOST endpoint (`switch.endpoint_role = "host"`): - `collectors.nmxt` for NMX-T Prometheus telemetry on port `9352`. - - `collectors.nvue.rest` for NVUE health/app/partition/interface diagnostics and `fan_max_speed` from `/platform/environment/fan`. - - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general` (memory/disk), plus ON_CHANGE system events. + - `collectors.nvue.rest` for NVUE health/app/partition/interface diagnostics, `fan_max_speed` from `/platform/environment/fan`, and `fan_led` (aggregate `FAN_STATUS`) from `/platform/environment`. + - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general` (memory/disk via `/state`, OS/BMC/EROT firmware versions via `/versions`), plus ON_CHANGE system events. No TelemetryService proxy ACL changes are required — collection uses the standard Redfish sensor paths plus the host NMX-T/gNMI/NVUE endpoints. @@ -127,9 +128,9 @@ cargo run -p carbide-health --bin forge-hw-health -- /path/to/gb200-switch-local 1. `/telemetry` output contains `hw_sensor` samples for the BMC endpoint (temp/thermal readings; plus `*_range_max`/`*_range_min` where the sensor exposes ranges). 2. `/telemetry` output contains `switch_nmxt` samples for the HOST endpoint — only the explicit `NMXT_METRIC_MAP` families with the allowlisted identity labels (no sanitized/unknown source names). -3. `/telemetry` output contains `nvue_gnmi` samples for the HOST endpoint: canonical `interface_*` (incl. `interface_link_speed_active` in gbps), `component_*`, and `platform_memory_used/total` + `platform_disk_total/used`. -4. `/telemetry` output contains the NVUE REST `fan_max_speed` sample (HOST). Logs show the NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected roles; matched-but-uncoercible leaves are debug-logged, not emitted. -5. The two catalog rows with no listed source (`CABLE-SNR-MEDIA-LANE-N`, `CABLE-SNR-HOST-LANE-N`) are checked explicitly in live output. If they do not appear through Redfish MetricReports, NMX-T, or gNMI, open a catalog/source-owner follow-up immediately; keep them open until source-owner resolution. +3. `/telemetry` output contains `nvue_gnmi` samples for the HOST endpoint: canonical `interface_*` (incl. `interface_link_speed_active` in gbps and `interface_plr_bw_loss_percent`), `component_*`, `platform_memory_used/total` + `platform_disk_total/used`, and the switch-level `platform_{os,bmc,erot}_version_info` info-metrics (value 1.0, version carried in the label). +4. `/telemetry` output contains the NVUE REST `fan_max_speed` and `fan_led` samples (HOST). Logs show the NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected roles; matched-but-uncoercible leaves are debug-logged, not emitted. +5. The two catalog rows with no listed source (`CABLE-SNR-MEDIA-LANE-N` row 2294, `CABLE-SNR-HOST-LANE-N` row 2295) are checked explicitly in live output. NMX-T exposes `rx_power_lane_0/1` (rows 977/978) but **no SNR family**, so neither row is emitted today (an earlier rescue pass spuriously token-matched `rx_power_lane_5`/`cable-proto-cap-ext` — corrected to ABSENT-BLOCKER). If they do not appear through Redfish MetricReports, NMX-T, or gNMI, open a catalog/source-owner follow-up immediately; keep them open until source-owner resolution. ## Series-shape acceptance checks @@ -137,20 +138,28 @@ Only explicit catalog-row mappings are emitted; unknown sources are dropped (deb 1. Capture the distinct `(name, metric_type, key)` tuples from two consecutive `/telemetry` scrapes after collectors are warm. 2. Confirm the tuple set is stable across scrapes except for expected link/error-counter changes. -3. Confirm every emitted series is one of the known families: `hw_sensor`, `switch_nmxt`, `nvue_gnmi` (`interface_*`/`component_*`/`platform_*`), or `fan_max_speed`. No `nvswitch_*`, `source_metric`, or `redfish_telemetry_service` series may appear. +3. Confirm every emitted series is one of the known families: `hw_sensor`, `switch_nmxt`, `nvue_gnmi` (`interface_*`/`component_*`/`platform_*`, incl. `platform_{os,bmc,erot}_version_info`), `fan_max_speed`, or `fan_led`. No `nvswitch_*`, `source_metric`, or `redfish_telemetry_service` series may appear. 4. Confirm NMX-T identity labels are the allowlisted `NMXT_LABEL_MAP` set (bounded per port); no raw/unknown source names as labels. Unit coverage that locks this behavior: - NMX-T: `test_nmxt_metric_map_locks_type_and_unit`, `test_unknown_nmxt_sources_not_allowlisted`. -- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_string_leaf_is_not_exported` (string leaves emit nothing). -- NVUE REST: `test_fan_max_speed_emit`. +- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_string_leaf_is_not_exported` (string leaves emit nothing), `test_interface_numeric_leaf_table_mappings` (locks `interface_plr_bw_loss_percent` type/unit), `test_platform_general_version_info_metrics` + `test_platform_general_empty_version_string_is_not_exported` (OS/BMC/EROT version info-metrics), `test_nvue_subscribe_paths_all_enabled` (the `/platform-general/versions` subscribe path is added). +- NVUE REST: `test_fan_max_speed_emit`, `test_fan_led_emit` (green/ok=1, amber=0, absent FAN_STATUS emits nothing) + `test_parse_platform_environment_fan_status`. ## Blocker escalations (Stage 0) Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. -13 rows are escalated below — all ABSENT-BLOCKER (no live source on this platform). No -row is deferred — each has an explicit disposition and a named resolution path. +**16 rows remain ABSENT-BLOCKER** (no live source on this platform) — these are the escalations +in Groups B–D plus the rescue-match audit below. No row is deferred — each has an explicit +disposition and a named resolution path. + +The remaining subsections here (temperature, string-valued, and firmware/PLR/fan-LED, all marked +**RESOLVED**) are *not* escalations; they are kept for provenance, recording rows that earlier +passes had escalated but that are now implemented, so the trail from the Stage-0 blocker set down +to the final 16 is auditable. A post-implementation audit on 2026-06-23 moved 3 rows *into* the +blocker set — 870 CPU_CORE_NUMBER, 2294/2295 CABLE-SNR-MEDIA/HOST-LANE-N — that an earlier pass +had token-matched but no lane actually emits (see "Rescue-match audit" below). ### Temperature threshold rows — RESOLVED (21 rows, formerly BLOCKER-THRESHOLD) @@ -226,3 +235,33 @@ These 6 catalog rows are string-valued and were previously escalated; they are n info-metrics (value 1 with the string carried in a label; skipped when empty, so `CONTACT`/`LOCATION` emit only when configured). - `876 ASIC-NAME` — covered by the existing `component_name` label on every component metric (not re-emitted). + +### Firmware-version / PLR / fan-LED rows — RESOLVED (5 rows, now implemented) + +These 5 catalog rows were previously rescue-matched by token but not emitted by any lane; each +now has an explicit, unit-tested emit path: +- `764 OS-VERSION`, `767 BMC-VERSION`, `766 EROT-FW-VERSION` — gNMI now also subscribes + `/platform-general/versions` (sibling of `/state`); the `versions/state/{nos-version, + fw-version-bmc,fw-version-erot}` leaves emit switch-level info-metrics + `platform_{os,bmc,erot}_version_info` (value 1.0, raw version carried in the + `{os,bmc,erot}_version` label; empty strings emit nothing). +- `942 PLR-BW-LOSS-PERCENT` — gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` + added to the numeric interface-leaf allowlist as `interface_plr_bw_loss_percent` (unit `percent`). +- `967 FAN-LED` — re-sourced from NVUE REST: the `/nvue_v1/platform/environment` parent summary's + aggregate `FAN_STATUS.state` LED is emitted as switch-level `fan_led` (green/ok = 1.0, any other + state = 0.0, absent = nothing), gated on `platform_environment_status_enabled` (default true). + The catalog's CLI LED path (`nv show platform environment led`) is not used. + +### Rescue-match audit — 3 rows re-classified to ABSENT-BLOCKER (2026-06-23) + +A verification pass over the 8 `RESOLVED-LIVE` rows found 3 that an earlier token-rescue had marked +`implemented` but that **no collector lane actually emits**. They are now ABSENT-BLOCKER: +- **`870 CPU_CORE_NUMBER`** — catalog source is NVOS CLI only (`nv show system cpu`). The rescue + token `core-to-phy-link-width-enabled` is a gNMI link-width *config knob*, not a CPU core count. + No gNMI/NMX-T emit arm exists. Resolution: new CLI collector or NVOS gNMI exposure; escalate to + NVOS owner (same path as `765 OS-KERNEL`). +- **`2294 CABLE-SNR-MEDIA-LANE-N` / `2295 CABLE-SNR-HOST-LANE-N`** — catalog lists no source. NMX-T + has `rx_power_lane_0/1` (power, rows 977/978) but no per-lane **SNR** family; the rescue tokens + (`rx_power_lane_5`, `cable-proto-cap-ext`) do not exist as live SNR sources. No emit arm exists. + Resolution: source-owner follow-up (see "Evidence to capture" step 5); keep open until an NVLink + per-lane SNR source is identified or the rows are declared N/A for NVLink backplane switches. diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 5b7aa96fbc..3ebc62e162 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -1,9 +1,9 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition,match_detail 763,NET-FW-VER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -764,OS-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" +764,OS-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/nos-version -> platform_os_version_info 765,OS-KERNEL,source-equivalent required; no new CLI collector by default,ABSENT-BLOCKER,blocker,no live token match (CLI-only) -766,EROT-FW-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" -767,BMC-VERSION,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['fw-version-bmc', 'nos-version'] nmxt~['cable_fw_version', 'FW_Version']" +766,EROT-FW-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/fw-version-erot -> platform_erot_version_info +767,BMC-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/fw-version-bmc -> platform_bmc_version_info 794,LINK-DOWNED-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 795,PORT-MALFORMED-PACKET-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 796,PORT-NEIGHBOR-MTU-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact @@ -35,7 +35,7 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 867,PORT-LABEL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 868,REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 869,DEVICE-HARDWARE-REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -870,CPU_CORE_NUMBER,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['core-to-phy-link-width-enabled', 'core-to-phy-link-proto-enabled'] nmxt~['Port_Number', 'sw_serial_number']" +870,CPU_CORE_NUMBER,NVOS CLI only (nv show system cpu); no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (gnmi core-to-phy-link-width-enabled is a link knob not a CPU core count); not emitted 872,ASIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) 873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) 874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) @@ -98,7 +98,7 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 939,PLR-XMIT-RETRYS-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 940,PLR-SYNC-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 941,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -942,PLR-BW-LOSS-PERCENT,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['plr-xmit-retry-events', 'plr-rcv-code-err'] nmxt~['PlrRcvUncorrectableCode', 'PlrXmitRetryEvents']" +942,PLR-BW-LOSS-PERCENT,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi interfaces/interface/phy-diag/state/plr-bw-loss-percent -> interface_plr_bw_loss_percent (percent) 943,RQ-GENERAL-ERROR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 944,TIME-TO-LINKS-UP,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 945,STATUS-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family @@ -123,7 +123,7 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 964,SUPPORTED-WIDTH,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric interface_vl_capabilities_info 966,FAN-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -967,FAN-LED,source-equivalent required; no new CLI collector by default,RESOLVED-LIVE,implemented,"gnmi~['pd-link-speed-enabled', 'phy-hst-link-speed-enabled'] nmxt~[]" +967,FAN-LED,NVUE REST explicit mapping,IMPLEMENTED,implemented,nvue-rest /nvue_v1/platform/environment FAN_STATUS.state -> fan_led (green/ok=1 else 0) 968,CABLE-PART-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 969,CABLE-SERIAL-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 970,CABLE-TRANSMITTER-TECHNOLOGY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family @@ -186,8 +186,8 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 1707,RQ-NUM-LLE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: rq_num_lle 1708,SQ-NUM-WRFE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: sq_num_wrfe 2293,CABLE-OPER-STATUS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/transceiver-diag/state/module-oper-status -2294,CABLE-SNR-MEDIA-LANE-N,live source resolution required,RESOLVED-LIVE,implemented,"gnmi~['cable-proto-cap-ext'] nmxt~['tx_power_lane_3', 'rx_power_lane_5']" -2295,CABLE-SNR-HOST-LANE-N,live source resolution required,RESOLVED-LIVE,implemented,"gnmi~['cable-proto-cap-ext', 'hostname'] nmxt~['tx_power_lane_3', 'rx_power_lane_5']" +2294,CABLE-SNR-MEDIA-LANE-N,no source listed in catalog; no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (NMX-T has rx_power_lane_0/1 but no SNR field); not emitted; source-owner follow-up open +2295,CABLE-SNR-HOST-LANE-N,no source listed in catalog; no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (NMX-T has rx_power_lane_0/1 but no SNR field); not emitted; source-owner follow-up open 2296,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-lower 2297,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/output-power-lower 2298,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-upper diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index 01d2b88906..82180cb90a 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -17,31 +17,39 @@ Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_ | Disposition | Count | Meaning | |----------------|-------|-----------------------------------------------------------------------| -| implemented | 180 | PRESENT/RESOLVED-LIVE allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded), or covered by an existing label | -| blocker | 13 | ABSENT-BLOCKER — leaf/family not live on this platform | +| implemented | 177 | PRESENT allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded / discovered live source), or covered by an existing label | +| blocker | 16 | ABSENT-BLOCKER — leaf/family not live on this platform | ### final_status breakdown | final_status | Count | |-------------------|-------| | PRESENT | 136 | -| RESOLVED-LIVE | 8 | -| IMPLEMENTED | 36 | -| ABSENT-BLOCKER | 13 | +| IMPLEMENTED | 41 | +| ABSENT-BLOCKER | 16 | ## Blocker escalations See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the -full annotated list of 13 rows, grouped by root cause, with resolution path and re-probe +full annotated list of 16 rows, grouped by root cause, with resolution path and re-probe conditions. ## Notes on implemented rows - **PRESENT** rows have an explicit gNMI or NMX-T allowlist mapping confirmed live by the Stage 0 probe. No further work required before merge. -- **RESOLVED-LIVE** rows have no direct catalog-listed source but a live token match was found in - gNMI or NMX-T output. Match tokens are recorded in `match_detail`. These are accepted as - covered; if live validation on a production rig disputes a mapping, re-escalate immediately. +- **IMPLEMENTED via discovered live sources (5 rows)** — these had no direct catalog-listed source + originally (the catalog marked them CLI-only / "resolution required"), but each now has an + explicit, unit-tested emit path; `match_detail` records the concrete live leaf/endpoint: + - 764 OS-VERSION, 767 BMC-VERSION, 766 EROT-FW-VERSION → gNMI `platform-general/versions/state/` + `{nos-version,fw-version-bmc,fw-version-erot}` info-metrics (`platform_os/bmc/erot_version_info`). + - 942 PLR-BW-LOSS-PERCENT → gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` + (`interface_plr_bw_loss_percent`, percent). + - 967 FAN-LED → NVUE REST `/nvue_v1/platform/environment` `FAN_STATUS.state` (`fan_led`, + green/ok=1 else 0). + - Audit note: an earlier pass token-matched 3 further rows (870 CPU_CORE_NUMBER, 2294 + CABLE-SNR-MEDIA-LANE-N, 2295 CABLE-SNR-HOST-LANE-N) on spurious substrings; on verification no + lane emits them, so they were re-classified to ABSENT-BLOCKER (see "Notes on blocker rows"). - **IMPLEMENTED** rows are sourced beyond the plain gNMI/NMX-T allowlist: - NVUE REST `/nvue_v1/platform/environment/{fan,temperature}` → MAX-SPEED (894); the 21 temp `*-CRITICAL/MAX/STATE` rows (`.crit`/`.max`/`.state`) and the 8 `*-TEMP-CURRENT` rows @@ -70,3 +78,12 @@ No row is marked "deferred." Every blocker has an explicit escalation dispositio NMX-T/RDMA owner. - **ABSENT-BLOCKER — OS-KERNEL (row 765):** CLI-only, no gNMI or NMX-T token match. Requires a new CLI collector or NVOS gNMI exposure; escalate to NVOS owner. +- **ABSENT-BLOCKER — CPU_CORE_NUMBER (row 870):** CLI-only (`nv show system cpu`); the catalog + lists no gNMI/NMX-T source. A prior pass spuriously token-matched the gNMI link knob + `core-to-phy-link-width-enabled` (a link-width config flag, not a CPU core count); no lane emits + it. Requires a new CLI collector or NVOS gNMI exposure; escalate to NVOS owner. +- **ABSENT-BLOCKER — CABLE-SNR-MEDIA-LANE-N / CABLE-SNR-HOST-LANE-N (rows 2294, 2295):** catalog + lists *no source* for either row. NMX-T exposes `rx_power_lane_0/1` (rows 977/978) but **no SNR + family**; a prior pass spuriously token-matched `rx_power_lane_5`/`cable-proto-cap-ext`. No lane + emits these. Source-owner follow-up is open (see live-validation runbook step 5) — keep open + until an NVLink per-lane SNR source is identified or the rows are declared N/A. From f96a1cca4208275980ba204b79bc92736ff4b388 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:59:01 -0400 Subject: [PATCH 10/25] feat(health): exclude high-cardinality free-text labels from the Prometheus sink Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 21 +++---------- crates/health/src/sink/prometheus.rs | 38 +++++++++++++++++++++-- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index c742b9971b..ffbcd4852f 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -189,12 +189,6 @@ logs_state_file = "/tmp/logs_collector_{machine_id}.json" # ============================================================================== # Switch Host Collectors: What data to collect from NVLink Switch Hosts -# -# NMX-T and NVUE (REST + gNMI) are disabled by default in config.rs because -# they are only meaningful for switch host endpoints -# (switch.endpoint_role = "host"). This example enables all three for the -# GB200 NVLink switch-host scenario. Deploy only where the target endpoint is -# a switch host; non-switch hosts should omit these sections entirely. # ============================================================================== [collectors.nmxt] @@ -210,25 +204,18 @@ system_health_enabled = true cluster_apps_enabled = true sdn_partitions_enabled = true interfaces_enabled = true -# Per-sensor temperatures from `/nvue_v1/platform/environment/temperature` -# (current/max/crit in celsius + sensor state). Emits only the fields each -# sensor actually reports. platform_environment_temperature_enabled = true -# Aggregate fan LED state from the `/nvue_v1/platform/environment` parent -# summary (the `FAN_STATUS` entry). Switch-level: "green"/"ok" => 1.0, any -# other state => 0.0. platform_environment_status_enabled = true -# NVUE gNMI streaming collector. Disabled by default in code; explicitly -# enabled here for the GB200 NVLink switch-host scenario. Subscribes to +# NVUE gNMI streaming collector, disabled by default. Subscribes to # gNMI SAMPLE paths (components + interfaces) and pushes metrics through -# the DataSink pipeline. PrometheusSink serves the /metrics endpoint; -# OtlpSink (when configured separately) pushes to an OTel Collector. +# the configured sinks. gNMI ON_CHANGE targets system-events [collectors.nvue.gnmi] +# periodic SAMPLE gnmi_port = 9339 sample_interval = "5m" request_timeout = "30s" -# gNMI ON_CHANGE subscription for system events +# streaming ON_CHANGE system_events_enabled = true [collectors.nvue.gnmi.paths] diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index 3822b24d31..3e154b546c 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -22,7 +22,18 @@ use dashmap::DashMap; use super::{CollectorEvent, DataSink, EventContext, MetricSample}; use crate::HealthError; -use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricsManager}; +use crate::metrics::{CollectorRegistry, GaugeMetrics, GaugeReading, MetricLabel, MetricsManager}; + +/// High-cardinality / free-text labels kept for OTLPSink but excluded from PrometheusSink +const PROMETHEUS_EXCLUDED_LABELS: &[&str] = &["status_message"]; + +fn filter_prometheus_labels(labels: &[MetricLabel]) -> Vec { + labels + .iter() + .filter(|(key, _)| !PROMETHEUS_EXCLUDED_LABELS.contains(&key.as_ref())) + .cloned() + .collect() +} pub struct PrometheusSink { collector_registry: Arc, @@ -204,7 +215,7 @@ impl DataSink for PrometheusSink { sample.unit.clone(), sample.value, ) - .with_labels(sample.labels.clone()), + .with_labels(filter_prometheus_labels(&sample.labels)), ); } Err(error) => { @@ -333,4 +344,27 @@ mod tests { assert_eq!(label_value("switch_slot_number"), Some("7")); assert_eq!(label_value("switch_tray_index"), Some("3")); } + + // status_message is excluded from Prometheus series; other labels (e.g. port_num) are retained. + #[test] + fn test_filter_prometheus_labels_drops_status_message() { + let labels: Vec = vec![ + ( + std::borrow::Cow::Borrowed("status_message"), + "No issue was observed".to_string(), + ), + (std::borrow::Cow::Borrowed("port_num"), "11".to_string()), + ]; + + let filtered = filter_prometheus_labels(&labels); + + assert!( + !filtered.iter().any(|(k, _)| k == "status_message"), + "status_message must be excluded from Prometheus series" + ); + assert!( + filtered.iter().any(|(k, v)| k == "port_num" && v == "11"), + "non-excluded labels must be retained" + ); + } } From 4582d0eb3b01d467bcb46c3052ef1bec46131c5f Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:59:01 -0400 Subject: [PATCH 11/25] refactor(health): struct allowlists, StateSet enum metrics, NMX-T label cardinality fixes Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 835 +++++++++-- .../collectors/nvue/gnmi/sample_processor.rs | 1262 +++++++++++------ .../src/collectors/nvue/rest/collector.rs | 384 +++-- 3 files changed, 1802 insertions(+), 679 deletions(-) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 812681c952..0c84baaa39 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -16,7 +16,7 @@ */ //! This module collects metrics from NMX-T telemetry endpoints on NVLink switches if the service is enabled. -//! Scrapes HTTP on 9352 (default for NMX-T) - NOT A Redfish collector! +//! Scrapes HTTP on 9352 (default for NMX-T) //! //! Mapping is an EXPLICIT, catalog-row allowlist over the live NMX-T Prometheus scrape (see //! `NMXT_METRIC_MAP` and `NMXT_LABEL_MAP`). Each NMX-T source name is either: @@ -27,7 +27,7 @@ //! Source names not on either allowlist are skipped and counted only (never sanitized into telemetry). use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use nv_redfish::core::Bmc; @@ -38,124 +38,339 @@ use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; -/// default NMX-T port const NMXT_PORT: u16 = 9352; -/// NMX-T endpoint const NMXT_ENDPOINT: &str = "/xcset/nvlink_domain_telemetry"; /// Producer name for every emitted NMX-T series. Preserved across all mappings so the /// downstream sink keeps a single `switch_nmxt` family. const NMXT_PRODUCER: &str = "switch_nmxt"; -/// Explicit allowlist: live NMX-T Prometheus **family** (numeric series) -> canonical mapping. -/// -/// Tuple is `(nmxt_source_name, metric_type, unit)`. One canonical series per catalog row; the -/// source name is matched verbatim against the scraped line name. Names absent from this table -/// (and from [`NMXT_LABEL_MAP`]) are never exported. Each entry was confirmed live in the GB200 -/// NMX-T scrape (Stage 0). Catalog rows are noted for traceability. -const NMXT_METRIC_MAP: &[(&str, &str, &str)] = &[ - // BER / error counters (existing mappings, retained) - ("Effective_BER", "effective_ber", "ratio"), - ("Symbol_Errors", "symbol_errors", "count"), // row 908 PHY-SYMBOL-ERRORS - ("Link_Down", "link_down", "count"), +/// One NMX-T numeric family -> canonical `switch_nmxt` series; `source` matched verbatim. +#[derive(Debug, PartialEq)] +struct NmxtMetric { + source: &'static str, + metric_type: &'static str, + unit: &'static str, +} + +/// One NMX-T identity/inventory label -> canonical label re-exported on every series. +#[derive(Debug, PartialEq)] +struct NmxtLabel { + source: &'static str, + canonical: &'static str, +} + +/// Explicit family allowlist. Names absent here (and from [`NMXT_LABEL_MAP`]) are never exported; +/// each entry was confirmed live in the GB200 NMX-T scrape (Stage 0). Trailing comments name the +/// catalog telemetry parameter. +const NMXT_METRIC_MAP: &[NmxtMetric] = &[ + // BER / error counters + NmxtMetric { + source: "Effective_BER", + metric_type: "effective_ber", + unit: "ratio", + }, + NmxtMetric { + source: "Symbol_Errors", + metric_type: "symbol_errors", + unit: "count", + }, // PHY-SYMBOL-ERRORS + NmxtMetric { + source: "Link_Down", + metric_type: "link_down", + unit: "count", + }, // Identity / inventory numeric families - ("lid", "lid", "id"), // row 865 LID - ("device_hw_rev", "device_hw_rev", "id"), // row 869 DEVICE-HARDWARE-REVISION + NmxtMetric { + source: "lid", + metric_type: "lid", + unit: "id", + }, // LID + NmxtMetric { + source: "device_hw_rev", + metric_type: "device_hw_rev", + unit: "id", + }, // DEVICE-HARDWARE-REVISION // Status / link-down attribution - ("Advanced_Status_Opcode", "status_opcode", "code"), // row 945 STATUS-OPCODE - ("remote_reason_opcode", "remote_reason_opcode", "code"), // row 949 REMOTE-REASON-OPCODE - ("time_to_link_up_ext_msec", "time_to_link_up", "milliseconds"), // row 944 TIME-TO-LINKS-UP + NmxtMetric { + source: "Advanced_Status_Opcode", + metric_type: "status_opcode", + unit: "code", + }, // STATUS-OPCODE + NmxtMetric { + source: "remote_reason_opcode", + metric_type: "remote_reason_opcode", + unit: "code", + }, // REMOTE-REASON-OPCODE + NmxtMetric { + source: "time_to_link_up_ext_msec", + metric_type: "time_to_link_up", + unit: "milliseconds", + }, // TIME-TO-LINKS-UP // Cable optics (numeric families) - ("cable_technology", "cable_transmitter_technology", "code"), // row 970 CABLE-TRANSMITTER-TECHNOLOGY - ("rx_power_lane_0", "cable_rx_power_lane0", "milliwatts"), // row 977 CABLE-RX-POWER-LANE0 - ("rx_power_lane_1", "cable_rx_power_lane1", "milliwatts"), // row 978 CABLE-RX-POWER-LANE1 - ("Module_Voltage", "cable_diag_supply_voltage", "volts"), // row 979 CABLE-DIAG-SUPPLY-VOLTAGE + NmxtMetric { + source: "cable_technology", + metric_type: "cable_transmitter_technology", + unit: "code", + }, // CABLE-TRANSMITTER-TECHNOLOGY + NmxtMetric { + source: "rx_power_lane_0", + metric_type: "cable_rx_power_lane0", + unit: "milliwatts", + }, // CABLE-RX-POWER-LANE0 + NmxtMetric { + source: "rx_power_lane_1", + metric_type: "cable_rx_power_lane1", + unit: "milliwatts", + }, // CABLE-RX-POWER-LANE1 + NmxtMetric { + source: "Module_Voltage", + metric_type: "cable_diag_supply_voltage", + unit: "volts", + }, // CABLE-DIAG-SUPPLY-VOLTAGE // Link partner - ("link_partner_lid", "link_partner_lid", "id"), // row 989 LINK-PARTNER-LID + NmxtMetric { + source: "link_partner_lid", + metric_type: "link_partner_lid", + unit: "id", + }, // LINK-PARTNER-LID // Recovery counters / timers - ("successful_recovery_events", "link_recovery_success_cnt", "count"), // row 1688 LINK-RECOVERY-SUCCESS-CNT - ("total_successful_recovery_events", "total_link_recovery_success_cnt", "count"), // row 1689 TOTAL-LINK-RECOVERY-SUCCESS-CNT - ("time_since_last_recovery", "time_since_last_recovery", "seconds"), // row 1690 TIME-SINCE-LAST-RECOVERY - ("time_between_last_2_recoveries", "time_btwn_two_recoveries", "seconds"), // row 1691 TIME-BTWN-TWO-RECOVERIES - ("last_host_logical_recovery_attempts_count", "recovery_attempts_l1_cnt", "count"), // row 1692 RECOVERY-ATTEMPTS-L1-CNT - ("last_host_serdes_feq_attempts_count", "recovery_attempts_l2_cnt", "count"), // row 1693 RECOVERY-ATTEMPTS-L2-CNT - ("time_in_last_host_logical_recovery", "recovery_cycle_duration", "seconds"), // row 1694 RECOVERY-CYCLE-DURATION - ("time_in_last_host_serdes_feq_recovery", "serdes_recovery_cycle_duration", "seconds"), // row 1695 SERDES-RECOVERY-CYCLE-DURATION + NmxtMetric { + source: "successful_recovery_events", + metric_type: "link_recovery_success_cnt", + unit: "count", + }, // LINK-RECOVERY-SUCCESS-CNT + NmxtMetric { + source: "total_successful_recovery_events", + metric_type: "total_link_recovery_success_cnt", + unit: "count", + }, // TOTAL-LINK-RECOVERY-SUCCESS-CNT + NmxtMetric { + source: "time_since_last_recovery", + metric_type: "time_since_last_recovery", + unit: "seconds", + }, // TIME-SINCE-LAST-RECOVERY + NmxtMetric { + source: "time_between_last_2_recoveries", + metric_type: "time_btwn_two_recoveries", + unit: "seconds", + }, // TIME-BTWN-TWO-RECOVERIES + NmxtMetric { + source: "last_host_logical_recovery_attempts_count", + metric_type: "recovery_attempts_l1_cnt", + unit: "count", + }, // RECOVERY-ATTEMPTS-L1-CNT + NmxtMetric { + source: "last_host_serdes_feq_attempts_count", + metric_type: "recovery_attempts_l2_cnt", + unit: "count", + }, // RECOVERY-ATTEMPTS-L2-CNT + NmxtMetric { + source: "time_in_last_host_logical_recovery", + metric_type: "recovery_cycle_duration", + unit: "seconds", + }, // RECOVERY-CYCLE-DURATION + NmxtMetric { + source: "time_in_last_host_serdes_feq_recovery", + metric_type: "serdes_recovery_cycle_duration", + unit: "seconds", + }, // SERDES-RECOVERY-CYCLE-DURATION // Contain-and-drain discards - ("contain_n_drain_xmit_discards", "contain_drain_xmit_discard", "count"), // row 1696 CONTAIN-DRAIN-XMIT-DISCARD - ("contain_n_drain_rcv_discards", "contain_drain_rcv_discard", "count"), // row 1697 CONTAIN-DRAIN-RCV-DISCARD + NmxtMetric { + source: "contain_n_drain_xmit_discards", + metric_type: "contain_drain_xmit_discard", + unit: "count", + }, // CONTAIN-DRAIN-XMIT-DISCARD + NmxtMetric { + source: "contain_n_drain_rcv_discards", + metric_type: "contain_drain_rcv_discard", + unit: "count", + }, // CONTAIN-DRAIN-RCV-DISCARD // Raw error lanes - ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), // row 1704 RAW-ERR-LANE-2 - ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), // row 1705 RAW-ERR-LANE-3 + NmxtMetric { + source: "Raw_Errors_Lane_2", + metric_type: "raw_err_lane_2", + unit: "count", + }, // RAW-ERR-LANE-2 + NmxtMetric { + source: "Raw_Errors_Lane_3", + metric_type: "raw_err_lane_3", + unit: "count", + }, // RAW-ERR-LANE-3 // Cable/transceiver fault flags (0/1). Re-sourced from NMX-T: NVLink ports on // the N5400_LD are not modeled as gNMI transceiver components, so the catalog's // gNMI transceiver-diag path is absent live; NMX-T exposes these per active link. - ("tx_cdr_lol", "cable_tx_cdr_lol", "state"), // row 983 CABLE-TX-CDR-LOL - ("rx_cdr_lol", "cable_rx_cdr_lol", "state"), // row 984 CABLE-RX-CDR-LOL - ("tx_los", "cable_tx_los", "state"), // row 985 CABLE-TX-LOS - ("rx_los", "cable_rx_los", "state"), // row 986 CABLE-RX-LOS + NmxtMetric { + source: "tx_cdr_lol", + metric_type: "cable_tx_cdr_lol", + unit: "state", + }, // CABLE-TX-CDR-LOL + NmxtMetric { + source: "rx_cdr_lol", + metric_type: "cable_rx_cdr_lol", + unit: "state", + }, // CABLE-RX-CDR-LOL + NmxtMetric { + source: "tx_los", + metric_type: "cable_tx_los", + unit: "state", + }, // CABLE-TX-LOS + NmxtMetric { + source: "rx_los", + metric_type: "cable_rx_los", + unit: "state", + }, // CABLE-RX-LOS ]; -/// Explicit allowlist: live NMX-T Prometheus **label** key -> canonical label name. -/// -/// These catalog rows are identity/inventory dimensions, not standalone metrics. NMX-T carries -/// them as labels on every series, so they are re-exported as canonical labels on each emitted -/// `switch_nmxt` sample (consistent with the existing `node_guid` / `port_num` handling). They are -/// never emitted as their own metric family. Tuple is `(nmxt_label_key, canonical_label_name)`. -/// Catalog rows are noted for traceability. -const NMXT_LABEL_MAP: &[(&str, &str)] = &[ - ("FW_Version", "net_fw_ver"), // row 763 NET-FW-VER - ("sw_serial_number", "serial"), // row 804 SERIAL - ("Node_GUID", "node_guid"), // row 806 NODE-GUID - ("port_guid", "port_guid"), // row 807 PORT-GUID - ("Port_Number", "port_num"), // row 866 PORT-NUMBER - ("port_label", "port_label"), // row 867 PORT-LABEL - ("sw_revision", "revision"), // row 868 REVISION - ("Active_FEC", "fec_mode_active"), // row 898 FEC-MODE-ACTIVE - ("Device_ID", "device_id"), // row 910 DEVICE-ID - ("Status_Message", "status_message"), // row 946 STATUS-MESSAGE - ("down_blame", "down_blame"), // row 947 DOWN-BLAME - ("local_reason_opcode", "local_reason_opcode"), // row 948 LOCAL-REASON-OPCODE - ("Cable_PN", "cable_part_number"), // row 968 CABLE-PART-NUMBER - ("Cable_SN", "cable_serial_number"), // row 969 CABLE-SERIAL-NUMBER - ("cable_type", "cable_type"), // row 971 CABLE-TYPE - ("cable_vendor", "cable_vendor"), // row 972 CABLE-VENDOR - ("cable_length", "cable_length"), // row 973 CABLE-LENGTH - ("cable_identifier", "cable_identifier"), // row 974 CABLE-IDENTIFIER - ("vendor_rev", "cable_rev"), // row 975 CABLE-REV - ("cable_fw_version", "cable_fw_version"), // row 976 CABLE-FW-VERSION - ("Module_Temperature", "cable_temp"), // row 980 CABLE-TEMP - ("link_partner_description", "link_partner_description"), // row 987 LINK-PARTNER-DESCRIPTION - ("link_partner_node_guid", "link_partner_node_guid"), // row 988 LINK-PARTNER-NODE-GUID - ("link_partner_port_num", "link_partner_port_num"), // row 990 LINK-PARTNER-PORT-NUM - ("device_num_on_tray", "device_num"), // row 1698 DEVICE-NUM - ("board_type", "board_type"), // row 1699 BOARD-TYPE - ("chassis_slot_index", "chassis_slot_idx"), // row 1700 CHASSIS-SLOT-IDX - ("tray_index", "tray_idx"), // row 1701 TRAY-IDX - ("topology_id", "topology_id"), // row 1702 TOPOLOGY-ID - ("chassis_id", "chassis_id"), // row 1703 CHASSIS-ID +/// Explicit label allowlist. These are identity/inventory dimensions, never standalone metrics: +/// re-exported as canonical labels on every emitted `switch_nmxt` sample. Trailing comments name +/// the catalog telemetry parameter. +const NMXT_LABEL_MAP: &[NmxtLabel] = &[ + NmxtLabel { + source: "FW_Version", + canonical: "net_fw_ver", + }, // NET-FW-VER + NmxtLabel { + source: "sw_serial_number", + canonical: "serial", + }, // SERIAL + NmxtLabel { + source: "Node_GUID", + canonical: "node_guid", + }, // NODE-GUID + NmxtLabel { + source: "port_guid", + canonical: "port_guid", + }, // PORT-GUID + NmxtLabel { + source: "Port_Number", + canonical: "port_num", + }, // PORT-NUMBER + NmxtLabel { + source: "port_label", + canonical: "port_label", + }, // PORT-LABEL + NmxtLabel { + source: "sw_revision", + canonical: "revision", + }, // REVISION + NmxtLabel { + source: "Active_FEC", + canonical: "fec_mode_active", + }, // FEC-MODE-ACTIVE + NmxtLabel { + source: "Device_ID", + canonical: "device_id", + }, // DEVICE-ID + NmxtLabel { + source: "Status_Message", + canonical: "status_message", + }, // STATUS-MESSAGE + NmxtLabel { + source: "local_reason_opcode", + canonical: "local_reason_opcode", + }, // LOCAL-REASON-OPCODE + NmxtLabel { + source: "Cable_PN", + canonical: "cable_part_number", + }, // CABLE-PART-NUMBER + NmxtLabel { + source: "Cable_SN", + canonical: "cable_serial_number", + }, // CABLE-SERIAL-NUMBER + NmxtLabel { + source: "cable_type", + canonical: "cable_type", + }, // CABLE-TYPE + NmxtLabel { + source: "cable_vendor", + canonical: "cable_vendor", + }, // CABLE-VENDOR + NmxtLabel { + source: "cable_length", + canonical: "cable_length", + }, // CABLE-LENGTH + NmxtLabel { + source: "cable_identifier", + canonical: "cable_identifier", + }, // CABLE-IDENTIFIER + NmxtLabel { + source: "vendor_rev", + canonical: "cable_rev", + }, // CABLE-REV + NmxtLabel { + source: "cable_fw_version", + canonical: "cable_fw_version", + }, // CABLE-FW-VERSION + NmxtLabel { + source: "link_partner_description", + canonical: "link_partner_description", + }, // LINK-PARTNER-DESCRIPTION + NmxtLabel { + source: "link_partner_node_guid", + canonical: "link_partner_node_guid", + }, // LINK-PARTNER-NODE-GUID + NmxtLabel { + source: "link_partner_port_num", + canonical: "link_partner_port_num", + }, // LINK-PARTNER-PORT-NUM + NmxtLabel { + source: "device_num_on_tray", + canonical: "device_num", + }, // DEVICE-NUM + NmxtLabel { + source: "board_type", + canonical: "board_type", + }, // BOARD-TYPE + NmxtLabel { + source: "chassis_slot_index", + canonical: "chassis_slot_idx", + }, // CHASSIS-SLOT-IDX + NmxtLabel { + source: "tray_index", + canonical: "tray_idx", + }, // TRAY-IDX + NmxtLabel { + source: "topology_id", + canonical: "topology_id", + }, // TOPOLOGY-ID + NmxtLabel { + source: "chassis_id", + canonical: "chassis_id", + }, // CHASSIS-ID ]; -/// Look up a live NMX-T family name in the explicit allowlist, returning `(metric_type, unit)`. -fn lookup_nmxt_metric(name: &str) -> Option<(&'static str, &'static str)> { - NMXT_METRIC_MAP - .iter() - .find(|(source, _, _)| *source == name) - .map(|(_, metric_type, unit)| (*metric_type, *unit)) +fn lookup_nmxt_metric(name: &str) -> Option<&'static NmxtMetric> { + NMXT_METRIC_MAP.iter().find(|m| m.source == name) +} + +/// `Module_Temperature` arrives only as a label value (e.g. `"0C"`), never its own numeric line, +/// so it is parsed here and re-emitted as a gauge. Returns `None` on empty/unparseable (e.g. `"N/A"`). +fn cable_temp_to_celsius(raw: &str) -> Option { + let trimmed = raw.trim(); + let digits = trimmed.strip_suffix(['C', 'c']).unwrap_or(trimmed).trim(); + digits.parse::().ok() +} + +/// Closed 3-state enum for `down_blame`, emitted as a StateSet (one 0/1 series per state). +const DOWN_BLAME_STATES: &[&str] = &["unknown", "local_phy", "remote_phy"]; + +/// Maps a raw `down_blame` value to its canonical state, case-insensitively; unknown/empty -> "unknown". +fn down_blame_to_state(raw: &str) -> &'static str { + match raw.trim().to_ascii_lowercase().as_str() { + "local_phy" => "local_phy", + "remote_phy" => "remote_phy", + _ => "unknown", + } } -/// Look up a live NMX-T label key in the explicit allowlist, returning the canonical label name. -/// Test-only helper; production re-exports labels by iterating `NMXT_LABEL_MAP` directly in `build_labels`. +/// Test-only; production iterates `NMXT_LABEL_MAP` directly in `build_labels`. #[cfg(test)] -fn lookup_nmxt_label(key: &str) -> Option<&'static str> { - NMXT_LABEL_MAP - .iter() - .find(|(source, _)| *source == key) - .map(|(_, canonical)| *canonical) +fn lookup_nmxt_label(key: &str) -> Option<&'static NmxtLabel> { + NMXT_LABEL_MAP.iter().find(|l| l.source == key) } -/// Prometheus text -> NmxtMetricSample #[derive(Debug, Clone)] struct NmxtMetricSample { name: String, @@ -163,7 +378,6 @@ struct NmxtMetricSample { value: f64, } -/// Parse Prometheus text format metrics from NMX-T endpoint fn parse_prometheus_metrics(body: &str) -> Vec { let mut samples = Vec::new(); @@ -181,15 +395,12 @@ fn parse_prometheus_metrics(body: &str) -> Vec { samples } -/// Parse a single text line fn parse_prometheus_line(line: &str) -> Option { - // find labels let (name_part, rest) = if let Some(brace_pos) = line.find('{') { let name = &line[..brace_pos]; let rest = &line[brace_pos..]; (name, rest) } else { - // no labels let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let name = parts[0]; @@ -226,7 +437,6 @@ fn parse_prometheus_line(line: &str) -> Option { }) } -/// scrape nmxt metrics from a single switch async fn scrape_switch_nmxt_metrics( http_client: &reqwest::Client, switch_ip: &str, @@ -260,7 +470,6 @@ pub struct NmxtCollectorConfig { pub data_sink: Option>, } -/// NMX-T collector for a single switch/endpoint pub struct NmxtCollector { endpoint: Arc, switch_id: String, @@ -325,24 +534,22 @@ impl NmxtCollector { } } - /// Build the canonical label set for one emitted `switch_nmxt` series. - /// - /// Always carries `switch_id` / `switch_ip`. Identity and inventory dimensions are re-exported - /// from the scraped sample only when their NMX-T label key is on the explicit - /// [`NMXT_LABEL_MAP`] allowlist; their canonical names come from that map. Label keys not on - /// the allowlist are dropped (never sanitized into exported labels). + /// Canonical label set for one `switch_nmxt` series. Always carries `switch_id` / `switch_ip`; + /// scraped dimensions are re-exported only when their key is on [`NMXT_LABEL_MAP`], everything + /// else is dropped (never sanitized into exported labels). fn build_labels( &self, switch_ip: &str, sample_labels: &HashMap, ) -> Vec<(Cow<'static, str>, String)> { - let mut labels: Vec<(Cow<'static, str>, String)> = Vec::with_capacity(2 + NMXT_LABEL_MAP.len()); + let mut labels: Vec<(Cow<'static, str>, String)> = + Vec::with_capacity(2 + NMXT_LABEL_MAP.len()); labels.push((Cow::Borrowed("switch_id"), self.switch_id.clone())); labels.push((Cow::Borrowed("switch_ip"), switch_ip.to_string())); - for (source_key, canonical) in NMXT_LABEL_MAP { - if let Some(value) = sample_labels.get(*source_key) { - labels.push((Cow::Borrowed(*canonical), value.clone())); + for label in NMXT_LABEL_MAP { + if let Some(value) = sample_labels.get(label.source) { + labels.push((Cow::Borrowed(label.canonical), value.clone())); } } @@ -356,9 +563,12 @@ impl NmxtCollector { self.emit_event(CollectorEvent::MetricCollectionStart); - // Count of scraped families not on the explicit allowlist. These are skipped (never - // sanitized into telemetry) and only reported diagnostically. + // Scraped families off the allowlist: skipped (never sanitized) and only counted. let mut unmapped_families = 0u64; + // Ports already emitted a cable temperature this iteration (one series per port). + let mut cable_temp_ports: HashSet = HashSet::new(); + // Ports already emitted a down_blame StateSet this iteration (one set per port). + let mut down_blame_ports: HashSet = HashSet::new(); for sample in metrics { let NmxtMetricSample { @@ -367,13 +577,68 @@ impl NmxtCollector { value, } = sample; - // Explicit family allowlist: an unknown source name is dropped and counted only. - let Some((metric_type, unit)) = lookup_nmxt_metric(&name) else { + // `Module_Temperature` rides as a label on lines whose family may not be allowlisted, + // so emit it before the family check, once per port. + if let Some(celsius) = sample_labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)) + { + let port_num = sample_labels + .get("Port_Number") + .cloned() + .unwrap_or_default(); + if cable_temp_ports.insert(port_num.clone()) { + let labels = self.build_labels(&switch_ip, &sample_labels); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("cable_temperature_celsius:{}", port_num), + name: NMXT_PRODUCER.to_string(), + metric_type: "cable_temperature_celsius".to_string(), + unit: "celsius".to_string(), + value: celsius, + labels, + context: None, + } + .into(), + )); + } + } + + // `down_blame` is a closed enum riding as a label; emit it as a per-port StateSet + // (one 0/1 series per state) before the family check, once per port. + if let Some(raw) = sample_labels.get("down_blame") { + let port_num = sample_labels + .get("Port_Number") + .cloned() + .unwrap_or_default(); + if down_blame_ports.insert(port_num.clone()) { + let current = down_blame_to_state(raw); + for state in DOWN_BLAME_STATES { + let mut labels = self.build_labels(&switch_ip, &sample_labels); + labels.push((Cow::Borrowed("state"), (*state).to_string())); + self.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("down_blame:{}:{}", port_num, state), + name: NMXT_PRODUCER.to_string(), + metric_type: "down_blame".to_string(), + unit: "state".to_string(), + value: if *state == current { 1.0 } else { 0.0 }, + labels, + context: None, + } + .into(), + )); + } + } + } + + let Some(metric) = lookup_nmxt_metric(&name) else { unmapped_families += 1; continue; }; + let (metric_type, unit) = (metric.metric_type, metric.unit); - // Port number anchors the per-series key; sourced from the explicit label dimension. + // Port number anchors the per-series key. let port_num = sample_labels .get("Port_Number") .cloned() @@ -457,11 +722,9 @@ Link_Down{Port_Number="1"} 5 assert_eq!(samples.len(), 4); } - /// Representative live NMX-T `lid` series carrying the full identity/inventory label set. - /// Mirrors the Stage-0 GB200 scrape (`nmxt-prometheus.txt`). + /// Live NMX-T `lid` series from the Stage-0 GB200 scrape (`nmxt-prometheus.txt`). const SAMPLE_LID_LINE: &str = r#"lid{Device_ID="GB100", port_label="GPUP10", logical_state="ACT", device_num_on_tray="2", board_type="3", chassis_slot_index="27", tray_index="17", topology_id="128", chassis_id="1820325172739", Active_FEC="Int_KP4_FEC_PLR", link_partner_description="MF0;sw06:N5400_LD/U1", link_partner_node_guid="0x2c5eab0300b6a900", link_partner_port_num="71", cable_vendor="Other", down_blame="Unknown", local_reason_opcode="No_link_down_indication", Node_GUID="0xe1d04a69816f16bc", node_description="GB100 Nvidia Technologies", Port_Number="11", FW_Version="36.2014.1866", Cable_PN="NA", Cable_SN="NA", cable_type="850 nm VCSEL", cable_length="NA", cable_identifier="Backplane", vendor_rev="NA", cable_fw_version="N/A", Module_Temperature="0C", Status_Message="No issue was observed", port_guid="0xe1d04a69816f16c6", sw_serial_number="MT123", sw_revision="A1", remote_reason_opcode="4"} 3093 1781993954087"#; - // Catalog row -> NMX-T family -> (metric_type, unit). One row per explicit family mapping. #[test] fn test_nmxt_metric_map_locks_type_and_unit() { let expected: &[(&str, &str, &str)] = &[ @@ -472,20 +735,36 @@ Link_Down{Port_Number="1"} 5 ("device_hw_rev", "device_hw_rev", "id"), ("Advanced_Status_Opcode", "status_opcode", "code"), ("remote_reason_opcode", "remote_reason_opcode", "code"), - ("time_to_link_up_ext_msec", "time_to_link_up", "milliseconds"), + ( + "time_to_link_up_ext_msec", + "time_to_link_up", + "milliseconds", + ), ("cable_technology", "cable_transmitter_technology", "code"), ("rx_power_lane_0", "cable_rx_power_lane0", "milliwatts"), ("rx_power_lane_1", "cable_rx_power_lane1", "milliwatts"), ("Module_Voltage", "cable_diag_supply_voltage", "volts"), ("link_partner_lid", "link_partner_lid", "id"), - ("successful_recovery_events", "link_recovery_success_cnt", "count"), + ( + "successful_recovery_events", + "link_recovery_success_cnt", + "count", + ), ( "total_successful_recovery_events", "total_link_recovery_success_cnt", "count", ), - ("time_since_last_recovery", "time_since_last_recovery", "seconds"), - ("time_between_last_2_recoveries", "time_btwn_two_recoveries", "seconds"), + ( + "time_since_last_recovery", + "time_since_last_recovery", + "seconds", + ), + ( + "time_between_last_2_recoveries", + "time_btwn_two_recoveries", + "seconds", + ), ( "last_host_logical_recovery_attempts_count", "recovery_attempts_l1_cnt", @@ -496,14 +775,26 @@ Link_Down{Port_Number="1"} 5 "recovery_attempts_l2_cnt", "count", ), - ("time_in_last_host_logical_recovery", "recovery_cycle_duration", "seconds"), + ( + "time_in_last_host_logical_recovery", + "recovery_cycle_duration", + "seconds", + ), ( "time_in_last_host_serdes_feq_recovery", "serdes_recovery_cycle_duration", "seconds", ), - ("contain_n_drain_xmit_discards", "contain_drain_xmit_discard", "count"), - ("contain_n_drain_rcv_discards", "contain_drain_rcv_discard", "count"), + ( + "contain_n_drain_xmit_discards", + "contain_drain_xmit_discard", + "count", + ), + ( + "contain_n_drain_rcv_discards", + "contain_drain_rcv_discard", + "count", + ), ("Raw_Errors_Lane_2", "raw_err_lane_2", "count"), ("Raw_Errors_Lane_3", "raw_err_lane_3", "count"), ("tx_cdr_lol", "cable_tx_cdr_lol", "state"), @@ -513,9 +804,11 @@ Link_Down{Port_Number="1"} 5 ]; for (source, metric_type, unit) in expected { + let m = lookup_nmxt_metric(source) + .unwrap_or_else(|| panic!("family `{source}` must be allowlisted")); assert_eq!( - lookup_nmxt_metric(source), - Some((*metric_type, *unit)), + (m.metric_type, m.unit), + (*metric_type, *unit), "family `{source}` must map to ({metric_type}, {unit})" ); } @@ -523,7 +816,6 @@ Link_Down{Port_Number="1"} 5 assert_eq!(NMXT_METRIC_MAP.len(), expected.len()); } - // Catalog identity/inventory row -> NMX-T label key -> canonical label name. #[test] fn test_nmxt_label_map_locks_canonical_names() { let expected: &[(&str, &str)] = &[ @@ -537,7 +829,6 @@ Link_Down{Port_Number="1"} 5 ("Active_FEC", "fec_mode_active"), ("Device_ID", "device_id"), ("Status_Message", "status_message"), - ("down_blame", "down_blame"), ("local_reason_opcode", "local_reason_opcode"), ("Cable_PN", "cable_part_number"), ("Cable_SN", "cable_serial_number"), @@ -547,7 +838,6 @@ Link_Down{Port_Number="1"} 5 ("cable_identifier", "cable_identifier"), ("vendor_rev", "cable_rev"), ("cable_fw_version", "cable_fw_version"), - ("Module_Temperature", "cable_temp"), ("link_partner_description", "link_partner_description"), ("link_partner_node_guid", "link_partner_node_guid"), ("link_partner_port_num", "link_partner_port_num"), @@ -561,7 +851,7 @@ Link_Down{Port_Number="1"} 5 for (key, canonical) in expected { assert_eq!( - lookup_nmxt_label(key), + lookup_nmxt_label(key).map(|l| l.canonical), Some(*canonical), "label `{key}` must map to canonical `{canonical}`" ); @@ -600,31 +890,282 @@ Link_Down{Port_Number="1"} 5 // Resolve canonical labels exactly as build_labels would (allowlist-gated). let mut canonical = HashMap::new(); - for (source_key, canonical_name) in NMXT_LABEL_MAP { - if let Some(value) = sample.labels.get(*source_key) { - canonical.insert(*canonical_name, value.clone()); + for label in NMXT_LABEL_MAP { + if let Some(value) = sample.labels.get(label.source) { + canonical.insert(label.canonical, value.clone()); } } - // Identity/inventory rows are present as labels with their canonical names. - assert_eq!(canonical.get("node_guid"), Some(&"0xe1d04a69816f16bc".to_string())); // 806 - assert_eq!(canonical.get("port_guid"), Some(&"0xe1d04a69816f16c6".to_string())); // 807 - assert_eq!(canonical.get("port_num"), Some(&"11".to_string())); // 866 - assert_eq!(canonical.get("port_label"), Some(&"GPUP10".to_string())); // 867 - assert_eq!(canonical.get("net_fw_ver"), Some(&"36.2014.1866".to_string())); // 763 - assert_eq!(canonical.get("serial"), Some(&"MT123".to_string())); // 804 - assert_eq!(canonical.get("revision"), Some(&"A1".to_string())); // 868 - assert_eq!(canonical.get("device_id"), Some(&"GB100".to_string())); // 910 - assert_eq!(canonical.get("fec_mode_active"), Some(&"Int_KP4_FEC_PLR".to_string())); // 898 - assert_eq!(canonical.get("cable_part_number"), Some(&"NA".to_string())); // 968 - assert_eq!(canonical.get("cable_temp"), Some(&"0C".to_string())); // 980 - assert_eq!(canonical.get("chassis_id"), Some(&"1820325172739".to_string())); // 1703 + assert_eq!( + canonical.get("node_guid"), + Some(&"0xe1d04a69816f16bc".to_string()) + ); + assert_eq!( + canonical.get("port_guid"), + Some(&"0xe1d04a69816f16c6".to_string()) + ); + assert_eq!(canonical.get("port_num"), Some(&"11".to_string())); + assert_eq!(canonical.get("port_label"), Some(&"GPUP10".to_string())); + assert_eq!( + canonical.get("net_fw_ver"), + Some(&"36.2014.1866".to_string()) + ); + assert_eq!(canonical.get("serial"), Some(&"MT123".to_string())); + assert_eq!(canonical.get("revision"), Some(&"A1".to_string())); + assert_eq!(canonical.get("device_id"), Some(&"GB100".to_string())); + assert_eq!( + canonical.get("fec_mode_active"), + Some(&"Int_KP4_FEC_PLR".to_string()) + ); + assert_eq!(canonical.get("cable_part_number"), Some(&"NA".to_string())); + // Module_Temperature is no longer a re-exported label; it becomes a numeric metric. + assert!(!canonical.contains_key("cable_temp")); + assert_eq!( + sample + .labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)), + Some(0.0) + ); + assert_eq!( + canonical.get("chassis_id"), + Some(&"1820325172739".to_string()) + ); assert_eq!( canonical.get("link_partner_node_guid"), Some(&"0x2c5eab0300b6a900".to_string()) - ); // 988 + ); // node_description is present on the series but NOT allowlisted -> not re-exported. assert!(!canonical.contains_key("node_description")); } + + #[test] + fn test_down_blame_to_state() { + assert_eq!(down_blame_to_state("Unknown"), "unknown"); + assert_eq!(down_blame_to_state("Local_phy"), "local_phy"); + assert_eq!(down_blame_to_state("Remote_phy"), "remote_phy"); + // Case-insensitive. + assert_eq!(down_blame_to_state("LOCAL_PHY"), "local_phy"); + assert_eq!(down_blame_to_state("remote_PHY"), "remote_phy"); + // Unrecognized / empty -> "unknown". + assert_eq!(down_blame_to_state("garbage"), "unknown"); + assert_eq!(down_blame_to_state(""), "unknown"); + } + + // Two scraped lines for the same port both carry down_blame="Remote_phy": exactly three + // down_blame series (one per state) are emitted for that port, remote_phy=1 the rest=0, + // unit "state", and down_blame is NOT a plain identity label on the emitted series. + #[test] + fn test_down_blame_state_set_once_per_port() { + use std::sync::Mutex as StdMutex; + + use crate::endpoint::test_support::{mac, test_endpoint}; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let endpoint = Arc::new(test_endpoint(mac("00:11:22:33:44:55"))); + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let collector = NmxtCollector { + endpoint: endpoint.clone(), + switch_id: "test-switch".to_string(), + http_client: reqwest::Client::new(), + event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), + data_sink: Some(sink.clone()), + }; + + // Two distinct families on the SAME port, both carrying down_blame. + let lines = [ + r#"lid{Port_Number="11", down_blame="Remote_phy"} 3093"#, + r#"Effective_BER{Port_Number="11", down_blame="Remote_phy"} 0"#, + ]; + let switch_ip = endpoint.addr.ip.to_string(); + let mut down_blame_ports: HashSet = HashSet::new(); + for line in lines { + let sample = parse_prometheus_line(line).expect("parse line"); + if let Some(raw) = sample.labels.get("down_blame") { + let port_num = sample + .labels + .get("Port_Number") + .cloned() + .unwrap_or_default(); + if down_blame_ports.insert(port_num.clone()) { + let current = down_blame_to_state(raw); + for state in DOWN_BLAME_STATES { + let mut labels = collector.build_labels(&switch_ip, &sample.labels); + labels.push((Cow::Borrowed("state"), (*state).to_string())); + collector.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("down_blame:{}:{}", port_num, state), + name: NMXT_PRODUCER.to_string(), + metric_type: "down_blame".to_string(), + unit: "state".to_string(), + value: if *state == current { 1.0 } else { 0.0 }, + labels, + context: None, + } + .into(), + )); + } + } + } + } + + let samples = sink.samples.lock().unwrap(); + let blame_series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == "down_blame") + .collect(); + assert_eq!( + blame_series.len(), + 3, + "exactly one series per state per port per scrape" + ); + + for s in &blame_series { + assert_eq!(s.name, NMXT_PRODUCER); + assert_eq!(s.unit, "state"); + let state = s + .labels + .iter() + .find(|(k, _)| k == "state") + .map(|(_, v)| v.as_str()) + .expect("state label present"); + let expected = if state == "remote_phy" { 1.0 } else { 0.0 }; + assert_eq!(s.value, expected, "state `{state}` value"); + // down_blame must not survive as a plain identity label. + assert!( + !s.labels.iter().any(|(k, _)| k == "down_blame"), + "down_blame must not be a re-exported identity label" + ); + } + } + + #[test] + fn test_cable_temp_to_celsius() { + assert_eq!(cable_temp_to_celsius("0C"), Some(0.0)); + assert_eq!(cable_temp_to_celsius("37C"), Some(37.0)); + assert_eq!(cable_temp_to_celsius("37.5C"), Some(37.5)); + assert_eq!(cable_temp_to_celsius("N/A"), None); + assert_eq!(cable_temp_to_celsius(""), None); + assert_eq!(cable_temp_to_celsius("NA"), None); + } + + // Two scraped lines for the same port both carry Module_Temperature: exactly one + // cable_temperature_celsius series is emitted, with the parsed value and no cable_temp label. + #[test] + fn test_cable_temperature_emit_once_per_port() { + use std::sync::Mutex as StdMutex; + + use crate::endpoint::test_support::{mac, test_endpoint}; + + struct CapturingSink { + samples: StdMutex>, + } + + impl DataSink for CapturingSink { + fn sink_type(&self) -> &'static str { + "capturing_sink" + } + + fn handle_event(&self, _context: &EventContext, event: &CollectorEvent) { + if let CollectorEvent::Metric(sample) = event { + self.samples.lock().unwrap().push((**sample).clone()); + } + } + } + + let endpoint = Arc::new(test_endpoint(mac("00:11:22:33:44:55"))); + let sink = Arc::new(CapturingSink { + samples: StdMutex::new(Vec::new()), + }); + let collector = NmxtCollector { + endpoint: endpoint.clone(), + switch_id: "test-switch".to_string(), + http_client: reqwest::Client::new(), + event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), + data_sink: Some(sink.clone()), + }; + + // Two distinct families on the SAME port, both carrying Module_Temperature. + let lines = [ + r#"lid{Port_Number="11", Module_Temperature="37.5C"} 3093"#, + r#"Effective_BER{Port_Number="11", Module_Temperature="37.5C"} 0"#, + ]; + let switch_ip = endpoint.addr.ip.to_string(); + let mut cable_temp_ports: HashSet = HashSet::new(); + for line in lines { + let sample = parse_prometheus_line(line).expect("parse line"); + if let Some(celsius) = sample + .labels + .get("Module_Temperature") + .and_then(|raw| cable_temp_to_celsius(raw)) + { + let port_num = sample + .labels + .get("Port_Number") + .cloned() + .unwrap_or_default(); + if cable_temp_ports.insert(port_num.clone()) { + let labels = collector.build_labels(&switch_ip, &sample.labels); + collector.emit_event(CollectorEvent::Metric( + MetricSample { + key: format!("cable_temperature_celsius:{}", port_num), + name: NMXT_PRODUCER.to_string(), + metric_type: "cable_temperature_celsius".to_string(), + unit: "celsius".to_string(), + value: celsius, + labels, + context: None, + } + .into(), + )); + } + } + } + + let samples = sink.samples.lock().unwrap(); + let temp_series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == "cable_temperature_celsius") + .collect(); + assert_eq!( + temp_series.len(), + 1, + "exactly one series per port per scrape" + ); + + let series = temp_series[0]; + assert_eq!(series.name, NMXT_PRODUCER); + assert_eq!(series.unit, "celsius"); + assert_eq!(series.value, 37.5); + assert_eq!(series.key, "cable_temperature_celsius:11"); + assert!( + !series.labels.iter().any(|(k, _)| k == "cable_temp"), + "identity labels must no longer include cable_temp" + ); + assert!( + series + .labels + .iter() + .any(|(k, v)| k == "port_num" && v == "11"), + "identity labels still carry port_num" + ); + } } diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 77f3be627f..944ae601c5 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -27,7 +27,6 @@ use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; pub(crate) const NVUE_GNMI_SAMPLE_STREAM_ID: &str = "nvue_gnmi"; -/// process NVUE gNMI SAMPLE notifications and emit them as `CollectorEvent::Metric` pub(crate) struct GnmiSampleProcessor { pub(crate) data_sink: Option>, pub(crate) event_context: EventContext, @@ -99,8 +98,7 @@ impl GnmiSampleProcessor { entities.insert(("component", comp)); self.process_component_metric(&combined, comp, val); } else if combined.iter().any(|e| e.name == "platform-general") { - // switch-level singleton: no interface/component name key. Count - // it as a single entity so monitored_entities stays accurate. + // switch-level singleton: no name key, counted as one entity. entities.insert(("platform-general", "")); self.process_platform_general_metric(&combined, val); } @@ -115,29 +113,43 @@ impl GnmiSampleProcessor { iface_name: &str, val: &proto::TypedValue, ) { - // Explicit per-leaf canonical mappings for `/interfaces/interface`. Each - // arm is an allowlisted GB200 NVOS gNMI leaf proven live in the Stage-0 - // probe. Unknown leaves fall through and are never exported. + // Allowlisted `/interfaces/interface` leaves (live in the Stage-0 probe); + // unknown leaves fall through and are never exported. if leaf_matches(elems, &["state", "oper-status"]) { - let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); - self.emit_iface("interface_oper_status", iface_name, v, "state"); + let current = oper_status_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_oper_status", + "interface_name", + iface_name, + current, + OPER_STATUS_STATES, + ); } else if let Some(metric_type) = numeric_interface_leaf(elems) { - // numeric counters, gauges, and BER ratios share the same numeric - // coercion; the matched leaf decides the canonical metric_type/unit. match typed_value_to_f64(val) { Some(v) => self.emit_iface(metric_type.name, iface_name, v, metric_type.unit), None => debug_unmapped_value(elems, val, metric_type.name), } } else if leaf_matches(elems, &["infiniband", "state", "physical-port-state"]) { - let v = physical_port_state_to_f64(typed_value_to_string(val).as_deref()); - self.emit_iface("interface_physical_port_state", iface_name, v, "state"); + let current = physical_port_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_physical_port_state", + "interface_name", + iface_name, + current, + PHYSICAL_PORT_STATES, + ); } else if leaf_matches(elems, &["infiniband", "state", "logical-port-state"]) { - let v = logical_port_state_to_f64(typed_value_to_string(val).as_deref()); - self.emit_iface("interface_logical_port_state", iface_name, v, "state"); + let current = logical_port_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_logical_port_state", + "interface_name", + iface_name, + current, + LOGICAL_PORT_STATES, + ); } else if leaf_matches(elems, &["infiniband", "state", "speed"]) { - // NVOS types speed as a string/enum, but live GB200 emits bare - // numeric Gbps ("400", "100", "0"). Parse via the string path and - // normalize to Gbps; unparseable forms (e.g. "hdr") emit nothing. + // NVOS types speed as a string/enum but live GB200 emits bare numeric Gbps; + // unparseable forms (e.g. "hdr") emit nothing. match link_speed_to_gbps(typed_value_to_string(val).as_deref()) { Some(v) => self.emit_iface("interface_link_speed_active", iface_name, v, "gbps"), None => debug_unmapped_value(elems, val, "interface_link_speed_active"), @@ -153,15 +165,17 @@ impl GnmiSampleProcessor { None => debug_unmapped_value(elems, val, "interface_supported_width"), } } else if leaf_matches(elems, &["phy-diag", "state", "phy-manager-state"]) { - // PHY-MANAGER-STATE (row 961): a dynamic PHY FSM string. Enum-code it - // rather than carry it as an info label (the value changes over time). - let v = phy_manager_state_to_f64(typed_value_to_string(val).as_deref()); - self.emit_iface("interface_phy_manager_state", iface_name, v, "state"); + // dynamic PHY FSM string: emit as a StateSet, not an info label. + let current = phy_manager_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "interface_phy_manager_state", + "interface_name", + iface_name, + current, + PHY_MANAGER_STATES, + ); } else if leaf_matches(elems, &["infiniband", "state", "vl-capabilities"]) { - // VL-CAPABILITIES (row 965): a stable capability string (e.g. - // "VL0-VL7"). Surface it as an info-metric: a constant 1.0 sample - // whose information lives in the `vl_capabilities` label. Empty - // strings carry no information and emit nothing. + // stable capability string surfaced as an info-metric; empty emits nothing. if let Some(caps) = typed_value_to_string(val).filter(|s| !s.is_empty()) { self.emit_iface_info( "interface_vl_capabilities_info", @@ -173,7 +187,6 @@ impl GnmiSampleProcessor { } } - /// emit a `/interfaces/interface` canonical series keyed on `interface_name` fn emit_iface(&self, metric_type: &str, iface_name: &str, value: f64, unit: &str) { self.emit_data_metric( metric_type, @@ -185,9 +198,7 @@ impl GnmiSampleProcessor { ); } - /// emit a per-interface info-metric: a constant `1.0` sample whose - /// information is carried by an extra string label alongside the - /// `interface_name` label. Used for stable interface capability strings. + /// per-interface info-metric: constant `1.0` sample with a string label beside `interface_name`. fn emit_iface_info( &self, metric_type: &str, @@ -227,25 +238,41 @@ impl GnmiSampleProcessor { comp_name: &str, val: &proto::TypedValue, ) { - // Explicit per-leaf canonical mappings for `/components/component`. The - // `component_name` label (e.g. "ASIC1", "FAN1/1", "cpu") distinguishes - // catalog rows that share a leaf (FAN-STATE and CPU-STATE both resolve + // Allowlisted `/components/component` leaves; the `component_name` label + // distinguishes rows that share a leaf (FAN-STATE and CPU-STATE both resolve // to `state/oper-status`). Unknown leaves are never exported. if leaf_matches(elems, &["healthz", "state", "status"]) { - let v = component_health_to_f64(typed_value_to_string(val).as_deref()); - self.emit_comp("component_health_status", comp_name, v, "state"); + let current = component_health_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "component_health_status", + "component_name", + comp_name, + current, + COMPONENT_HEALTH_STATES, + ); } else if leaf_matches(elems, &["state", "temperature", "instant"]) && let Some(v) = typed_value_to_f64(val) { self.emit_comp("component_temperature_celsius", comp_name, v, "celsius"); } else if leaf_matches(elems, &["state", "oper-status"]) { // FAN-STATE (row 966) and CPU-STATE (row 1174) share this leaf. - let v = oper_status_to_f64(typed_value_to_string(val).as_deref()); - self.emit_comp("component_oper_status", comp_name, v, "state"); + let current = oper_status_to_state(typed_value_to_string(val).as_deref()); + self.emit_state_set( + "component_oper_status", + "component_name", + comp_name, + current, + OPER_STATUS_STATES, + ); } else if leaf_matches(elems, &["asic", "state", "asic-temp"]) && let Some(v) = typed_value_to_f64(val) { - self.emit_comp("component_asic_temperature_celsius", comp_name, v, "celsius"); + self.emit_comp( + "component_asic_temperature_celsius", + comp_name, + v, + "celsius", + ); } else if leaf_matches(elems, &["cpu", "utilization", "state", "avg"]) && let Some(v) = typed_value_to_f64(val) { @@ -256,7 +283,6 @@ impl GnmiSampleProcessor { // component metric, so a dedicated series would be redundant. } - /// emit a `/components/component` canonical series keyed on `component_name` fn emit_comp(&self, metric_type: &str, comp_name: &str, value: f64, unit: &str) { self.emit_data_metric( metric_type, @@ -325,9 +351,7 @@ impl GnmiSampleProcessor { } } - /// emit a switch-level singleton series. Unlike interface/component series - /// there is no per-entity name; endpoint identity is added by PrometheusSink - /// from EventContext. + /// switch-level singleton series: no per-entity name, endpoint identity added by PrometheusSink. fn emit_switch(&self, metric_type: &str, value: f64, unit: &str) { let Some(sink) = &self.data_sink else { return }; @@ -345,9 +369,7 @@ impl GnmiSampleProcessor { ); } - /// emit a switch-level singleton info-metric: a constant `1.0` sample whose - /// information is carried by a single string label. Like `emit_switch`, - /// endpoint identity is added by PrometheusSink from EventContext. + /// switch-level info-metric: constant `1.0` sample carrying a single string label. fn emit_switch_info( &self, metric_type: &str, @@ -356,10 +378,7 @@ impl GnmiSampleProcessor { ) { let Some(sink) = &self.data_sink else { return }; - let labels = vec![( - Cow::Borrowed(info_label_name), - info_label_value.to_string(), - )]; + let labels = vec![(Cow::Borrowed(info_label_name), info_label_value.to_string())]; sink.handle_event( &self.event_context, @@ -391,8 +410,7 @@ impl GnmiSampleProcessor { key.push(':'); key.push_str(entity_id); - // only the domain-specific entity label; endpoint identity (ip, mac, - // serial_number, collector_type) is added by PrometheusSink from EventContext + // only the entity label; endpoint identity is added by PrometheusSink from EventContext. let labels = vec![( Cow::Borrowed(entity_label_name), entity_label_value.to_string(), @@ -411,6 +429,48 @@ impl GnmiSampleProcessor { })), ); } + + /// OpenMetrics StateSet: one `0.0`/`1.0` series per state (current == 1.0), with a `state` + /// label. The fan-out works for both sinks since OTLP has no native StateSet type. Unit "state". + fn emit_state_set( + &self, + metric_type: &str, + entity_label_name: &'static str, + entity_id: &str, + current_state: &str, + all_states: &[&'static str], + ) { + let Some(sink) = &self.data_sink else { return }; + + for state in all_states { + let mut key = + String::with_capacity(metric_type.len() + 1 + entity_id.len() + 1 + state.len()); + key.push_str(metric_type); + key.push(':'); + key.push_str(entity_id); + key.push(':'); + key.push_str(state); + + // only the entity + state labels; endpoint identity is added by PrometheusSink. + let labels = vec![ + (Cow::Borrowed(entity_label_name), entity_id.to_string()), + (Cow::Borrowed("state"), state.to_string()), + ]; + + sink.handle_event( + &self.event_context, + &CollectorEvent::Metric(Box::new(MetricSample { + key, + name: NVUE_GNMI_SAMPLE_STREAM_ID.to_string(), + metric_type: metric_type.to_string(), + unit: "state".to_string(), + value: if *state == current_state { 1.0 } else { 0.0 }, + labels, + context: None, + })), + ); + } + } } fn find_elem_key_ref<'a>( @@ -435,7 +495,14 @@ fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { .all(|(elem, name)| elem.name == *name) } -/// canonical (`metric_type`, `unit`) for an allowlisted numeric interface leaf +/// One numeric `/interfaces/interface` leaf mapping: path tail -> metric_type + unit. +struct NumericLeafMapping { + tail: &'static [&'static str], + name: &'static str, + unit: &'static str, +} + +/// A resolved numeric leaf: the metric_type + unit to emit. struct NumericLeaf { name: &'static str, unit: &'static str, @@ -446,281 +513,316 @@ struct NumericLeaf { /// expected leaf path tail is matched against the live gNMI tree. Leaves not in /// this table are never exported as metrics. fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { - // (leaf path tail, metric_type, unit) - const TABLE: &[(&[&str], &str, &str)] = &[ + const TABLE: &[NumericLeafMapping] = &[ // OpenConfig interface counters (`/state/counters/*`) - ( - &["state", "counters", "in-errors"], - "interface_in_errors", - "count", - ), - ( - &["state", "counters", "out-errors"], - "interface_out_errors", - "count", - ), - ( - &["state", "counters", "out-discards"], - "interface_out_discards", - "count", - ), - ( - &["state", "counters", "in-octets"], - "interface_in_octets", - "bytes", - ), - ( - &["state", "counters", "out-octets"], - "interface_out_octets", - "bytes", - ), - ( - &["state", "counters", "in-pkts"], - "interface_in_packets", - "count", - ), - ( - &["state", "counters", "out-pkts"], - "interface_out_packets", - "count", - ), + NumericLeafMapping { + tail: &["state", "counters", "in-errors"], + name: "interface_in_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-errors"], + name: "interface_out_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-discards"], + name: "interface_out_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "in-octets"], + name: "interface_in_octets", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-octets"], + name: "interface_out_octets", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["state", "counters", "in-pkts"], + name: "interface_in_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["state", "counters", "out-pkts"], + name: "interface_out_packets", + unit: "count", + }, // InfiniBand port counters (`/infiniband/state/counters/port/*`) - ( - &["infiniband", "state", "counters", "port", "link-downed"], - "interface_link_downed", - "count", - ), - ( - &[ + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "link-downed"], + name: "interface_link_downed", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "link-error-recovery", ], - "interface_link_error_recovery", - "count", - ), - ( - &[ + name: "interface_link_error_recovery", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "rcv-remote-phy-errors", ], - "interface_rcv_remote_physical_errors", - "count", - ), - ( - &[ + name: "interface_rcv_remote_physical_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "rcv-switch-relay-errors", ], - "interface_rcv_switch_relay_errors", - "count", - ), - ( - &[ + name: "interface_rcv_switch_relay_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "rcv-constraints-errors", ], - "interface_rcv_constraint_errors", - "count", - ), - ( - &[ + name: "interface_rcv_constraint_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "local-link-integrity-errors", ], - "interface_local_link_integrity_errors", - "count", - ), - ( - &[ + name: "interface_local_link_integrity_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &[ "infiniband", "state", "counters", "port", "excessive-buffer-overrun", ], - "interface_port_buffer_overrun_errors", - "count", - ), - ( - &["infiniband", "state", "counters", "port", "qp1-dropped"], - "interface_qp1_dropped", - "count", - ), - ( - &["infiniband", "state", "counters", "port", "vl15-dropped"], - "interface_vl15_dropped", - "count", - ), - ( - &["infiniband", "state", "counters", "port", "xmit-wait"], - "interface_port_xmit_wait", - "count", - ), + name: "interface_port_buffer_overrun_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "qp1-dropped"], + name: "interface_qp1_dropped", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "vl15-dropped"], + name: "interface_vl15_dropped", + unit: "count", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "counters", "port", "xmit-wait"], + name: "interface_port_xmit_wait", + unit: "count", + }, // NOTE: `infiniband/state/speed` is intentionally NOT in this numeric // table. NVOS types it as a string/enum and the live GB200 form is a // bare Gbps numeric; it is handled by a dedicated `link_speed_to_gbps` // arm in `process_interface_metric` that emits unit `gbps`. - (&["infiniband", "state", "mtu"], "interface_mtu", "bytes"), - ( - &["infiniband", "state", "max-supported-mtus"], - "interface_max_supported_mtu", - "bytes", - ), + NumericLeafMapping { + tail: &["infiniband", "state", "mtu"], + name: "interface_mtu", + unit: "bytes", + }, + NumericLeafMapping { + tail: &["infiniband", "state", "max-supported-mtus"], + name: "interface_max_supported_mtu", + unit: "bytes", + }, // phy-diag counters and ratios (`/phy-diag/state/*`) - (&["phy-diag", "state", "raw-ber"], "interface_raw_ber", "ratio"), - ( - &["phy-diag", "state", "effective-ber"], - "interface_effective_ber", - "ratio", - ), - (&["phy-diag", "state", "symbol-ber"], "interface_symbol_ber", "ratio"), - (&["phy-diag", "state", "raw-ber-ch-1"], "interface_raw_ber_lane0", "ratio"), - (&["phy-diag", "state", "raw-ber-ch-2"], "interface_raw_ber_lane1", "ratio"), - ( - &["phy-diag", "state", "raw-errors-ch-1"], - "interface_phy_raw_errors_lane0", - "count", - ), - ( - &["phy-diag", "state", "raw-errors-ch-2"], - "interface_phy_raw_errors_lane1", - "count", - ), - ( - &["phy-diag", "state", "effective-errors"], - "interface_phy_effective_errors", - "count", - ), - (&["phy-diag", "state", "zero-hist"], "interface_zero_hist", "count"), - ( - &["phy-diag", "state", "phy-received-bits"], - "interface_phy_received_bits", - "count", - ), - ( - &["phy-diag", "state", "port-malformed-packet-errors"], - "interface_port_malformed_packet_errors", - "count", - ), - ( - &["phy-diag", "state", "port-neighbor-mtu-discards"], - "interface_port_neighbor_mtu_discards", - "count", - ), - ( - &["phy-diag", "state", "port-multi-cast-rcv-pkts"], - "interface_port_multicast_rcv_packets", - "count", - ), - ( - &["phy-diag", "state", "port-multi-cast-xmit-pkts"], - "interface_port_multicast_xmit_packets", - "count", - ), - ( - &["phy-diag", "state", "port-uni-cast-rcv-pkts"], - "interface_port_unicast_rcv_packets", - "count", - ), - ( - &["phy-diag", "state", "port-uni-cast-xmit-pkts"], - "interface_port_unicast_xmit_packets", - "count", - ), - ( - &["phy-diag", "state", "port-local-physical-errors"], - "interface_port_local_physical_errors", - "count", - ), - ( - &["phy-diag", "state", "sync-header-error-counter"], - "interface_sync_header_error_counter", - "count", - ), - ( - &["phy-diag", "state", "port-dlid-mapping-errors"], - "interface_port_dlid_mapping_errors", - "count", - ), - ( - &["phy-diag", "state", "port-vl-mapping-errors"], - "interface_port_vl_mapping_errors", - "count", - ), - ( - &["phy-diag", "state", "port-looping-errors"], - "interface_port_looping_errors", - "count", - ), - ( - &["phy-diag", "state", "port-inactive-discards"], - "interface_port_inactive_discards", - "count", - ), - ( - &["phy-diag", "state", "rq-general-error"], - "interface_rq_general_error", - "count", - ), - (&["phy-diag", "state", "plr-rcv-codes"], "interface_plr_rcv_codes", "count"), - ( - &["phy-diag", "state", "plr-rcv-code-err"], - "interface_plr_rcv_codes_err", - "count", - ), - ( - &["phy-diag", "state", "plr-rcv-uncorrectable-code"], - "interface_plr_rcv_uncorrectables_code", - "count", - ), - (&["phy-diag", "state", "plr-xmit-codes"], "interface_plr_xmit_codes", "count"), - ( - &["phy-diag", "state", "plr-xmit-retry-codes"], - "interface_plr_xmit_retrys_codes", - "count", - ), - ( - &["phy-diag", "state", "plr-xmit-retry-events"], - "interface_plr_xmit_retrys_events", - "count", - ), - ( - &["phy-diag", "state", "plr-sync-events"], - "interface_plr_sync_events", - "count", - ), - ( - &["phy-diag", "state", "plr-xmit-retry-events-within-t-sec-max"], - "interface_plr_xmit_retry_codes_within_minute", - "count", - ), - ( - &["phy-diag", "state", "plr-bw-loss-percent"], - "interface_plr_bw_loss_percent", - "percent", - ), + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber"], + name: "interface_raw_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "effective-ber"], + name: "interface_effective_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "symbol-ber"], + name: "interface_symbol_ber", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber-ch-1"], + name: "interface_raw_ber_lane0", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-ber-ch-2"], + name: "interface_raw_ber_lane1", + unit: "ratio", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-errors-ch-1"], + name: "interface_phy_raw_errors_lane0", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "raw-errors-ch-2"], + name: "interface_phy_raw_errors_lane1", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "effective-errors"], + name: "interface_phy_effective_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "zero-hist"], + name: "interface_zero_hist", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "phy-received-bits"], + name: "interface_phy_received_bits", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-malformed-packet-errors"], + name: "interface_port_malformed_packet_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-neighbor-mtu-discards"], + name: "interface_port_neighbor_mtu_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-multi-cast-rcv-pkts"], + name: "interface_port_multicast_rcv_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-multi-cast-xmit-pkts"], + name: "interface_port_multicast_xmit_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-uni-cast-rcv-pkts"], + name: "interface_port_unicast_rcv_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-uni-cast-xmit-pkts"], + name: "interface_port_unicast_xmit_packets", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-local-physical-errors"], + name: "interface_port_local_physical_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "sync-header-error-counter"], + name: "interface_sync_header_error_counter", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-dlid-mapping-errors"], + name: "interface_port_dlid_mapping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-vl-mapping-errors"], + name: "interface_port_vl_mapping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-looping-errors"], + name: "interface_port_looping_errors", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "port-inactive-discards"], + name: "interface_port_inactive_discards", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "rq-general-error"], + name: "interface_rq_general_error", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-codes"], + name: "interface_plr_rcv_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-code-err"], + name: "interface_plr_rcv_codes_err", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-rcv-uncorrectable-code"], + name: "interface_plr_rcv_uncorrectables_code", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-codes"], + name: "interface_plr_xmit_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-retry-codes"], + name: "interface_plr_xmit_retrys_codes", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-xmit-retry-events"], + name: "interface_plr_xmit_retrys_events", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-sync-events"], + name: "interface_plr_sync_events", + unit: "count", + }, + NumericLeafMapping { + tail: &[ + "phy-diag", + "state", + "plr-xmit-retry-events-within-t-sec-max", + ], + name: "interface_plr_xmit_retry_codes_within_minute", + unit: "count", + }, + NumericLeafMapping { + tail: &["phy-diag", "state", "plr-bw-loss-percent"], + name: "interface_plr_bw_loss_percent", + unit: "percent", + }, // existing pre-branch mapping retained (leaf out of GB200 row set but // restored upstream; kept so the canonical series is not dropped) - ( - &["phy-diag", "state", "unintentional-link-down-events"], - "interface_link_down_events", - "count", - ), + NumericLeafMapping { + tail: &["phy-diag", "state", "unintentional-link-down-events"], + name: "interface_link_down_events", + unit: "count", + }, ]; // FEC histogram bins 0..=15 -> interface_fec_hist_{n} (rows 911..926) @@ -736,8 +838,11 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { }); } - TABLE.iter().find_map(|&(tail, name, unit)| { - leaf_matches(elems, tail).then_some(NumericLeaf { name, unit }) + TABLE.iter().find_map(|m| { + leaf_matches(elems, m.tail).then_some(NumericLeaf { + name: m.name, + unit: m.unit, + }) }) } @@ -762,49 +867,60 @@ const FEC_HIST_NAMES: [&str; 16] = [ "interface_fec_hist_15", ]; -fn oper_status_to_f64(status: Option<&str>) -> f64 { +const OPER_STATUS_STATES: &[&str] = &["up", "down"]; + +/// oper-status string -> current StateSet state. "up" when the source reads +/// "up" or "active" (case-insensitive), else "down". Used for both +/// `interface_oper_status` and `component_oper_status`. +fn oper_status_to_state(status: Option<&str>) -> &'static str { match status { - Some(s) if s.eq_ignore_ascii_case("up") => 1.0, - Some(s) if s.eq_ignore_ascii_case("active") => 1.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("up") || s.eq_ignore_ascii_case("active") => "up", + _ => "down", } } -/// InfiniBand physical port state enum -> numeric code. Values observed live on -/// GB200: `LINK_UP`, `POLLING`, `PORT_CONFIGURATION_TRAINING`. 1.0 == link up. -fn physical_port_state_to_f64(state: Option<&str>) -> f64 { +const PHYSICAL_PORT_STATES: &[&str] = &["up", "down"]; + +/// InfiniBand physical port state enum -> current StateSet state. Values +/// observed live on GB200: `LINK_UP`, `POLLING`, `PORT_CONFIGURATION_TRAINING`. +/// Binary: "up" only when the link is up; polling/training/everything-else is +/// "down". +fn physical_port_to_state(state: Option<&str>) -> &'static str { match state { - Some(s) if s.eq_ignore_ascii_case("link_up") => 1.0, - Some(s) if s.eq_ignore_ascii_case("polling") => 2.0, - Some(s) if s.eq_ignore_ascii_case("port_configuration_training") => 3.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("link_up") => "up", + _ => "down", } } -/// PHY manager FSM state string -> numeric code. The PHY manager reports a -/// dynamic FSM label (e.g. "Active_or_Linkup", "Disabled"); a substring match -/// is used because the exact tokens vary. 1.0 == the PHY is active or linked -/// up, 0.0 otherwise (including empty/None). Mirrors `physical_port_state_to_f64`. -fn phy_manager_state_to_f64(state: Option<&str>) -> f64 { +const PHY_MANAGER_STATES: &[&str] = &["up", "down"]; + +/// PHY manager FSM state string -> current StateSet state. The PHY manager +/// reports a dynamic FSM label (e.g. "Active_or_Linkup", "Disabled"), so we +/// match the `active`/`linkup` tokens on word boundaries -- a bare substring +/// check would also match "Inactive"/"Deactivated" and falsely report a down +/// PHY as up. +fn phy_manager_to_state(state: Option<&str>) -> &'static str { match state { - Some(s) => { - let lower = s.to_ascii_lowercase(); - if lower.contains("active") || lower.contains("linkup") { - 1.0 - } else { - 0.0 - } + Some(s) + if s.split(|c: char| !c.is_ascii_alphanumeric()).any(|tok| { + tok.eq_ignore_ascii_case("active") || tok.eq_ignore_ascii_case("linkup") + }) => + { + "up" } - None => 0.0, + _ => "down", } } -/// InfiniBand logical port state enum -> numeric code. Values observed live on -/// GB200: `ACTIVE`, `DOWN`. 1.0 == active. -fn logical_port_state_to_f64(state: Option<&str>) -> f64 { +const LOGICAL_PORT_STATES: &[&str] = &["active", "down"]; + +/// InfiniBand logical port state enum -> current StateSet state. Values +/// observed live on GB200: `ACTIVE`, `DOWN`. "active" when the source reads +/// "active" (case-insensitive), else "down". +fn logical_port_to_state(state: Option<&str>) -> &'static str { match state { - Some(s) if s.eq_ignore_ascii_case("active") => 1.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("active") => "active", + _ => "down", } } @@ -878,11 +994,15 @@ fn leaf_path(elems: &[&PathElem]) -> String { .join("/") } -fn component_health_to_f64(status: Option<&str>) -> f64 { +const COMPONENT_HEALTH_STATES: &[&str] = &["healthy", "unhealthy", "unknown"]; + +/// component healthz status -> current StateSet state. "healthy"/"unhealthy" +/// by case-insensitive match, anything else (including absent) "unknown". +fn component_health_to_state(status: Option<&str>) -> &'static str { match status { - Some(s) if s.eq_ignore_ascii_case("healthy") => 1.0, - Some(s) if s.eq_ignore_ascii_case("unhealthy") => 2.0, - _ => 0.0, + Some(s) if s.eq_ignore_ascii_case("healthy") => "healthy", + Some(s) if s.eq_ignore_ascii_case("unhealthy") => "unhealthy", + _ => "unknown", } } @@ -962,18 +1082,21 @@ mod tests { #[test] fn test_oper_status_mapping() { - assert_eq!(oper_status_to_f64(Some("UP")), 1.0); - assert_eq!(oper_status_to_f64(Some("up")), 1.0); - assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); - assert_eq!(oper_status_to_f64(None), 0.0); + assert_eq!(oper_status_to_state(Some("UP")), "up"); + assert_eq!(oper_status_to_state(Some("up")), "up"); + assert_eq!(oper_status_to_state(Some("DOWN")), "down"); + assert_eq!(oper_status_to_state(None), "down"); } #[test] fn test_component_health_mapping() { - assert_eq!(component_health_to_f64(Some("healthy")), 1.0); - assert_eq!(component_health_to_f64(Some("HEALTHY")), 1.0); - assert_eq!(component_health_to_f64(Some("unhealthy")), 2.0); - assert_eq!(component_health_to_f64(None), 0.0); + assert_eq!(component_health_to_state(Some("healthy")), "healthy"); + assert_eq!(component_health_to_state(Some("HEALTHY")), "healthy"); + assert_eq!(component_health_to_state(Some("unhealthy")), "unhealthy"); + assert_eq!(component_health_to_state(Some("UNHEALTHY")), "unhealthy"); + // unrecognized / absent => "unknown" + assert_eq!(component_health_to_state(Some("weird")), "unknown"); + assert_eq!(component_health_to_state(None), "unknown"); } fn make_path_elem(name: &str, keys: &[(&str, &str)]) -> PathElem { @@ -1121,13 +1244,16 @@ mod tests { assert_eq!(count, 1); let events = sink.events.lock().expect("lock poisoned"); - assert_eq!(events.len(), 1); - let (context, event) = &events[0]; - assert_eq!(context.switch_id(), Some(switch_id)); - assert_eq!(context.switch_slot_number(), Some(7)); - assert_eq!(context.switch_tray_index(), Some(3)); - assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); - assert!(matches!(event, CollectorEvent::Metric(_))); + // oper-status is a StateSet: one 0/1 series per state ("up"/"down"). + assert_eq!(events.len(), OPER_STATUS_STATES.len()); + // every emitted series preserves the switch-position context. + for (context, event) in events.iter() { + assert_eq!(context.switch_id(), Some(switch_id)); + assert_eq!(context.switch_slot_number(), Some(7)); + assert_eq!(context.switch_tray_index(), Some(3)); + assert_eq!(context.rack_id().map(RackId::as_str), Some("RACK_2")); + assert!(matches!(event, CollectorEvent::Metric(_))); + } } #[test] @@ -1468,10 +1594,7 @@ mod tests { /// Drive a single `/interfaces/interface[name=acp0]/` update and /// return the one captured `MetricSample`, asserting the producer-level /// invariants (stream `name`, `collector_type`, `interface_name` label). - fn run_interface_leaf( - tail: &[&str], - val: proto::TypedValue, - ) -> (MetricSample, EventContext) { + fn run_interface_leaf(tail: &[&str], val: proto::TypedValue) -> (MetricSample, EventContext) { let sink = Arc::new(CapturingSink::default()); let mut proc = test_processor(); proc.data_sink = Some(sink.clone()); @@ -1517,11 +1640,7 @@ mod tests { } /// Same as `run_interface_leaf` but for `/components/component[name=...]`. - fn run_component_leaf( - comp_name: &str, - tail: &[&str], - val: proto::TypedValue, - ) -> MetricSample { + fn run_component_leaf(comp_name: &str, tail: &[&str], val: proto::TypedValue) -> MetricSample { let sink = Arc::new(CapturingSink::default()); let mut proc = test_processor(); proc.data_sink = Some(sink.clone()); @@ -1562,39 +1681,216 @@ mod tests { *sample } + /// Drive a single `/interfaces/interface[name=acp0]/` update and + /// return ALL captured `MetricSample`s. Used for StateSet leaves, which + /// fan a single source value out into one 0/1 series per possible state. + fn run_interface_leaf_all(tail: &[&str], val: proto::TypedValue) -> Vec { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("interfaces", &[]), + make_path_elem("interface", &[("name", "acp0")]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + sink.events + .lock() + .expect("lock poisoned") + .iter() + .map(|(_, event)| { + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + (**sample).clone() + }) + .collect() + } + + /// Same as `run_interface_leaf_all` but for `/components/component[name=...]`. + fn run_component_leaf_all( + comp_name: &str, + tail: &[&str], + val: proto::TypedValue, + ) -> Vec { + let sink = Arc::new(CapturingSink::default()); + let mut proc = test_processor(); + proc.data_sink = Some(sink.clone()); + + let mut elems = vec![ + make_path_elem("components", &[]), + make_path_elem("component", &[("name", comp_name)]), + ]; + elems.extend(tail.iter().map(|n| make_path_elem(n, &[]))); + + let notification = proto::Notification { + timestamp: 0, + prefix: None, + update: vec![proto::Update { + path: Some(proto::Path { + elem: elems, + ..Default::default() + }), + val: Some(val), + ..Default::default() + }], + ..Default::default() + }; + proc.process_notification(¬ification); + + sink.events + .lock() + .expect("lock poisoned") + .iter() + .map(|(_, event)| { + let CollectorEvent::Metric(sample) = event else { + panic!("expected a Metric event"); + }; + (**sample).clone() + }) + .collect() + } + + /// Assert OpenMetrics StateSet semantics over a captured fan-out: exactly + /// one 0/1 series per `all_states` entry, each with unit "state", the named + /// entity label present, and a `state` label; the series whose `state` + /// label equals `current` has value 1.0 and every other series is 0.0. + fn assert_state_set( + samples: &[MetricSample], + metric_type: &str, + entity_label: &str, + entity_id: &str, + all_states: &[&str], + current: &str, + ) { + assert_eq!( + samples.len(), + all_states.len(), + "{metric_type}: expected one series per state" + ); + for state in all_states { + let sample = samples + .iter() + .find(|s| s.labels.iter().any(|(k, v)| k == "state" && v == state)) + .unwrap_or_else(|| panic!("{metric_type}: missing series for state {state}")); + assert_eq!(sample.metric_type, metric_type, "state {state}"); + assert_eq!(sample.unit, "state", "state {state}"); + assert_eq!( + sample.value, + if *state == current { 1.0 } else { 0.0 }, + "{metric_type} state {state}: value (current={current})" + ); + assert!( + sample + .labels + .iter() + .any(|(k, v)| k == entity_label && v == entity_id), + "{metric_type} state {state}: missing entity label {entity_label}={entity_id}" + ); + } + } + #[test] fn test_interface_numeric_leaf_table_mappings() { // (leaf tail, expected metric_type, expected unit) let cases: &[(&[&str], &str, &str)] = &[ - (&["state", "counters", "in-errors"], "interface_in_errors", "count"), - (&["state", "counters", "out-errors"], "interface_out_errors", "count"), - (&["state", "counters", "out-discards"], "interface_out_discards", "count"), - (&["state", "counters", "in-octets"], "interface_in_octets", "bytes"), - (&["state", "counters", "out-octets"], "interface_out_octets", "bytes"), - (&["state", "counters", "in-pkts"], "interface_in_packets", "count"), - (&["state", "counters", "out-pkts"], "interface_out_packets", "count"), + ( + &["state", "counters", "in-errors"], + "interface_in_errors", + "count", + ), + ( + &["state", "counters", "out-errors"], + "interface_out_errors", + "count", + ), + ( + &["state", "counters", "out-discards"], + "interface_out_discards", + "count", + ), + ( + &["state", "counters", "in-octets"], + "interface_in_octets", + "bytes", + ), + ( + &["state", "counters", "out-octets"], + "interface_out_octets", + "bytes", + ), + ( + &["state", "counters", "in-pkts"], + "interface_in_packets", + "count", + ), + ( + &["state", "counters", "out-pkts"], + "interface_out_packets", + "count", + ), ( &["infiniband", "state", "counters", "port", "link-downed"], "interface_link_downed", "count", ), ( - &["infiniband", "state", "counters", "port", "link-error-recovery"], + &[ + "infiniband", + "state", + "counters", + "port", + "link-error-recovery", + ], "interface_link_error_recovery", "count", ), ( - &["infiniband", "state", "counters", "port", "rcv-remote-phy-errors"], + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-remote-phy-errors", + ], "interface_rcv_remote_physical_errors", "count", ), ( - &["infiniband", "state", "counters", "port", "rcv-switch-relay-errors"], + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-switch-relay-errors", + ], "interface_rcv_switch_relay_errors", "count", ), ( - &["infiniband", "state", "counters", "port", "rcv-constraints-errors"], + &[ + "infiniband", + "state", + "counters", + "port", + "rcv-constraints-errors", + ], "interface_rcv_constraint_errors", "count", ), @@ -1610,7 +1906,13 @@ mod tests { "count", ), ( - &["infiniband", "state", "counters", "port", "excessive-buffer-overrun"], + &[ + "infiniband", + "state", + "counters", + "port", + "excessive-buffer-overrun", + ], "interface_port_buffer_overrun_errors", "count", ), @@ -1635,11 +1937,31 @@ mod tests { "interface_max_supported_mtu", "bytes", ), - (&["phy-diag", "state", "raw-ber"], "interface_raw_ber", "ratio"), - (&["phy-diag", "state", "effective-ber"], "interface_effective_ber", "ratio"), - (&["phy-diag", "state", "symbol-ber"], "interface_symbol_ber", "ratio"), - (&["phy-diag", "state", "raw-ber-ch-1"], "interface_raw_ber_lane0", "ratio"), - (&["phy-diag", "state", "raw-ber-ch-2"], "interface_raw_ber_lane1", "ratio"), + ( + &["phy-diag", "state", "raw-ber"], + "interface_raw_ber", + "ratio", + ), + ( + &["phy-diag", "state", "effective-ber"], + "interface_effective_ber", + "ratio", + ), + ( + &["phy-diag", "state", "symbol-ber"], + "interface_symbol_ber", + "ratio", + ), + ( + &["phy-diag", "state", "raw-ber-ch-1"], + "interface_raw_ber_lane0", + "ratio", + ), + ( + &["phy-diag", "state", "raw-ber-ch-2"], + "interface_raw_ber_lane1", + "ratio", + ), ( &["phy-diag", "state", "raw-errors-ch-1"], "interface_phy_raw_errors_lane0", @@ -1655,7 +1977,11 @@ mod tests { "interface_phy_effective_errors", "count", ), - (&["phy-diag", "state", "zero-hist"], "interface_zero_hist", "count"), + ( + &["phy-diag", "state", "zero-hist"], + "interface_zero_hist", + "count", + ), ( &["phy-diag", "state", "phy-received-bits"], "interface_phy_received_bits", @@ -1726,7 +2052,11 @@ mod tests { "interface_rq_general_error", "count", ), - (&["phy-diag", "state", "plr-rcv-codes"], "interface_plr_rcv_codes", "count"), + ( + &["phy-diag", "state", "plr-rcv-codes"], + "interface_plr_rcv_codes", + "count", + ), ( &["phy-diag", "state", "plr-rcv-code-err"], "interface_plr_rcv_codes_err", @@ -1737,7 +2067,11 @@ mod tests { "interface_plr_rcv_uncorrectables_code", "count", ), - (&["phy-diag", "state", "plr-xmit-codes"], "interface_plr_xmit_codes", "count"), + ( + &["phy-diag", "state", "plr-xmit-codes"], + "interface_plr_xmit_codes", + "count", + ), ( &["phy-diag", "state", "plr-xmit-retry-codes"], "interface_plr_xmit_retrys_codes", @@ -1754,7 +2088,11 @@ mod tests { "count", ), ( - &["phy-diag", "state", "plr-xmit-retry-events-within-t-sec-max"], + &[ + "phy-diag", + "state", + "plr-xmit-retry-events-within-t-sec-max", + ], "interface_plr_xmit_retry_codes_within_minute", "count", ), @@ -1805,63 +2143,104 @@ mod tests { #[test] fn test_interface_physical_port_state_enum() { - for (raw, expected) in [ - ("LINK_UP", 1.0), - ("POLLING", 2.0), - ("PORT_CONFIGURATION_TRAINING", 3.0), - ("SOMETHING_ELSE", 0.0), + // Binary StateSet: only LINK_UP is "up"; polling/training/anything-else + // is "down" (regression: ordinal codes 2/3 collapsed to "down"). + for (raw, current) in [ + ("LINK_UP", "up"), + ("POLLING", "down"), + ("PORT_CONFIGURATION_TRAINING", "down"), + ("SOMETHING_ELSE", "down"), ] { - let (sample, _) = run_interface_leaf( + let samples = run_interface_leaf_all( &["infiniband", "state", "physical-port-state"], make_typed_value_string(raw), ); - assert_eq!(sample.metric_type, "interface_physical_port_state"); - assert_eq!(sample.unit, "state"); - assert_eq!(sample.value, expected, "physical-port-state {raw}"); + assert_state_set( + &samples, + "interface_physical_port_state", + "interface_name", + "acp0", + PHYSICAL_PORT_STATES, + current, + ); } } #[test] fn test_interface_logical_port_state_enum() { - for (raw, expected) in [("ACTIVE", 1.0), ("DOWN", 0.0)] { - let (sample, _) = run_interface_leaf( + for (raw, current) in [("ACTIVE", "active"), ("DOWN", "down")] { + let samples = run_interface_leaf_all( &["infiniband", "state", "logical-port-state"], make_typed_value_string(raw), ); - assert_eq!(sample.metric_type, "interface_logical_port_state"); - assert_eq!(sample.unit, "state"); - assert_eq!(sample.value, expected, "logical-port-state {raw}"); + assert_state_set( + &samples, + "interface_logical_port_state", + "interface_name", + "acp0", + LOGICAL_PORT_STATES, + current, + ); } } #[test] - fn test_phy_manager_state_to_f64_helper() { - // substring match, case-insensitive: active/linkup => 1.0 - assert_eq!(phy_manager_state_to_f64(Some("Active_or_Linkup")), 1.0); - assert_eq!(phy_manager_state_to_f64(Some("LINKUP")), 1.0); - assert_eq!(phy_manager_state_to_f64(Some("active")), 1.0); - // anything else => 0.0 - assert_eq!(phy_manager_state_to_f64(Some("Disabled")), 0.0); - assert_eq!(phy_manager_state_to_f64(Some("")), 0.0); - assert_eq!(phy_manager_state_to_f64(None), 0.0); + fn test_phy_manager_to_state_helper() { + // token match, case-insensitive: active/linkup => "up" + assert_eq!(phy_manager_to_state(Some("Active_or_Linkup")), "up"); + assert_eq!(phy_manager_to_state(Some("LINKUP")), "up"); + assert_eq!(phy_manager_to_state(Some("active")), "up"); + // anything else => "down" + assert_eq!(phy_manager_to_state(Some("Disabled")), "down"); + assert_eq!(phy_manager_to_state(Some("")), "down"); + assert_eq!(phy_manager_to_state(None), "down"); + // regression: "active" is a substring of these down-states but must NOT + // match as up -- word-boundary token match, not substring. + assert_eq!(phy_manager_to_state(Some("Inactive")), "down"); + assert_eq!(phy_manager_to_state(Some("Deactivated")), "down"); } #[test] fn test_interface_phy_manager_state_enum() { - // PHY-MANAGER-STATE (row 961): dynamic FSM string enum-coded to 1/0. - for (raw, expected) in [ - ("Active_or_Linkup", 1.0), - ("LINKUP", 1.0), - ("Disabled", 0.0), - ("", 0.0), + // PHY-MANAGER-STATE (row 961): dynamic FSM string emitted as a StateSet. + for (raw, current) in [ + ("Active_or_Linkup", "up"), + ("LINKUP", "up"), + ("Disabled", "down"), + ("", "down"), + // regression for the substring bug: these contain "active" as a + // substring but are down-states. + ("Inactive", "down"), + ("Deactivated", "down"), ] { - let (sample, _) = run_interface_leaf( + let samples = run_interface_leaf_all( &["phy-diag", "state", "phy-manager-state"], make_typed_value_string(raw), ); - assert_eq!(sample.metric_type, "interface_phy_manager_state"); - assert_eq!(sample.unit, "state"); - assert_eq!(sample.value, expected, "phy-manager-state {raw:?}"); + assert_state_set( + &samples, + "interface_phy_manager_state", + "interface_name", + "acp0", + PHY_MANAGER_STATES, + current, + ); + } + } + + #[test] + fn test_interface_oper_status_state_set() { + for (raw, current) in [("UP", "up"), ("active", "up"), ("DOWN", "down")] { + let samples = + run_interface_leaf_all(&["state", "oper-status"], make_typed_value_string(raw)); + assert_state_set( + &samples, + "interface_oper_status", + "interface_name", + "acp0", + OPER_STATUS_STATES, + current, + ); } } @@ -1947,7 +2326,11 @@ mod tests { #[test] fn test_component_explicit_leaf_mappings() { // ASIC-TEMP-CURRENT (row 875) - let asic = run_component_leaf("ASIC1", &["asic", "state", "asic-temp"], make_typed_value_uint(46)); + let asic = run_component_leaf( + "ASIC1", + &["asic", "state", "asic-temp"], + make_typed_value_uint(46), + ); assert_eq!(asic.metric_type, "component_asic_temperature_celsius"); assert_eq!(asic.unit, "celsius"); assert_eq!(asic.value, 46.0); @@ -1966,23 +2349,59 @@ mod tests { #[test] fn test_component_oper_status_shared_leaf_fan_and_cpu() { // FAN-STATE (row 966) and CPU-STATE (row 1174) share state/oper-status; - // the component_name label is the only discriminator. - let fan = run_component_leaf( + // the component_name label is the only discriminator. Emitted as a + // StateSet (one 0/1 series per state). + let fan = run_component_leaf_all( "FAN1/1", &["state", "oper-status"], make_typed_value_string("ACTIVE"), ); - assert_eq!(fan.metric_type, "component_oper_status"); - assert_eq!(fan.unit, "state"); - assert_eq!(fan.value, 1.0); + assert_state_set( + &fan, + "component_oper_status", + "component_name", + "FAN1/1", + OPER_STATUS_STATES, + "up", + ); - let cpu = run_component_leaf( + let cpu = run_component_leaf_all( "cpu", &["state", "oper-status"], - make_typed_value_string("ACTIVE"), + make_typed_value_string("DOWN"), + ); + assert_state_set( + &cpu, + "component_oper_status", + "component_name", + "cpu", + OPER_STATUS_STATES, + "down", ); - assert_eq!(cpu.metric_type, "component_oper_status"); - assert_eq!(cpu.value, 1.0); + } + + #[test] + fn test_component_health_status_state_set() { + // healthz status emitted as a 3-state StateSet; unrecognized => unknown. + for (raw, current) in [ + ("healthy", "healthy"), + ("unhealthy", "unhealthy"), + ("something_weird", "unknown"), + ] { + let samples = run_component_leaf_all( + "ASIC1", + &["healthz", "state", "status"], + make_typed_value_string(raw), + ); + assert_state_set( + &samples, + "component_health_status", + "component_name", + "ASIC1", + COMPONENT_HEALTH_STATES, + current, + ); + } } #[test] @@ -2130,9 +2549,9 @@ mod tests { #[test] fn test_oper_status_active_is_up() { - assert_eq!(oper_status_to_f64(Some("ACTIVE")), 1.0); - assert_eq!(oper_status_to_f64(Some("active")), 1.0); - assert_eq!(oper_status_to_f64(Some("DOWN")), 0.0); + assert_eq!(oper_status_to_state(Some("ACTIVE")), "up"); + assert_eq!(oper_status_to_state(Some("active")), "up"); + assert_eq!(oper_status_to_state(Some("DOWN")), "down"); } #[test] @@ -2223,7 +2642,11 @@ mod tests { // (leaf tail, raw bytes value, expected metric_type, expected value) // values are the authoritative live GB200 Stage-0 capture. let cases: &[(&[&str], u64, &str)] = &[ - (&["state", "memory-used"], 3_856_510_976, "platform_memory_used"), + ( + &["state", "memory-used"], + 3_856_510_976, + "platform_memory_used", + ), ( &["state", "memory-total-size"], 16_151_990_272, @@ -2234,7 +2657,11 @@ mod tests { 77_780_082_688, "platform_disk_total", ), - (&["state", "disk-used"], 22_848_192_512, "platform_disk_used"), + ( + &["state", "disk-used"], + 22_848_192_512, + "platform_disk_used", + ), ]; for (tail, raw, metric_type) in cases { let sample = run_platform_general_leaf(tail, make_typed_value_uint(*raw)); @@ -2336,7 +2763,10 @@ mod tests { ..Default::default() }; let count = proc.process_notification(¬ification); - assert_eq!(count, 1, "platform-general entity is still counted for {leaf}"); + assert_eq!( + count, 1, + "platform-general entity is still counted for {leaf}" + ); assert_eq!( sink.events.lock().expect("lock poisoned").len(), 0, @@ -2370,7 +2800,12 @@ mod tests { // CONTACT (862) / LOCATION (863): non-empty strings emit their info // series with the matching single label. for (leaf, metric_type, label, raw) in [ - ("contact", "platform_contact_info", "contact", "noc@example.com"), + ( + "contact", + "platform_contact_info", + "contact", + "noc@example.com", + ), ("location", "platform_location_info", "location", "rack-7"), ] { let sample = run_platform_general_leaf_info(&["state", leaf], raw); @@ -2451,7 +2886,10 @@ mod tests { ..Default::default() }; let count = proc.process_notification(¬ification); - assert_eq!(count, 1, "platform-general entity is still counted for {tail:?}"); + assert_eq!( + count, 1, + "platform-general entity is still counted for {tail:?}" + ); assert_eq!( sink.events.lock().expect("lock poisoned").len(), 0, diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 35eb011586..403bec0f97 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -28,29 +28,44 @@ use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; const COLLECTOR_NAME: &str = "nvue_rest"; -fn system_health_to_f64(status: Option<&str>) -> f64 { +const SYSTEM_HEALTH_STATES: &[&str] = &["ok", "not_ok", "unknown"]; + +/// anything else (including absent) => "unknown". +fn system_health_to_state(status: Option<&str>) -> &'static str { match status { - Some("OK") => 1.0, - Some("Not OK") => 2.0, - _ => 0.0, + Some("OK") => "ok", + Some("Not OK") => "not_ok", + _ => "unknown", } } -fn partition_health_to_f64(status: Option<&str>) -> f64 { +const PARTITION_HEALTH_STATES: &[&str] = &[ + "healthy", + "degraded_bandwidth", + "degraded", + "unhealthy", + "unknown", +]; + +/// The four known states map to themselves; anything else (including absent) => "unknown". +fn partition_health_to_state(status: Option<&str>) -> &'static str { match status { - Some("healthy") => 1.0, - Some("degraded_bandwidth") => 2.0, - Some("degraded") => 3.0, - Some("unhealthy") => 4.0, - _ => 0.0, + Some("healthy") => "healthy", + Some("degraded_bandwidth") => "degraded_bandwidth", + Some("degraded") => "degraded", + Some("unhealthy") => "unhealthy", + _ => "unknown", } } -fn app_status_to_f64(status: Option<&str>) -> f64 { +const APP_STATUS_STATES: &[&str] = &["ok", "not_ok", "unknown"]; + +/// anything else (including absent) => "unknown". +fn app_status_to_state(status: Option<&str>) -> &'static str { match status { - Some("ok") => 1.0, - Some("not ok") => 2.0, - _ => 0.0, + Some("ok") => "ok", + Some("not ok") => "not_ok", + _ => "unknown", } } @@ -77,32 +92,36 @@ fn temp_to_f64(value: Option<&str>) -> Option { value.and_then(|s| s.trim().parse::().ok()) } -/// Map a temperature sensor's string `state` to a numeric gauge: "ok" -/// (case-insensitive) => 1.0, any other non-empty value => 0.0, absent => None -/// (so callers emit nothing rather than fabricating a value). -fn temp_state_to_f64(state: Option<&str>) -> Option { +const TEMP_STATE_STATES: &[&str] = &["ok", "not_ok"]; + +/// Map a temperature sensor's string `state` to a StateSet state: "ok" +/// (case-insensitive) => "ok", any other present value => "not_ok", absent => +/// None (so callers emit nothing rather than fabricating an all-zero StateSet). +fn temp_state_to_state(state: Option<&str>) -> Option<&'static str> { state.map(|s| { if s.trim().eq_ignore_ascii_case("ok") { - 1.0 + "ok" } else { - 0.0 + "not_ok" } }) } +const FAN_LED_STATES: &[&str] = &["ok", "not_ok"]; + /// Map the aggregate `FAN_STATUS` LED state from the platform/environment parent -/// summary to a numeric gauge: "green"/"ok" (case-insensitive) => 1.0, any other -/// non-empty value (e.g. "amber"/"red") => 0.0, absent/empty => None (so callers -/// emit nothing rather than fabricating a value). -fn fan_led_to_f64(state: Option<&str>) -> Option { +/// summary to a StateSet state: "green"/"ok" (case-insensitive) => "ok", any +/// other non-empty value (e.g. "amber"/"red") => "not_ok", absent/empty => None +/// (so callers emit nothing rather than fabricating an all-zero StateSet). +fn fan_led_to_state(state: Option<&str>) -> Option<&'static str> { let s = state?.trim(); if s.is_empty() { return None; } if s.eq_ignore_ascii_case("green") || s.eq_ignore_ascii_case("ok") { - Some(1.0) + Some("ok") } else { - Some(0.0) + Some("not_ok") } } @@ -179,8 +198,8 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_system_health().await { Ok(Some(health)) => { - let value = system_health_to_f64(health.status.as_deref()); - self.emit_metric("system_health", None, value, "state", vec![]); + let current = system_health_to_state(health.status.as_deref()); + self.emit_state_set("system_health", None, current, SYSTEM_HEALTH_STATES, vec![]); entity_count += 1; } Ok(None) => {} @@ -198,12 +217,12 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_cluster_apps().await { Ok(Some(apps)) => { for (name, app) in &apps { - let value = app_status_to_f64(app.status.as_deref()); - self.emit_metric( + let current = app_status_to_state(app.status.as_deref()); + self.emit_state_set( "cluster_app", Some(name), - value, - "state", + current, + APP_STATUS_STATES, vec![(Cow::Borrowed("app_name"), name.clone())], ); entity_count += 1; @@ -225,18 +244,18 @@ impl PeriodicCollector for NvueRestCollector { Ok(Some(partitions)) => { for (part_id, partition) in &partitions { let part_name = partition.name.as_deref().unwrap_or(part_id); - let health_value = partition_health_to_f64(partition.health.as_deref()); + let health_state = partition_health_to_state(partition.health.as_deref()); let gpu_count = partition.num_gpus.unwrap_or(0) as f64; let partition_labels = vec![ (Cow::Borrowed("partition_id"), part_id.clone()), (Cow::Borrowed("partition_name"), part_name.to_string()), ]; - self.emit_metric( + self.emit_state_set( "partition_health", Some(part_id), - health_value, - "state", + health_state, + PARTITION_HEALTH_STATES, partition_labels.clone(), ); self.emit_metric( @@ -323,8 +342,7 @@ impl PeriodicCollector for NvueRestCollector { for (sensor_name, temp) in &temps { // Each field is optional; emit only the ones present/parseable // rather than fabricating absent thresholds. - let sensor_label = - || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; + let sensor_label = || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; if let Some(value) = temp_to_f64(temp.current.as_deref()) { self.emit_metric( @@ -356,12 +374,14 @@ impl PeriodicCollector for NvueRestCollector { ); entity_count += 1; } - if let Some(value) = temp_state_to_f64(temp.state.as_deref()) { - self.emit_metric( + // Absent `state` => emit nothing (never fabricate an + // all-zero StateSet); present => one 0/1 series per state. + if let Some(current) = temp_state_to_state(temp.state.as_deref()) { + self.emit_state_set( "platform_temperature_state", Some(sensor_name), - value, - "state", + current, + TEMP_STATE_STATES, sensor_label(), ); entity_count += 1; @@ -384,10 +404,11 @@ impl PeriodicCollector for NvueRestCollector { Ok(Some(env)) => { // Switch-level aggregate FAN_STATUS LED; emit only when present // and the state maps to a value, absent → nothing. - if let Some(value) = - env.get("FAN_STATUS").and_then(|s| fan_led_to_f64(s.state.as_deref())) + if let Some(current) = env + .get("FAN_STATUS") + .and_then(|s| fan_led_to_state(s.state.as_deref())) { - self.emit_metric("fan_led", None, value, "state", vec![]); + self.emit_state_set("fan_led", None, current, FAN_LED_STATES, vec![]); entity_count += 1; } } @@ -498,6 +519,41 @@ impl NvueRestCollector { .into(), )); } + + /// emit an OpenMetrics StateSet: one `0.0`/`1.0` series per possible state, + /// with the current state's series == 1.0 and an added `state` label. The + /// existing per-entity `labels` are carried onto every series; `key_base` + /// is the per-entity key qualifier (it is suffixed with the state name so + /// each series gets a unique key). Unit is always "state". + fn emit_state_set( + &self, + metric_type: &str, + key_base: Option<&str>, + current_state: &str, + all_states: &[&str], + labels: Vec<(Cow<'static, str>, String)>, + ) { + for state in all_states { + let mut series_labels = labels.clone(); + series_labels.push((Cow::Borrowed("state"), state.to_string())); + + // suffix the state onto the per-entity qualifier so each series key + // is unique (switch-level series have no entity qualifier, so the + // state name alone disambiguates them). + let qualifier = match key_base { + Some(base) => format!("{base}:{state}"), + None => (*state).to_string(), + }; + + self.emit_metric( + metric_type, + Some(&qualifier), + if *state == current_state { 1.0 } else { 0.0 }, + "state", + series_labels, + ); + } + } } #[cfg(test)] @@ -514,30 +570,74 @@ mod tests { use crate::bmc::BoxFuture; use crate::config::NvueRestPaths; + /// Assert OpenMetrics StateSet semantics over a captured fan-out: exactly + /// one 0/1 series per `all_states` entry, each with unit "state" and a + /// `state` label; the series whose `state` label equals `current` has value + /// 1.0 and every other series is 0.0. `entity` (if any) is asserted present + /// on every series. + fn assert_state_set( + samples: &[MetricSample], + metric_type: &str, + entity: Option<(&str, &str)>, + all_states: &[&str], + current: &str, + ) { + let series: Vec<&MetricSample> = samples + .iter() + .filter(|s| s.metric_type == metric_type) + .collect(); + assert_eq!( + series.len(), + all_states.len(), + "{metric_type}: expected one series per state" + ); + for state in all_states { + let sample = series + .iter() + .find(|s| s.labels.iter().any(|(k, v)| k == "state" && v == state)) + .unwrap_or_else(|| panic!("{metric_type}: missing series for state {state}")); + assert_eq!(sample.unit, "state", "state {state}"); + assert_eq!( + sample.value, + if *state == current { 1.0 } else { 0.0 }, + "{metric_type} state {state}: value (current={current})" + ); + if let Some((label, value)) = entity { + assert!( + sample.labels.iter().any(|(k, v)| k == label && v == value), + "{metric_type} state {state}: missing entity label {label}={value}" + ); + } + } + } + #[test] fn test_system_health_mapping() { - assert_eq!(system_health_to_f64(Some("OK")), 1.0); - assert_eq!(system_health_to_f64(Some("Not OK")), 2.0); - assert_eq!(system_health_to_f64(None), 0.0); - assert_eq!(system_health_to_f64(Some("unknown_value")), 0.0); + assert_eq!(system_health_to_state(Some("OK")), "ok"); + assert_eq!(system_health_to_state(Some("Not OK")), "not_ok"); + assert_eq!(system_health_to_state(None), "unknown"); + assert_eq!(system_health_to_state(Some("unknown_value")), "unknown"); } #[test] fn test_partition_health_mapping() { - assert_eq!(partition_health_to_f64(Some("unknown")), 0.0); - assert_eq!(partition_health_to_f64(Some("healthy")), 1.0); - assert_eq!(partition_health_to_f64(Some("degraded_bandwidth")), 2.0); - assert_eq!(partition_health_to_f64(Some("degraded")), 3.0); - assert_eq!(partition_health_to_f64(Some("unhealthy")), 4.0); - assert_eq!(partition_health_to_f64(None), 0.0); + assert_eq!(partition_health_to_state(Some("unknown")), "unknown"); + assert_eq!(partition_health_to_state(Some("healthy")), "healthy"); + assert_eq!( + partition_health_to_state(Some("degraded_bandwidth")), + "degraded_bandwidth" + ); + assert_eq!(partition_health_to_state(Some("degraded")), "degraded"); + assert_eq!(partition_health_to_state(Some("unhealthy")), "unhealthy"); + assert_eq!(partition_health_to_state(None), "unknown"); } #[test] fn test_app_status_mapping() { - assert_eq!(app_status_to_f64(Some("ok")), 1.0); - assert_eq!(app_status_to_f64(Some("not ok")), 2.0); - assert_eq!(app_status_to_f64(None), 0.0); - assert_eq!(app_status_to_f64(Some("other")), 0.0); + assert_eq!(app_status_to_state(Some("ok")), "ok"); + assert_eq!(app_status_to_state(Some("not ok")), "not_ok"); + assert_eq!(app_status_to_state(None), "unknown"); + assert_eq!(app_status_to_state(Some("other")), "unknown"); } #[test] @@ -569,30 +669,31 @@ mod tests { } #[test] - fn test_temp_state_to_f64_mapping() { - assert_eq!(temp_state_to_f64(Some("ok")), Some(1.0)); - assert_eq!(temp_state_to_f64(Some("OK")), Some(1.0)); - assert_eq!(temp_state_to_f64(Some(" ok ")), Some(1.0)); - assert_eq!(temp_state_to_f64(Some("warning")), Some(0.0)); - assert_eq!(temp_state_to_f64(Some("")), Some(0.0)); - assert_eq!(temp_state_to_f64(None), None); + fn test_temp_state_to_state_mapping() { + assert_eq!(temp_state_to_state(Some("ok")), Some("ok")); + assert_eq!(temp_state_to_state(Some("OK")), Some("ok")); + assert_eq!(temp_state_to_state(Some(" ok ")), Some("ok")); + assert_eq!(temp_state_to_state(Some("warning")), Some("not_ok")); + assert_eq!(temp_state_to_state(Some("")), Some("not_ok")); + // absent => None (emit nothing, never fabricate) + assert_eq!(temp_state_to_state(None), None); } #[test] - fn test_fan_led_to_f64_mapping() { - // green/ok (case-insensitive) => 1.0 - assert_eq!(fan_led_to_f64(Some("green")), Some(1.0)); - assert_eq!(fan_led_to_f64(Some("GREEN")), Some(1.0)); - assert_eq!(fan_led_to_f64(Some(" green ")), Some(1.0)); - assert_eq!(fan_led_to_f64(Some("ok")), Some(1.0)); - assert_eq!(fan_led_to_f64(Some("OK")), Some(1.0)); - // any other non-empty value => 0.0 - assert_eq!(fan_led_to_f64(Some("amber")), Some(0.0)); - assert_eq!(fan_led_to_f64(Some("red")), Some(0.0)); + fn test_fan_led_to_state_mapping() { + // green/ok (case-insensitive) => "ok" + assert_eq!(fan_led_to_state(Some("green")), Some("ok")); + assert_eq!(fan_led_to_state(Some("GREEN")), Some("ok")); + assert_eq!(fan_led_to_state(Some(" green ")), Some("ok")); + assert_eq!(fan_led_to_state(Some("ok")), Some("ok")); + assert_eq!(fan_led_to_state(Some("OK")), Some("ok")); + // any other non-empty value => "not_ok" + assert_eq!(fan_led_to_state(Some("amber")), Some("not_ok")); + assert_eq!(fan_led_to_state(Some("red")), Some("not_ok")); // absent/empty => None (emit nothing) - assert_eq!(fan_led_to_f64(Some("")), None); - assert_eq!(fan_led_to_f64(Some(" ")), None); - assert_eq!(fan_led_to_f64(None), None); + assert_eq!(fan_led_to_state(Some("")), None); + assert_eq!(fan_led_to_state(Some(" ")), None); + assert_eq!(fan_led_to_state(None), None); } /// Drives the same parse + emit logic `run_iteration` uses for the @@ -793,37 +894,36 @@ mod tests { sensor_label(), ); } - if let Some(value) = temp_state_to_f64(temp.state.as_deref()) { - collector.emit_metric( + if let Some(current) = temp_state_to_state(temp.state.as_deref()) { + collector.emit_state_set( "platform_temperature_state", Some(sensor_name), - value, - "state", + current, + TEMP_STATE_STATES, sensor_label(), ); } } let samples = sink.samples.lock().unwrap(); - // ASIC1: 4 series; Ambient-MNG-Temp: 2 series (current + state) = 6 total. - assert_eq!(samples.len(), 6, "unexpected emitted sample count"); + // ASIC1: current + max + crit (3) + state StateSet (2) = 5. + // Ambient-MNG-Temp: current (1) + state StateSet (2) = 3. Total 8. + assert_eq!(samples.len(), 8, "unexpected emitted sample count"); // Helper: find a sample by metric_type + sensor label. let find = |metric_type: &str, sensor: &str| { samples.iter().find(|s| { s.metric_type == metric_type - && s.labels - .iter() - .any(|(k, v)| k == "sensor" && v == sensor) + && s.labels.iter().any(|(k, v)| k == "sensor" && v == sensor) }) }; - // ASIC1: all four series present with correct name/unit/value/label/key. + // ASIC1: the three scalar temperature series present with correct + // name/unit/value/label/key. let expected_asic1: &[(&str, &str, f64)] = &[ ("platform_temperature", "celsius", 43.0), ("platform_temperature_max", "celsius", 105.0), ("platform_temperature_critical", "celsius", 120.0), - ("platform_temperature_state", "state", 1.0), ]; for (metric_type, unit, value) in expected_asic1 { let sample = find(metric_type, "ASIC1") @@ -838,14 +938,44 @@ mod tests { assert_eq!(sample.labels[0].1, "ASIC1"); } - // Ambient-MNG-Temp: only current + state emitted. - let ambient_current = find("platform_temperature", "Ambient-MNG-Temp") - .expect("ambient current sample"); + // ASIC1 state="ok" => StateSet: ok=1, not_ok=0; sensor label preserved. + let asic1_state: Vec = samples + .iter() + .filter(|s| { + s.metric_type == "platform_temperature_state" + && s.labels.iter().any(|(k, v)| k == "sensor" && v == "ASIC1") + }) + .cloned() + .collect(); + assert_state_set( + &asic1_state, + "platform_temperature_state", + Some(("sensor", "ASIC1")), + TEMP_STATE_STATES, + "ok", + ); + + // Ambient-MNG-Temp: only current + state StateSet emitted. + let ambient_current = + find("platform_temperature", "Ambient-MNG-Temp").expect("ambient current sample"); assert_eq!(ambient_current.value, 27.0); assert_eq!(ambient_current.unit, "celsius"); - assert!( - find("platform_temperature_state", "Ambient-MNG-Temp").is_some(), - "ambient state sample expected" + let ambient_state: Vec = samples + .iter() + .filter(|s| { + s.metric_type == "platform_temperature_state" + && s.labels + .iter() + .any(|(k, v)| k == "sensor" && v == "Ambient-MNG-Temp") + }) + .cloned() + .collect(); + assert_state_set( + &ambient_state, + "platform_temperature_state", + Some(("sensor", "Ambient-MNG-Temp")), + TEMP_STATE_STATES, + "ok", ); // A sensor missing max/crit must NOT emit those series. @@ -886,25 +1016,25 @@ mod tests { struct Case { name: &'static str, json: &'static str, - // expected emitted fan_led value, or None when nothing must emit. - expected: Option, + // expected current StateSet state, or None when nothing must emit. + expected: Option<&'static str>, } let cases = [ Case { - name: "green LED emits 1.0", + name: "green LED => ok", json: r#"{"FAN_STATUS": {"state": "green", "type": "led"}}"#, - expected: Some(1.0), + expected: Some("ok"), }, Case { - name: "ok LED emits 1.0", + name: "ok LED => ok", json: r#"{"FAN_STATUS": {"state": "ok", "type": "led"}}"#, - expected: Some(1.0), + expected: Some("ok"), }, Case { - name: "amber LED emits 0.0", + name: "amber LED => not_ok", json: r#"{"FAN_STATUS": {"state": "amber", "type": "led"}}"#, - expected: Some(0.0), + expected: Some("not_ok"), }, Case { name: "absent FAN_STATUS emits nothing", @@ -923,27 +1053,41 @@ mod tests { let env: PlatformEnvironmentResponse = serde_json::from_str(case.json).expect("env json parses"); // Mirror run_iteration's emit logic exactly. - if let Some(value) = - env.get("FAN_STATUS").and_then(|s| fan_led_to_f64(s.state.as_deref())) + if let Some(current) = env + .get("FAN_STATUS") + .and_then(|s| fan_led_to_state(s.state.as_deref())) { - collector.emit_metric("fan_led", None, value, "state", vec![]); + collector.emit_state_set("fan_led", None, current, FAN_LED_STATES, vec![]); } let samples = sink.samples.lock().unwrap(); match case.expected { - Some(expected_value) => { - assert_eq!(samples.len(), 1, "case '{}': expected one sample", case.name); - let sample = &samples[0]; - assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); - assert_eq!(sample.metric_type, "fan_led", "case '{}'", case.name); - assert_eq!(sample.unit, "state", "case '{}'", case.name); - assert_eq!(sample.value, expected_value, "case '{}'", case.name); - assert_eq!(sample.key, "fan_led", "case '{}'", case.name); - assert!( - sample.labels.is_empty(), - "case '{}': fan_led is switch-level, no per-entity label", - case.name - ); + Some(current) => { + // switch-level StateSet: no per-entity label, but a `state` + // label per series; series keys are unique per state. + assert_state_set(&samples, "fan_led", None, FAN_LED_STATES, current); + for sample in samples.iter() { + assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); + let state = sample + .labels + .iter() + .find(|(k, _)| k == "state") + .map(|(_, v)| v.clone()) + .expect("state label present"); + assert_eq!( + sample.key, + format!("fan_led:{state}"), + "case '{}'", + case.name + ); + // switch-level: the only label is `state`. + assert_eq!( + sample.labels.len(), + 1, + "case '{}': fan_led is switch-level (only the state label)", + case.name + ); + } } None => assert_eq!( samples.len(), From 2d2fd691e3dc297c554f6914fa7cb08ae7f44b4b Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:59:01 -0400 Subject: [PATCH 12/25] docs(health): reconcile GB200 matrix + runbook for StateSet/representation changes Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- ...vswitch_telemetry_gb200_live_validation.md | 100 +++++++++++++++++- .../nvswitch_telemetry_gb200_matrix.csv | 10 +- .../health/nvswitch_telemetry_gb200_matrix.md | 8 +- 3 files changed, 106 insertions(+), 12 deletions(-) diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md index 3593849938..6f90d293d3 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md @@ -144,8 +144,8 @@ Only explicit catalog-row mappings are emitted; unknown sources are dropped (deb Unit coverage that locks this behavior: - NMX-T: `test_nmxt_metric_map_locks_type_and_unit`, `test_unknown_nmxt_sources_not_allowlisted`. -- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_string_leaf_is_not_exported` (string leaves emit nothing), `test_interface_numeric_leaf_table_mappings` (locks `interface_plr_bw_loss_percent` type/unit), `test_platform_general_version_info_metrics` + `test_platform_general_empty_version_string_is_not_exported` (OS/BMC/EROT version info-metrics), `test_nvue_subscribe_paths_all_enabled` (the `/platform-general/versions` subscribe path is added). -- NVUE REST: `test_fan_max_speed_emit`, `test_fan_led_emit` (green/ok=1, amber=0, absent FAN_STATUS emits nothing) + `test_parse_platform_environment_fan_status`. +- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_unmapped_string_leaf_is_not_exported` (unmapped string leaves emit nothing), `test_interface_numeric_leaf_table_mappings` (locks `interface_plr_bw_loss_percent` type/unit), `test_platform_general_version_info_metrics` + `test_platform_general_empty_version_string_is_not_exported` (OS/BMC/EROT version info-metrics), `test_nvue_subscribe_paths_all_enabled` (the `/platform-general/versions` subscribe path is added). StateSet shape (per-state 0/1 series with a `state` label, unit `state`): `test_interface_oper_status_state_set`, `test_interface_physical_port_state_enum` (polling/training => up=0/down=1), `test_interface_logical_port_state_enum`, `test_interface_phy_manager_state_enum` + `test_phy_manager_to_state_helper` (Inactive/Deactivated => up=0/down=1 substring regression), `test_component_oper_status_shared_leaf_fan_and_cpu`, `test_component_health_status_state_set` (unrecognized => unknown=1). +- NVUE REST: `test_fan_max_speed_emit`, `test_fan_led_emit` (StateSet: green/ok => ok=1, amber => not_ok=1, absent FAN_STATUS emits nothing) + `test_parse_platform_environment_fan_status`. StateSet shape also locked by `test_system_health_mapping`, `test_partition_health_mapping`, `test_app_status_mapping`, `test_temp_state_to_state_mapping`, `test_fan_led_to_state_mapping`, and `test_platform_temperature_emit` (absent sensor `state` emits no StateSet). ## Blocker escalations (Stage 0) @@ -230,7 +230,9 @@ Escalate to NMX-T owner with the NMX-T version string from the test rig. ### String-valued rows — RESOLVED (6 rows, now implemented) These 6 catalog rows are string-valued and were previously escalated; they are now implemented: -- `961 PHY-MANAGER-STATE` — enum-coded to `interface_phy_manager_state` (active/linkup = 1, else 0). +- `961 PHY-MANAGER-STATE` — emitted as a StateSet `interface_phy_manager_state` (one 0/1 series per + state with a `state` label; `up` when an `active`/`linkup` token matches on a word boundary, else + `down`). - `965 VL-CAPABILITIES`, `862 CONTACT`, `863 LOCATION`, `864 NODE-DESCRIPTION` — emitted as info-metrics (value 1 with the string carried in a label; skipped when empty, so `CONTACT`/`LOCATION` emit only when configured). @@ -248,8 +250,9 @@ now has an explicit, unit-tested emit path: - `942 PLR-BW-LOSS-PERCENT` — gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` added to the numeric interface-leaf allowlist as `interface_plr_bw_loss_percent` (unit `percent`). - `967 FAN-LED` — re-sourced from NVUE REST: the `/nvue_v1/platform/environment` parent summary's - aggregate `FAN_STATUS.state` LED is emitted as switch-level `fan_led` (green/ok = 1.0, any other - state = 0.0, absent = nothing), gated on `platform_environment_status_enabled` (default true). + aggregate `FAN_STATUS.state` LED is emitted as switch-level `fan_led`, a StateSet (per-state 0/1 + series with a `state` label: green/ok => `ok`, any other state => `not_ok`; absent = nothing), + gated on `platform_environment_status_enabled` (default true). The catalog's CLI LED path (`nv show platform environment led`) is not used. ### Rescue-match audit — 3 rows re-classified to ABSENT-BLOCKER (2026-06-23) @@ -265,3 +268,90 @@ A verification pass over the 8 `RESOLVED-LIVE` rows found 3 that an earlier toke (`rx_power_lane_5`, `cable-proto-cap-ext`) do not exist as live SNR sources. No emit arm exists. Resolution: source-owner follow-up (see "Evidence to capture" step 5); keep open until an NVLink per-lane SNR source is identified or the rows are declared N/A for NVLink backplane switches. + +## NMX-T field representation validation (DEFERRED — requires live GB200 rig) + +These NMX-T fields had their representation changed during review (label → metric / StateSet, or +per-sink routing). The chosen representations are **derived from source/catalog analysis, not yet +confirmed on live hardware**, and must be validated on a real GB200 NVLink switch before merge is +considered fully signed off. + +**Why this needs live verification (critical caveat).** The target firmware NVOS **25.02.2553** +ships **NMX-T 1.3.4**, which **predates** the NMX-T Prometheus metric-vs-label renderer fix +(**NVBug 6131830**, fixed in NMX-T 4.20.4 / 4.21.4 / 5.06.12, telemetry commit `3dd5d388`). On the +pre-fix renderer, `/xcset/nvlink_domain_telemetry` may render the same field as a string label on +one endpoint and a numeric gauge on another (`/metrics`), or render empty — and a `;lookup=` xcset +suffix can flip string↔numeric. So the *actual* on-wire form of these fields on 25.02.2553 must be +captured, not assumed. Our scraper is robust to either form (unmapped strings fall back to +`unknown` / are dropped, never fabricated), so collection is safe regardless — but the +representation decisions below should be re-confirmed against reality. + +### Capture commands (run on / against the live switch host) + +```bash +# What NMX-T actually renders for the fields in question, on BOTH endpoints: +curl -s http://:9352/metrics | grep -E 'local_reason_opcode|remote_reason_opcode|down_blame|fec_mode_active|Active_FEC|Module_Temperature|Status_Message' +curl -s http://:9352/xcset/nvlink_domain_telemetry | grep -E 'local_reason_opcode|remote_reason_opcode|down_blame|Active_FEC|Module_Temperature|Status_Message' | head +curl -s http://:9352/management/xcset/nvlink_domain_telemetry | head +curl -s 'http://:9352/management/schema?schema_id=all' > nmxt-schema.json # value-space / lookup tables +# Distinct observed values per field (empirical floor for the value space): +curl -s http://:9352/xcset/nvlink_domain_telemetry \ + | grep -oE '(down_blame|local_reason_opcode|remote_reason_opcode|Active_FEC|Status_Message|Module_Temperature)="[^"]*"' \ + | sort -u +``` + +Source-of-truth references for the value spaces (use to confirm closed-enum membership): +NVOS `nvos/src/nvos-swss/orchagent/portsorch.h` (link-down reason opcode map `0..49`, read from +SAI `SAI_PORT_ATTR_LINK_DOWN_{LOCAL,REMOTE}_REASON`); `Telemetry_Catalog_v4.0_Telemetry_APIs.csv`; +NMX-T producer `gitlab-master.nvidia.com/telemetry/nmx-telemetry`. + +### Per-field acceptance checks + +1. **`cable_temperature_celsius`** (was `cable_temp` label → numeric gauge, one series/port). Confirm + the `Module_Temperature` label is present in the scrape; confirm our exporter emits exactly one + `cable_temperature_celsius` series per port with the parsed value (e.g. `"37C"→37`). On a rig with + **optical** modules (not just the passive backplane, which reads `0C`), confirm the value varies + over time **and** that no `cable_temp` *label* reappears (the churn fix). +2. **`down_blame`** (was label → StateSet `unknown`/`local_phy`/`remote_phy`). Confirm the live + string values are within that closed set; any value mapping to `unknown` that *isn't* literally + "Unknown" is an unmapped source token → record it and extend the mapping. Confirm exactly 3 + series per port, exactly one `=1`. Best signal: induce/observe a link-down and confirm the active + state flips (`local_phy`/`remote_phy`). +3. **`status_message`** (kept as label, **Prometheus-excluded, OTLP-only**). Confirm it is **absent** + from our Prometheus `/telemetry` scrape and **present** as a data-point attribute in the OTLP + export. Capture the distinct-value count over a soak window to confirm it is bounded (a finite + decode of the opcode table), not unbounded free text. If it proves unbounded/noisy in OTLP, + reconsider dropping or moving to the events/logs path. +4. **`local_reason_opcode` vs `remote_reason_opcode`** (left as-is: local = string label, remote = + numeric `code` metric). Confirm on 25.02.2553 which form each actually renders. If **both** render + numeric (post-fix backport) — or if `nmxt-schema.json` exposes a stable numeric↔string map — then + numeric-ifying `local_reason_opcode` into a `code` metric (consistent with `remote_reason_opcode`) + becomes worthwhile. Until then the local=string/remote=numeric asymmetry is a documented NMX-T + 1.3.4 source artifact, **not** our bug — do NOT hardcode a `0..49` reverse map (it differs across + versions, e.g. `0..37` in older customer docs). +5. **`fec_mode_active` (`Active_FEC`)** (left as a label). Capture distinct values incl. aliases + (`Int_KP4_FEC_PLR` etc. vs the catalog canonical set `No_FEC` / `Firecode_FEC` / `Standard_RS_FEC` + / `Standard_LL_RS_FEC` / `Interleaved_Standard_RS-FEC` / `Standard_RS-FEC`). Only convert to a + StateSet if the alias→canonical normalization map can be sourced authoritatively; otherwise the + low-churn label is fine. + +### Cardinality observation (do this while validating) + +Scrape our `/telemetry` endpoint twice ~1 minute apart after collectors are warm; diff the distinct +`(metric_type, label-set)` tuples. Expectation: stable except expected counter movement. Then induce +a link event and re-diff — confirm the only new series are the intended StateSet flips +(`down_blame`, port/oper state), **not** a fan-out of new label-value combinations. Record per-field +distinct-value counts so future representation decisions have an empirical basis. + +### Follow-on goal (AFTER live validation): representation true-up + +Once the live-hardware validation above is complete, perform a deliberate **true-up** pass over the +full NVSWITCH catalog coverage to confirm we are filling the gaps in the *best* way, not merely a +working way: +- Re-confirm each chosen source and representation against observed live data. +- Revisit label-vs-metric-vs-StateSet decisions with **real cardinality numbers** (not estimates). +- Re-examine the 16 ABSENT-BLOCKER rows for newly-available sources — especially cable optical + telemetry, the RDMA queue counters under active load, OS-KERNEL, and TIME-SINCE-LAST-CLEAR. +- Reconcile the matrix to reality. + +This is sequenced strictly **after** hardware validation: validate what we built, then optimize. diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv index 3ebc62e162..87e8e01038 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv @@ -102,8 +102,8 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 943,RQ-GENERAL-ERROR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 944,TIME-TO-LINKS-UP,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 945,STATUS-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -946,STATUS-MESSAGE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -947,DOWN-BLAME,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +946,STATUS-MESSAGE,NMX-T explicit allowlist,PRESENT,implemented,OTLP-only label status_message; excluded from Prometheus series to bound cardinality +947,DOWN-BLAME,NMX-T explicit allowlist,PRESENT,implemented,StateSet down_blame (state label unknown/local_phy/remote_phy 0/1 per state) one series per port 948,LOCAL-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 949,REMOTE-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 950,PHY-RECEIVED-BITS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact @@ -117,13 +117,13 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 958,PORT-LOOPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 959,PORT-INACTIVE-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 960,LINK-WIDTH-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi enum-coded interface_phy_manager_state +961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi interface_phy_manager_state StateSet (state label, 0/1 per state) 962,MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 963,MAX-SUPPORTED-MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 964,SUPPORTED-WIDTH,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact 965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric interface_vl_capabilities_info 966,FAN-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -967,FAN-LED,NVUE REST explicit mapping,IMPLEMENTED,implemented,nvue-rest /nvue_v1/platform/environment FAN_STATUS.state -> fan_led (green/ok=1 else 0) +967,FAN-LED,NVUE REST explicit mapping,IMPLEMENTED,implemented,nvue-rest /nvue_v1/platform/environment FAN_STATUS.state -> fan_led StateSet (state label, 0/1 per state: green/ok=ok else not_ok) 968,CABLE-PART-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 969,CABLE-SERIAL-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim 970,CABLE-TRANSMITTER-TECHNOLOGY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family @@ -136,7 +136,7 @@ catalog_row,metric_param_name,corrected_primary_source,final_status,disposition, 977,CABLE-RX-POWER-LANE0,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 978,CABLE-RX-POWER-LANE1,NMX-T explicit allowlist,PRESENT,implemented,nmxt family 979,CABLE-DIAG-SUPPLY-VOLTAGE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -980,CABLE-TEMP,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim +980,CABLE-TEMP,NMX-T explicit allowlist,PRESENT,implemented,numeric metric cable_temperature_celsius (celsius) parsed from NMX-T Module_Temperature label; one series per port 981,CABLE-TEMP-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag 982,CABLE-VOLTAGE-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag 983,CABLE-TX-CDR-LOL,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md index 82180cb90a..1ae2614da0 100644 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md @@ -46,7 +46,7 @@ conditions. - 942 PLR-BW-LOSS-PERCENT → gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` (`interface_plr_bw_loss_percent`, percent). - 967 FAN-LED → NVUE REST `/nvue_v1/platform/environment` `FAN_STATUS.state` (`fan_led`, - green/ok=1 else 0). + StateSet: `state` label, 0/1 per state — green/ok => ok, else not_ok). - Audit note: an earlier pass token-matched 3 further rows (870 CPU_CORE_NUMBER, 2294 CABLE-SNR-MEDIA-LANE-N, 2295 CABLE-SNR-HOST-LANE-N) on spurious substrings; on verification no lane emits them, so they were re-classified to ABSENT-BLOCKER (see "Notes on blocker rows"). @@ -55,7 +55,11 @@ conditions. `*-CRITICAL/MAX/STATE` rows (`.crit`/`.max`/`.state`) and the 8 `*-TEMP-CURRENT` rows (`.current`), emitted per sensor as `platform_temperature{,_max,_critical,_state}` with a `sensor` label. - gNMI `platform-general` subscribe path → the 4 memory/disk rows (`886-889`). - - String rows → `interface_phy_manager_state` (enum-coded), `*_info` info-metrics, and the existing `component_name` label (`ASIC-NAME`). + - String rows → `interface_phy_manager_state` (StateSet: `state` label, 0/1 per state), `*_info` info-metrics, and the existing `component_name` label (`ASIC-NAME`). + - 947 DOWN-BLAME → NMX-T `down_blame` emitted as a StateSet `down_blame` (`state` label: + unknown/local_phy/remote_phy, 0/1 per state), one series per port (no longer a re-exported label). + - 946 STATUS-MESSAGE → NMX-T `status_message` is free-text; emitted as an OTLP-only data-point + attribute (`status_message`) and excluded from Prometheus series to bound cardinality. ## Notes on blocker rows From ef8f1733a44bff8114253494dd320c4dd7ebc075 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:24:22 -0700 Subject: [PATCH 13/25] feat(health): OTLP metrics export full Prometheus-style names + switch_serial label Compose OTLP metric name as {prefix}_{name}_{metric_type}_{unit} to match the Prometheus sink, and promote switch_serial/switch_id onto datapoint attributes so Grafana switch dashboards resolve identically across export paths. Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/otlp/convert.rs | 104 +++++++++++++++++++++--- crates/health/src/otlp/metrics_drain.rs | 5 +- crates/health/src/sink/otlp.rs | 1 + 3 files changed, 96 insertions(+), 14 deletions(-) diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index e83a979049..84b6c0c576 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -235,6 +235,7 @@ pub fn build_export_request(batch: &[(EventContext, CollectorEvent)]) -> ExportL /// every sample maps to an OTLP `Gauge` point; Sum/Histogram is a follow-up. pub fn build_metrics_export_request( batch: &[(EventContext, MetricSample)], + metric_name_prefix: &str, ) -> ExportMetricsServiceRequest { let observed_nanos = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) @@ -244,19 +245,40 @@ pub fn build_metrics_export_request( let mut by_endpoint: HashMap, Vec)> = HashMap::new(); for (context, sample) in batch { + let mut attributes: Vec = sample + .labels + .iter() + .map(|(k, v)| kv(k, v.clone())) + .collect(); + + // promote switch identity onto the datapoint so dashboards filtering on + // `switch_serial`/`switch_id` (underscore label form) match; these otherwise + // only exist as OTLP *resource* attributes (`switch.serial`/`switch.id`). + if !attributes.iter().any(|attr| attr.key == "switch_serial") { + if let Some(serial) = context.switch_serial() { + attributes.push(kv("switch_serial", serial.to_string())); + } + } + if !attributes.iter().any(|attr| attr.key == "switch_id") { + if let Some(switch_id) = context.switch_id() { + attributes.push(kv("switch_id", switch_id.to_string())); + } + } + let data_point = NumberDataPoint { - attributes: sample - .labels - .iter() - .map(|(k, v)| kv(k, v.clone())) - .collect(), + attributes, time_unix_nano: observed_nanos, value: Some(number_data_point::Value::AsDouble(sample.value)), ..Default::default() }; let otlp_metric = OtlpMetric { - name: sample.metric_type.clone(), + // match the Prometheus sink's full series name exactly so Grafana queries + // resolve identically across both export paths. + name: format!( + "{}_{}_{}_{}", + metric_name_prefix, sample.name, sample.metric_type, sample.unit + ), description: String::new(), unit: sample.unit.clone(), data: Some(metric::Data::Gauge(OtlpGauge { @@ -640,10 +662,13 @@ mod tests { context: None, }; - let request = build_metrics_export_request(&[ - (rest_ctx, sample("nvue_rest")), - (gnmi_ctx, sample("nvue_gnmi")), - ]); + let request = build_metrics_export_request( + &[ + (rest_ctx, sample("nvue_rest")), + (gnmi_ctx, sample("nvue_gnmi")), + ], + "carbide_hardware_health", + ); let collector_types: std::collections::HashSet<_> = request .resource_metrics @@ -658,7 +683,7 @@ mod tests { } #[test] - fn metric_export_name_uses_metric_type() { + fn metric_export_name_uses_full_prometheus_series_name() { let ctx = test_context(); let sample = MetricSample { key: "asic0/oper_status".to_string(), @@ -670,11 +695,64 @@ mod tests { context: None, }; - let request = build_metrics_export_request(&[(ctx, sample)]); + let request = build_metrics_export_request(&[(ctx, sample)], "carbide_hardware_health"); let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; assert_eq!(metrics.len(), 1); - assert_eq!(metrics[0].name, "interface_oper_status"); + assert_eq!( + metrics[0].name, + "carbide_hardware_health_nvue_gnmi_interface_oper_status_state" + ); assert_eq!(metrics[0].unit, "state"); } + + #[test] + fn switch_nmxt_metric_carries_full_name_and_switch_serial_label() { + let switch_id = test_switch_id("switch-nmxt"); + let switch_id_attr = switch_id.to_string(); + let context = EventContext { + endpoint_key: "11:22:33:44:55:66".to_string(), + addr: BmcAddr { + ip: IpAddr::V4(Ipv4Addr::new(10, 0, 1, 1)), + port: Some(443), + mac: MacAddress::from_str("11:22:33:44:55:66").expect("valid mac"), + }, + collector_type: "nvue_gnmi", + metadata: Some(EndpointMetadata::Switch(SwitchData { + id: Some(switch_id), + serial: "SN-SWITCH-001".to_string(), + slot_number: Some(7), + tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: true, + nmxt_enabled: true, + })), + rack_id: Some(RackId::new("RACK_2")), + }; + let sample = MetricSample { + key: "effective_ber".to_string(), + name: "switch_nmxt".to_string(), + metric_type: "effective_ber".to_string(), + unit: "ratio".to_string(), + value: 0.5, + labels: vec![], + context: None, + }; + + let request = build_metrics_export_request(&[(context, sample)], "carbide_hardware_health"); + let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; + + assert_eq!(metrics.len(), 1); + assert_eq!( + metrics[0].name, + "carbide_hardware_health_switch_nmxt_effective_ber_ratio" + ); + + let metric::Data::Gauge(gauge) = metrics[0].data.as_ref().expect("metric data") else { + panic!("expected gauge data"); + }; + let attrs = &gauge.data_points[0].attributes; + assert_eq!(attr_value(attrs, "switch_serial"), Some("SN-SWITCH-001")); + assert_eq!(attr_value(attrs, "switch_id"), Some(switch_id_attr.as_str())); + } } diff --git a/crates/health/src/otlp/metrics_drain.rs b/crates/health/src/otlp/metrics_drain.rs index bb04dac56a..f8e944f4b7 100644 --- a/crates/health/src/otlp/metrics_drain.rs +++ b/crates/health/src/otlp/metrics_drain.rs @@ -31,6 +31,7 @@ pub(crate) struct OtlpMetricsDrainTask { endpoint: String, batch_size: usize, flush_interval: Duration, + metric_name_prefix: String, } impl OtlpMetricsDrainTask { @@ -39,12 +40,14 @@ impl OtlpMetricsDrainTask { endpoint: String, batch_size: usize, flush_interval: Duration, + metric_name_prefix: String, ) -> Self { Self { queue, endpoint, batch_size, flush_interval, + metric_name_prefix, } } @@ -133,7 +136,7 @@ impl OtlpMetricsDrainTask { return; } - let request = build_metrics_export_request(batch); + let request = build_metrics_export_request(batch, &self.metric_name_prefix); batch.clear(); let point_count = request diff --git a/crates/health/src/sink/otlp.rs b/crates/health/src/sink/otlp.rs index e3f6bdeb25..dc636f9023 100644 --- a/crates/health/src/sink/otlp.rs +++ b/crates/health/src/sink/otlp.rs @@ -125,6 +125,7 @@ impl OtlpSink { config.endpoint.clone(), config.batch_size, config.flush_interval, + prefix.to_string(), ); handle.spawn(metrics_drain.run()); From a08b956ed5fe60cca5cdcfe20411cbdd6a51cd7c Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:24:22 -0700 Subject: [PATCH 14/25] fix(health): NMX-T client accept self-signed certs (fixes builder error) The NMX-T collector built its reqwest client without danger_accept_invalid_certs, unlike the sibling NVUE REST collector. On minimal runtime images this fails at client build time (native-root-CA load) and the switch serves a self-signed cert anyway, so NMX-T never collected. Match the NVUE REST self-signed handling. Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 0c84baaa39..61f4776272 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -495,6 +495,11 @@ impl PeriodicCollector for NmxtCollector { let http_client = reqwest::Client::builder() .timeout(request_timeout) + // NMX-T switch endpoints serve a self-signed cert (same as the NVUE REST + // collector). Accepting invalid certs also avoids a native-root-CA load + // failure at client build time on minimal runtime images without + // ca-certificates, which otherwise surfaces as "builder error". + .danger_accept_invalid_certs(true) .build() .map_err(|e| { HealthError::GenericError(format!("Failed to create HTTP client: {}", e)) From 84a4de36e8c124a48db5d60f0893a14200a276ed Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 25 Jun 2026 17:50:06 -0700 Subject: [PATCH 15/25] fix(health): gNMI TLS uses tonic native custom verifier (skip-verify) tonic 0.14 auto-injects a strict system-root TLS verifier for https:// URIs (Endpoint::from) and layers its own TlsConnector over any custom connector (channel/service/connector.rs). That silently negated the hand-rolled hyper-rustls skip-verify connector, so tonic strictly verified and rejected NVOS's self-signed gNMI cert -- the channel died right after the server Certificate message (opaque 'transport error', no HTTP/2 frames). Use Endpoint::tls_config_with_verifier(ClientTlsConfig::new(), ) so the AcceptAnyCertVerifier is applied in tonic's own TLS layer; drop the hand-rolled connector. tls.rs now exposes accept_any_cert_verifier() instead of self_signed_tls_config(). Validated on gb-nvl-124-switch06: gNMI SAMPLE+ON_CHANGE streams connect and 86 carbide_hardware_health_nvue_gnmi_* metric families flow via the OtlpSink into VictoriaMetrics. Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- .../health/src/collectors/nvue/gnmi/client.rs | 29 +++++++++++++------ crates/health/src/collectors/nvue/tls.rs | 19 ++++++------ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 0b7abbb27e..4de235ce87 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -18,7 +18,7 @@ use std::time::Duration; use tonic::metadata::MetadataMap; -use tonic::transport::{Channel, Endpoint}; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; use tonic::{Extensions, Request}; use super::proto::g_nmi_client::GNmiClient as TonicGnmiClient; @@ -141,19 +141,30 @@ impl GnmiClient { )) })?; + // tonic 0.14 auto-injects a strict WebPKI/system-root TLS verifier when an + // Endpoint is built from an `https://` URI and layers its own TlsConnector + // over any custom connector (see tonic transport channel/service/connector.rs). + // That silently negated a hand-rolled hyper-rustls skip-verify connector and + // made tonic strictly reject the switch's self-signed NVOS gNMI cert (SAN does + // not cover the management IP). Use tonic's native custom-verifier hook so the + // skip-verify verifier is the one tonic actually applies. ClientTlsConfig::new() + // must NOT set any roots here (mixing roots + custom verifier is an error). let endpoint = Endpoint::from(uri) + .tls_config_with_verifier( + ClientTlsConfig::new(), + crate::collectors::nvue::tls::accept_any_cert_verifier(), + ) + .map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: invalid gNMI TLS config: {e}", + self.switch_id + )) + })? .connect_timeout(self.request_timeout) .timeout(self.request_timeout); - let tls_config = crate::collectors::nvue::tls::self_signed_tls_config(); - let connector = hyper_rustls::HttpsConnectorBuilder::new() - .with_tls_config(tls_config) - .https_only() - .enable_http2() - .build(); - let channel = endpoint - .connect_with_connector(connector) + .connect() .await .map_err(|e| { HealthError::GnmiError(format!( diff --git a/crates/health/src/collectors/nvue/tls.rs b/crates/health/src/collectors/nvue/tls.rs index a715e644c0..6137828e67 100644 --- a/crates/health/src/collectors/nvue/tls.rs +++ b/crates/health/src/collectors/nvue/tls.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified, ServerCertVerifier}; use rustls::pki_types::{CertificateDer, ServerName, UnixTime}; -use rustls::{ClientConfig, DigitallySignedStruct, SignatureScheme}; +use rustls::{DigitallySignedStruct, SignatureScheme}; // ! dangerous cert verifier that accepts any server certificate without validation. // ! only enable in test environments where you cannot replace NVOS self-signed certificates. @@ -63,12 +63,13 @@ impl ServerCertVerifier for AcceptAnyCertVerifier { } } -/// build a rustls ClientConfig that dangerously skips server certificate verification. -pub fn self_signed_tls_config() -> ClientConfig { - ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) - .with_safe_default_protocol_versions() - .expect("default protocol versions are valid") - .dangerous() - .with_custom_certificate_verifier(Arc::new(AcceptAnyCertVerifier)) - .with_no_client_auth() +/// Dangerous rustls verifier that accepts any server certificate without validation. +/// +/// Passed to tonic's `Endpoint::tls_config_with_verifier` so tonic's own TLS layer +/// skips verification. NVOS gNMI presents a self-signed cert whose SAN may not cover +/// the management IP being dialed; a strict verifier rejects it. Do not hand-roll a +/// separate `hyper_rustls` connector for this — tonic 0.14 layers its own (strict) +/// TLS over any custom connector for `https://` URIs, which silently negates it. +pub fn accept_any_cert_verifier() -> Arc { + Arc::new(AcceptAnyCertVerifier) } From 57eb8b70f0158272aab58140b35373f1bb49b5b9 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Fri, 26 Jun 2026 09:55:00 -0400 Subject: [PATCH 16/25] chore(health): remove temp docs from repo Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- ...vswitch_telemetry_gb200_live_validation.md | 357 ------------------ .../nvswitch_telemetry_gb200_matrix.csv | 194 ---------- .../health/nvswitch_telemetry_gb200_matrix.md | 93 ----- ...vswitch_telemetry_nv_redfish_dependency.md | 72 ---- 4 files changed, 716 deletions(-) delete mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md delete mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv delete mode 100644 docs/architecture/health/nvswitch_telemetry_gb200_matrix.md delete mode 100644 docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md b/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md deleted file mode 100644 index 6f90d293d3..0000000000 --- a/docs/architecture/health/nvswitch_telemetry_gb200_live_validation.md +++ /dev/null @@ -1,357 +0,0 @@ -# GB200 NVSWITCH telemetry live-validation runbook - -> **Implementation note.** GB200 telemetry is collected via **explicit catalog-row -> allowlists** over the live host surfaces: NMX-T (`switch_nmxt`), NVOS gNMI -> (`nvue_gnmi`, explicit per-leaf), NVUE REST (`fan_max_speed` from -> `/platform/environment/fan`, `fan_led` from `/platform/environment`), and standard -> Redfish sensors (`hw_sensor`). There is -> **no** standalone Redfish `TelemetryService` collector and **no** generic/sanitized -> source preservation — both were evaluated against the live GB200 BMC and removed. -> Unknown gNMI/NMX-T sources are dropped and debug-logged, never emitted. nv-redfish is -> consumed at the released `0.10.0` (no local patch). - -This branch stops before live hardware validation. After build/test/lint review, run the health service locally against one GB200 NVLink Switch BMC endpoint and one switch HOST/NVOS endpoint. - -## Collectors that must be enabled - -For the GB200 phase, enable all switch telemetry collectors below: - -- BMC endpoint (`switch.endpoint_role = "bmc"`): - - `collectors.sensors` for standard Redfish sensor readings and threshold/range context (the temp/thermal `hw_sensor` series plus `*_range_max`/`*_range_min`). -- HOST endpoint (`switch.endpoint_role = "host"`): - - `collectors.nmxt` for NMX-T Prometheus telemetry on port `9352`. - - `collectors.nvue.rest` for NVUE health/app/partition/interface diagnostics, `fan_max_speed` from `/platform/environment/fan`, and `fan_led` (aggregate `FAN_STATUS`) from `/platform/environment`. - - `collectors.nvue.gnmi` for SAMPLE telemetry from `components`, `interfaces`, and `platform-general` (memory/disk via `/state`, OS/BMC/EROT firmware versions via `/versions`), plus ON_CHANGE system events. - -No TelemetryService proxy ACL changes are required — collection uses the standard Redfish sensor paths plus the host NMX-T/gNMI/NVUE endpoints. - -## Local static config template - -Replace placeholders after the branch is reviewed. Keep real credentials out of git. - -```toml -[endpoint_sources.carbide_api] -enabled = false - -[sinks.health_report] -enabled = false - -[sinks.rack_health_report] -enabled = false - -[sinks.switch_health_report] -enabled = false - -[sinks.power_shelf_health_report] -enabled = false - -[sinks.prometheus] -enabled = true - -[metrics] -endpoint = "127.0.0.1:9009" -prefix = "carbide_hardware_health" - -[[endpoint_sources.static_bmc_endpoints]] -ip = "" -port = 443 -mac = "" -username = "" -password = "" -switch = { serial = "", endpoint_role = "bmc", slot_number = , tray_index = } - -[[endpoint_sources.static_bmc_endpoints]] -ip = "" -port = 443 -mac = "" -username = "" -password = "" -switch = { serial = "", endpoint_role = "host", is_primary = true, nmxt_enabled = true, slot_number = , tray_index = } - -[collectors.discovery] -refresh_interval = "5m" -discovery_concurrency = 4 - -[collectors.sensors] -sensor_fetch_interval = "1m" -sensor_fetch_concurrency = 8 -include_sensor_thresholds = true - -[collectors.metrics] -enabled = false - -[collectors.logs] -enabled = false - -[collectors.firmware] -enabled = false - -[collectors.leak_detector] -enabled = false - -[collectors.nmxt] -scrape_interval = "1m" -request_timeout = "30s" - -[collectors.nvue.rest] -poll_interval = "1m" -request_timeout = "30s" - -[collectors.nvue.rest.paths] -system_health_enabled = true -cluster_apps_enabled = true -sdn_partitions_enabled = true -interfaces_enabled = true -platform_environment_fan_enabled = true # MAX-SPEED via /nvue_v1/platform/environment/fan - -[collectors.nvue.gnmi] -gnmi_port = 9339 -sample_interval = "1m" -request_timeout = "30s" -system_events_enabled = true - -[collectors.nvue.gnmi.paths] -components_enabled = true -interfaces_enabled = true -platform_general_enabled = true -``` - -## Run the local health service - -nv-redfish is consumed at the released `0.10.0` — no local patch or companion checkout is needed. - -```bash -cargo run -p carbide-health --bin forge-hw-health -- /path/to/gb200-switch-local.toml -``` - -## Evidence to capture during live validation - -1. `/telemetry` output contains `hw_sensor` samples for the BMC endpoint (temp/thermal readings; plus `*_range_max`/`*_range_min` where the sensor exposes ranges). -2. `/telemetry` output contains `switch_nmxt` samples for the HOST endpoint — only the explicit `NMXT_METRIC_MAP` families with the allowlisted identity labels (no sanitized/unknown source names). -3. `/telemetry` output contains `nvue_gnmi` samples for the HOST endpoint: canonical `interface_*` (incl. `interface_link_speed_active` in gbps and `interface_plr_bw_loss_percent`), `component_*`, `platform_memory_used/total` + `platform_disk_total/used`, and the switch-level `platform_{os,bmc,erot}_version_info` info-metrics (value 1.0, version carried in the label). -4. `/telemetry` output contains the NVUE REST `fan_max_speed` and `fan_led` samples (HOST). Logs show the NMX-T, NVUE REST, and NVUE gNMI collectors started for the expected roles; matched-but-uncoercible leaves are debug-logged, not emitted. -5. The two catalog rows with no listed source (`CABLE-SNR-MEDIA-LANE-N` row 2294, `CABLE-SNR-HOST-LANE-N` row 2295) are checked explicitly in live output. NMX-T exposes `rx_power_lane_0/1` (rows 977/978) but **no SNR family**, so neither row is emitted today (an earlier rescue pass spuriously token-matched `rx_power_lane_5`/`cable-proto-cap-ext` — corrected to ABSENT-BLOCKER). If they do not appear through Redfish MetricReports, NMX-T, or gNMI, open a catalog/source-owner follow-up immediately; keep them open until source-owner resolution. - -## Series-shape acceptance checks - -Only explicit catalog-row mappings are emitted; unknown sources are dropped (debug-logged), never sanitized into metrics. Before treating live validation as successful: - -1. Capture the distinct `(name, metric_type, key)` tuples from two consecutive `/telemetry` scrapes after collectors are warm. -2. Confirm the tuple set is stable across scrapes except for expected link/error-counter changes. -3. Confirm every emitted series is one of the known families: `hw_sensor`, `switch_nmxt`, `nvue_gnmi` (`interface_*`/`component_*`/`platform_*`, incl. `platform_{os,bmc,erot}_version_info`), `fan_max_speed`, or `fan_led`. No `nvswitch_*`, `source_metric`, or `redfish_telemetry_service` series may appear. -4. Confirm NMX-T identity labels are the allowlisted `NMXT_LABEL_MAP` set (bounded per port); no raw/unknown source names as labels. - -Unit coverage that locks this behavior: - -- NMX-T: `test_nmxt_metric_map_locks_type_and_unit`, `test_unknown_nmxt_sources_not_allowlisted`. -- NVUE gNMI: `test_interface_link_speed_active_gbps`, `test_platform_general_numeric_leaf_mappings`, `test_platform_general_unmapped_string_leaf_is_not_exported` (unmapped string leaves emit nothing), `test_interface_numeric_leaf_table_mappings` (locks `interface_plr_bw_loss_percent` type/unit), `test_platform_general_version_info_metrics` + `test_platform_general_empty_version_string_is_not_exported` (OS/BMC/EROT version info-metrics), `test_nvue_subscribe_paths_all_enabled` (the `/platform-general/versions` subscribe path is added). StateSet shape (per-state 0/1 series with a `state` label, unit `state`): `test_interface_oper_status_state_set`, `test_interface_physical_port_state_enum` (polling/training => up=0/down=1), `test_interface_logical_port_state_enum`, `test_interface_phy_manager_state_enum` + `test_phy_manager_to_state_helper` (Inactive/Deactivated => up=0/down=1 substring regression), `test_component_oper_status_shared_leaf_fan_and_cpu`, `test_component_health_status_state_set` (unrecognized => unknown=1). -- NVUE REST: `test_fan_max_speed_emit`, `test_fan_led_emit` (StateSet: green/ok => ok=1, amber => not_ok=1, absent FAN_STATUS emits nothing) + `test_parse_platform_environment_fan_status`. StateSet shape also locked by `test_system_health_mapping`, `test_partition_health_mapping`, `test_app_status_mapping`, `test_temp_state_to_state_mapping`, `test_fan_led_to_state_mapping`, and `test_platform_temperature_emit` (absent sensor `state` emits no StateSet). - -## Blocker escalations (Stage 0) - -Stage 0 live probe (2026-06-20) classified all 193 GB200-applicable NVSWITCH catalog rows. -**16 rows remain ABSENT-BLOCKER** (no live source on this platform) — these are the escalations -in Groups B–D plus the rescue-match audit below. No row is deferred — each has an explicit -disposition and a named resolution path. - -The remaining subsections here (temperature, string-valued, and firmware/PLR/fan-LED, all marked -**RESOLVED**) are *not* escalations; they are kept for provenance, recording rows that earlier -passes had escalated but that are now implemented, so the trail from the Stage-0 blocker set down -to the final 16 is auditable. A post-implementation audit on 2026-06-23 moved 3 rows *into* the -blocker set — 870 CPU_CORE_NUMBER, 2294/2295 CABLE-SNR-MEDIA/HOST-LANE-N — that an earlier pass -had token-matched but no lane actually emits (see "Rescue-match audit" below). - -### Temperature threshold rows — RESOLVED (21 rows, formerly BLOCKER-THRESHOLD) - -The 21 temperature `*-CRITICAL` / `*-MAX` / `*-STATE` rows (ASIC / CPU-Pack / SODIMM / Drive / -HSC-VinDC / PDB-Conv / PMIC / SWB-ASIC-PCB / Ambient-MNG) are now implemented from NVUE REST -`/nvue_v1/platform/environment/temperature` (`.crit` / `.max` / `.state` per sensor; only the fields -a sensor actually exposes are emitted). The 8 `*-TEMP-CURRENT` rows were re-sourced from `.current` -on the same endpoint, correcting an earlier spurious gNMI token match. No longer escalated. - -### Group B — Cable/transceiver leaves (7 rows, ABSENT-BLOCKER) - -**Root cause (NOT an uncabled rig).** The N5400_LD NVLink switch enumerates **no gNMI transceiver -components** — the live component tree has only `ASIC`/`CPU`/`FAN`/`SWITCH` types and no -`/components/component/transceiver/*` subtree, even though 64+ ports are active NDR/XDR backplane -links (re-probed live 2026-06-23). The catalog mapped these rows to an openconfig transceiver-diag -path this platform does not expose; NVLink backplane cables are not modeled as openconfig -transceivers. - -**Re-sourced to NMX-T (now implemented):** 4 fault-flag rows have live NMX-T families (value 0 = no -fault on the active links) and were moved into `NMXT_METRIC_MAP` — 983 CABLE-TX-CDR-LOL -(`tx_cdr_lol`), 984 CABLE-RX-CDR-LOL (`rx_cdr_lol`), 985 CABLE-TX-LOS (`tx_los`), 986 CABLE-RX-LOS -(`rx_los`). They are no longer blockers. - -**Resolution (remaining 7):** no NMX-T or gNMI source exists for the alarm/threshold/oper-status -rows below. Escalate to the NVOS gNMI / NMX-T owner: is there any source (gNMI/NMX-T/Redfish/CLI) -for NVLink cable optical alarms, module oper-status, and per-lane power thresholds on N5400_LD, or -are these rows N/A for NVLink backplane switches? - -| Row | Metric | Catalog source (absent live) | -|------|-------------------------------------|----------------------------------------------------------------| -| 981 | CABLE-TEMP-ALARM | gNMI transceiver `temp-high-alarm-flag` (no transceiver component) | -| 982 | CABLE-VOLTAGE-ALARM | gNMI transceiver `vcc-high-alarm-flag` (no transceiver component) | -| 2293 | CABLE-OPER-STATUS | gNMI transceiver `module-oper-status` (no transceiver component) | -| 2296 | NVSWITCH-CABLE-RX-POWER-LANE-LOW-N | gNMI transceiver thresholds `input-power-lower` (absent) | -| 2297 | NVSWITCH-CABLE-TX-POWER-LANE-LOW-N | gNMI transceiver thresholds `output-power-lower` (absent) | -| 2298 | NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N | gNMI transceiver thresholds `input-power-upper` (absent) | -| 2299 | NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N | gNMI transceiver thresholds `output-power-upper` (absent) | - -### Group C — NMX-T RDMA queue counters (3 rows, ABSENT-BLOCKER) - -NMX-T fields were not present in the live scrape output. These are RDMA queue error counters -that may only appear under active RDMA workloads or specific firmware versions. - -**Resolution:** Escalate to NMX-T / RDMA owner with the NMX-T version from the test rig. -Re-probe under active RDMA traffic if possible. - -| Row | Metric | NMX-T field not live | -|------|-------------|----------------------| -| 1706 | RQ-NUM-WRFE | `rq_num_wrfe` | -| 1707 | RQ-NUM-LLE | `rq_num_lle` | -| 1708 | SQ-NUM-WRFE | `sq_num_wrfe` | - -### Group D — Single-field ABSENT-BLOCKERs - -**OS-KERNEL (row 765):** Catalog source is NVOS CLI only (`nv show system version {build-id}`). -No gNMI leaf or NMX-T field matched. Implementing this row requires either a new CLI collector -(not in scope for this branch) or a new NVOS gNMI exposure. Escalate to NVOS owner. - -**TIME-SINCE-LASTS-CLEAR (row 909):** gNMI leaf -`/interfaces/interface/phy-diag/state/time-since-last-clear-min` is in the NVOS schema but -returned no data. Escalate to NVOS gNMI owner with NVOS version; confirm whether this leaf -requires a specific counter-clear event to populate. - -**PLR-CODES-LOSS (row 931):** NMX-T field `HiRetransmissionRate` is not present in the live -scrape. This may be a naming discrepancy or a field absent in the installed NMX-T version. -Escalate to NMX-T owner with the NMX-T version string from the test rig. - -### String-valued rows — RESOLVED (6 rows, now implemented) - -These 6 catalog rows are string-valued and were previously escalated; they are now implemented: -- `961 PHY-MANAGER-STATE` — emitted as a StateSet `interface_phy_manager_state` (one 0/1 series per - state with a `state` label; `up` when an `active`/`linkup` token matches on a word boundary, else - `down`). -- `965 VL-CAPABILITIES`, `862 CONTACT`, `863 LOCATION`, `864 NODE-DESCRIPTION` — emitted as - info-metrics (value 1 with the string carried in a label; skipped when empty, so `CONTACT`/`LOCATION` - emit only when configured). -- `876 ASIC-NAME` — covered by the existing `component_name` label on every component metric (not re-emitted). - -### Firmware-version / PLR / fan-LED rows — RESOLVED (5 rows, now implemented) - -These 5 catalog rows were previously rescue-matched by token but not emitted by any lane; each -now has an explicit, unit-tested emit path: -- `764 OS-VERSION`, `767 BMC-VERSION`, `766 EROT-FW-VERSION` — gNMI now also subscribes - `/platform-general/versions` (sibling of `/state`); the `versions/state/{nos-version, - fw-version-bmc,fw-version-erot}` leaves emit switch-level info-metrics - `platform_{os,bmc,erot}_version_info` (value 1.0, raw version carried in the - `{os,bmc,erot}_version` label; empty strings emit nothing). -- `942 PLR-BW-LOSS-PERCENT` — gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` - added to the numeric interface-leaf allowlist as `interface_plr_bw_loss_percent` (unit `percent`). -- `967 FAN-LED` — re-sourced from NVUE REST: the `/nvue_v1/platform/environment` parent summary's - aggregate `FAN_STATUS.state` LED is emitted as switch-level `fan_led`, a StateSet (per-state 0/1 - series with a `state` label: green/ok => `ok`, any other state => `not_ok`; absent = nothing), - gated on `platform_environment_status_enabled` (default true). - The catalog's CLI LED path (`nv show platform environment led`) is not used. - -### Rescue-match audit — 3 rows re-classified to ABSENT-BLOCKER (2026-06-23) - -A verification pass over the 8 `RESOLVED-LIVE` rows found 3 that an earlier token-rescue had marked -`implemented` but that **no collector lane actually emits**. They are now ABSENT-BLOCKER: -- **`870 CPU_CORE_NUMBER`** — catalog source is NVOS CLI only (`nv show system cpu`). The rescue - token `core-to-phy-link-width-enabled` is a gNMI link-width *config knob*, not a CPU core count. - No gNMI/NMX-T emit arm exists. Resolution: new CLI collector or NVOS gNMI exposure; escalate to - NVOS owner (same path as `765 OS-KERNEL`). -- **`2294 CABLE-SNR-MEDIA-LANE-N` / `2295 CABLE-SNR-HOST-LANE-N`** — catalog lists no source. NMX-T - has `rx_power_lane_0/1` (power, rows 977/978) but no per-lane **SNR** family; the rescue tokens - (`rx_power_lane_5`, `cable-proto-cap-ext`) do not exist as live SNR sources. No emit arm exists. - Resolution: source-owner follow-up (see "Evidence to capture" step 5); keep open until an NVLink - per-lane SNR source is identified or the rows are declared N/A for NVLink backplane switches. - -## NMX-T field representation validation (DEFERRED — requires live GB200 rig) - -These NMX-T fields had their representation changed during review (label → metric / StateSet, or -per-sink routing). The chosen representations are **derived from source/catalog analysis, not yet -confirmed on live hardware**, and must be validated on a real GB200 NVLink switch before merge is -considered fully signed off. - -**Why this needs live verification (critical caveat).** The target firmware NVOS **25.02.2553** -ships **NMX-T 1.3.4**, which **predates** the NMX-T Prometheus metric-vs-label renderer fix -(**NVBug 6131830**, fixed in NMX-T 4.20.4 / 4.21.4 / 5.06.12, telemetry commit `3dd5d388`). On the -pre-fix renderer, `/xcset/nvlink_domain_telemetry` may render the same field as a string label on -one endpoint and a numeric gauge on another (`/metrics`), or render empty — and a `;lookup=` xcset -suffix can flip string↔numeric. So the *actual* on-wire form of these fields on 25.02.2553 must be -captured, not assumed. Our scraper is robust to either form (unmapped strings fall back to -`unknown` / are dropped, never fabricated), so collection is safe regardless — but the -representation decisions below should be re-confirmed against reality. - -### Capture commands (run on / against the live switch host) - -```bash -# What NMX-T actually renders for the fields in question, on BOTH endpoints: -curl -s http://:9352/metrics | grep -E 'local_reason_opcode|remote_reason_opcode|down_blame|fec_mode_active|Active_FEC|Module_Temperature|Status_Message' -curl -s http://:9352/xcset/nvlink_domain_telemetry | grep -E 'local_reason_opcode|remote_reason_opcode|down_blame|Active_FEC|Module_Temperature|Status_Message' | head -curl -s http://:9352/management/xcset/nvlink_domain_telemetry | head -curl -s 'http://:9352/management/schema?schema_id=all' > nmxt-schema.json # value-space / lookup tables -# Distinct observed values per field (empirical floor for the value space): -curl -s http://:9352/xcset/nvlink_domain_telemetry \ - | grep -oE '(down_blame|local_reason_opcode|remote_reason_opcode|Active_FEC|Status_Message|Module_Temperature)="[^"]*"' \ - | sort -u -``` - -Source-of-truth references for the value spaces (use to confirm closed-enum membership): -NVOS `nvos/src/nvos-swss/orchagent/portsorch.h` (link-down reason opcode map `0..49`, read from -SAI `SAI_PORT_ATTR_LINK_DOWN_{LOCAL,REMOTE}_REASON`); `Telemetry_Catalog_v4.0_Telemetry_APIs.csv`; -NMX-T producer `gitlab-master.nvidia.com/telemetry/nmx-telemetry`. - -### Per-field acceptance checks - -1. **`cable_temperature_celsius`** (was `cable_temp` label → numeric gauge, one series/port). Confirm - the `Module_Temperature` label is present in the scrape; confirm our exporter emits exactly one - `cable_temperature_celsius` series per port with the parsed value (e.g. `"37C"→37`). On a rig with - **optical** modules (not just the passive backplane, which reads `0C`), confirm the value varies - over time **and** that no `cable_temp` *label* reappears (the churn fix). -2. **`down_blame`** (was label → StateSet `unknown`/`local_phy`/`remote_phy`). Confirm the live - string values are within that closed set; any value mapping to `unknown` that *isn't* literally - "Unknown" is an unmapped source token → record it and extend the mapping. Confirm exactly 3 - series per port, exactly one `=1`. Best signal: induce/observe a link-down and confirm the active - state flips (`local_phy`/`remote_phy`). -3. **`status_message`** (kept as label, **Prometheus-excluded, OTLP-only**). Confirm it is **absent** - from our Prometheus `/telemetry` scrape and **present** as a data-point attribute in the OTLP - export. Capture the distinct-value count over a soak window to confirm it is bounded (a finite - decode of the opcode table), not unbounded free text. If it proves unbounded/noisy in OTLP, - reconsider dropping or moving to the events/logs path. -4. **`local_reason_opcode` vs `remote_reason_opcode`** (left as-is: local = string label, remote = - numeric `code` metric). Confirm on 25.02.2553 which form each actually renders. If **both** render - numeric (post-fix backport) — or if `nmxt-schema.json` exposes a stable numeric↔string map — then - numeric-ifying `local_reason_opcode` into a `code` metric (consistent with `remote_reason_opcode`) - becomes worthwhile. Until then the local=string/remote=numeric asymmetry is a documented NMX-T - 1.3.4 source artifact, **not** our bug — do NOT hardcode a `0..49` reverse map (it differs across - versions, e.g. `0..37` in older customer docs). -5. **`fec_mode_active` (`Active_FEC`)** (left as a label). Capture distinct values incl. aliases - (`Int_KP4_FEC_PLR` etc. vs the catalog canonical set `No_FEC` / `Firecode_FEC` / `Standard_RS_FEC` - / `Standard_LL_RS_FEC` / `Interleaved_Standard_RS-FEC` / `Standard_RS-FEC`). Only convert to a - StateSet if the alias→canonical normalization map can be sourced authoritatively; otherwise the - low-churn label is fine. - -### Cardinality observation (do this while validating) - -Scrape our `/telemetry` endpoint twice ~1 minute apart after collectors are warm; diff the distinct -`(metric_type, label-set)` tuples. Expectation: stable except expected counter movement. Then induce -a link event and re-diff — confirm the only new series are the intended StateSet flips -(`down_blame`, port/oper state), **not** a fan-out of new label-value combinations. Record per-field -distinct-value counts so future representation decisions have an empirical basis. - -### Follow-on goal (AFTER live validation): representation true-up - -Once the live-hardware validation above is complete, perform a deliberate **true-up** pass over the -full NVSWITCH catalog coverage to confirm we are filling the gaps in the *best* way, not merely a -working way: -- Re-confirm each chosen source and representation against observed live data. -- Revisit label-vs-metric-vs-StateSet decisions with **real cardinality numbers** (not estimates). -- Re-examine the 16 ABSENT-BLOCKER rows for newly-available sources — especially cable optical - telemetry, the RDMA queue counters under active load, OS-KERNEL, and TIME-SINCE-LAST-CLEAR. -- Reconcile the matrix to reality. - -This is sequenced strictly **after** hardware validation: validate what we built, then optimize. diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv deleted file mode 100644 index 87e8e01038..0000000000 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv +++ /dev/null @@ -1,194 +0,0 @@ -catalog_row,metric_param_name,corrected_primary_source,final_status,disposition,match_detail -763,NET-FW-VER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -764,OS-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/nos-version -> platform_os_version_info -765,OS-KERNEL,source-equivalent required; no new CLI collector by default,ABSENT-BLOCKER,blocker,no live token match (CLI-only) -766,EROT-FW-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/fw-version-erot -> platform_erot_version_info -767,BMC-VERSION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi platform-general/versions/state/fw-version-bmc -> platform_bmc_version_info -794,LINK-DOWNED-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -795,PORT-MALFORMED-PACKET-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -796,PORT-NEIGHBOR-MTU-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -797,PORT-RCV-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -798,PORT-XMIT-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -799,PORT-RCV-REMOTE-PHYSICAL-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -800,PORT-RCV-SWITCH-RELAY-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -801,QP1Dropped,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -802,VL15-DROPPED,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -804,SERIAL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -806,NODE-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -807,PORT-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -834,@pshima@nvidia.com should be called PORT-PHYSICAL-STATE -Ziv Hillel IL NVLINK-STATUS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -846,LINK-ERROR-RECOVERY-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -847,PORT-MULTICAST-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -848,PORT-MULTICAST-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -849,PORT-RCV-DATA,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -850,PORT-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -851,PORT-UNICAST-RCV-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -852,PORT-UNICAST-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -853,PORT-XMIT-DATA,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -854,PORT-XMIT-PKTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -855,PORT-XMIT-WAIT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -862,CONTACT,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_contact_info (emits when set) -863,LOCATION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_location_info (emits when set) -864,NODE-DESCRIPTION,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric platform_node_description_info -865,LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -866,PORT-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -867,PORT-LABEL,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -868,REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -869,DEVICE-HARDWARE-REVISION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -870,CPU_CORE_NUMBER,NVOS CLI only (nv show system cpu); no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (gnmi core-to-phy-link-width-enabled is a link knob not a CPU core count); not emitted -872,ASIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -873,ASIC-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -874,ASIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -875,ASIC-TEMP-CURRENT,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -876,ASIC-NAME,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,covered by component_name label on component metrics -879,AMBIENT-MNG-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -880,AMBIENT-MNG-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -881,CPU_PACK_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -882,CPU_PACK_TEMP_MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -883,CPU_PACK_TEMP_STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -884,CPU_PACK_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -885,CPU-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -886,MEM-UTIL,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -887,MEM-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -888,DISK-TOTAL-SIZE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -889,DISK-USED,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -890,SODIMM_TEMP_CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -891,SODIMM_TEMP_MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -892,SODIMM_TEMP_STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -893,SODIMM_TEMP_CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -894,MAX-SPEED,BMC Redfish live resource only,IMPLEMENTED,implemented,nvue rest /platform/environment/fan .max-speed -897,PORT-LOGICAL-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -898,FEC-MODE-ACTIVE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -899,RAW-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -900,EFFECTIVE-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -901,SYMBOL-BER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -902,ZERO-HIST,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -903,PHY-RAW-ERRORS-LANE0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -904,PHY-RAW-ERRORS-LANE1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -905,RAW-BER-LANE0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -906,RAW-BER-LANE1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -907,PHY-EFFECTIVE-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -908,PHY-SYMBOL-ERRORS,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -909,TIME-SINCE-LASTS-CLEAR,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /interfaces/interface/phy-diag/state/time-since-last-clear-min -910,DEVICE-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -911,FEC-HIST-0,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -912,FEC-HIST-1,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -913,FEC-HIST-2,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -914,FEC-HIST-3,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -915,FEC-HIST-4,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -916,FEC-HIST-5,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -917,FEC-HIST-6,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -918,FEC-HIST-7,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -919,FEC-HIST-8,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -920,FEC-HIST-9,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -921,FEC-HIST-10,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -922,FEC-HIST-11,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -923,FEC-HIST-12,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -924,FEC-HIST-13,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -925,FEC-HIST-14,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -926,FEC-HIST-15,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -931,PLR-CODES-LOSS,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: HiRetransmissionRate -932,PORT-BUFFER-OVERRUN-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -933,LINK-SPEED-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -934,PLR-RCV-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -935,PLR-RCV-CODES-ERR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -936,PLR-RCV-UNCORRECTABLES-CODE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -937,PLR-XMIT-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -938,PLR-XMIT-RETRYS-CODES,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -939,PLR-XMIT-RETRYS-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -940,PLR-SYNC-EVENTS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -941,PLR-XMIT-RETRY-CODES-WITHIN-MINUTE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -942,PLR-BW-LOSS-PERCENT,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi interfaces/interface/phy-diag/state/plr-bw-loss-percent -> interface_plr_bw_loss_percent (percent) -943,RQ-GENERAL-ERROR,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -944,TIME-TO-LINKS-UP,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -945,STATUS-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -946,STATUS-MESSAGE,NMX-T explicit allowlist,PRESENT,implemented,OTLP-only label status_message; excluded from Prometheus series to bound cardinality -947,DOWN-BLAME,NMX-T explicit allowlist,PRESENT,implemented,StateSet down_blame (state label unknown/local_phy/remote_phy 0/1 per state) one series per port -948,LOCAL-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -949,REMOTE-REASON-OPCODE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -950,PHY-RECEIVED-BITS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -951,PORT-RCV-CONSTRAINT-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -952,PORT-XMIT-CONSTRAINTS-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -953,PORT-LOCAL-PHYSICAL-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -954,SYNC-HEADER-ERROR-COUNTER,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -955,PORT-DLID-MAPPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -956,LOCAL-LINK-INTEGRITY-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -957,PORT-VL-MAPPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -958,PORT-LOOPING-ERRORS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -959,PORT-INACTIVE-DISCARDS,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -960,LINK-WIDTH-ACTIVE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -961,PHY-MANAGER-STATE,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi interface_phy_manager_state StateSet (state label, 0/1 per state) -962,MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -963,MAX-SUPPORTED-MTU,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -964,SUPPORTED-WIDTH,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -965,VL-CAPABILITIES,NVOS gNMI explicit allowlist,IMPLEMENTED,implemented,gnmi info metric interface_vl_capabilities_info -966,FAN-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -967,FAN-LED,NVUE REST explicit mapping,IMPLEMENTED,implemented,nvue-rest /nvue_v1/platform/environment FAN_STATUS.state -> fan_led StateSet (state label, 0/1 per state: green/ok=ok else not_ok) -968,CABLE-PART-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -969,CABLE-SERIAL-NUMBER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -970,CABLE-TRANSMITTER-TECHNOLOGY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -971,CABLE-TYPE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -972,CABLE-VENDOR,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -973,CABLE-LENGTH,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -974,CABLE-IDENTIFIER,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -975,CABLE-REV,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -976,CABLE-FW-VERSION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -977,CABLE-RX-POWER-LANE0,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -978,CABLE-RX-POWER-LANE1,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -979,CABLE-DIAG-SUPPLY-VOLTAGE,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -980,CABLE-TEMP,NMX-T explicit allowlist,PRESENT,implemented,numeric metric cable_temperature_celsius (celsius) parsed from NMX-T Module_Temperature label; one series per port -981,CABLE-TEMP-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/temp-high-alarm-flag -982,CABLE-VOLTAGE-ALARM,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/physical-channels/transceiver-diag/state/vcc-high-alarm-flag -983,CABLE-TX-CDR-LOL,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) -984,CABLE-RX-CDR-LOL,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) -985,CABLE-TX-LOS,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) -986,CABLE-RX-LOS,NMX-T explicit allowlist,PRESENT,implemented,nmxt cable fault flag (re-sourced; gNMI transceiver path absent on NVLink) -987,LINK-PARTNER-DESCRIPTION,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -988,LINK-PARTNER-NODE-GUID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -989,LINK-PARTNER-LID,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -990,LINK-PARTNER-PORT-NUM,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1174,CPU-STATE,NVOS gNMI explicit allowlist,PRESENT,implemented,gnmi exact -1241,DRIVE-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1242,DRIVE-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1243,DRIVE-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1244,DRIVE-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -1245,HSC-VINDC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1246,HSC-VINDC-TEMP-MAX,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1247,HSC-VINDC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1248,HSC-VINDC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -1249,PDB-CONV-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1251,PDB-CONV-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1252,PDB-CONV-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -1253,PMIC-TEMP-CRITICAL,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1255,PMIC-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1256,PMIC-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -1259,SWB-ASIC-PCB-TEMP-STATE,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature (.crit/.max/.state) -1260,SWB-ASIC-PCB-TEMP-CURRENT,source-equivalent required; no new CLI collector by default,IMPLEMENTED,implemented,nvue rest /platform/environment/temperature .current (corrected from gNMI rescue) -1688,LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1689,TOTAL-LINK-RECOVERY-SUCCESS-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1690,TIME-SINCE-LAST-RECOVERY,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1691,TIME-BTWN-TWO-RECOVERIES,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1692,RECOVERY-ATTEMPTS-L1-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1693,RECOVERY-ATTEMPTS-L2-CNT,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1694,RECOVERY-CYCLE-DURATION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1695,SERDES-RECOVERY-CYCLE-DURATION,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1696,CONTAIN-DRAIN-XMIT-DISCARD,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1697,CONTAIN-DRAIN-RCV-DISCARD,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1698,DEVICE-NUM,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1699,BOARD-TYPE,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1700,CHASSIS-SLOT-IDX,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1701,TRAY-IDX,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1702,TOPOLOGY-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1703,CHASSIS-ID,NMX-T explicit allowlist,PRESENT,implemented,nmxt label dim -1704,RAW-ERR-LANE-2,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1705,RAW-ERR-LANE-3,NMX-T explicit allowlist,PRESENT,implemented,nmxt family -1706,RQ-NUM-WRFE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: rq_num_wrfe -1707,RQ-NUM-LLE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: rq_num_lle -1708,SQ-NUM-WRFE,NMX-T explicit allowlist,ABSENT-BLOCKER,blocker,nmxt not live: sq_num_wrfe -2293,CABLE-OPER-STATUS,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/transceiver-diag/state/module-oper-status -2294,CABLE-SNR-MEDIA-LANE-N,no source listed in catalog; no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (NMX-T has rx_power_lane_0/1 but no SNR field); not emitted; source-owner follow-up open -2295,CABLE-SNR-HOST-LANE-N,no source listed in catalog; no live gNMI/NMX-T source,ABSENT-BLOCKER,blocker,false token match (NMX-T has rx_power_lane_0/1 but no SNR field); not emitted; source-owner follow-up open -2296,NVSWITCH-CABLE-RX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-lower -2297,NVSWITCH-CABLE-TX-POWER-LANE-LOW-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/output-power-lower -2298,NVSWITCH-CABLE-RX-POWER-LANE-HIGH-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/input-power-upper -2299,NVSWITCH-CABLE-TX-POWER-LANE-HIGH-N,NVOS gNMI explicit allowlist,ABSENT-BLOCKER,blocker,gnmi leaf not live: /components/component/transceiver/thresholds/threshold/state/output-power-upper diff --git a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md b/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md deleted file mode 100644 index 1ae2614da0..0000000000 --- a/docs/architecture/health/nvswitch_telemetry_gb200_matrix.md +++ /dev/null @@ -1,93 +0,0 @@ -# NVSWITCH telemetry GB200 source matrix - -Generated from Stage 0 live-probe results (`nvswitch-stage0-live-coverage-20260620.md`) via -`catalog-coverage-final.csv`. Supersedes the pre-live-validation matrix generated from the raw -catalog extraction. - -CSV matrix: `docs/architecture/health/nvswitch_telemetry_gb200_matrix.csv` - -Columns: `catalog_row`, `metric_param_name`, `corrected_primary_source`, `final_status`, -`disposition`, `match_detail`. - -## Counts - -- Total GB200-applicable NVSWITCH rows: 193 - -### Disposition (post-live-probe) - -| Disposition | Count | Meaning | -|----------------|-------|-----------------------------------------------------------------------| -| implemented | 177 | PRESENT allowlist hit, IMPLEMENTED (NVUE REST / info / enum-coded / discovered live source), or covered by an existing label | -| blocker | 16 | ABSENT-BLOCKER — leaf/family not live on this platform | - -### final_status breakdown - -| final_status | Count | -|-------------------|-------| -| PRESENT | 136 | -| IMPLEMENTED | 41 | -| ABSENT-BLOCKER | 16 | - -## Blocker escalations - -See `nvswitch_telemetry_gb200_live_validation.md` section "Blocker escalations (Stage 0)" for the -full annotated list of 16 rows, grouped by root cause, with resolution path and re-probe -conditions. - -## Notes on implemented rows - -- **PRESENT** rows have an explicit gNMI or NMX-T allowlist mapping confirmed live by the Stage 0 - probe. No further work required before merge. -- **IMPLEMENTED via discovered live sources (5 rows)** — these had no direct catalog-listed source - originally (the catalog marked them CLI-only / "resolution required"), but each now has an - explicit, unit-tested emit path; `match_detail` records the concrete live leaf/endpoint: - - 764 OS-VERSION, 767 BMC-VERSION, 766 EROT-FW-VERSION → gNMI `platform-general/versions/state/` - `{nos-version,fw-version-bmc,fw-version-erot}` info-metrics (`platform_os/bmc/erot_version_info`). - - 942 PLR-BW-LOSS-PERCENT → gNMI `interfaces/interface/phy-diag/state/plr-bw-loss-percent` - (`interface_plr_bw_loss_percent`, percent). - - 967 FAN-LED → NVUE REST `/nvue_v1/platform/environment` `FAN_STATUS.state` (`fan_led`, - StateSet: `state` label, 0/1 per state — green/ok => ok, else not_ok). - - Audit note: an earlier pass token-matched 3 further rows (870 CPU_CORE_NUMBER, 2294 - CABLE-SNR-MEDIA-LANE-N, 2295 CABLE-SNR-HOST-LANE-N) on spurious substrings; on verification no - lane emits them, so they were re-classified to ABSENT-BLOCKER (see "Notes on blocker rows"). -- **IMPLEMENTED** rows are sourced beyond the plain gNMI/NMX-T allowlist: - - NVUE REST `/nvue_v1/platform/environment/{fan,temperature}` → MAX-SPEED (894); the 21 temp - `*-CRITICAL/MAX/STATE` rows (`.crit`/`.max`/`.state`) and the 8 `*-TEMP-CURRENT` rows - (`.current`), emitted per sensor as `platform_temperature{,_max,_critical,_state}` with a `sensor` label. - - gNMI `platform-general` subscribe path → the 4 memory/disk rows (`886-889`). - - String rows → `interface_phy_manager_state` (StateSet: `state` label, 0/1 per state), `*_info` info-metrics, and the existing `component_name` label (`ASIC-NAME`). - - 947 DOWN-BLAME → NMX-T `down_blame` emitted as a StateSet `down_blame` (`state` label: - unknown/local_phy/remote_phy, 0/1 per state), one series per port (no longer a re-exported label). - - 946 STATUS-MESSAGE → NMX-T `status_message` is free-text; emitted as an OTLP-only data-point - attribute (`status_message`) and excluded from Prometheus series to bound cardinality. - -## Notes on blocker rows - -No row is marked "deferred." Every blocker has an explicit escalation disposition: - -- **ABSENT-BLOCKER — cable/transceiver leaves (7 rows: 981, 982, 2293, 2296-2299):** the catalog's - gNMI transceiver-diag path is absent live — the N5400_LD NVLink switch enumerates **no gNMI - transceiver components** (confirmed live; 64+ active backplane links, so *not* an uncabled rig). - The 4 fault-flag rows (983-986: CABLE-TX/RX-CDR-LOL, CABLE-TX/RX-LOS) were **re-sourced to NMX-T** - (live flag families) and are now implemented. The remaining 7 (temp/vcc alarm flags, module - oper-status, RX/TX power-lane LOW/HIGH thresholds) have no NMX-T or gNMI source; escalate to the - NVOS gNMI / NMX-T owner re: NVLink cable optical telemetry. -- **ABSENT-BLOCKER — TIME-SINCE-LASTS-CLEAR (row 909):** gNMI leaf - `/interfaces/interface/phy-diag/state/time-since-last-clear-min` not live. Escalate to NVOS - gNMI owner for NVOS version confirmation. -- **ABSENT-BLOCKER — PLR-CODES-LOSS (row 931):** NMX-T field `HiRetransmissionRate` not live. - Escalate to NMX-T owner. -- **ABSENT-BLOCKER — NMX-T RDMA queue counters (rows 1706-1708):** RQ-NUM-WRFE, RQ-NUM-LLE, - SQ-NUM-WRFE — NMX-T fields `rq_num_wrfe`, `rq_num_lle`, `sq_num_wrfe` not live. Escalate to - NMX-T/RDMA owner. -- **ABSENT-BLOCKER — OS-KERNEL (row 765):** CLI-only, no gNMI or NMX-T token match. Requires a - new CLI collector or NVOS gNMI exposure; escalate to NVOS owner. -- **ABSENT-BLOCKER — CPU_CORE_NUMBER (row 870):** CLI-only (`nv show system cpu`); the catalog - lists no gNMI/NMX-T source. A prior pass spuriously token-matched the gNMI link knob - `core-to-phy-link-width-enabled` (a link-width config flag, not a CPU core count); no lane emits - it. Requires a new CLI collector or NVOS gNMI exposure; escalate to NVOS owner. -- **ABSENT-BLOCKER — CABLE-SNR-MEDIA-LANE-N / CABLE-SNR-HOST-LANE-N (rows 2294, 2295):** catalog - lists *no source* for either row. NMX-T exposes `rx_power_lane_0/1` (rows 977/978) but **no SNR - family**; a prior pass spuriously token-matched `rx_power_lane_5`/`cable-proto-cap-ext`. No lane - emits these. Source-owner follow-up is open (see live-validation runbook step 5) — keep open - until an NVLink per-lane SNR source is identified or the rows are declared N/A. diff --git a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md b/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md deleted file mode 100644 index 96890c3164..0000000000 --- a/docs/architecture/health/nvswitch_telemetry_nv_redfish_dependency.md +++ /dev/null @@ -1,72 +0,0 @@ -# NVSWITCH telemetry nv-redfish dependency notes - -> **Superseded (2026-06-18).** The standalone Redfish `TelemetryService` MetricReports -> collector described below was **removed**: live GB200 BMC probes show -> `/redfish/v1/TelemetryService` MetricReports are absent/404, `SwitchMetrics` are -> empty, histograms are empty, and `Ports` are absent. The corrected direction uses -> explicit, catalog-row allowlist mappings over the live BMC sensor/thermal surface -> and the live host NVOS gNMI / NMX-T surfaces. This file is retained for the -> nv-redfish dependency history only; the `telemetry-service` feature, the -> `[collectors.telemetry_service]` config, and the collector itself are no longer -> present in this branch. - -Generated during the GB200 NVSWITCH telemetry branch setup. - -## Current infra-controller dependency state - -- `Cargo.toml` pins `nv-redfish = { version = "0.10.0" }`. -- `Cargo.lock` resolves `nv-redfish`, `nv-redfish-bmc-http`, `nv-redfish-core`, `nv-redfish-schema`, and `nv-redfish-csdl-compiler` to `0.10.0` from crates.io. -- ~~This branch enables `telemetry-service` in `crates/health/Cargo.toml` for the new Redfish TelemetryService collector.~~ (Reverted: the `telemetry-service` feature and collector were removed; see the superseded banner above.) -- The GB200 branch has a local `nv-redfish` worktree available for companion development only: - - `${NV_REDFISH_WORKTREE}` - - Branch: `nvswitch_telemetry_gaps` - - Base: `origin/main` at `dbd2789c987fd320d263d87524fc25fde305bc7f` - -## Refreshed upstream state - -- Local `${NV_REDFISH_SOURCE_CHECKOUT}` was fetched from `origin` on 2026-06-18. -- Latest observed public tags: `v0.10.2`, `v0.10.1`, `v0.10.0`. -- `v0.10.2` does not appear to contain Fabric/Switch/Port/NVSwitch changes relevant to this work. -- `origin/main` includes a `telemetry-service` feature in `redfish/features.toml`. -- `origin/main` exposes `ServiceRoot::telemetry_service()` behind the `telemetry-service` feature. -- Neither `origin/main` nor `v0.10.2` has a `fabrics` feature or generated/wrapper hits for Fabric, Switch, Port, SwitchMetrics, or PortMetrics in the inspected source. - -## Dependency conclusion - -Historical note: TelemetryService MetricReports *could* in principle be wired in infra-controller by enabling `telemetry-service` and consuming the typed `TelemetryService` APIs available in nv-redfish 0.10.x. This was attempted and then **reverted** — live GB200 BMC exposes no usable MetricReports, so no TelemetryService collector is wired in this branch. - -Redfish Fabric/Switch/Port support needs companion `nv-redfish` work if GB200 live hardware or the catalog requires those paths. The companion work should add standard DMTF schema XMLs and feature entries for Fabric, Switch, Port, SwitchMetrics, PortMetrics, Endpoint, and Zone families, plus ergonomic ServiceRoot/Fabric/Switch navigation wrappers and mock tests. - -## Local development strategy - -During local development, keep user-local absolute paths out of committed manifests. Use Cargo local patching via command-line `--config` for experiments against the companion `nv-redfish` worktree, for example: - -```bash -cargo test -p carbide-health --lib --no-run \ - --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" -``` - -If companion changes touch internal nv-redfish crates, patch the affected packages too: - -```bash -cargo test -p carbide-health --lib --no-run \ - --config "patch.crates-io.nv-redfish.path=\"${NV_REDFISH_WORKTREE}/redfish\"" \ - --config "patch.crates-io.nv-redfish-core.path=\"${NV_REDFISH_WORKTREE}/core\"" \ - --config "patch.crates-io.nv-redfish-schema.path=\"${NV_REDFISH_WORKTREE}/schema\"" \ - --config "patch.crates-io.nv-redfish-csdl-compiler.path=\"${NV_REDFISH_WORKTREE}/csdl-compiler\"" \ - --config "patch.crates-io.nv-redfish-bmc-http.path=\"${NV_REDFISH_WORKTREE}/bmc-http\"" -``` - -## Final MR strategy - -Do not commit local absolute path dependencies. Before final review, use one of these acceptable states: - -1. A released `nv-redfish` version containing companion support, with `Cargo.toml` and `Cargo.lock` updated accordingly. -2. A reviewer-approved git revision dependency if release timing blocks final integration. -3. A documented split where infra-controller names the required `nv-redfish` companion MR and keeps local path overrides out of the final diff. - -## Branch implementation update - -~~The GB200 branch consumes the typed TelemetryService API already present in `nv-redfish` 0.10.0 (`ServiceRoot::telemetry_service()`, `TelemetryService::metric_report_links()`, and `MetricReportLink::fetch()`).~~ **Reverted.** The branch no longer consumes the TelemetryService API; the collector was removed after live GB200 probes returned no MetricReports. No local `nv-redfish` path dependency is committed. - -Direct Fabric/Switch/Port wrappers are still absent from `nv-redfish` 0.10.x and `origin/main` as inspected. BMC-side switch telemetry is now sourced from the live BMC sensor/thermal surface (not TelemetryService MetricReports), with the local companion worktree kept available if live GB200 evidence later proves a required metric is only available from Fabric/Switch/Port resources and not from the BMC sensor surface, NMX-T, or gNMI. From f96a515d5aca24dd2927522bc8074b6d5c43a972 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Fri, 26 Jun 2026 16:47:52 -0400 Subject: [PATCH 17/25] fix(health): prevent empty labels from propagating. Update example config Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 8 ++- crates/health/src/collectors/nmxt.rs | 68 ++++++++++++++--------- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 56ed50ea78..769787919e 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -226,10 +226,12 @@ platform_environment_temperature_enabled = true platform_environment_status_enabled = true # NVUE gNMI streaming collector, disabled by default. Subscribes to -# gNMI SAMPLE paths (components + interfaces) and pushes metrics through -# the configured sinks. gNMI ON_CHANGE targets system-events +# gNMI SAMPLE paths (components + interfaces, plus platform_general when +# platform_general_enabled is true) and pushes metrics through the configured +# sinks. gNMI ON_CHANGE targets system-events [collectors.nvue.gnmi] -# periodic SAMPLE +# periodic SAMPLE (components, interfaces, and platform_general when +# platform_general_enabled is true) gnmi_port = 9339 sample_interval = "5m" request_timeout = "30s" diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 61f4776272..bd02cdb5a7 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -365,6 +365,13 @@ fn down_blame_to_state(raw: &str) -> &'static str { } } +fn required_port_num(sample_labels: &HashMap) -> Option<&str> { + sample_labels + .get("Port_Number") + .map(String::as_str) + .filter(|port_num| !port_num.is_empty()) +} + /// Test-only; production iterates `NMXT_LABEL_MAP` directly in `build_labels`. #[cfg(test)] fn lookup_nmxt_label(key: &str) -> Option<&'static NmxtLabel> { @@ -588,11 +595,10 @@ impl NmxtCollector { .get("Module_Temperature") .and_then(|raw| cable_temp_to_celsius(raw)) { - let port_num = sample_labels - .get("Port_Number") - .cloned() - .unwrap_or_default(); - if cable_temp_ports.insert(port_num.clone()) { + let Some(port_num) = required_port_num(&sample_labels) else { + continue; + }; + if cable_temp_ports.insert(port_num.to_string()) { let labels = self.build_labels(&switch_ip, &sample_labels); self.emit_event(CollectorEvent::Metric( MetricSample { @@ -612,11 +618,10 @@ impl NmxtCollector { // `down_blame` is a closed enum riding as a label; emit it as a per-port StateSet // (one 0/1 series per state) before the family check, once per port. if let Some(raw) = sample_labels.get("down_blame") { - let port_num = sample_labels - .get("Port_Number") - .cloned() - .unwrap_or_default(); - if down_blame_ports.insert(port_num.clone()) { + let Some(port_num) = required_port_num(&sample_labels) else { + continue; + }; + if down_blame_ports.insert(port_num.to_string()) { let current = down_blame_to_state(raw); for state in DOWN_BLAME_STATES { let mut labels = self.build_labels(&switch_ip, &sample_labels); @@ -644,15 +649,14 @@ impl NmxtCollector { let (metric_type, unit) = (metric.metric_type, metric.unit); // Port number anchors the per-series key. - let port_num = sample_labels - .get("Port_Number") - .cloned() - .unwrap_or_default(); + let Some(port_num) = required_port_num(&sample_labels) else { + continue; + }; let mut metric_key = String::with_capacity(metric_type.len() + 1 + port_num.len()); metric_key.push_str(metric_type); metric_key.push(':'); - metric_key.push_str(&port_num); + metric_key.push_str(port_num); let labels = self.build_labels(&switch_ip, &sample_labels); @@ -727,6 +731,20 @@ Link_Down{Port_Number="1"} 5 assert_eq!(samples.len(), 4); } + #[test] + fn test_required_port_num_requires_present_non_empty_label() { + let missing = HashMap::new(); + assert_eq!(required_port_num(&missing), None); + + let mut empty = HashMap::new(); + empty.insert("Port_Number".to_string(), String::new()); + assert_eq!(required_port_num(&empty), None); + + let mut present = HashMap::new(); + present.insert("Port_Number".to_string(), "11".to_string()); + assert_eq!(required_port_num(&present), Some("11")); + } + /// Live NMX-T `lid` series from the Stage-0 GB200 scrape (`nmxt-prometheus.txt`). const SAMPLE_LID_LINE: &str = r#"lid{Device_ID="GB100", port_label="GPUP10", logical_state="ACT", device_num_on_tray="2", board_type="3", chassis_slot_index="27", tray_index="17", topology_id="128", chassis_id="1820325172739", Active_FEC="Int_KP4_FEC_PLR", link_partner_description="MF0;sw06:N5400_LD/U1", link_partner_node_guid="0x2c5eab0300b6a900", link_partner_port_num="71", cable_vendor="Other", down_blame="Unknown", local_reason_opcode="No_link_down_indication", Node_GUID="0xe1d04a69816f16bc", node_description="GB100 Nvidia Technologies", Port_Number="11", FW_Version="36.2014.1866", Cable_PN="NA", Cable_SN="NA", cable_type="850 nm VCSEL", cable_length="NA", cable_identifier="Backplane", vendor_rev="NA", cable_fw_version="N/A", Module_Temperature="0C", Status_Message="No issue was observed", port_guid="0xe1d04a69816f16c6", sw_serial_number="MT123", sw_revision="A1", remote_reason_opcode="4"} 3093 1781993954087"#; @@ -1005,12 +1023,10 @@ Link_Down{Port_Number="1"} 5 for line in lines { let sample = parse_prometheus_line(line).expect("parse line"); if let Some(raw) = sample.labels.get("down_blame") { - let port_num = sample - .labels - .get("Port_Number") - .cloned() - .unwrap_or_default(); - if down_blame_ports.insert(port_num.clone()) { + let Some(port_num) = required_port_num(&sample.labels) else { + continue; + }; + if down_blame_ports.insert(port_num.to_string()) { let current = down_blame_to_state(raw); for state in DOWN_BLAME_STATES { let mut labels = collector.build_labels(&switch_ip, &sample.labels); @@ -1122,12 +1138,10 @@ Link_Down{Port_Number="1"} 5 .get("Module_Temperature") .and_then(|raw| cable_temp_to_celsius(raw)) { - let port_num = sample - .labels - .get("Port_Number") - .cloned() - .unwrap_or_default(); - if cable_temp_ports.insert(port_num.clone()) { + let Some(port_num) = required_port_num(&sample.labels) else { + continue; + }; + if cable_temp_ports.insert(port_num.to_string()) { let labels = collector.build_labels(&switch_ip, &sample.labels); collector.emit_event(CollectorEvent::Metric( MetricSample { From 7bf26d6c25fa3ecc87d20ce1c581c07b51c4f656 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:40:55 -0400 Subject: [PATCH 18/25] fix(health): default to strict TLS verification. add optional flag in config for dev Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 3 + .../health/src/collectors/nvue/gnmi/client.rs | 119 ++++++++++++------ .../src/collectors/nvue/gnmi/subscriber.rs | 4 + crates/health/src/config.rs | 61 +++++++++ 4 files changed, 151 insertions(+), 36 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 769787919e..5314a0b22f 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -235,6 +235,9 @@ platform_environment_status_enabled = true gnmi_port = 9339 sample_interval = "5m" request_timeout = "30s" +# Keep strict TLS certificate and hostname verification by default. Set true only +# for lab/self-signed NVOS gNMI endpoints where that dangerous bypass is required. +dangerously_skip_tls_verification = false # streaming ON_CHANGE system_events_enabled = true diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 4de235ce87..6d48970f9b 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -105,6 +105,35 @@ pub struct GnmiClient { username: Option, password: Option, request_timeout: Duration, + dangerously_skip_tls_verification: bool, +} + +fn configure_tls_endpoint( + endpoint: Endpoint, + switch_id: &str, + dangerously_skip_tls_verification: bool, +) -> Result { + if !dangerously_skip_tls_verification { + return Ok(endpoint); + } + + // tonic 0.14 auto-injects a strict WebPKI/system-root TLS verifier when an + // Endpoint is built from an `https://` URI and layers its own TlsConnector + // over any custom connector (see tonic transport channel/service/connector.rs). + // That silently negated a hand-rolled hyper-rustls skip-verify connector and + // made tonic strictly reject the switch's self-signed NVOS gNMI cert (SAN does + // not cover the management IP). When the dangerous opt-in is enabled, use + // tonic's native custom-verifier hook so the skip-verify verifier is the one + // tonic actually applies. ClientTlsConfig::new() must NOT set any roots here + // (mixing roots + custom verifier is an error). + endpoint + .tls_config_with_verifier( + ClientTlsConfig::new(), + crate::collectors::nvue::tls::accept_any_cert_verifier(), + ) + .map_err(|e| { + HealthError::GnmiError(format!("switch {switch_id}: invalid gNMI TLS config: {e}")) + }) } impl GnmiClient { @@ -115,6 +144,7 @@ impl GnmiClient { username: Option, password: Option, request_timeout: Duration, + dangerously_skip_tls_verification: bool, ) -> Self { Self { switch_id, @@ -123,6 +153,7 @@ impl GnmiClient { username, password, request_timeout, + dangerously_skip_tls_verification, } } @@ -141,43 +172,34 @@ impl GnmiClient { )) })?; - // tonic 0.14 auto-injects a strict WebPKI/system-root TLS verifier when an - // Endpoint is built from an `https://` URI and layers its own TlsConnector - // over any custom connector (see tonic transport channel/service/connector.rs). - // That silently negated a hand-rolled hyper-rustls skip-verify connector and - // made tonic strictly reject the switch's self-signed NVOS gNMI cert (SAN does - // not cover the management IP). Use tonic's native custom-verifier hook so the - // skip-verify verifier is the one tonic actually applies. ClientTlsConfig::new() - // must NOT set any roots here (mixing roots + custom verifier is an error). - let endpoint = Endpoint::from(uri) - .tls_config_with_verifier( - ClientTlsConfig::new(), - crate::collectors::nvue::tls::accept_any_cert_verifier(), - ) - .map_err(|e| { - HealthError::GnmiError(format!( - "switch {}: invalid gNMI TLS config: {e}", - self.switch_id - )) - })? - .connect_timeout(self.request_timeout) - .timeout(self.request_timeout); - - let channel = endpoint - .connect() - .await - .map_err(|e| { - HealthError::GnmiError(format!( - "switch {}: connection failed to {target}: {e}", - self.switch_id - )) - })?; + let endpoint = configure_tls_endpoint( + Endpoint::from(uri), + &self.switch_id, + self.dangerously_skip_tls_verification, + )? + .connect_timeout(self.request_timeout) + .timeout(self.request_timeout); + + let channel = endpoint.connect().await.map_err(|e| { + HealthError::GnmiError(format!( + "switch {}: connection failed to {target}: {e}", + self.switch_id + )) + })?; - tracing::debug!( - switch_id = %self.switch_id, - target = %target, - "gNMI TLS channel established (skip-verify)" - ); + if self.dangerously_skip_tls_verification { + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established with certificate verification disabled" + ); + } else { + tracing::debug!( + switch_id = %self.switch_id, + target = %target, + "gNMI TLS channel established" + ); + } Ok(TonicGnmiClient::new(channel)) } @@ -481,6 +503,31 @@ mod tests { assert_eq!(typed_value_to_f64(&val), None); } + #[test] + fn test_gnmi_client_stores_dangerous_tls_skip_flag() { + let strict = GnmiClient::new( + "switch-1".to_string(), + "10.0.0.9", + 9339, + None, + None, + Duration::from_secs(30), + false, + ); + assert!(!strict.dangerously_skip_tls_verification); + + let dangerous = GnmiClient::new( + "switch-1".to_string(), + "10.0.0.9", + 9339, + None, + None, + Duration::from_secs(30), + true, + ); + assert!(dangerous.dangerously_skip_tls_verification); + } + #[test] fn test_nvue_subscribe_paths_all_enabled() { let paths = nvue_subscribe_paths(&NvueGnmiPaths::default()); diff --git a/crates/health/src/collectors/nvue/gnmi/subscriber.rs b/crates/health/src/collectors/nvue/gnmi/subscriber.rs index 7c4f8e4bfb..156aabdb4f 100644 --- a/crates/health/src/collectors/nvue/gnmi/subscriber.rs +++ b/crates/health/src/collectors/nvue/gnmi/subscriber.rs @@ -187,6 +187,7 @@ struct GnmiClientProvider { switch_ip: String, port: u16, request_timeout: Duration, + dangerously_skip_tls_verification: bool, credentials: Arc, } @@ -215,6 +216,7 @@ impl GnmiClientProvider { credentials.username, credentials.password, self.request_timeout, + self.dangerously_skip_tls_verification, ), generation, )) @@ -459,6 +461,7 @@ pub fn spawn_gnmi_collector( switch_ip, port: gnmi_config.gnmi_port, request_timeout: gnmi_config.request_timeout, + dangerously_skip_tls_verification: gnmi_config.dangerously_skip_tls_verification, credentials: Arc::new(GnmiCredentialCache::new( credential_provider, endpoint.addr.clone(), @@ -846,6 +849,7 @@ mod tests { switch_ip: addr.ip.to_string(), port: 9339, request_timeout: Duration::from_secs(1), + dangerously_skip_tls_verification: false, credentials: Arc::new(GnmiCredentialCache::new(provider, addr)), } } diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index f6be64275f..e15403a0e3 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -958,6 +958,11 @@ pub struct NvueGnmiConfig { #[serde(with = "humantime_serde")] pub request_timeout: Duration, + /// Dangerously disable TLS certificate and hostname verification for NVUE gNMI. + /// + /// Defaults to false so strict TLS verification remains the default. + pub dangerously_skip_tls_verification: bool, + /// Enable gNMI ON_CHANGE subscription for live system-event messages. #[serde(alias = "system_events_subscription_enabled", alias = "events_enabled")] pub system_events_enabled: bool, @@ -972,6 +977,7 @@ impl Default for NvueGnmiConfig { gnmi_port: 9339, sample_interval: Duration::from_secs(300), request_timeout: Duration::from_secs(30), + dangerously_skip_tls_verification: false, system_events_enabled: true, paths: NvueGnmiPaths::default(), } @@ -1371,6 +1377,7 @@ mod tests { assert_eq!(gnmi.gnmi_port, 9339); assert_eq!(gnmi.sample_interval, Duration::from_secs(300)); assert_eq!(gnmi.request_timeout, Duration::from_secs(30)); + assert!(!gnmi.dangerously_skip_tls_verification); assert!(gnmi.system_events_enabled); } else { panic!("nvue gnmi config should be enabled in example config"); @@ -1882,6 +1889,60 @@ system_events_enabled = false } } + #[test] + fn test_nvue_gnmi_dangerous_tls_skip_defaults_false_and_parses_true() { + let omitted = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(omitted)) + .extract() + .expect("failed to parse omitted tls flag"); + + let Configurable::Enabled(nvue) = config.collectors.nvue else { + panic!("nvue config should be enabled"); + }; + let Configurable::Enabled(gnmi) = nvue.gnmi else { + panic!("gnmi config should be enabled"); + }; + assert!(!gnmi.dangerously_skip_tls_verification); + + let enabled = r#" +[endpoint_sources.carbide_api] +enabled = false + +[sinks.health_report] +enabled = false + +[collectors.nvue.gnmi] +gnmi_port = 9339 +dangerously_skip_tls_verification = true +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(enabled)) + .extract() + .expect("failed to parse enabled tls flag"); + + let Configurable::Enabled(nvue) = config.collectors.nvue else { + panic!("nvue config should be enabled"); + }; + let Configurable::Enabled(gnmi) = nvue.gnmi else { + panic!("gnmi config should be enabled"); + }; + assert!(gnmi.dangerously_skip_tls_verification); + } + #[test] fn test_static_endpoint_with_switch_serial() { let toml_content = r#" From 3151bfcf57a099c003ca52132b83dea0a93e4ac9 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 01:16:52 +0200 Subject: [PATCH 19/25] lint(health): fix Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/otlp/convert.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index c74afc9cfc..b0c2cd0497 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -256,15 +256,15 @@ pub fn build_metrics_export_request( // promote switch identity onto the datapoint so dashboards filtering on // `switch_serial`/`switch_id` (underscore label form) match; these otherwise // only exist as OTLP *resource* attributes (`switch.serial`/`switch.id`). - if !attributes.iter().any(|attr| attr.key == "switch_serial") { - if let Some(serial) = context.switch_serial() { - attributes.push(kv("switch_serial", serial.to_string())); - } + if !attributes.iter().any(|attr| attr.key == "switch_serial") + && let Some(serial) = context.switch_serial() + { + attributes.push(kv("switch_serial", serial.to_string())); } - if !attributes.iter().any(|attr| attr.key == "switch_id") { - if let Some(switch_id) = context.switch_id() { - attributes.push(kv("switch_id", switch_id.to_string())); - } + if !attributes.iter().any(|attr| attr.key == "switch_id") + && let Some(switch_id) = context.switch_id() + { + attributes.push(kv("switch_id", switch_id.to_string())); } let data_point = NumberDataPoint { @@ -804,6 +804,9 @@ mod tests { }; let attrs = &gauge.data_points[0].attributes; assert_eq!(attr_value(attrs, "switch_serial"), Some("SN-SWITCH-001")); - assert_eq!(attr_value(attrs, "switch_id"), Some(switch_id_attr.as_str())); + assert_eq!( + attr_value(attrs, "switch_id"), + Some(switch_id_attr.as_str()) + ); } } From 52ceca58a42608d0d5a553b71e4e15d8f5732b78 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 02:25:59 +0200 Subject: [PATCH 20/25] chore(health): remove leftover GB200 NVSWITCH matrix generator The generated matrix/validation docs were already dropped in 3b0a075c61 (chore(health): remove temp docs from repo), but the one-shot generator script was missed. It has no callers, its required inputs are not in the repo, and its outputs are no longer tracked, so it cannot run from a clean checkout. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- dev/bin/generate_nvswitch_gb200_matrix.py | 464 ---------------------- 1 file changed, 464 deletions(-) delete mode 100755 dev/bin/generate_nvswitch_gb200_matrix.py diff --git a/dev/bin/generate_nvswitch_gb200_matrix.py b/dev/bin/generate_nvswitch_gb200_matrix.py deleted file mode 100755 index e2e5aadc18..0000000000 --- a/dev/bin/generate_nvswitch_gb200_matrix.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python3 -"""Generate the GB200 NVSWITCH telemetry source matrix. - -The source workbook is not tracked. Pass sanitized catalog extraction artifacts with -``--rows-csv`` and ``--coverage-json`` when regenerating review artifacts. -""" - -from __future__ import annotations - -import argparse -import csv -import json -import re -from collections import Counter -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[2] -DEFAULT_OUT_DIR = ROOT / "docs/architecture/health" -DEFAULT_OUT_CSV = DEFAULT_OUT_DIR / "nvswitch_telemetry_gb200_matrix.csv" -DEFAULT_OUT_MD = DEFAULT_OUT_DIR / "nvswitch_telemetry_gb200_matrix.md" - -GB200_COLUMNS = [ - "Applicable for \nGB200 NVL HMC", - "Applicable for \nGB200 NVL BMC", - "Applicable for\nGB200 NVL NvswitchTray", -] - -COL_METRIC = "Metric (ParamName)" -COL_GUID = "Telemetry GUID (Device+ParamName)" -COL_DEVICE = "Device \n(CompClass)" -COL_CATEGORY = "Category\n(ParamClass)" -COL_DATA_TYPE = "Data\nType" -COL_DESC = "Description" -COL_AVAIL = "Availability\n(IB/OOB/BOTH/NONE)" -COL_WILDCARD = "OOB API - Wildcards\n(Redfish URI and Field. N/A for NvSwitch Tray)" -COL_URI_DOMAIN = "URI Search Domain" -COL_URI_MATCH = "URI Match Criteria for Search Domain" -COL_OTLP = "OTLP" -COL_ONBOARD = "Onboard API (dbus path etc. within HMC/BMC)" -COL_NMXT = "Hi @zhillel@nvidia.com, IIUC these interfaces will be applicable even if there is single or no compute node at all correct in the rack ? so no need to say its \"applicable for multi node\" ?\n_Assigned to Ziv Hillel IL_\n-Pradeep Kumar Shima US\nNMX-T(applicable for MultiNode)" -COL_GNMI = "NVOS gNMI(applicable for MultiNode)" -COL_CLI_2502 = "Format of this column:\nline-1: nvos cli command with any placeholder for Id starting with \"$\"\nline-2 (Optional): Search criteria/filter for finding the applicable IDs for the placeholder in column. If this line isn't present, we'll look at all available Ids (interfaces, fans etc.)\nline-3: Property to check enclosed in curly braces. For example, {voltage}. For nested properties, curly braces can be used. E.g. {link{counters}}\n-Afsana Chowdhury US\nNVOS CLI v25.02.4282 (applicable for MultiNode)" -COL_CLI_2503 = "NVOS CLI v25.03.XXXX (applicable for MultiNode)" -COL_REDFISH_GB = "OOB API on GH200 NVL/GB200 NVL/GB300 NVL/MGX-4U-NVL16/Vera Rubin NVL72\n(Redfish URI and Field. N/A for NvSwitch Tray)" -COL_REDFISH_DGX = "Candidate to get rid of\n-Afsana Chowdhury US\nCheck with Jim and Joe about partners' usage\n-Afsana Chowdhury US\nOOB API on GH200/C2/DGX Station GB300\n(Redfish URI and Field. N/A for NvSwitch Tray)" -COL_MRD = "MRD URI on Hopper-HGX-8-GPU/Blackwell-HGX-8-GPU/GH200/GB200/HGX B300 NVL8/GB300/MGX-4U-NVL16\n(N/A for NvSwitch Tray)" - -SOURCE_COLUMNS = { - "redfish_gb": COL_REDFISH_GB, - "redfish_dgx_or_c2": COL_REDFISH_DGX, - "redfish_wildcard": COL_WILDCARD, - "mrd": COL_MRD, - "nvos_gnmi": COL_GNMI, - "nmx_t": COL_NMXT, - "nvos_cli_2503": COL_CLI_2503, - "nvos_cli_2502": COL_CLI_2502, - "onboard_dbus": COL_ONBOARD, - "otlp": COL_OTLP, -} - -NA_VALUES = {"", "NA", "N/A", "#N/A", "NONE", "TBD", "N.A."} -GENERIC_INFRA_FAMILIES = { - "Redfish TelemetryService", - "Redfish Fabric/Switch/Port", - "NVOS gNMI", - "NMX-T", -} - - -def clean(value: str | None) -> str: - if value is None: - return "" - return re.sub(r"\s+", " ", value.replace("\xa0", " ")).strip() - - -def has_value(value: str | None) -> bool: - c = clean(value) - return bool(c) and c.upper() not in NA_VALUES - - -def yes(value: str | None) -> bool: - return clean(value).lower() == "yes" - - -def snake(metric: str) -> str: - return re.sub(r"[^a-z0-9]+", "_", metric.lower()).strip("_") - - -def load_coverage(coverage_json: Path) -> dict[int, dict[str, str]]: - if not coverage_json.exists(): - return {} - data = json.loads(coverage_json.read_text()) - out: dict[int, dict[str, str]] = {} - for section in ("covered", "partial", "gaps"): - for item in data.get(section, []): - out[int(item["row"])] = item - return out - - -def extract_sources(row: dict[str, str]) -> dict[str, str]: - sources = {} - for name, col in SOURCE_COLUMNS.items(): - val = row.get(col, "") - if has_value(val): - sources[name] = clean(val) - return sources - - -def source_family(source_name: str, value: str) -> str: - text = f"{source_name} {value}".lower() - if "telemetryservice" in text or "metricreport" in text or source_name == "mrd": - return "Redfish TelemetryService" - if source_name.startswith("redfish"): - return "Redfish Fabric/Switch/Port" - if source_name == "nvos_gnmi": - return "NVOS gNMI" - if source_name == "nmx_t": - return "NMX-T" - if source_name.startswith("nvos_cli"): - return "NVOS CLI" - if source_name == "onboard_dbus": - return "Onboard DBus" - if source_name == "otlp": - return "OTLP" - return source_name - - -def choose_sources(row: dict[str, str], sources: dict[str, str], metric: str = "") -> tuple[str, str, str, str]: - existing_primary = { - "PORT-RCV-ERRORS": "nvos_gnmi", - "PORT-XMIT-CONSTRAINTS-ERRORS": "nvos_gnmi", - "EFFECTIVE-BER": "nvos_gnmi", - "SYMBOL-BER": "nvos_gnmi", - "PHY-SYMBOL-ERRORS": "nmx_t", - } - if not sources: - return ( - "SOURCE UNLISTED live source resolution", - "", - "No catalog source listed for GB200 row; resolve during live validation", - "source-resolution required before live signoff", - ) - - availability = clean(row.get(COL_AVAIL, "")).upper() - tray = yes(row.get("Applicable for\nGB200 NVL NvswitchTray")) - hmc_or_bmc = yes(row.get("Applicable for \nGB200 NVL HMC")) or yes(row.get("Applicable for \nGB200 NVL BMC")) - tray_only = tray and not hmc_or_bmc - - ordered_names = [] - if any(k in sources for k in ("mrd",)): - ordered_names.append("mrd") - if any(k in sources for k in ("redfish_gb", "redfish_wildcard", "redfish_dgx_or_c2")) and not tray_only: - ordered_names.extend(["redfish_wildcard", "redfish_gb", "redfish_dgx_or_c2"]) - if "nvos_gnmi" in sources: - if "IB" in availability or tray_only: - ordered_names.insert(0, "nvos_gnmi") - else: - ordered_names.append("nvos_gnmi") - if "nmx_t" in sources: - ordered_names.append("nmx_t") - if "nvos_cli_2503" in sources: - ordered_names.append("nvos_cli_2503") - if "nvos_cli_2502" in sources: - ordered_names.append("nvos_cli_2502") - if "onboard_dbus" in sources: - ordered_names.append("onboard_dbus") - if "otlp" in sources: - ordered_names.append("otlp") - - seen = set() - available_ordered = [] - for name in ordered_names: - if name in sources and name not in seen: - seen.add(name) - available_ordered.append(name) - for name in sources: - if name not in seen: - available_ordered.append(name) - - if metric in existing_primary and existing_primary[metric] in available_ordered: - available_ordered.remove(existing_primary[metric]) - available_ordered.insert(0, existing_primary[metric]) - - primary_name = available_ordered[0] - fallback_name = available_ordered[1] if len(available_ordered) > 1 else "" - primary = source_family(primary_name, sources[primary_name]) - fallback = source_family(fallback_name, sources[fallback_name]) if fallback_name else "" - precedence_parts = [] - for name in available_ordered: - family = source_family(name, sources[name]) - if family not in precedence_parts: - precedence_parts.append(family) - precedence = " then ".join(precedence_parts) - return primary, fallback, precedence, "one canonical series unless source-qualified duplicate is justified" - - -def is_redfish_sensor_range(redfish_path: str) -> bool: - return "/Sensors/" in redfish_path and ( - "ReadingRangeMax" in redfish_path or "ReadingRangeMin" in redfish_path - ) - - -def sensor_range_surface(redfish_path: str) -> str: - if "ReadingRangeMax" in redfish_path: - return "hw_sensor {reading_type}_range_max MetricSample with sensor_range=reading_range_max" - if "ReadingRangeMin" in redfish_path: - return "hw_sensor {reading_type}_range_min MetricSample with sensor_range=reading_range_min" - return "hw_sensor range MetricSample" - - -def target_collector(primary: str, sources: dict[str, str], redfish_path: str) -> str: - if is_redfish_sensor_range(redfish_path): - return "existing SensorsCollector range emission when include_sensor_thresholds=true" - if primary.startswith("SOURCE UNLISTED"): - return "live source resolution required; generic Redfish/NMX-T/gNMI collectors will expose the row if emitted" - if primary == "Redfish TelemetryService": - return "new NvSwitchTelemetryServiceCollector behind collectors.telemetry_service" - if primary == "Redfish Fabric/Switch/Port": - return "new NvSwitchRedfishCollector for switch BMC endpoints" - if primary == "NVOS gNMI": - return "extend NvueGnmiCollector sample paths/processors" - if primary == "NMX-T": - return "extend NmxtCollector mapping" - if primary == "NVOS CLI": - if "nvos_gnmi" in sources: - return "prefer NVOS gNMI equivalent; live source-equivalence required if no streamed equivalent exists" - return "live source-equivalence required; prefer Redfish TelemetryService, NVOS gNMI, or NMX-T before adding CLI collector" - if primary == "Onboard DBus": - return "live source-equivalence required; prefer Redfish exposure before adding DBus collector" - if primary == "OTLP": - return "live source-equivalence required; upstream OTLP source contract needed if not exposed elsewhere" - return "TBD collector" - - -def has_generic_infra_source(sources: dict[str, str]) -> bool: - return any( - source_family(source_name, source_value) in GENERIC_INFRA_FAMILIES - for source_name, source_value in sources.items() - ) - - -def branch_coverage( - primary: str, - sources: dict[str, str], - cov_status: str, - cov_reason: str, -) -> tuple[str, str, str]: - if cov_status.startswith("covered"): - return cov_status, "already-covered-regression-required", cov_reason - - if primary.startswith("SOURCE UNLISTED") or not sources: - return ( - "source_resolution_required", - "requires-live-source-resolution", - "Catalog row has no source path/name; live validation must identify a Redfish, NMX-T, or gNMI source if the device emits it.", - ) - - if has_generic_infra_source(sources): - return ( - "covered_generic_infra_unvalidated", - "covered-by-generic-infra-requires-live-validation", - "GB200 branch generic Redfish MetricReport, NMX-T, and NVUE gNMI preservation can emit this row; live hardware validation must confirm the concrete device path/name.", - ) - - return ( - "source_equivalent_required", - "requires-live-source-equivalent", - "Catalog lists only source families that are not collected directly; live validation must find an equivalent Redfish, NMX-T, or gNMI exposure before signoff.", - ) - - -def emitted_surface(metric: str, data_type: str, coverage: str, redfish_path: str) -> str: - if is_redfish_sensor_range(redfish_path): - return sensor_range_surface(redfish_path) - existing = { - "PORT-RCV-ERRORS": "existing interface_in_errors MetricSample", - "PORT-XMIT-CONSTRAINTS-ERRORS": "existing interface_out_errors MetricSample", - "EFFECTIVE-BER": "existing interface_effective_ber MetricSample", - "SYMBOL-BER": "existing interface_symbol_ber MetricSample", - "PHY-SYMBOL-ERRORS": "existing switch_nmxt symbol_errors MetricSample", - } - if metric in existing and coverage.startswith("covered"): - return existing[metric] - dtype = clean(data_type).lower() - base = f"nvswitch_{snake(metric)}" - if "text" in dtype or "string" in dtype: - return f"{base} as inventory/info event or state metric with bounded labels" - if "bool" in dtype or "enum" in dtype or "status" in dtype: - return f"{base} as numeric state MetricSample" - return f"{base} MetricSample" - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--rows-csv", - required=True, - type=Path, - help="Sanitized NVSWITCH rows extracted from the telemetry catalog workbook.", - ) - parser.add_argument( - "--coverage-json", - required=True, - type=Path, - help="Coverage heuristic JSON for the sanitized NVSWITCH rows.", - ) - parser.add_argument( - "--out-csv", - default=DEFAULT_OUT_CSV, - type=Path, - help="Output CSV path.", - ) - parser.add_argument( - "--out-md", - default=DEFAULT_OUT_MD, - type=Path, - help="Output Markdown summary path.", - ) - return parser.parse_args() - - -def display_path(path: Path) -> str: - try: - return str(path.relative_to(ROOT)) - except ValueError: - return str(path) - - -def main() -> None: - args = parse_args() - rows_csv = args.rows_csv.resolve() - coverage_json = args.coverage_json.resolve() - out_csv = args.out_csv.resolve() - out_md = args.out_md.resolve() - out_dir = out_csv.parent - - coverage = load_coverage(coverage_json) - out_dir.mkdir(parents=True, exist_ok=True) - with rows_csv.open(newline="") as f: - rows = list(csv.DictReader(f)) - - out_rows = [] - for row in rows: - if "nvswitch" not in clean(row.get(COL_DEVICE, "")).lower(): - continue - applicable_cols = [col for col in GB200_COLUMNS if yes(row.get(col))] - if not applicable_cols: - continue - row_no = int(row["__ods_row_number"]) - metric = clean(row.get(COL_METRIC, "")) - sources = extract_sources(row) - primary, fallback, precedence, duplicate_policy = choose_sources(row, sources, metric) - cov = coverage.get(row_no, {}) - cov_status = clean(cov.get("coverage", "gap")) or "gap" - cov_reason = clean(cov.get("coverage_reason", "")) - redfish_path = clean(row.get(COL_URI_DOMAIN)) or clean(row.get(COL_WILDCARD)) or clean(row.get(COL_REDFISH_GB)) or clean(row.get(COL_MRD)) - branch_cov_status, implementation_status, branch_cov_reason = branch_coverage( - primary, - sources, - cov_status, - cov_reason, - ) - - out_rows.append({ - "catalog_row": row_no, - "guid": clean(row.get(COL_GUID, "")), - "metric_param_name": metric, - "description": clean(row.get(COL_DESC, "")), - "category": clean(row.get(COL_CATEGORY, "")), - "data_type": clean(row.get(COL_DATA_TYPE, "")), - "gb200_applicability": "; ".join(col.replace("Applicable for", "").replace("\n", " ").strip() for col in applicable_cols), - "availability": clean(row.get(COL_AVAIL, "")), - "source_families": "; ".join(dict.fromkeys(source_family(k, v) for k, v in sources.items())), - "primary_source": primary, - "fallback_source": fallback, - "source_precedence": precedence, - "duplicate_alias_policy": duplicate_policy, - "target_collector": target_collector(primary, sources, redfish_path), - "target_emitted_surface": emitted_surface(metric, row.get(COL_DATA_TYPE, ""), cov_status, redfish_path), - "current_coverage": branch_cov_status, - "implementation_status": implementation_status, - "coverage_reason": branch_cov_reason, - "redfish_or_mrd_path": redfish_path, - "nvos_gnmi_path": clean(row.get(COL_GNMI, "")), - "nmx_t_field": clean(row.get(COL_NMXT, "")), - "nvos_cli_reference": clean(row.get(COL_CLI_2503, "")) or clean(row.get(COL_CLI_2502, "")), - "onboard_dbus_reference": clean(row.get(COL_ONBOARD, "")), - "test_fixture_plan": "required before review: parser/unit fixture plus metric emission assertion; live GB evidence during post-review validation", - "live_validation_plan": "validate on GB200 NVLink Switch BMC/HOST after branch build-test-lint review", - }) - - fieldnames = list(out_rows[0].keys()) if out_rows else [] - out_dir.mkdir(parents=True, exist_ok=True) - with out_csv.open("w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") - writer.writeheader() - writer.writerows(out_rows) - - counts = Counter(r["implementation_status"] for r in out_rows) - primary_counts = Counter(r["primary_source"] for r in out_rows) - coverage_counts = Counter(r["current_coverage"] for r in out_rows) - md = [ - "# NVSWITCH telemetry GB200 source matrix", - "", - "Generated from sanitized Telemetry Catalog extraction artifacts for rows where `Device (CompClass)` is NVSWITCH and one of the GB200 columns is `Yes`:", - "", - "- `Applicable for GB200 NVL HMC`", - "- `Applicable for GB200 NVL BMC`", - "- `Applicable for GB200 NVL NvswitchTray`", - "", - f"CSV matrix: `{display_path(out_csv)}`", - "", - "## Counts", - "", - f"- Total GB200-applicable NVSWITCH rows: {len(out_rows)}", - "", - "### Implementation status", - "", - ] - for key, value in sorted(counts.items()): - md.append(f"- {key}: {value}") - md.extend(["", "### Branch coverage status", ""]) - for key, value in sorted(coverage_counts.items()): - md.append(f"- {key}: {value}") - md.extend(["", "### Primary source", ""]) - for key, value in sorted(primary_counts.items()): - md.append(f"- {key}: {value}") - md.extend([ - "", - "## GB200 branch implementation coverage", - "", - "The `nvswitch_telemetry_gaps` branch implements common GB+VR-friendly collector infrastructure for the GB200 phase:", - "", - "- Redfish BMC: enabled `nv-redfish` `telemetry-service`, added a switch-BMC-only TelemetryService collector, and emits every numeric/boolean/string `MetricReport` value as `redfish_telemetry_service` samples with report and source-property labels.", - "- BMC proxy: widened TelemetryService ACLs to `MetricReportDefinitions/*` and `MetricReports/*` so live GB200 validation is not limited to `NvidiaNMMetrics_0`.", - "- NMX-T HOST: preserves all numeric Prometheus samples instead of dropping unknown metric names; legacy `Effective_BER`, `Symbol_Errors`, and `Link_Down` metric names remain canonical.", - "- NVUE gNMI HOST: subscribes to `components`, `interfaces`, and `platform-general`; known current metrics keep their existing names, and previously unmapped leaves are emitted as source-qualified `nvswitch_*` samples.", - "- Config: `collectors.telemetry_service` is disabled by default, and `collectors.nvue.gnmi.paths.platform_general_enabled` is an explicit opt-in path gate; the example and live-validation configs enable the full GB200 switch collector set.", - "", - "The generic-preservation surfaces are behavior-locked by unit tests before live hardware validation:", - "", - "- Redfish TelemetryService: `metric_report_values_emit_numeric_and_info_samples` covers numeric, string/info, and boolean/state MetricReport values.", - "- NMX-T: `generic_metric_key_includes_sorted_extra_label_identity` and `generic_metric_key_distinguishes_same_port_samples_by_extra_labels` cover stable key identity for unknown Prometheus samples with extra labels.", - "- NVUE gNMI: `unmapped_interface_leaf_emits_catalog_metric_sample` and `platform_general_string_leaf_emits_info_metric` cover previously unmapped interface leaves and platform-general string leaves.", - "", - "Rows that still have no catalog-listed source remain in scope: `CABLE-SNR-MEDIA-LANE-N` and `CABLE-SNR-HOST-LANE-N` are marked `requires-live-source-resolution` and must be checked during live validation. The generic Redfish MetricReport, NMX-T, and gNMI preservation paths will expose them if the device emits them; if not, open a source-owner follow-up immediately.", - "", - "## Execution rules", - "", - "- Every row must keep `primary_source`, `fallback_source`, `source_precedence`, and `duplicate_alias_policy` populated before implementation is marked complete.", - "- Default duplicate policy is one canonical series per catalog row; source-qualified duplicates require source-path proof and consumer-safety rationale.", - "- Generic-preserved metrics must keep bounded identity labels: report id/URI/definition and metric id/property/identity for Redfish MetricReports, raw source metric plus sorted source-label identity for NMX-T, and full gNMI path plus endpoint/entity labels for gNMI. Redfish internal keys must use escaped raw MetricId/MetricProperty identity, and NMX-T generic keys must escape raw port/source/node/label identity, to avoid aliasing. Raw string metric values must not be emitted as labels.", - "- Rows marked `requires-live-source-resolution` or `requires-live-source-equivalent` remain in scope; they require live source proof or immediate escalation before GB200 signoff.", - "- Live GB200 validation happens after the branch is built, tested, linted, pushed, and reviewed.", - "", - ]) - out_md.write_text("\n".join(md) + "\n") - print(f"wrote {out_csv}") - print(f"wrote {out_md}") - print(f"rows {len(out_rows)}") - - -if __name__ == "__main__": - main() From 42781fbc009a82e0faf5cda0f70010e48446580b Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 02:30:25 +0200 Subject: [PATCH 21/25] chore(health): fix comment copy Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 5314a0b22f..470a2bc9de 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -225,7 +225,7 @@ interfaces_enabled = true platform_environment_temperature_enabled = true platform_environment_status_enabled = true -# NVUE gNMI streaming collector, disabled by default. Subscribes to +# NVUE gNMI streaming collector which subscribes to # gNMI SAMPLE paths (components + interfaces, plus platform_general when # platform_general_enabled is true) and pushes metrics through the configured # sinks. gNMI ON_CHANGE targets system-events From ba3f114c0e182ff62e719a78b8ef97f87f66a0d3 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 03:47:22 +0200 Subject: [PATCH 22/25] fix(health): nmxt cleanup. Fix wasteful label rebuilds Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 77 ++++++++-------------------- 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index bd02cdb5a7..9344720efc 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -38,15 +38,15 @@ use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; use crate::endpoint::{BmcEndpoint, EndpointMetadata}; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; +/// default NMX-T port const NMXT_PORT: u16 = 9352; +/// NMX-T endpoint const NMXT_ENDPOINT: &str = "/xcset/nvlink_domain_telemetry"; -/// Producer name for every emitted NMX-T series. Preserved across all mappings so the -/// downstream sink keeps a single `switch_nmxt` family. -const NMXT_PRODUCER: &str = "switch_nmxt"; +/// MetricSample name for NMX-T metrics +const NMXT_METRIC_NAME: &str = "switch_nmxt"; -/// One NMX-T numeric family -> canonical `switch_nmxt` series; `source` matched verbatim. #[derive(Debug, PartialEq)] struct NmxtMetric { source: &'static str, @@ -54,18 +54,13 @@ struct NmxtMetric { unit: &'static str, } -/// One NMX-T identity/inventory label -> canonical label re-exported on every series. #[derive(Debug, PartialEq)] struct NmxtLabel { source: &'static str, canonical: &'static str, } -/// Explicit family allowlist. Names absent here (and from [`NMXT_LABEL_MAP`]) are never exported; -/// each entry was confirmed live in the GB200 NMX-T scrape (Stage 0). Trailing comments name the -/// catalog telemetry parameter. const NMXT_METRIC_MAP: &[NmxtMetric] = &[ - // BER / error counters NmxtMetric { source: "Effective_BER", metric_type: "effective_ber", @@ -81,7 +76,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "link_down", unit: "count", }, - // Identity / inventory numeric families NmxtMetric { source: "lid", metric_type: "lid", @@ -92,7 +86,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "device_hw_rev", unit: "id", }, // DEVICE-HARDWARE-REVISION - // Status / link-down attribution NmxtMetric { source: "Advanced_Status_Opcode", metric_type: "status_opcode", @@ -108,7 +101,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "time_to_link_up", unit: "milliseconds", }, // TIME-TO-LINKS-UP - // Cable optics (numeric families) NmxtMetric { source: "cable_technology", metric_type: "cable_transmitter_technology", @@ -129,13 +121,11 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "cable_diag_supply_voltage", unit: "volts", }, // CABLE-DIAG-SUPPLY-VOLTAGE - // Link partner NmxtMetric { source: "link_partner_lid", metric_type: "link_partner_lid", unit: "id", }, // LINK-PARTNER-LID - // Recovery counters / timers NmxtMetric { source: "successful_recovery_events", metric_type: "link_recovery_success_cnt", @@ -176,7 +166,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "serdes_recovery_cycle_duration", unit: "seconds", }, // SERDES-RECOVERY-CYCLE-DURATION - // Contain-and-drain discards NmxtMetric { source: "contain_n_drain_xmit_discards", metric_type: "contain_drain_xmit_discard", @@ -187,7 +176,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "contain_drain_rcv_discard", unit: "count", }, // CONTAIN-DRAIN-RCV-DISCARD - // Raw error lanes NmxtMetric { source: "Raw_Errors_Lane_2", metric_type: "raw_err_lane_2", @@ -198,9 +186,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ metric_type: "raw_err_lane_3", unit: "count", }, // RAW-ERR-LANE-3 - // Cable/transceiver fault flags (0/1). Re-sourced from NMX-T: NVLink ports on - // the N5400_LD are not modeled as gNMI transceiver components, so the catalog's - // gNMI transceiver-diag path is absent live; NMX-T exposes these per active link. NmxtMetric { source: "tx_cdr_lol", metric_type: "cable_tx_cdr_lol", @@ -223,9 +208,6 @@ const NMXT_METRIC_MAP: &[NmxtMetric] = &[ }, // CABLE-RX-LOS ]; -/// Explicit label allowlist. These are identity/inventory dimensions, never standalone metrics: -/// re-exported as canonical labels on every emitted `switch_nmxt` sample. Trailing comments name -/// the catalog telemetry parameter. const NMXT_LABEL_MAP: &[NmxtLabel] = &[ NmxtLabel { source: "FW_Version", @@ -345,15 +327,15 @@ fn lookup_nmxt_metric(name: &str) -> Option<&'static NmxtMetric> { NMXT_METRIC_MAP.iter().find(|m| m.source == name) } -/// `Module_Temperature` arrives only as a label value (e.g. `"0C"`), never its own numeric line, -/// so it is parsed here and re-emitted as a gauge. Returns `None` on empty/unparseable (e.g. `"N/A"`). +/// Parse `Module_Temperature` as a label value (e.g. `"0C"`), never its own numeric +/// line and emit as a gauge with either numeric or `None fn cable_temp_to_celsius(raw: &str) -> Option { let trimmed = raw.trim(); let digits = trimmed.strip_suffix(['C', 'c']).unwrap_or(trimmed).trim(); digits.parse::().ok() } -/// Closed 3-state enum for `down_blame`, emitted as a StateSet (one 0/1 series per state). +/// Enum for `down_blame`, emitted as a StateSet (one 0/1 series per state). const DOWN_BLAME_STATES: &[&str] = &["unknown", "local_phy", "remote_phy"]; /// Maps a raw `down_blame` value to its canonical state, case-insensitively; unknown/empty -> "unknown". @@ -372,7 +354,6 @@ fn required_port_num(sample_labels: &HashMap) -> Option<&str> { .filter(|port_num| !port_num.is_empty()) } -/// Test-only; production iterates `NMXT_LABEL_MAP` directly in `build_labels`. #[cfg(test)] fn lookup_nmxt_label(key: &str) -> Option<&'static NmxtLabel> { NMXT_LABEL_MAP.iter().find(|l| l.source == key) @@ -546,9 +527,7 @@ impl NmxtCollector { } } - /// Canonical label set for one `switch_nmxt` series. Always carries `switch_id` / `switch_ip`; - /// scraped dimensions are re-exported only when their key is on [`NMXT_LABEL_MAP`], everything - /// else is dropped (never sanitized into exported labels). + /// Builds label set for one `switch_nmxt` series fn build_labels( &self, switch_ip: &str, @@ -575,8 +554,6 @@ impl NmxtCollector { self.emit_event(CollectorEvent::MetricCollectionStart); - // Scraped families off the allowlist: skipped (never sanitized) and only counted. - let mut unmapped_families = 0u64; // Ports already emitted a cable temperature this iteration (one series per port). let mut cable_temp_ports: HashSet = HashSet::new(); // Ports already emitted a down_blame StateSet this iteration (one set per port). @@ -589,8 +566,8 @@ impl NmxtCollector { value, } = sample; - // `Module_Temperature` rides as a label on lines whose family may not be allowlisted, - // so emit it before the family check, once per port. + // `Module_Temperature` rides as a label on lines whose map entry may not be + // collected. Emit before the map check, once per port. if let Some(celsius) = sample_labels .get("Module_Temperature") .and_then(|raw| cable_temp_to_celsius(raw)) @@ -603,7 +580,7 @@ impl NmxtCollector { self.emit_event(CollectorEvent::Metric( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), - name: NMXT_PRODUCER.to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: "cable_temperature_celsius".to_string(), unit: "celsius".to_string(), value: celsius, @@ -615,21 +592,21 @@ impl NmxtCollector { } } - // `down_blame` is a closed enum riding as a label; emit it as a per-port StateSet - // (one 0/1 series per state) before the family check, once per port. + // `down_blame` is an enum riding as a label; emit per port as a StateSet if let Some(raw) = sample_labels.get("down_blame") { let Some(port_num) = required_port_num(&sample_labels) else { continue; }; if down_blame_ports.insert(port_num.to_string()) { let current = down_blame_to_state(raw); + let base_labels = self.build_labels(&switch_ip, &sample_labels); for state in DOWN_BLAME_STATES { - let mut labels = self.build_labels(&switch_ip, &sample_labels); + let mut labels = base_labels.clone(); labels.push((Cow::Borrowed("state"), (*state).to_string())); self.emit_event(CollectorEvent::Metric( MetricSample { key: format!("down_blame:{}:{}", port_num, state), - name: NMXT_PRODUCER.to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: "down_blame".to_string(), unit: "state".to_string(), value: if *state == current { 1.0 } else { 0.0 }, @@ -642,11 +619,7 @@ impl NmxtCollector { } } - let Some(metric) = lookup_nmxt_metric(&name) else { - unmapped_families += 1; - continue; - }; - let (metric_type, unit) = (metric.metric_type, metric.unit); + let (metric_type, unit) = (metrics.metric_type, metric.unit); // Port number anchors the per-series key. let Some(port_num) = required_port_num(&sample_labels) else { @@ -663,7 +636,7 @@ impl NmxtCollector { self.emit_event(CollectorEvent::Metric( MetricSample { key: metric_key, - name: NMXT_PRODUCER.to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: metric_type.to_string(), unit: unit.to_string(), value, @@ -674,14 +647,6 @@ impl NmxtCollector { )); } - if unmapped_families > 0 { - tracing::debug!( - switch_id = %self.switch_id, - count = unmapped_families, - "skipped NMX-T families not on explicit allowlist" - ); - } - self.emit_event(CollectorEvent::MetricCollectionEnd); Ok(()) @@ -1034,7 +999,7 @@ Link_Down{Port_Number="1"} 5 collector.emit_event(CollectorEvent::Metric( MetricSample { key: format!("down_blame:{}:{}", port_num, state), - name: NMXT_PRODUCER.to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: "down_blame".to_string(), unit: "state".to_string(), value: if *state == current { 1.0 } else { 0.0 }, @@ -1060,7 +1025,7 @@ Link_Down{Port_Number="1"} 5 ); for s in &blame_series { - assert_eq!(s.name, NMXT_PRODUCER); + assert_eq!(s.name, "switch_nmxt"); assert_eq!(s.unit, "state"); let state = s .labels @@ -1146,7 +1111,7 @@ Link_Down{Port_Number="1"} 5 collector.emit_event(CollectorEvent::Metric( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), - name: NMXT_PRODUCER.to_string(), + name: NMXT_METRIC_NAME.to_string(), metric_type: "cable_temperature_celsius".to_string(), unit: "celsius".to_string(), value: celsius, @@ -1171,7 +1136,7 @@ Link_Down{Port_Number="1"} 5 ); let series = temp_series[0]; - assert_eq!(series.name, NMXT_PRODUCER); + assert_eq!(series.name, "switch_nmxt"); assert_eq!(series.unit, "celsius"); assert_eq!(series.value, 37.5); assert_eq!(series.key, "cable_temperature_celsius:11"); From 0524fd7830a81a864a314f8674d2dbd4dc1af287 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 16:30:36 +0200 Subject: [PATCH 23/25] chore(health): comment cleanup. fixing labels Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- .../health/src/collectors/nvue/gnmi/client.rs | 19 +--- .../collectors/nvue/gnmi/sample_processor.rs | 107 +++++------------- .../health/src/collectors/nvue/rest/client.rs | 23 ++-- .../src/collectors/nvue/rest/collector.rs | 85 +++++--------- crates/health/src/collectors/nvue/tls.rs | 8 +- crates/health/src/config.rs | 3 - crates/health/src/otlp/convert.rs | 17 ++- 7 files changed, 85 insertions(+), 177 deletions(-) diff --git a/crates/health/src/collectors/nvue/gnmi/client.rs b/crates/health/src/collectors/nvue/gnmi/client.rs index 6d48970f9b..44816caa75 100644 --- a/crates/health/src/collectors/nvue/gnmi/client.rs +++ b/crates/health/src/collectors/nvue/gnmi/client.rs @@ -63,8 +63,8 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { }); } if paths_config.platform_general_enabled { - // switch-level singleton: `/platform-general/state` carries the memory - // and disk utilization leaves (no interface/component name key). + // `/platform-general/state` carries the memory and disk + // utilization leaves paths.push(Path { elem: vec![ PathElem { @@ -78,8 +78,8 @@ pub fn nvue_subscribe_paths(paths_config: &NvueGnmiPaths) -> Vec { ], ..Default::default() }); - // sibling singleton: `/platform-general/versions` carries the OS/BMC/EROT - // firmware version leaves (also no interface/component name key). + // `/platform-general/versions` carries the OS/BMC/EROT + // firmware version leaves paths.push(Path { elem: vec![ PathElem { @@ -117,15 +117,8 @@ fn configure_tls_endpoint( return Ok(endpoint); } - // tonic 0.14 auto-injects a strict WebPKI/system-root TLS verifier when an - // Endpoint is built from an `https://` URI and layers its own TlsConnector - // over any custom connector (see tonic transport channel/service/connector.rs). - // That silently negated a hand-rolled hyper-rustls skip-verify connector and - // made tonic strictly reject the switch's self-signed NVOS gNMI cert (SAN does - // not cover the management IP). When the dangerous opt-in is enabled, use - // tonic's native custom-verifier hook so the skip-verify verifier is the one - // tonic actually applies. ClientTlsConfig::new() must NOT set any roots here - // (mixing roots + custom verifier is an error). + // Use tonic's verifier hook (https endpoints get a strict verifier + // otherwise). No roots on ClientTlsConfig — roots + verifier is an error. endpoint .tls_config_with_verifier( ClientTlsConfig::new(), diff --git a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs index 944ae601c5..94f7b80950 100644 --- a/crates/health/src/collectors/nvue/gnmi/sample_processor.rs +++ b/crates/health/src/collectors/nvue/gnmi/sample_processor.rs @@ -113,8 +113,6 @@ impl GnmiSampleProcessor { iface_name: &str, val: &proto::TypedValue, ) { - // Allowlisted `/interfaces/interface` leaves (live in the Stage-0 probe); - // unknown leaves fall through and are never exported. if leaf_matches(elems, &["state", "oper-status"]) { let current = oper_status_to_state(typed_value_to_string(val).as_deref()); self.emit_state_set( @@ -148,8 +146,6 @@ impl GnmiSampleProcessor { LOGICAL_PORT_STATES, ); } else if leaf_matches(elems, &["infiniband", "state", "speed"]) { - // NVOS types speed as a string/enum but live GB200 emits bare numeric Gbps; - // unparseable forms (e.g. "hdr") emit nothing. match link_speed_to_gbps(typed_value_to_string(val).as_deref()) { Some(v) => self.emit_iface("interface_link_speed_active", iface_name, v, "gbps"), None => debug_unmapped_value(elems, val, "interface_link_speed_active"), @@ -165,7 +161,6 @@ impl GnmiSampleProcessor { None => debug_unmapped_value(elems, val, "interface_supported_width"), } } else if leaf_matches(elems, &["phy-diag", "state", "phy-manager-state"]) { - // dynamic PHY FSM string: emit as a StateSet, not an info label. let current = phy_manager_to_state(typed_value_to_string(val).as_deref()); self.emit_state_set( "interface_phy_manager_state", @@ -174,16 +169,15 @@ impl GnmiSampleProcessor { current, PHY_MANAGER_STATES, ); - } else if leaf_matches(elems, &["infiniband", "state", "vl-capabilities"]) { - // stable capability string surfaced as an info-metric; empty emits nothing. - if let Some(caps) = typed_value_to_string(val).filter(|s| !s.is_empty()) { - self.emit_iface_info( - "interface_vl_capabilities_info", - iface_name, - "vl_capabilities", - &caps, - ); - } + } else if leaf_matches(elems, &["infiniband", "state", "vl-capabilities"]) + && let Some(caps) = typed_value_to_string(val).filter(|s| !s.is_empty()) + { + self.emit_iface_info( + "interface_vl_capabilities_info", + iface_name, + "vl_capabilities", + &caps, + ); } } @@ -238,9 +232,9 @@ impl GnmiSampleProcessor { comp_name: &str, val: &proto::TypedValue, ) { - // Allowlisted `/components/component` leaves; the `component_name` label - // distinguishes rows that share a leaf (FAN-STATE and CPU-STATE both resolve - // to `state/oper-status`). Unknown leaves are never exported. + // `/components/component` leaves: the `component_name` label + // distinguishes rows that share a leaf (e.g. FAN-STATE and CPU-STATE both resolve + // to `state/oper-status`) if leaf_matches(elems, &["healthz", "state", "status"]) { let current = component_health_to_state(typed_value_to_string(val).as_deref()); self.emit_state_set( @@ -278,9 +272,6 @@ impl GnmiSampleProcessor { { self.emit_comp("component_cpu_utilization", comp_name, v, "percent"); } - // ASIC-NAME (row 876): `state/name` is intentionally not emitted; the - // same value is already surfaced as the `component_name` label on every - // component metric, so a dedicated series would be redundant. } fn emit_comp(&self, metric_type: &str, comp_name: &str, value: f64, unit: &str) { @@ -295,22 +286,6 @@ impl GnmiSampleProcessor { } fn process_platform_general_metric(&self, elems: &[&PathElem], val: &proto::TypedValue) { - // Explicit per-leaf canonical mappings for `/platform-general/state`. - // This is a switch-level singleton: the four numeric memory/disk leaves - // are numeric gauges; contact/location/platform-name are stable strings - // surfaced as switch-level info-metrics. Every other platform-general - // leaf falls through and is never exported. - // - // String info-metrics first (CONTACT 862, LOCATION 863, - // NODE-DESCRIPTION 864): each emits a constant 1.0 sample whose - // information is carried by a single string label. Empty strings carry - // no information and emit nothing (CONTACT/LOCATION are empty on the - // GB200 rig, so only NODE-DESCRIPTION emits live). - // - // The firmware version info-metrics (OS-VERSION 868, BMC-VERSION 869, - // EROT-FW-VERSION 870) live under the sibling `/platform-general/versions` - // subtree rather than `/state`; they follow the same info-metric contract - // (constant 1.0 sample, single string label, empty strings emit nothing). let info: Option<(&str, &'static str)> = if leaf_matches(elems, &["state", "contact"]) { Some(("platform_contact_info", "contact")) } else if leaf_matches(elems, &["state", "location"]) { @@ -410,7 +385,6 @@ impl GnmiSampleProcessor { key.push(':'); key.push_str(entity_id); - // only the entity label; endpoint identity is added by PrometheusSink from EventContext. let labels = vec![( Cow::Borrowed(entity_label_name), entity_label_value.to_string(), @@ -431,7 +405,7 @@ impl GnmiSampleProcessor { } /// OpenMetrics StateSet: one `0.0`/`1.0` series per state (current == 1.0), with a `state` - /// label. The fan-out works for both sinks since OTLP has no native StateSet type. Unit "state". + /// label. fn emit_state_set( &self, metric_type: &str, @@ -451,7 +425,6 @@ impl GnmiSampleProcessor { key.push(':'); key.push_str(state); - // only the entity + state labels; endpoint identity is added by PrometheusSink. let labels = vec![ (Cow::Borrowed(entity_label_name), entity_id.to_string()), (Cow::Borrowed("state"), state.to_string()), @@ -495,23 +468,19 @@ fn leaf_matches(elems: &[&PathElem], expected: &[&str]) -> bool { .all(|(elem, name)| elem.name == *name) } -/// One numeric `/interfaces/interface` leaf mapping: path tail -> metric_type + unit. struct NumericLeafMapping { tail: &'static [&'static str], name: &'static str, unit: &'static str, } -/// A resolved numeric leaf: the metric_type + unit to emit. struct NumericLeaf { name: &'static str, unit: &'static str, } -/// Table-driven dispatch for numeric `/interfaces/interface` leaves. Every entry -/// is an explicit GB200 catalog mapping proven live in the Stage-0 probe; the -/// expected leaf path tail is matched against the live gNMI tree. Leaves not in -/// this table are never exported as metrics. +/// Table-driven dispatch for numeric `/interfaces/interface` leaves. The +/// expected leaf path tail is matched against the live gNMI tree. fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { const TABLE: &[NumericLeafMapping] = &[ // OpenConfig interface counters (`/state/counters/*`) @@ -637,10 +606,6 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { name: "interface_port_xmit_wait", unit: "count", }, - // NOTE: `infiniband/state/speed` is intentionally NOT in this numeric - // table. NVOS types it as a string/enum and the live GB200 form is a - // bare Gbps numeric; it is handled by a dedicated `link_speed_to_gbps` - // arm in `process_interface_metric` that emits unit `gbps`. NumericLeafMapping { tail: &["infiniband", "state", "mtu"], name: "interface_mtu", @@ -816,8 +781,6 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { name: "interface_plr_bw_loss_percent", unit: "percent", }, - // existing pre-branch mapping retained (leaf out of GB200 row set but - // restored upstream; kept so the canonical series is not dropped) NumericLeafMapping { tail: &["phy-diag", "state", "unintentional-link-down-events"], name: "interface_link_down_events", @@ -825,7 +788,7 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { }, ]; - // FEC histogram bins 0..=15 -> interface_fec_hist_{n} (rows 911..926) + // FEC histogram bins 0..=15 -> interface_fec_hist_{n} if let Some(leaf) = elems.last().map(|e| e.name.as_str()) && let Some(bin) = leaf.strip_prefix("rs-num-corr-err-bin") && let Ok(n) = bin.parse::() @@ -846,8 +809,7 @@ fn numeric_interface_leaf(elems: &[&PathElem]) -> Option { }) } -/// Stable, leaked-free metric_type names for FEC histogram bins 0..=15. The -/// catalog defines exactly 16 bins (FEC-HIST-0 .. FEC-HIST-15). +/// FEC histogram bins 0..=15 const FEC_HIST_NAMES: [&str; 16] = [ "interface_fec_hist_0", "interface_fec_hist_1", @@ -870,7 +832,7 @@ const FEC_HIST_NAMES: [&str; 16] = [ const OPER_STATUS_STATES: &[&str] = &["up", "down"]; /// oper-status string -> current StateSet state. "up" when the source reads -/// "up" or "active" (case-insensitive), else "down". Used for both +/// "up" or "active" else "down". Applies to /// `interface_oper_status` and `component_oper_status`. fn oper_status_to_state(status: Option<&str>) -> &'static str { match status { @@ -896,9 +858,7 @@ const PHY_MANAGER_STATES: &[&str] = &["up", "down"]; /// PHY manager FSM state string -> current StateSet state. The PHY manager /// reports a dynamic FSM label (e.g. "Active_or_Linkup", "Disabled"), so we -/// match the `active`/`linkup` tokens on word boundaries -- a bare substring -/// check would also match "Inactive"/"Deactivated" and falsely report a down -/// PHY as up. +/// match the `active`/`linkup` tokens fn phy_manager_to_state(state: Option<&str>) -> &'static str { match state { Some(s) @@ -914,9 +874,9 @@ fn phy_manager_to_state(state: Option<&str>) -> &'static str { const LOGICAL_PORT_STATES: &[&str] = &["active", "down"]; -/// InfiniBand logical port state enum -> current StateSet state. Values -/// observed live on GB200: `ACTIVE`, `DOWN`. "active" when the source reads -/// "active" (case-insensitive), else "down". +/// InfiniBand logical port state enum -> current StateSet state. +/// (e.g. `ACTIVE`, `DOWN`). "active" when the source reads +/// "active", else "down". fn logical_port_to_state(state: Option<&str>) -> &'static str { match state { Some(s) if s.eq_ignore_ascii_case("active") => "active", @@ -940,27 +900,18 @@ fn link_width_to_f64(width: Option<&str>) -> Option { .reduce(f64::max) } -/// IB link speed -> Gbps. NVOS types speed as a string/enum, but the live GB200 -/// capture emits bare numeric Gbps ("400" pairs with ib-speed=SPEED_NDR). We -/// accept the bare numeric (authoritative for this hardware) plus the defensive -/// suffix forms the schema permits, and normalize everything to Gbps: -/// - bare numeric ("400", "2.5") -> that value -/// - "G"/"G" (trailing G, case-insensitive) -> n -/// - "Mb/s" or "M" -> n/1000 -/// - anything else (e.g. "hdr") -> None (not exported) +/// IB link speed -> Gbps. GB200 emits bare numeric Gbps; we also accept the +/// suffix forms the schema permits. fn link_speed_to_gbps(speed: Option<&str>) -> Option { let s = speed?.trim(); if s.is_empty() { return None; } - // Mb/s forms first ("M" alone is ambiguous with a stray suffix, but the - // longest match wins so "Mb/s" is checked before the bare "M"). + // handle Mbit suffix if let Some(mbps) = s .strip_suffix("Mb/s") - .or_else(|| s.strip_suffix("MB/s")) .or_else(|| s.strip_suffix("Mbps")) .or_else(|| s.strip_suffix('M')) - .or_else(|| s.strip_suffix('m')) { return mbps.trim().parse::().ok().map(|v| v / 1000.0); } @@ -968,13 +919,11 @@ fn link_speed_to_gbps(speed: Option<&str>) -> Option { if let Some(gbps) = s.strip_suffix(['G', 'g']) { return gbps.trim().parse::().ok(); } - // bare numeric Gbps (live GB200 form) + // base case numeric implicit Gbps s.parse::().ok() } -/// Log (at debug) an interface leaf that matched a known mapping arm but whose -/// value could not be coerced, so the silent drop is observable. Nothing is -/// emitted for the metric in this case. +/// Log when an interface leaf that matched a known mapping but value wasn't caught. fn debug_unmapped_value(elems: &[&PathElem], val: &proto::TypedValue, metric_type: &str) { tracing::debug!( leaf = %leaf_path(elems), @@ -997,7 +946,7 @@ fn leaf_path(elems: &[&PathElem]) -> String { const COMPONENT_HEALTH_STATES: &[&str] = &["healthy", "unhealthy", "unknown"]; /// component healthz status -> current StateSet state. "healthy"/"unhealthy" -/// by case-insensitive match, anything else (including absent) "unknown". +/// else "unknown". fn component_health_to_state(status: Option<&str>) -> &'static str { match status { Some(s) if s.eq_ignore_ascii_case("healthy") => "healthy", diff --git a/crates/health/src/collectors/nvue/rest/client.rs b/crates/health/src/collectors/nvue/rest/client.rs index 951e65d279..d133b13c5e 100644 --- a/crates/health/src/collectors/nvue/rest/client.rs +++ b/crates/health/src/collectors/nvue/rest/client.rs @@ -324,9 +324,7 @@ pub type FanEnvironmentResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] pub struct FanData { - /// Fan maximum speed in RPM, reported by NVUE as a string (e.g. "33000"). - /// Other per-fan fields (current-speed, min-speed, direction, state) are - /// intentionally not captured — only max-speed is in scope. + /// Fan maximum speed in RPM, scraped as string (e.g. "33000") #[serde(rename = "max-speed")] pub max_speed: Option, } @@ -335,24 +333,19 @@ pub type TemperatureEnvironmentResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] pub struct TempData { - /// Current temperature in degrees Celsius, reported by NVUE as a string - /// (e.g. "43.00"). Each per-sensor field is optional — NVUE reports only a - /// subset for many sensors (e.g. ambient sensors expose only current+state). + /// Current temperature Celsius, scraped as string (e.g. "43.00"). + /// Field is optional per sensor pub current: Option, - /// Maximum (warning) threshold in degrees Celsius, as a string (e.g. "105.00"). + /// Maximum (warning) threshold in Celsius as string (e.g. "105.00"). pub max: Option, - /// Critical threshold in degrees Celsius, as a string (e.g. "120.00"). + /// Critical threshold in Celsius as a string (e.g. "120.00"). pub crit: Option, - /// Sensor state as a string (e.g. "ok"). + /// Sensor state as string (e.g. "ok"). pub state: Option, } -/// Parent `/nvue_v1/platform/environment` summary. Keys are aggregate status -/// entries (e.g. `FAN_STATUS`) as well as the `fan`/`temperature` subtrees. -/// Only the LED-style summary entries carry a top-level `state`; the nested -/// subtree objects have a different shape and deserialize with `state` absent -/// (serde ignores unknown keys, including the LED `type` discriminator we do -/// not consume), so they are harmlessly skipped by callers. +/// `/nvue_v1/platform/environment` summary. Keys are aggregate status +/// entries (e.g. `FAN_STATUS`) as well as the `fan`/`temperature` subtrees pub type PlatformEnvironmentResponse = HashMap; #[derive(Debug, Clone, Deserialize, Default)] diff --git a/crates/health/src/collectors/nvue/rest/collector.rs b/crates/health/src/collectors/nvue/rest/collector.rs index 403bec0f97..3d20042853 100644 --- a/crates/health/src/collectors/nvue/rest/collector.rs +++ b/crates/health/src/collectors/nvue/rest/collector.rs @@ -30,7 +30,6 @@ const COLLECTOR_NAME: &str = "nvue_rest"; const SYSTEM_HEALTH_STATES: &[&str] = &["ok", "not_ok", "unknown"]; -/// anything else (including absent) => "unknown". fn system_health_to_state(status: Option<&str>) -> &'static str { match status { Some("OK") => "ok", @@ -47,7 +46,6 @@ const PARTITION_HEALTH_STATES: &[&str] = &[ "unknown", ]; -/// The four known states map to themselves; anything else (including absent) => "unknown". fn partition_health_to_state(status: Option<&str>) -> &'static str { match status { Some("healthy") => "healthy", @@ -60,7 +58,6 @@ fn partition_health_to_state(status: Option<&str>) -> &'static str { const APP_STATUS_STATES: &[&str] = &["ok", "not_ok", "unknown"]; -/// anything else (including absent) => "unknown". fn app_status_to_state(status: Option<&str>) -> &'static str { match status { Some("ok") => "ok", @@ -69,7 +66,7 @@ fn app_status_to_state(status: Option<&str>) -> &'static str { } } -/// code "0" means no issue; any other opcode indicates a problem +/// "0" -> no issue. Any other opcode indicates a problem fn diagnostic_opcode_to_f64(code: &str) -> f64 { match code { "0" => 0.0, @@ -77,26 +74,22 @@ fn diagnostic_opcode_to_f64(code: &str) -> f64 { } } -/// NVUE reports fan max-speed as a string (e.g. "33000"). Parse it to RPM as -/// f64; return `None` when the field is absent or unparseable so callers emit -/// nothing rather than fabricating a value. +/// NVUE reports fan max-speed as a string (e.g. "33000"). Parse it to RPM. +/// Returns None when the field is absent or unparseable. fn fan_max_speed_to_f64(max_speed: Option<&str>) -> Option { max_speed.and_then(|s| s.trim().parse::().ok()) } -/// NVUE reports temperatures (current/max/crit) as strings in degrees Celsius -/// (e.g. "105.00"). Parse to f64; return `None` when the field is absent or -/// unparseable so callers emit nothing rather than fabricating a value. Shares -/// the same trim-then-parse contract as `fan_max_speed_to_f64`. +/// NVUE reports temps (current/max/crit) as Celsius strings (e.g. "105.00"). +/// Parse to f64. Returns None when the field is absent or unparseable. fn temp_to_f64(value: Option<&str>) -> Option { value.and_then(|s| s.trim().parse::().ok()) } const TEMP_STATE_STATES: &[&str] = &["ok", "not_ok"]; -/// Map a temperature sensor's string `state` to a StateSet state: "ok" -/// (case-insensitive) => "ok", any other present value => "not_ok", absent => -/// None (so callers emit nothing rather than fabricating an all-zero StateSet). +/// Sensor `state` -> StateSet: "ok" (case-insensitive) => "ok", other present +/// => "not_ok", absent => None. fn temp_state_to_state(state: Option<&str>) -> Option<&'static str> { state.map(|s| { if s.trim().eq_ignore_ascii_case("ok") { @@ -109,10 +102,8 @@ fn temp_state_to_state(state: Option<&str>) -> Option<&'static str> { const FAN_LED_STATES: &[&str] = &["ok", "not_ok"]; -/// Map the aggregate `FAN_STATUS` LED state from the platform/environment parent -/// summary to a StateSet state: "green"/"ok" (case-insensitive) => "ok", any -/// other non-empty value (e.g. "amber"/"red") => "not_ok", absent/empty => None -/// (so callers emit nothing rather than fabricating an all-zero StateSet). +/// `FAN_STATUS` LED -> StateSet: "green"/"ok" (case-insensitive) => "ok", +/// other non-empty => "not_ok", absent/empty => None. fn fan_led_to_state(state: Option<&str>) -> Option<&'static str> { let s = state?.trim(); if s.is_empty() { @@ -312,7 +303,7 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_platform_environment_fan().await { Ok(Some(fans)) => { for (fan_name, fan) in &fans { - // Only emit when max-speed parses; absent/garbage → nothing. + // Only emit when max-speed parses. Absent or garbage emits nothing. if let Some(value) = fan_max_speed_to_f64(fan.max_speed.as_deref()) { self.emit_metric( "fan_max_speed", @@ -340,8 +331,7 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_platform_environment_temperature().await { Ok(Some(temps)) => { for (sensor_name, temp) in &temps { - // Each field is optional; emit only the ones present/parseable - // rather than fabricating absent thresholds. + // Each field is optional. Emit only those present and parseable. let sensor_label = || vec![(Cow::Borrowed("sensor"), sensor_name.clone())]; if let Some(value) = temp_to_f64(temp.current.as_deref()) { @@ -374,8 +364,7 @@ impl PeriodicCollector for NvueRestCollector { ); entity_count += 1; } - // Absent `state` => emit nothing (never fabricate an - // all-zero StateSet); present => one 0/1 series per state. + // Absent state emits nothing. Present state emits one 0/1 series per state. if let Some(current) = temp_state_to_state(temp.state.as_deref()) { self.emit_state_set( "platform_temperature_state", @@ -402,8 +391,7 @@ impl PeriodicCollector for NvueRestCollector { match self.client.get_platform_environment().await { Ok(Some(env)) => { - // Switch-level aggregate FAN_STATUS LED; emit only when present - // and the state maps to a value, absent → nothing. + // Switch-level FAN_STATUS LED. Emit only when present and mappable. if let Some(current) = env .get("FAN_STATUS") .and_then(|s| fan_led_to_state(s.state.as_deref())) @@ -520,11 +508,9 @@ impl NvueRestCollector { )); } - /// emit an OpenMetrics StateSet: one `0.0`/`1.0` series per possible state, - /// with the current state's series == 1.0 and an added `state` label. The - /// existing per-entity `labels` are carried onto every series; `key_base` - /// is the per-entity key qualifier (it is suffixed with the state name so - /// each series gets a unique key). Unit is always "state". + /// Emit an OpenMetrics StateSet: one 0/1 series per state (current => 1.0), + /// each carrying `labels` plus a `state` label. `key_base` is suffixed with + /// the state name for a unique per-series key. Unit is always "state". fn emit_state_set( &self, metric_type: &str, @@ -537,9 +523,8 @@ impl NvueRestCollector { let mut series_labels = labels.clone(); series_labels.push((Cow::Borrowed("state"), state.to_string())); - // suffix the state onto the per-entity qualifier so each series key - // is unique (switch-level series have no entity qualifier, so the - // state name alone disambiguates them). + // suffix state onto the qualifier for a unique per-series key + // (switch-level series use the state name alone). let qualifier = match key_base { Some(base) => format!("{base}:{state}"), None => (*state).to_string(), @@ -570,10 +555,8 @@ mod tests { use crate::bmc::BoxFuture; use crate::config::NvueRestPaths; - /// Assert OpenMetrics StateSet semantics over a captured fan-out: exactly - /// one 0/1 series per `all_states` entry, each with unit "state" and a - /// `state` label; the series whose `state` label equals `current` has value - /// 1.0 and every other series is 0.0. `entity` (if any) is asserted present + /// Assert StateSet semantics: one 0/1 series per state (current => 1.0), + /// each with unit "state" and a `state` label. `entity` (if set) is present /// on every series. fn assert_state_set( samples: &[MetricSample], @@ -696,9 +679,8 @@ mod tests { assert_eq!(fan_led_to_state(None), None); } - /// Drives the same parse + emit logic `run_iteration` uses for the - /// platform/environment/fan endpoint against a captured sink, asserting the - /// emitted MAX-SPEED sample shape. Table-driven over representative payloads. + /// Drives run_iteration's fan parse + emit logic against a captured sink, + /// asserting max-speed sample shape. Table-driven. #[test] fn test_fan_max_speed_emit() { use crate::collectors::nvue::rest::client::FanEnvironmentResponse; @@ -826,11 +808,9 @@ mod tests { } } - /// Drives the same parse + emit logic `run_iteration` uses for the - /// platform/environment/temperature endpoint against a captured sink. A - /// fully-populated sensor (ASIC1) emits all four series; a sparse sensor - /// (Ambient-MNG-Temp, only current + state) emits exactly two and must NOT - /// fabricate the absent max/critical thresholds. + /// Drives run_iteration's temperature parse + emit logic against a captured + /// sink. A full sensor (ASIC1) emits all four series. A sparse sensor + /// (current + state only) emits two and must NOT fabricate absent max/crit. #[test] fn test_platform_temperature_emit() { use crate::collectors::nvue::rest::client::TemperatureEnvironmentResponse; @@ -938,7 +918,7 @@ mod tests { assert_eq!(sample.labels[0].1, "ASIC1"); } - // ASIC1 state="ok" => StateSet: ok=1, not_ok=0; sensor label preserved. + // ASIC1 state="ok" => StateSet: ok=1, not_ok=0. Sensor label preserved. let asic1_state: Vec = samples .iter() .filter(|s| { @@ -989,10 +969,8 @@ mod tests { ); } - /// Drives the same parse + emit logic `run_iteration` uses for the - /// platform/environment parent summary against a captured sink, asserting the - /// emitted switch-level `fan_led` sample shape. "green"/"ok" => 1.0, - /// "amber" => 0.0, and an absent `FAN_STATUS` emits nothing. + /// Drives run_iteration's fan_led parse + emit logic against a captured sink. + /// "green"/"ok" => 1.0, "amber" => 0.0, absent FAN_STATUS emits nothing. #[test] fn test_fan_led_emit() { use crate::collectors::nvue::rest::client::PlatformEnvironmentResponse; @@ -1064,7 +1042,7 @@ mod tests { match case.expected { Some(current) => { // switch-level StateSet: no per-entity label, but a `state` - // label per series; series keys are unique per state. + // label per series. Series keys are unique per state. assert_state_set(&samples, "fan_led", None, FAN_LED_STATES, current); for sample in samples.iter() { assert_eq!(sample.name, COLLECTOR_NAME, "case '{}'", case.name); @@ -1101,9 +1079,8 @@ mod tests { struct ScriptedProvider { calls: AtomicUsize, - // Each call pops the front of this queue; an empty queue yields an - // error. `HealthError` is not `Clone`, so we store and consume by - // value rather than indexing + `.cloned()`. + // Each call pops the front. An empty queue yields an error. HealthError + // isn't Clone, so we consume by value. responses: StdMutex>>, } diff --git a/crates/health/src/collectors/nvue/tls.rs b/crates/health/src/collectors/nvue/tls.rs index 6137828e67..4c43d07138 100644 --- a/crates/health/src/collectors/nvue/tls.rs +++ b/crates/health/src/collectors/nvue/tls.rs @@ -63,13 +63,7 @@ impl ServerCertVerifier for AcceptAnyCertVerifier { } } -/// Dangerous rustls verifier that accepts any server certificate without validation. -/// -/// Passed to tonic's `Endpoint::tls_config_with_verifier` so tonic's own TLS layer -/// skips verification. NVOS gNMI presents a self-signed cert whose SAN may not cover -/// the management IP being dialed; a strict verifier rejects it. Do not hand-roll a -/// separate `hyper_rustls` connector for this — tonic 0.14 layers its own (strict) -/// TLS over any custom connector for `https://` URIs, which silently negates it. +/// Dangerous rustls verifier that accepts any server certificate without validation pub fn accept_any_cert_verifier() -> Arc { Arc::new(AcceptAnyCertVerifier) } diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index e15403a0e3..2363505973 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -989,9 +989,6 @@ impl Default for NvueGnmiConfig { pub struct NvueGnmiPaths { pub components_enabled: bool, pub interfaces_enabled: bool, - /// Subscribe to `/platform-general/state` for switch-level memory and disk - /// utilization. This is a singleton resource (not keyed by interface or - /// component name). pub platform_general_enabled: bool, } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index b0c2cd0497..1ce1ff9af7 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -92,7 +92,7 @@ fn resource_attributes(context: &EventContext) -> Vec { attrs.push(kv("switch.id", switch_id.to_string())); } if let Some(serial) = context.switch_serial() { - attrs.push(kv("switch.serial", serial.to_string())); + attrs.push(kv("switch.serial_number", serial.to_string())); } if let Some(role) = context.switch_endpoint_role() { let endpoint_role = match role { @@ -253,9 +253,11 @@ pub fn build_metrics_export_request( .map(|(k, v)| kv(k, v.clone())) .collect(); - // promote switch identity onto the datapoint so dashboards filtering on - // `switch_serial`/`switch_id` (underscore label form) match; these otherwise - // only exist as OTLP *resource* attributes (`switch.serial`/`switch.id`). + // Promote switch identity onto the datapoint so it is queryable as a + // per-series label. As an OTLP resource attribute alone it lands on + // target_info, not the series. These datapoint labels use the underscore + // form (Prometheus label names cannot contain dots); the dotted + // switch.serial_number / switch.id live on the resource attributes. if !attributes.iter().any(|attr| attr.key == "switch_serial") && let Some(serial) = context.switch_serial() { @@ -492,7 +494,10 @@ mod tests { attr_value(&attrs, "switch.id"), Some(switch_id_attr.as_str()) ); - assert_eq!(attr_value(&attrs, "switch.serial"), Some("SN-SWITCH-001")); + assert_eq!( + attr_value(&attrs, "switch.serial_number"), + Some("SN-SWITCH-001") + ); assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("host")); assert_eq!(attr_bool_value(&attrs, "switch.is_primary"), Some(true)); assert_eq!(attr_int_value(&attrs, "switch.slot_number"), Some(7)); @@ -540,7 +545,7 @@ mod tests { Some(switch_id_attr.as_str()) ); assert_eq!( - attr_value(&attrs, "switch.serial"), + attr_value(&attrs, "switch.serial_number"), Some("SN-SWITCH-BMC-001") ); assert_eq!(attr_value(&attrs, "switch.endpoint_role"), Some("bmc")); From e293294fef3c4bdb0ae26dfb5da53c695e1d23c1 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sat, 27 Jun 2026 19:35:46 +0200 Subject: [PATCH 24/25] fix(health): added back allowlist guard Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index 9344720efc..ff518df228 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -619,7 +619,10 @@ impl NmxtCollector { } } - let (metric_type, unit) = (metrics.metric_type, metric.unit); + let Some(metric) = lookup_nmxt_metric(&name) else { + continue; + }; + let (metric_type, unit) = (metric.metric_type, metric.unit); // Port number anchors the per-series key. let Some(port_num) = required_port_num(&sample_labels) else { From 961fd8c54385a1ff944432bef307d8e014f6ce65 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sun, 28 Jun 2026 02:54:36 +0200 Subject: [PATCH 25/25] fix(health): remove label dupes Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/collectors/nmxt.rs | 28 +++++------------- crates/health/src/otlp/convert.rs | 44 +++++++++++++++------------- 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/crates/health/src/collectors/nmxt.rs b/crates/health/src/collectors/nmxt.rs index ff518df228..486424a255 100644 --- a/crates/health/src/collectors/nmxt.rs +++ b/crates/health/src/collectors/nmxt.rs @@ -35,7 +35,7 @@ use nv_redfish::core::Bmc; use crate::HealthError; use crate::collectors::{IterationResult, PeriodicCollector}; use crate::config::NmxtCollectorConfig as NmxtCollectorOptions; -use crate::endpoint::{BmcEndpoint, EndpointMetadata}; +use crate::endpoint::BmcEndpoint; use crate::sink::{CollectorEvent, DataSink, EventContext, MetricSample}; /// default NMX-T port @@ -460,7 +460,6 @@ pub struct NmxtCollectorConfig { pub struct NmxtCollector { endpoint: Arc, - switch_id: String, http_client: reqwest::Client, event_context: EventContext, data_sink: Option>, @@ -474,10 +473,6 @@ impl PeriodicCollector for NmxtCollector { endpoint: Arc, config: Self::Config, ) -> Result { - let switch_id = match &endpoint.metadata { - Some(EndpointMetadata::Switch(s)) => s.serial.clone(), - _ => endpoint.addr.mac.to_string(), - }; let event_context = EventContext::from_endpoint(endpoint.as_ref(), "nmxt"); let request_timeout = config.nmxt_config.request_timeout; @@ -495,7 +490,6 @@ impl PeriodicCollector for NmxtCollector { Ok(Self { endpoint, - switch_id, http_client, event_context, data_sink: config.data_sink, @@ -530,13 +524,9 @@ impl NmxtCollector { /// Builds label set for one `switch_nmxt` series fn build_labels( &self, - switch_ip: &str, sample_labels: &HashMap, ) -> Vec<(Cow<'static, str>, String)> { - let mut labels: Vec<(Cow<'static, str>, String)> = - Vec::with_capacity(2 + NMXT_LABEL_MAP.len()); - labels.push((Cow::Borrowed("switch_id"), self.switch_id.clone())); - labels.push((Cow::Borrowed("switch_ip"), switch_ip.to_string())); + let mut labels: Vec<(Cow<'static, str>, String)> = Vec::with_capacity(NMXT_LABEL_MAP.len()); for label in NMXT_LABEL_MAP { if let Some(value) = sample_labels.get(label.source) { @@ -576,7 +566,7 @@ impl NmxtCollector { continue; }; if cable_temp_ports.insert(port_num.to_string()) { - let labels = self.build_labels(&switch_ip, &sample_labels); + let labels = self.build_labels(&sample_labels); self.emit_event(CollectorEvent::Metric( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), @@ -599,7 +589,7 @@ impl NmxtCollector { }; if down_blame_ports.insert(port_num.to_string()) { let current = down_blame_to_state(raw); - let base_labels = self.build_labels(&switch_ip, &sample_labels); + let base_labels = self.build_labels(&sample_labels); for state in DOWN_BLAME_STATES { let mut labels = base_labels.clone(); labels.push((Cow::Borrowed("state"), (*state).to_string())); @@ -634,7 +624,7 @@ impl NmxtCollector { metric_key.push(':'); metric_key.push_str(port_num); - let labels = self.build_labels(&switch_ip, &sample_labels); + let labels = self.build_labels(&sample_labels); self.emit_event(CollectorEvent::Metric( MetricSample { @@ -975,7 +965,6 @@ Link_Down{Port_Number="1"} 5 }); let collector = NmxtCollector { endpoint: endpoint.clone(), - switch_id: "test-switch".to_string(), http_client: reqwest::Client::new(), event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), data_sink: Some(sink.clone()), @@ -986,7 +975,6 @@ Link_Down{Port_Number="1"} 5 r#"lid{Port_Number="11", down_blame="Remote_phy"} 3093"#, r#"Effective_BER{Port_Number="11", down_blame="Remote_phy"} 0"#, ]; - let switch_ip = endpoint.addr.ip.to_string(); let mut down_blame_ports: HashSet = HashSet::new(); for line in lines { let sample = parse_prometheus_line(line).expect("parse line"); @@ -997,7 +985,7 @@ Link_Down{Port_Number="1"} 5 if down_blame_ports.insert(port_num.to_string()) { let current = down_blame_to_state(raw); for state in DOWN_BLAME_STATES { - let mut labels = collector.build_labels(&switch_ip, &sample.labels); + let mut labels = collector.build_labels(&sample.labels); labels.push((Cow::Borrowed("state"), (*state).to_string())); collector.emit_event(CollectorEvent::Metric( MetricSample { @@ -1086,7 +1074,6 @@ Link_Down{Port_Number="1"} 5 }); let collector = NmxtCollector { endpoint: endpoint.clone(), - switch_id: "test-switch".to_string(), http_client: reqwest::Client::new(), event_context: EventContext::from_endpoint(endpoint.as_ref(), "nmxt"), data_sink: Some(sink.clone()), @@ -1097,7 +1084,6 @@ Link_Down{Port_Number="1"} 5 r#"lid{Port_Number="11", Module_Temperature="37.5C"} 3093"#, r#"Effective_BER{Port_Number="11", Module_Temperature="37.5C"} 0"#, ]; - let switch_ip = endpoint.addr.ip.to_string(); let mut cable_temp_ports: HashSet = HashSet::new(); for line in lines { let sample = parse_prometheus_line(line).expect("parse line"); @@ -1110,7 +1096,7 @@ Link_Down{Port_Number="1"} 5 continue; }; if cable_temp_ports.insert(port_num.to_string()) { - let labels = collector.build_labels(&switch_ip, &sample.labels); + let labels = collector.build_labels(&sample.labels); collector.emit_event(CollectorEvent::Metric( MetricSample { key: format!("cable_temperature_celsius:{}", port_num), diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index 1ce1ff9af7..93ab6bf951 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -247,28 +247,16 @@ pub fn build_metrics_export_request( let mut by_endpoint: HashMap, Vec)> = HashMap::new(); for (context, sample) in batch { - let mut attributes: Vec = sample + // Switch identity rides once on the resource attributes (switch.id, + // switch.serial_number, switch.ip). VictoriaMetrics flattens resource + // attributes onto every series, so promoting them onto the datapoint too + // only duplicates the same value under a second (underscore) label name. + let attributes: Vec = sample .labels .iter() .map(|(k, v)| kv(k, v.clone())) .collect(); - // Promote switch identity onto the datapoint so it is queryable as a - // per-series label. As an OTLP resource attribute alone it lands on - // target_info, not the series. These datapoint labels use the underscore - // form (Prometheus label names cannot contain dots); the dotted - // switch.serial_number / switch.id live on the resource attributes. - if !attributes.iter().any(|attr| attr.key == "switch_serial") - && let Some(serial) = context.switch_serial() - { - attributes.push(kv("switch_serial", serial.to_string())); - } - if !attributes.iter().any(|attr| attr.key == "switch_id") - && let Some(switch_id) = context.switch_id() - { - attributes.push(kv("switch_id", switch_id.to_string())); - } - let data_point = NumberDataPoint { attributes, time_unix_nano: observed_nanos, @@ -763,7 +751,7 @@ mod tests { } #[test] - fn switch_nmxt_metric_carries_full_name_and_switch_serial_label() { + fn switch_nmxt_identity_is_resource_only_not_on_datapoint() { let switch_id = test_switch_id("switch-nmxt"); let switch_id_attr = switch_id.to_string(); let context = EventContext { @@ -796,7 +784,8 @@ mod tests { }; let request = build_metrics_export_request(&[(context, sample)], "carbide_hardware_health"); - let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; + let resource_metrics = &request.resource_metrics[0]; + let metrics = &resource_metrics.scope_metrics[0].metrics; assert_eq!(metrics.len(), 1); assert_eq!( @@ -807,10 +796,23 @@ mod tests { let metric::Data::Gauge(gauge) = metrics[0].data.as_ref().expect("metric data") else { panic!("expected gauge data"); }; + // Identity must NOT be promoted onto the datapoint (VM duplicates it from the resource). let attrs = &gauge.data_points[0].attributes; - assert_eq!(attr_value(attrs, "switch_serial"), Some("SN-SWITCH-001")); + assert_eq!(attr_value(attrs, "switch_serial"), None); + assert_eq!(attr_value(attrs, "switch_id"), None); + + // It lives once, on the resource (dotted form). + let resource_attrs = &resource_metrics + .resource + .as_ref() + .expect("resource") + .attributes; + assert_eq!( + attr_value(resource_attrs, "switch.serial_number"), + Some("SN-SWITCH-001") + ); assert_eq!( - attr_value(attrs, "switch_id"), + attr_value(resource_attrs, "switch.id"), Some(switch_id_attr.as_str()) ); }