From 84576091f73c89ca031f6f9c05487337e73b356d Mon Sep 17 00:00:00 2001 From: Roman Tiulmankov Date: Sat, 27 Jun 2026 14:43:52 -0700 Subject: [PATCH] Drop non-finite (NaN/Inf) values before ODS write Summary: The `gcm slurm_monitor` ODS exporter has been failing to publish on all updated FAIR/shared clusters since the June 18 GCM rollout (S678528). Every collection cycle the write to https://graph.facebook.com/v21.0/ods_metrics is rejected with `400 (#100) param datapoints must be an array`, so no `fcm.*` metrics land in ODS. `fcm.total_gpus_up` and the rest of the cluster/GPU-availability metrics have been flat since 2026-06-22, breaking the v3_gpu_availability detector and the FAIR cluster-health pages. Root cause: a metric value computed by slurm_monitor can be non-finite (e.g. a mean/variance over zero samples => 0/0 = NaN). `get_payload` filtered values by `isinstance(value, (int, float))`, but NaN/Inf are floats and pass the check. `json.dumps` then serializes them as the bare tokens `NaN`/`Infinity`, which are invalid JSON, so the Graph API rejects the entire batch -- dropping every datapoint in the request, not just the offending one. That is why all `fcm.*` keys vanish at once. Reproduced live against the deployed token: a minimal datapoint and a 10k-datapoint (~1.3MB) batch both return 200, a `None` value returns 200, but a single NaN value returns exactly the observed `400 (#100) param datapoints must be an array`. Fix: in `get_payload`, skip non-finite values (logging how many were dropped) and serialize with `allow_nan=False` as a belt-and-suspenders so we can never emit invalid JSON again. Valid metrics now publish even when one metric goes NaN. This restores `fcm.*` ODS publishing. The separate `scribe_category argument is missing` assertion (the sdiag scribe write on clusters without `sdiag_scribe_category`) is tracked independently and does not affect ODS metrics. Reviewed By: xman1979 Differential Revision: D109949701 --- gcm/monitoring/meta_utils/ods.py | 20 +++++++- gcm/tests/test_ods.py | 82 +++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/gcm/monitoring/meta_utils/ods.py b/gcm/monitoring/meta_utils/ods.py index 203d1a5..162a407 100644 --- a/gcm/monitoring/meta_utils/ods.py +++ b/gcm/monitoring/meta_utils/ods.py @@ -7,6 +7,7 @@ import json import logging +import math import time from dataclasses import asdict, dataclass from typing import Callable, List, Mapping, Optional, TYPE_CHECKING, TypedDict, Union @@ -117,8 +118,18 @@ def write( def get_payload(config: ODSConfig, data: List[ODSData]) -> ODSPayload: datapoints = [] + dropped = 0 for ods_data in data: for key, value in ods_data.metrics.items(): + # Non-finite values (NaN/Inf) serialize to bare `NaN`/`Infinity` tokens, + # which are invalid JSON. The ODS Graph API endpoint rejects the entire + # request with `400 (#100) param datapoints must be an array`, dropping + # every datapoint in the batch (not just the offending one). Skip them so + # a single bad metric (e.g. a mean/variance computed over zero samples) + # can't take down all of the cluster's metrics. + if isinstance(value, float) and not math.isfinite(value): + dropped += 1 + continue datapoints.append( { "entity": ods_data.entity, @@ -128,12 +139,19 @@ def get_payload(config: ODSConfig, data: List[ODSData]) -> ODSPayload: } ) + if dropped: + logger.warning( + f"Dropped {dropped} non-finite (NaN/Inf) datapoint(s) before ODS write " + f"for category {config.category_id}." + ) logger.debug( f"Payload of {len(datapoints)} datapoints for ODS category {config.category_id}." ) return { "access_token": config.secret_key, - "datapoints": json.dumps(datapoints), + # allow_nan=False guards against any non-finite value slipping through above: + # it raises locally instead of emitting invalid JSON that the API would 400 on. + "datapoints": json.dumps(datapoints, allow_nan=False), "category_id": config.category_id, } diff --git a/gcm/tests/test_ods.py b/gcm/tests/test_ods.py index a7fa388..059d92b 100644 --- a/gcm/tests/test_ods.py +++ b/gcm/tests/test_ods.py @@ -7,8 +7,10 @@ import pytest import requests +import json + from gcm.monitoring.clock import ClockImpl -from gcm.monitoring.meta_utils.ods import ODSConfig, ODSData, write +from gcm.monitoring.meta_utils.ods import get_payload, ODSConfig, ODSData, write from pytest_mock import MockerFixture from requests_mock import Mocker as RequestsMocker @@ -110,6 +112,84 @@ def test_write_raises( will_retry.assert_has_calls(mocker.call(i) for i in range(1, num_retries + 1)) +class TestGetPayload: + @staticmethod + def test_includes_finite_values(ods_config: ODSConfig) -> None: + data = [ + ODSData( + entity="fake_entity", + time=TEST_TIME, + metrics={"bar": 42, "baz": 100.0}, + ) + ] + + payload = get_payload(ods_config, data) + datapoints = json.loads(payload["datapoints"]) + + assert len(datapoints) == 2 + assert {dp["key"] for dp in datapoints} == {"bar", "baz"} + + @staticmethod + def test_drops_non_finite_values(ods_config: ODSConfig) -> None: + data = [ + ODSData( + entity="fake_entity", + time=TEST_TIME, + metrics={ + "good": 1.0, + "nan": float("nan"), + "inf": float("inf"), + "neg_inf": float("-inf"), + "also_good": 7, + }, + ) + ] + + # json.loads rejects bare NaN/Infinity tokens, so a successful parse also + # proves the serialized payload is valid JSON (what the ODS API requires). + payload = get_payload(ods_config, data) + datapoints = json.loads(payload["datapoints"]) + + assert {dp["key"] for dp in datapoints} == {"good", "also_good"} + + @staticmethod + def test_non_finite_values_do_not_raise(ods_config: ODSConfig) -> None: + data = [ + ODSData( + entity="fake_entity", + time=TEST_TIME, + metrics={"nan": float("nan")}, + ) + ] + + payload = get_payload(ods_config, data) + + assert json.loads(payload["datapoints"]) == [] + + +def test_write_with_non_finite_values_sends_valid_json( + requests_mock: RequestsMocker, + mocker: MockerFixture, + ods_config: ODSConfig, +) -> None: + data = [ + ODSData( + entity="fake_entity", + time=TEST_TIME, + metrics={"good": 1.0, "nan": float("nan")}, + ) + ] + requests_mock.post(ods_config.endpoint) + will_retry = mocker.MagicMock() + + write(data, ods_config, will_retry=will_retry) + + assert requests_mock.call_count == 1 + assert not will_retry.called + sent = json.loads(requests_mock.last_request.json()["datapoints"]) + assert {dp["key"] for dp in sent} == {"good"} + + class TestODSConfig: @staticmethod def test_loads_value() -> None: