From 5527d40706b474c839cb9889a19b3f57d4d5535d Mon Sep 17 00:00:00 2001 From: Aditya Basu Date: Thu, 25 Jun 2026 19:04:42 -0700 Subject: [PATCH] Truncate oversized NODELIST/SCHEDNODES/EXC_NODES in squeue serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: ## Context/Motivation The GCM `gcm-slurm-job-monitor` on `fair-sc` is in a crash loop (~20x/hour for 24+ hours) because a single job's squeue record serializes to 1,246,781 bytes, exceeding the 1MB Scribe chunk_size limit. This causes `chunk_by_json_size()` to raise a `ValueError` which kills the entire collection cycle, resulting in ~51% data loss in `fair_job_data`. The root cause is that the NODELIST parser expands Slurm range notation into individual node names without any cap. For very large jobs, this expansion produces entries that exceed the Scribe message size limit when JSON-serialized. ## This diff - Adds a `_truncated_nodelist()` wrapper around the nodelist parser that caps the expanded list at 1000 entries (≈30KB, well within the 1MB limit) - Appends "..." as a sentinel marker when truncation occurs - Logs a warning with the original count and first/last node names - Applies the truncation to NODELIST, EXC_NODES, and SCHEDNODES fields in JobData Differential Revision: D109782894 --- gcm/schemas/slurm/squeue.py | 26 ++++++++++++++++++++++--- gcm/tests/test_slurm.py | 39 ++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/gcm/schemas/slurm/squeue.py b/gcm/schemas/slurm/squeue.py index de53473..e18b251 100644 --- a/gcm/schemas/slurm/squeue.py +++ b/gcm/schemas/slurm/squeue.py @@ -1,5 +1,7 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +import logging + from dataclasses import dataclass, field, fields from gcm.monitoring.clock import time_to_time_aware @@ -14,6 +16,24 @@ from gcm.schemas.dataclass import parsed_field from gcm.schemas.slurm.derived_cluster import DerivedCluster +logger = logging.getLogger(__name__) + +_MAX_NODELIST_ENTRIES = 1000 + + +def _truncated_nodelist(s: str) -> list[str] | None: + """Parse a Slurm nodelist, truncating to _MAX_NODELIST_ENTRIES with '...' marker.""" + parsed, _ = nodelist()(s) + if parsed is None: + return None + if len(parsed) > _MAX_NODELIST_ENTRIES: + logger.warning( + f"Truncating NODELIST from {len(parsed)} to {_MAX_NODELIST_ENTRIES} entries " + f"(first: {parsed[0]}, last: {parsed[-1]})" + ) + return parsed[:_MAX_NODELIST_ENTRIES] + ["..."] + return parsed + @dataclass(kw_only=True) class JobData(DerivedCluster): @@ -39,10 +59,10 @@ class JobData(DerivedCluster): NODES: int | None = parsed_field(parser=maybe_int, field_name="NUMNODES") TIME_LEFT: str = parsed_field(parser=str, field_name="TIMELEFT") TIME_USED: str = parsed_field(parser=str, field_name="TIMEUSED") - NODELIST: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0]) + NODELIST: list[str] | None = parsed_field(parser=_truncated_nodelist) DEPENDENCY: str = parsed_field(parser=str) EXC_NODES: list[str] | None = parsed_field( - parser=lambda s: nodelist()(s)[0], field_name="EXCNODES" + parser=_truncated_nodelist, field_name="EXCNODES" ) START_TIME: str = parsed_field(parser=time_to_time_aware, field_name="STARTTIME") SUBMIT_TIME: str = parsed_field(parser=time_to_time_aware, field_name="SUBMITTIME") @@ -75,7 +95,7 @@ class JobData(DerivedCluster): REQUEUE: str = parsed_field(parser=str) FEATURE: str = parsed_field(parser=str) RESTARTCNT: int = parsed_field(parser=int) - SCHEDNODES: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0]) + SCHEDNODES: list[str] | None = parsed_field(parser=_truncated_nodelist) LAST_SCHED_EVAL: str = parsed_field( parser=time_to_time_aware, field_name="LASTSCHEDEVAL" ) diff --git a/gcm/tests/test_slurm.py b/gcm/tests/test_slurm.py index baf1768..0abbfc5 100644 --- a/gcm/tests/test_slurm.py +++ b/gcm/tests/test_slurm.py @@ -15,7 +15,7 @@ from gcm.schemas.slurm.sinfo import Sinfo from gcm.schemas.slurm.sinfo_node import SinfoNode -from gcm.schemas.slurm.squeue import JobData +from gcm.schemas.slurm.squeue import _truncated_nodelist, JobData from gcm.tests import data TEST_CLUSTER = "test_cluster" @@ -653,3 +653,40 @@ def test_parse_sdiag_json_with_explicit_null_timestamp_objects( assert result.job_states_ts is None assert result.bf_when_last_cycle is None mock_reset.assert_called_once() + + +class TestTruncatedNodelist: + """Tests for the _truncated_nodelist helper that caps oversized nodelists.""" + + def test_normal_nodelist_unchanged(self) -> None: + """Nodelists with <= 1000 entries pass through unchanged.""" + # A small nodelist: "node[001-010]" expands to 10 entries + result = _truncated_nodelist("node[001-010]") + assert result is not None + assert len(result) == 10 + assert result[0] == "node001" + assert result[-1] == "node010" + # No "..." marker + assert "..." not in result + + def test_large_nodelist_truncated(self) -> None: + """Nodelists with > 1000 entries are truncated to 1000 + '...' marker.""" + # A large nodelist: "h200-[0001-2000]" expands to 2000 entries + result = _truncated_nodelist("h200-[0001-2000]") + assert result is not None + assert len(result) == 1001 # 1000 entries + "..." + assert result[0] == "h200-0001" + assert result[999] == "h200-1000" + assert result[-1] == "..." + + def test_exactly_1000_entries_unchanged(self) -> None: + """Nodelists with exactly 1000 entries are not truncated.""" + result = _truncated_nodelist("n[0001-1000]") + assert result is not None + assert len(result) == 1000 + assert "..." not in result + + def test_empty_nodelist_returns_none(self) -> None: + """Empty/unparseable nodelists return None.""" + result = _truncated_nodelist("") + assert result is None