From 5527d40706b474c839cb9889a19b3f57d4d5535d Mon Sep 17 00:00:00 2001
From: Aditya Basu <adityabasu@meta.com>
Date: Thu, 25 Jun 2026 19:04:42 -0700
Subject: [PATCH] Truncate oversized NODELIST/SCHEDNODES/EXC_NODES in squeue
 serialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
## Context/Motivation
The GCM `gcm-slurm-job-monitor` on `fair-sc` is in a crash loop (~20x/hour for 24+ hours)
because a single job's squeue record serializes to 1,246,781 bytes, exceeding the 1MB
Scribe chunk_size limit. This causes `chunk_by_json_size()` to raise a `ValueError` which
kills the entire collection cycle, resulting in ~51% data loss in `fair_job_data`.

The root cause is that the NODELIST parser expands Slurm range notation into individual
node names without any cap. For very large jobs, this expansion produces entries that
exceed the Scribe message size limit when JSON-serialized.

## This diff
- Adds a `_truncated_nodelist()` wrapper around the nodelist parser that caps the expanded
  list at 1000 entries (≈30KB, well within the 1MB limit)
- Appends "..." as a sentinel marker when truncation occurs
- Logs a warning with the original count and first/last node names
- Applies the truncation to NODELIST, EXC_NODES, and SCHEDNODES fields in JobData

Differential Revision: D109782894
---
 gcm/schemas/slurm/squeue.py | 26 ++++++++++++++++++++++---
 gcm/tests/test_slurm.py     | 39 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/gcm/schemas/slurm/squeue.py b/gcm/schemas/slurm/squeue.py
index de53473..e18b251 100644
--- a/gcm/schemas/slurm/squeue.py
+++ b/gcm/schemas/slurm/squeue.py
@@ -1,5 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+import logging
+
 from dataclasses import dataclass, field, fields
 
 from gcm.monitoring.clock import time_to_time_aware
@@ -14,6 +16,24 @@
 from gcm.schemas.dataclass import parsed_field
 from gcm.schemas.slurm.derived_cluster import DerivedCluster
 
+logger = logging.getLogger(__name__)
+
+_MAX_NODELIST_ENTRIES = 1000
+
+
+def _truncated_nodelist(s: str) -> list[str] | None:
+    """Parse a Slurm nodelist, truncating to _MAX_NODELIST_ENTRIES with '...' marker."""
+    parsed, _ = nodelist()(s)
+    if parsed is None:
+        return None
+    if len(parsed) > _MAX_NODELIST_ENTRIES:
+        logger.warning(
+            f"Truncating NODELIST from {len(parsed)} to {_MAX_NODELIST_ENTRIES} entries "
+            f"(first: {parsed[0]}, last: {parsed[-1]})"
+        )
+        return parsed[:_MAX_NODELIST_ENTRIES] + ["..."]
+    return parsed
+
 
 @dataclass(kw_only=True)
 class JobData(DerivedCluster):
@@ -39,10 +59,10 @@ class JobData(DerivedCluster):
     NODES: int | None = parsed_field(parser=maybe_int, field_name="NUMNODES")
     TIME_LEFT: str = parsed_field(parser=str, field_name="TIMELEFT")
     TIME_USED: str = parsed_field(parser=str, field_name="TIMEUSED")
-    NODELIST: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0])
+    NODELIST: list[str] | None = parsed_field(parser=_truncated_nodelist)
     DEPENDENCY: str = parsed_field(parser=str)
     EXC_NODES: list[str] | None = parsed_field(
-        parser=lambda s: nodelist()(s)[0], field_name="EXCNODES"
+        parser=_truncated_nodelist, field_name="EXCNODES"
     )
     START_TIME: str = parsed_field(parser=time_to_time_aware, field_name="STARTTIME")
     SUBMIT_TIME: str = parsed_field(parser=time_to_time_aware, field_name="SUBMITTIME")
@@ -75,7 +95,7 @@ class JobData(DerivedCluster):
     REQUEUE: str = parsed_field(parser=str)
     FEATURE: str = parsed_field(parser=str)
     RESTARTCNT: int = parsed_field(parser=int)
-    SCHEDNODES: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0])
+    SCHEDNODES: list[str] | None = parsed_field(parser=_truncated_nodelist)
     LAST_SCHED_EVAL: str = parsed_field(
         parser=time_to_time_aware, field_name="LASTSCHEDEVAL"
     )
diff --git a/gcm/tests/test_slurm.py b/gcm/tests/test_slurm.py
index baf1768..0abbfc5 100644
--- a/gcm/tests/test_slurm.py
+++ b/gcm/tests/test_slurm.py
@@ -15,7 +15,7 @@
 
 from gcm.schemas.slurm.sinfo import Sinfo
 from gcm.schemas.slurm.sinfo_node import SinfoNode
-from gcm.schemas.slurm.squeue import JobData
+from gcm.schemas.slurm.squeue import _truncated_nodelist, JobData
 from gcm.tests import data
 
 TEST_CLUSTER = "test_cluster"
@@ -653,3 +653,40 @@ def test_parse_sdiag_json_with_explicit_null_timestamp_objects(
         assert result.job_states_ts is None
         assert result.bf_when_last_cycle is None
         mock_reset.assert_called_once()
+
+
+class TestTruncatedNodelist:
+    """Tests for the _truncated_nodelist helper that caps oversized nodelists."""
+
+    def test_normal_nodelist_unchanged(self) -> None:
+        """Nodelists with <= 1000 entries pass through unchanged."""
+        # A small nodelist: "node[001-010]" expands to 10 entries
+        result = _truncated_nodelist("node[001-010]")
+        assert result is not None
+        assert len(result) == 10
+        assert result[0] == "node001"
+        assert result[-1] == "node010"
+        # No "..." marker
+        assert "..." not in result
+
+    def test_large_nodelist_truncated(self) -> None:
+        """Nodelists with > 1000 entries are truncated to 1000 + '...' marker."""
+        # A large nodelist: "h200-[0001-2000]" expands to 2000 entries
+        result = _truncated_nodelist("h200-[0001-2000]")
+        assert result is not None
+        assert len(result) == 1001  # 1000 entries + "..."
+        assert result[0] == "h200-0001"
+        assert result[999] == "h200-1000"
+        assert result[-1] == "..."
+
+    def test_exactly_1000_entries_unchanged(self) -> None:
+        """Nodelists with exactly 1000 entries are not truncated."""
+        result = _truncated_nodelist("n[0001-1000]")
+        assert result is not None
+        assert len(result) == 1000
+        assert "..." not in result
+
+    def test_empty_nodelist_returns_none(self) -> None:
+        """Empty/unparseable nodelists return None."""
+        result = _truncated_nodelist("")
+        assert result is None