Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/health-probe-core.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,19 @@ export function rollupSubnetStatus({
return "failed";
}

// Increment the matching bucket of a `{ ok, degraded, failed, unknown }` tally,
// folding any status outside those four into `unknown`. rollupSubnetStatus only
// reads the four known buckets, so an unrecognized status that landed in a stray
// `counts[status]` key would be invisible to it — and a subnet whose surfaces are
// ALL unrecognized would have failed===0 && degraded===0 and wrongly roll up to
// "ok" instead of "unknown". Every status tally funnels through here so that the
// fold can never drift between the prober and the live serve overlay. (#1739)
export function tallyStatus(counts, status) {
const bucket = Object.hasOwn(counts, status) ? status : "unknown";
counts[bucket] += 1;
return counts;
}

// Latency is a success-only signal: keep a probe's latency only when it resolved
// `ok`. Every failure (timeout, 5xx, unsafe, thrown) collapses to null, so it
// counts toward uptime but never toward the latency mean or percentiles.
Expand Down
7 changes: 4 additions & 3 deletions src/health-prober.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
okLatencyMs,
probeSurface as coreProbeSurface,
rollupSubnetStatus,
tallyStatus,
} from "./health-probe-core.mjs";
import { latencyStatColumns, rankedChecksCte } from "./health-sql.mjs";
import { ipv6EmbeddedIpv4 } from "./ip-safety.mjs";
Expand Down Expand Up @@ -349,7 +350,7 @@ function summarizeGroup(rows) {
let lastOk = 0;
const latencies = [];
for (const row of rows) {
counts[row.status] = (counts[row.status] || 0) + 1;
tallyStatus(counts, row.status);
if (row.checked_at_ms > lastChecked) lastChecked = row.checked_at_ms;
if (row.last_ok_ms && row.last_ok_ms > lastOk) lastOk = row.last_ok_ms;
if (Number.isFinite(row.latency_ms)) latencies.push(row.latency_ms);
Expand Down Expand Up @@ -493,7 +494,7 @@ export async function runHealthProber(env, ctx, overrides = {}) {
await persistToKv(kv, probed, runAt);

const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
for (const row of probed) counts[row.status] = (counts[row.status] || 0) + 1;
for (const row of probed) tallyStatus(counts, row.status);
const durationMs = now() - runAt;
// Wall-time guard: the prober runs on a 15-minute Cron Trigger, a hard CF
// ceiling. As the autonomous flywheel grows surfaces, a sweep that creeps past
Expand Down Expand Up @@ -591,7 +592,7 @@ async function persistToD1(db, probed, runAt) {
async function persistToKv(kv, probed, runAt) {
if (!kv?.put) return;
const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
for (const row of probed) counts[row.status] = (counts[row.status] || 0) + 1;
for (const row of probed) tallyStatus(counts, row.status);

const surfaceRows = probed.map((row) => ({
surface_id: row.surface_id,
Expand Down
4 changes: 2 additions & 2 deletions src/health-serving.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// objects + D1 rows in.

import { computeReliability, scoreFromStats } from "./reliability.mjs";
import { rollupSubnetStatus } from "./health-probe-core.mjs";
import { rollupSubnetStatus, tallyStatus } from "./health-probe-core.mjs";
import { dailyLatencyColumns } from "./health-sql.mjs";
import { KV_ECONOMICS_CURRENT, KV_HEALTH_CURRENT } from "./kv-keys.mjs";

Expand Down Expand Up @@ -107,7 +107,7 @@ export function summarizeRows(rows) {
const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
const latencies = [];
for (const row of rows) {
counts[row.status] = (counts[row.status] || 0) + 1;
tallyStatus(counts, row.status);
if (Number.isFinite(row.latency_ms)) latencies.push(row.latency_ms);
}
return {
Expand Down
23 changes: 23 additions & 0 deletions tests/health-probe-core.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
rollupSubnetStatus,
statusForClassification,
summarizeRpcProbe,
tallyStatus,
} from "../src/health-probe-core.mjs";

describe("rollupSubnetStatus (shared subnet-status precedence)", () => {
Expand Down Expand Up @@ -48,6 +49,28 @@ describe("rollupSubnetStatus (shared subnet-status precedence)", () => {
});
});

describe("tallyStatus (fold unrecognized status into unknown, #1739)", () => {
test("a known status increments its own bucket", () => {
const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
tallyStatus(counts, "ok");
tallyStatus(counts, "degraded");
assert.deepEqual(counts, { ok: 1, degraded: 1, failed: 0, unknown: 0 });
});
test("an unrecognized status folds into unknown, never a stray key", () => {
const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
tallyStatus(counts, "weird");
tallyStatus(counts, undefined);
assert.deepEqual(counts, { ok: 0, degraded: 0, failed: 0, unknown: 2 });
});
test("an inherited Object.prototype key is treated as unrecognized", () => {
// `"constructor" in counts` is true via the prototype chain; Object.hasOwn
// keeps the guard to the four real buckets so it still folds to unknown.
const counts = { ok: 0, degraded: 0, failed: 0, unknown: 0 };
tallyStatus(counts, "constructor");
assert.deepEqual(counts, { ok: 0, degraded: 0, failed: 0, unknown: 1 });
});
});

// Minimal Response-like stub for an injected fetch.
function fakeResponse({
status = 200,
Expand Down
34 changes: 34 additions & 0 deletions tests/health-prober.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -1429,6 +1429,40 @@ describe("summarizeGroup / rollupStatus via per-subnet rollup", () => {
assert.equal(subnet.last_checked, new Date(5000).toISOString());
});

test("all-unrecognized subnet folds into unknown, not a false ok (#1739)", async () => {
const kv = makeKv();
await runHealthProber(
{},
{},
{
now: () => 5000,
db: makeDb(),
kv,
loadSurfaces: async () => [
buildSurface("w1", 17),
buildSurface("w2", 17),
],
// A status outside ok/degraded/failed/unknown. Without tallyStatus
// folding it into `unknown`, the stray count was invisible to
// rollupSubnetStatus (failed===0 && degraded===0) and the subnet wrongly
// rolled up to "ok" in the cron `health:current` snapshot.
probeSurface: async () => ({
status: "weird",
classification: null,
latency_ms: null,
}),
probeOptions: {},
},
);
const current = kv.json(KV_HEALTH_CURRENT);
const subnet = current.subnets.find((s) => s.netuid === 17);
assert.equal(subnet.status, "unknown");
assert.equal(subnet.unknown_count, 2);
// The run-level diagnostic tally folds the same way — no stray bucket leaks.
assert.equal(current.summary.status_counts.unknown, 2);
assert.equal(current.summary.status_counts.weird, undefined);
});

test("a zero checked-at timestamp reports last_checked null, not the epoch", async () => {
// Same 0-sentinel guard as last_ok, on the last_checked field: with the
// clock at the Unix epoch every row's checked_at_ms is 0, so lastChecked
Expand Down
20 changes: 15 additions & 5 deletions tests/health-serving.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -390,15 +390,25 @@ describe("summarizeRows / rollupStatus", () => {
assert.equal(out.status, "failed");
assert.equal(out.failed_count, 2);
});
test("unrecognized status key initializes its own count (|| 0 branch)", () => {
// A status outside the known keys exercises the `counts[row.status] || 0`
// default-init branch in summarizeRows. With no failed/degraded counts,
// rollupStatus reports "ok".
test("all-unrecognized status folds into unknown → unknown (#1739)", () => {
// A status outside the four known buckets is folded into `unknown` by
// tallyStatus. Without that fold it landed in a stray `counts.weird` key that
// rollupStatus ignored, leaving failed===0 && degraded===0 → a false "ok"
// for a subnet whose surfaces are entirely unrecognized.
const out = summarizeRows([row("weird"), row("weird")]);
assert.equal(out.status, "ok");
assert.equal(out.status, "unknown");
assert.equal(out.unknown_count, 2);
assert.equal(out.failed_count, 0);
assert.equal(out.ok_count, 0);
});
test("unrecognized status mixed with failed still surfaces failed (#1739)", () => {
// The fold must not mask real failures: a "weird" + "failed" subnet is not
// all-unknown, so it rolls up to failed rather than being diluted to unknown.
const out = summarizeRows([row("weird"), row("failed")]);
assert.equal(out.status, "failed");
assert.equal(out.unknown_count, 1);
assert.equal(out.failed_count, 1);
});
test("aggregates latency (rounded), latest last_checked/last_ok", () => {
const out = summarizeRows([
row("ok", {
Expand Down