From a3520fe63ce601ffd50059d68c6890cda59f8cdc Mon Sep 17 00:00:00 2001 From: Ivan Liang Date: Sun, 7 Jun 2026 18:30:04 -0400 Subject: [PATCH] fix: correct query of ondemand challenges to prune --- cmgr/api.go | 43 ++++++++++++--- cmgr/prune_test.go | 131 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 144 insertions(+), 30 deletions(-) diff --git a/cmgr/api.go b/cmgr/api.go index 95d972f..d40139d 100644 --- a/cmgr/api.go +++ b/cmgr/api.go @@ -640,22 +640,53 @@ func (m *Manager) checkPrune() { } func (m *Manager) Prune() error { + // Guard the exported method directly: pruneAge <= 0 means pruning is + // disabled. Without this, a direct call would compute datetime('now', '-0 + // seconds') == now and delete essentially every on-demand instance. checkPrune + // also short-circuits, but Prune is public and may be called on its own. + if m.pruneAge <= 0 { + return nil + } + m.log.debugf("pruning on-demand instances older than %s", m.pruneAge) - // Manual builds start with the manual schema prefix. We want to find - // instances associated with these builds that are older than the prune age. - // We also prune those with NULL created_at (legacy entries). + // On-demand instances belong to builds with instancecount == DYNAMIC_INSTANCES + // (-1). Every other instancecount is excluded: fixed-pool builds (a positive + // count, maintained by the schema converge loop) and LOCKED builds (-2, a + // removed/locked schema) must never be pruned here. Note this selects on + // instancecount, not the schema name: schema-defined dynamic challenges keep + // their real schema (e.g. "picoctf-2024"), so a name-based filter would miss + // them entirely. + // + // This only clears the database rows (and, via ON DELETE CASCADE, their port + // and container assignments). It does not stop containers: by the time this + // fires the external cleanup (API/Celery) and docker_reaper have normally + // already removed them, so the goal here is to reclaim DB rows and assignable + // ports rather than to tear down live Docker resources. We also prune rows with + // NULL created_at (legacy entries that predate the column). + // + // The age check is restricted to finalized instances. Unfinalized rows are + // in-progress (or crashed) launches and are owned exclusively by the 5-minute + // crash-GC below; without this guard a small pruneAge could delete a launch + // that is still mid-flight. Legacy NULL-created_at rows are always finalized, + // so pruning them unconditionally is safe. query := ` DELETE FROM instances WHERE id IN ( SELECT i.id FROM instances AS i JOIN builds AS b ON i.build = b.id - WHERE b.schema LIKE ? - AND (i.created_at IS NULL OR i.created_at < datetime('now', ?)) + WHERE b.instancecount = ? + AND (i.created_at IS NULL OR (i.is_finalized = 1 AND i.created_at < datetime('now', ?))) );` - res, err := m.db.Exec(query, manualSchemaPrefix+"%", fmt.Sprintf("-%d seconds", int(m.pruneAge.Seconds()))) + // Render pruneAge to whole seconds for SQLite's datetime() window, rounding + // up: truncating down would prune slightly more aggressively than configured, + // and a sub-second pruneAge would truncate to a "-0 seconds" window matching + // everything. Ceiling keeps the effective age >= configured and always >= 1s. + ageSeconds := int((m.pruneAge + time.Second - time.Nanosecond) / time.Second) + + res, err := m.db.Exec(query, DYNAMIC_INSTANCES, fmt.Sprintf("-%d seconds", ageSeconds)) if err != nil { return err } diff --git a/cmgr/prune_test.go b/cmgr/prune_test.go index b52656e..332b2cb 100644 --- a/cmgr/prune_test.go +++ b/cmgr/prune_test.go @@ -6,9 +6,9 @@ import ( ) // setupPruneTestFixture creates a Manager, a challenge, a build with the -// given schema, and one instance. It returns the Manager, BuildId, and -// InstanceId so callers can manipulate them directly. -func setupPruneTestFixture(t *testing.T, schema string) (*Manager, BuildId, InstanceId) { +// given schema and instance count, and one instance. It returns the Manager, +// BuildId, and InstanceId so callers can manipulate them directly. +func setupPruneTestFixture(t *testing.T, schema string, instanceCount int) (*Manager, BuildId, InstanceId) { t.Helper() mgr := setupTestManager(t) @@ -39,7 +39,7 @@ func setupPruneTestFixture(t *testing.T, schema string) (*Manager, BuildId, Inst Format: "flag{%s}", Challenge: "test/prune-test", Schema: schema, - InstanceCount: DYNAMIC_INSTANCES, + InstanceCount: instanceCount, } if err := mgr.openBuild(build); err != nil { t.Fatalf("openBuild failed: %s", err) @@ -77,10 +77,12 @@ func backdateInstance(t *testing.T, mgr *Manager, id InstanceId, age time.Durati } } -// TestPruneRemovesOldManualInstances verifies that instances belonging to -// manual-schema builds whose created_at is older than pruneAge are deleted. -func TestPruneRemovesOldManualInstances(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test") +// TestPruneRemovesOldDynamicInstances verifies that instances belonging to +// on-demand (instancecount == DYNAMIC_INSTANCES) builds whose created_at is +// older than pruneAge are deleted. The schema name is a normal schema-defined +// name (not "manual-") to prove selection is by instance count, not name. +func TestPruneRemovesOldDynamicInstances(t *testing.T) { + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour // Make the instance appear 2 hours old. @@ -96,10 +98,10 @@ func TestPruneRemovesOldManualInstances(t *testing.T) { } } -// TestPruneRetainsRecentManualInstances verifies that instances younger than -// pruneAge are left untouched. -func TestPruneRetainsRecentManualInstances(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test") +// TestPruneRetainsRecentDynamicInstances verifies that on-demand instances +// younger than pruneAge are left untouched. +func TestPruneRetainsRecentDynamicInstances(t *testing.T) { + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour // The instance was just created (seconds ago) — well within the 1 h window. @@ -112,11 +114,11 @@ func TestPruneRetainsRecentManualInstances(t *testing.T) { } } -// TestPruneDoesNotAffectNonManualInstances verifies that instances belonging -// to builds whose schema does NOT start with "manual-" are never pruned, even -// when they are older than pruneAge. -func TestPruneDoesNotAffectNonManualInstances(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "schema-abc123") +// TestPruneDoesNotAffectFixedInstances verifies that instances belonging to +// fixed-pool builds (instancecount > 0, managed by the schema converge loop) +// are never pruned, even when they are older than pruneAge. +func TestPruneDoesNotAffectFixedInstances(t *testing.T) { + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", 1) mgr.pruneAge = 1 * time.Hour // Backdate the instance to be older than pruneAge. @@ -127,14 +129,14 @@ func TestPruneDoesNotAffectNonManualInstances(t *testing.T) { } if _, err := mgr.lookupInstanceMetadata(iid); err != nil { - t.Errorf("expected non-manual instance to be retained, got error: %s", err) + t.Errorf("expected fixed-pool instance to be retained, got error: %s", err) } } // TestPruneNullCreatedAt verifies that instances with a NULL created_at field // (legacy rows) are treated as expired and pruned. func TestPruneNullCreatedAt(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test") + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour // Simulate a legacy row that has no created_at value. @@ -152,10 +154,56 @@ func TestPruneNullCreatedAt(t *testing.T) { } } +// TestPruneSkipsWhenDisabled verifies that calling Prune() directly with +// pruneAge <= 0 (pruning disabled) is a no-op rather than deleting every +// on-demand instance via a zero-second age window. +func TestPruneSkipsWhenDisabled(t *testing.T) { + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) + mgr.pruneAge = 0 // disabled + + // Backdate well past any plausible age so it would be pruned if enabled. + backdateInstance(t, mgr, iid, 24*time.Hour) + + if err := mgr.Prune(); err != nil { + t.Fatalf("Prune() returned error: %s", err) + } + + if _, err := mgr.lookupInstanceMetadata(iid); err != nil { + t.Errorf("pruning disabled (pruneAge=0), but instance was removed: %s", err) + } +} + +// TestPruneIgnoresUnfinalizedWithinGCWindow verifies that the age-based prune +// does not delete an unfinalized (in-progress) instance that is older than +// pruneAge but younger than the 5-minute crash-GC window. Unfinalized rows are +// owned exclusively by the crash-GC, so a small pruneAge must not race with an +// in-flight launch. +func TestPruneIgnoresUnfinalizedWithinGCWindow(t *testing.T) { + mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) + mgr.pruneAge = 1 * time.Second + + instance := &InstanceMetadata{Build: bid} + if err := mgr.openInstance(instance); err != nil { + t.Fatalf("openInstance failed: %s", err) + } + // intentionally not finalized — simulates a launch still in progress + + // Older than pruneAge (1s) but well within the 5-minute crash-GC window. + backdateInstance(t, mgr, instance.Id, 30*time.Second) + + if err := mgr.Prune(); err != nil { + t.Fatalf("Prune() returned error: %s", err) + } + + if _, err := mgr.lookupInstanceMetadata(instance.Id); err != nil { + t.Errorf("expected in-progress unfinalized instance to be retained, got error: %s", err) + } +} + // TestCheckPruneSkipsWhenDisabled verifies that checkPrune is a no-op when // pruneAge is zero (i.e., pruning is disabled). func TestCheckPruneSkipsWhenDisabled(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test") + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 0 // disabled mgr.pruneInterval = 0 @@ -175,7 +223,7 @@ func TestCheckPruneSkipsWhenDisabled(t *testing.T) { // TestCheckPruneSkipsWithinInterval verifies that checkPrune does not trigger // a prune when the prune interval has not yet elapsed. func TestCheckPruneSkipsWithinInterval(t *testing.T) { - mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test") + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour mgr.pruneInterval = 10 * time.Minute @@ -198,7 +246,7 @@ func TestCheckPruneSkipsWithinInterval(t *testing.T) { // TestPruneGarbageCollectsStaleUnfinalizedInstances verifies that unfinalized // instances (simulating a crashed launch) older than 5 minutes are deleted. func TestPruneGarbageCollectsStaleUnfinalizedInstances(t *testing.T) { - mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-test") + mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour instance := &InstanceMetadata{Build: bid} @@ -221,7 +269,7 @@ func TestPruneGarbageCollectsStaleUnfinalizedInstances(t *testing.T) { // TestPruneRetainsRecentUnfinalizedInstances verifies that an unfinalized // instance newer than 5 minutes is left alone (launch may still be in progress). func TestPruneRetainsRecentUnfinalizedInstances(t *testing.T) { - mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-test") + mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour instance := &InstanceMetadata{Build: bid} @@ -242,7 +290,7 @@ func TestPruneRetainsRecentUnfinalizedInstances(t *testing.T) { // TestPruneGCReleasesReservedPorts verifies that ports reserved before a // crashed launch are freed when the unfinalized instance is GC'd. func TestPruneGCReleasesReservedPorts(t *testing.T) { - mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-ports-test") + mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) mgr.pruneAge = 1 * time.Hour mgr.portLow = 10000 mgr.portHigh = 20000 @@ -271,3 +319,38 @@ func TestPruneGCReleasesReservedPorts(t *testing.T) { t.Errorf("expected port %d to be released after GC, but it still exists in portAssignments", port) } } + +// TestPruneAgeReleasesReservedPorts verifies that a finalized on-demand instance +// pruned for age (not the crash GC) frees its reserved port via the ON DELETE +// CASCADE on portAssignments. Reclaiming these port reservations is the primary +// reason for pruning in deployments that run with explicit port ranges. +func TestPruneAgeReleasesReservedPorts(t *testing.T) { + mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES) + mgr.pruneAge = 1 * time.Hour + mgr.portLow = 10000 + mgr.portHigh = 20000 + + port, err := mgr.reservePort(iid, "http") + if err != nil { + t.Fatalf("reservePort failed: %s", err) + } + + // Age the (already finalized) instance past pruneAge. + backdateInstance(t, mgr, iid, 2*time.Hour) + + if err := mgr.Prune(); err != nil { + t.Fatalf("Prune() returned error: %s", err) + } + + if _, err := mgr.lookupInstanceMetadata(iid); err == nil { + t.Error("expected aged on-demand instance to be pruned, but it still exists") + } + + var count int + if err := mgr.db.QueryRow("SELECT COUNT(*) FROM portAssignments WHERE port = ?", port).Scan(&count); err != nil { + t.Fatalf("portAssignments query failed: %s", err) + } + if count != 0 { + t.Errorf("expected port %d to be released after age prune, but it still exists in portAssignments", port) + } +}