Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions cmgr/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,22 +640,53 @@ func (m *Manager) checkPrune() {
}

func (m *Manager) Prune() error {
// Guard the exported method directly: pruneAge <= 0 means pruning is
// disabled. Without this, a direct call would compute datetime('now', '-0
// seconds') == now and delete essentially every on-demand instance. checkPrune
// also short-circuits, but Prune is public and may be called on its own.
if m.pruneAge <= 0 {
return nil
}

m.log.debugf("pruning on-demand instances older than %s", m.pruneAge)

Comment thread
hi-liang marked this conversation as resolved.
// Manual builds start with the manual schema prefix. We want to find
// instances associated with these builds that are older than the prune age.
// We also prune those with NULL created_at (legacy entries).
// On-demand instances belong to builds with instancecount == DYNAMIC_INSTANCES
// (-1). Every other instancecount is excluded: fixed-pool builds (a positive
// count, maintained by the schema converge loop) and LOCKED builds (-2, a
// removed/locked schema) must never be pruned here. Note this selects on
// instancecount, not the schema name: schema-defined dynamic challenges keep
// their real schema (e.g. "picoctf-2024"), so a name-based filter would miss
// them entirely.
//
// This only clears the database rows (and, via ON DELETE CASCADE, their port
// and container assignments). It does not stop containers: by the time this
// fires the external cleanup (API/Celery) and docker_reaper have normally
// already removed them, so the goal here is to reclaim DB rows and assignable
// ports rather than to tear down live Docker resources. We also prune rows with
// NULL created_at (legacy entries that predate the column).
//
// The age check is restricted to finalized instances. Unfinalized rows are
// in-progress (or crashed) launches and are owned exclusively by the 5-minute
// crash-GC below; without this guard a small pruneAge could delete a launch
// that is still mid-flight. Legacy NULL-created_at rows are always finalized,
// so pruning them unconditionally is safe.
query := `
DELETE FROM instances
WHERE id IN (
SELECT i.id
FROM instances AS i
Comment thread
hi-liang marked this conversation as resolved.
JOIN builds AS b ON i.build = b.id
WHERE b.schema LIKE ?
AND (i.created_at IS NULL OR i.created_at < datetime('now', ?))
WHERE b.instancecount = ?
AND (i.created_at IS NULL OR (i.is_finalized = 1 AND i.created_at < datetime('now', ?)))
);`

res, err := m.db.Exec(query, manualSchemaPrefix+"%", fmt.Sprintf("-%d seconds", int(m.pruneAge.Seconds())))
// Render pruneAge to whole seconds for SQLite's datetime() window, rounding
// up: truncating down would prune slightly more aggressively than configured,
// and a sub-second pruneAge would truncate to a "-0 seconds" window matching
// everything. Ceiling keeps the effective age >= configured and always >= 1s.
ageSeconds := int((m.pruneAge + time.Second - time.Nanosecond) / time.Second)

res, err := m.db.Exec(query, DYNAMIC_INSTANCES, fmt.Sprintf("-%d seconds", ageSeconds))
if err != nil {
return err
}
Expand Down
131 changes: 107 additions & 24 deletions cmgr/prune_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ import (
)

// setupPruneTestFixture creates a Manager, a challenge, a build with the
// given schema, and one instance. It returns the Manager, BuildId, and
// InstanceId so callers can manipulate them directly.
func setupPruneTestFixture(t *testing.T, schema string) (*Manager, BuildId, InstanceId) {
// given schema and instance count, and one instance. It returns the Manager,
// BuildId, and InstanceId so callers can manipulate them directly.
func setupPruneTestFixture(t *testing.T, schema string, instanceCount int) (*Manager, BuildId, InstanceId) {
t.Helper()

mgr := setupTestManager(t)
Expand Down Expand Up @@ -39,7 +39,7 @@ func setupPruneTestFixture(t *testing.T, schema string) (*Manager, BuildId, Inst
Format: "flag{%s}",
Challenge: "test/prune-test",
Schema: schema,
InstanceCount: DYNAMIC_INSTANCES,
InstanceCount: instanceCount,
}
if err := mgr.openBuild(build); err != nil {
t.Fatalf("openBuild failed: %s", err)
Expand Down Expand Up @@ -77,10 +77,12 @@ func backdateInstance(t *testing.T, mgr *Manager, id InstanceId, age time.Durati
}
}

// TestPruneRemovesOldManualInstances verifies that instances belonging to
// manual-schema builds whose created_at is older than pruneAge are deleted.
func TestPruneRemovesOldManualInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test")
// TestPruneRemovesOldDynamicInstances verifies that instances belonging to
// on-demand (instancecount == DYNAMIC_INSTANCES) builds whose created_at is
// older than pruneAge are deleted. The schema name is a normal schema-defined
// name (not "manual-") to prove selection is by instance count, not name.
func TestPruneRemovesOldDynamicInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour

// Make the instance appear 2 hours old.
Expand All @@ -96,10 +98,10 @@ func TestPruneRemovesOldManualInstances(t *testing.T) {
}
}

// TestPruneRetainsRecentManualInstances verifies that instances younger than
// pruneAge are left untouched.
func TestPruneRetainsRecentManualInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test")
// TestPruneRetainsRecentDynamicInstances verifies that on-demand instances
// younger than pruneAge are left untouched.
func TestPruneRetainsRecentDynamicInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour

// The instance was just created (seconds ago) — well within the 1 h window.
Expand All @@ -112,11 +114,11 @@ func TestPruneRetainsRecentManualInstances(t *testing.T) {
}
}

// TestPruneDoesNotAffectNonManualInstances verifies that instances belonging
// to builds whose schema does NOT start with "manual-" are never pruned, even
// when they are older than pruneAge.
func TestPruneDoesNotAffectNonManualInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "schema-abc123")
// TestPruneDoesNotAffectFixedInstances verifies that instances belonging to
// fixed-pool builds (instancecount > 0, managed by the schema converge loop)
// are never pruned, even when they are older than pruneAge.
func TestPruneDoesNotAffectFixedInstances(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", 1)
mgr.pruneAge = 1 * time.Hour

// Backdate the instance to be older than pruneAge.
Expand All @@ -127,14 +129,14 @@ func TestPruneDoesNotAffectNonManualInstances(t *testing.T) {
}

if _, err := mgr.lookupInstanceMetadata(iid); err != nil {
t.Errorf("expected non-manual instance to be retained, got error: %s", err)
t.Errorf("expected fixed-pool instance to be retained, got error: %s", err)
}
}

// TestPruneNullCreatedAt verifies that instances with a NULL created_at field
// (legacy rows) are treated as expired and pruned.
func TestPruneNullCreatedAt(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test")
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour

// Simulate a legacy row that has no created_at value.
Expand All @@ -152,10 +154,56 @@ func TestPruneNullCreatedAt(t *testing.T) {
}
}

// TestPruneSkipsWhenDisabled verifies that calling Prune() directly with
// pruneAge <= 0 (pruning disabled) is a no-op rather than deleting every
// on-demand instance via a zero-second age window.
func TestPruneSkipsWhenDisabled(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 0 // disabled

// Backdate well past any plausible age so it would be pruned if enabled.
backdateInstance(t, mgr, iid, 24*time.Hour)

if err := mgr.Prune(); err != nil {
t.Fatalf("Prune() returned error: %s", err)
}

if _, err := mgr.lookupInstanceMetadata(iid); err != nil {
t.Errorf("pruning disabled (pruneAge=0), but instance was removed: %s", err)
}
}

// TestPruneIgnoresUnfinalizedWithinGCWindow verifies that the age-based prune
// does not delete an unfinalized (in-progress) instance that is older than
// pruneAge but younger than the 5-minute crash-GC window. Unfinalized rows are
// owned exclusively by the crash-GC, so a small pruneAge must not race with an
// in-flight launch.
func TestPruneIgnoresUnfinalizedWithinGCWindow(t *testing.T) {
mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Second

instance := &InstanceMetadata{Build: bid}
if err := mgr.openInstance(instance); err != nil {
t.Fatalf("openInstance failed: %s", err)
}
// intentionally not finalized — simulates a launch still in progress

// Older than pruneAge (1s) but well within the 5-minute crash-GC window.
backdateInstance(t, mgr, instance.Id, 30*time.Second)

if err := mgr.Prune(); err != nil {
t.Fatalf("Prune() returned error: %s", err)
}

if _, err := mgr.lookupInstanceMetadata(instance.Id); err != nil {
t.Errorf("expected in-progress unfinalized instance to be retained, got error: %s", err)
}
}

// TestCheckPruneSkipsWhenDisabled verifies that checkPrune is a no-op when
// pruneAge is zero (i.e., pruning is disabled).
func TestCheckPruneSkipsWhenDisabled(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test")
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 0 // disabled
mgr.pruneInterval = 0

Expand All @@ -175,7 +223,7 @@ func TestCheckPruneSkipsWhenDisabled(t *testing.T) {
// TestCheckPruneSkipsWithinInterval verifies that checkPrune does not trigger
// a prune when the prune interval has not yet elapsed.
func TestCheckPruneSkipsWithinInterval(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "manual-prune-test")
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour
mgr.pruneInterval = 10 * time.Minute

Expand All @@ -198,7 +246,7 @@ func TestCheckPruneSkipsWithinInterval(t *testing.T) {
// TestPruneGarbageCollectsStaleUnfinalizedInstances verifies that unfinalized
// instances (simulating a crashed launch) older than 5 minutes are deleted.
func TestPruneGarbageCollectsStaleUnfinalizedInstances(t *testing.T) {
mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-test")
mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour

instance := &InstanceMetadata{Build: bid}
Expand All @@ -221,7 +269,7 @@ func TestPruneGarbageCollectsStaleUnfinalizedInstances(t *testing.T) {
// TestPruneRetainsRecentUnfinalizedInstances verifies that an unfinalized
// instance newer than 5 minutes is left alone (launch may still be in progress).
func TestPruneRetainsRecentUnfinalizedInstances(t *testing.T) {
mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-test")
mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour

instance := &InstanceMetadata{Build: bid}
Expand All @@ -242,7 +290,7 @@ func TestPruneRetainsRecentUnfinalizedInstances(t *testing.T) {
// TestPruneGCReleasesReservedPorts verifies that ports reserved before a
// crashed launch are freed when the unfinalized instance is GC'd.
func TestPruneGCReleasesReservedPorts(t *testing.T) {
mgr, bid, _ := setupPruneTestFixture(t, "schema-gc-ports-test")
mgr, bid, _ := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour
mgr.portLow = 10000
mgr.portHigh = 20000
Expand Down Expand Up @@ -271,3 +319,38 @@ func TestPruneGCReleasesReservedPorts(t *testing.T) {
t.Errorf("expected port %d to be released after GC, but it still exists in portAssignments", port)
}
}

// TestPruneAgeReleasesReservedPorts verifies that a finalized on-demand instance
// pruned for age (not the crash GC) frees its reserved port via the ON DELETE
// CASCADE on portAssignments. Reclaiming these port reservations is the primary
// reason for pruning in deployments that run with explicit port ranges.
func TestPruneAgeReleasesReservedPorts(t *testing.T) {
mgr, _, iid := setupPruneTestFixture(t, "picoctf-2021", DYNAMIC_INSTANCES)
mgr.pruneAge = 1 * time.Hour
mgr.portLow = 10000
mgr.portHigh = 20000

port, err := mgr.reservePort(iid, "http")
if err != nil {
t.Fatalf("reservePort failed: %s", err)
}

// Age the (already finalized) instance past pruneAge.
backdateInstance(t, mgr, iid, 2*time.Hour)

if err := mgr.Prune(); err != nil {
t.Fatalf("Prune() returned error: %s", err)
}

if _, err := mgr.lookupInstanceMetadata(iid); err == nil {
t.Error("expected aged on-demand instance to be pruned, but it still exists")
}

var count int
if err := mgr.db.QueryRow("SELECT COUNT(*) FROM portAssignments WHERE port = ?", port).Scan(&count); err != nil {
t.Fatalf("portAssignments query failed: %s", err)
}
if count != 0 {
t.Errorf("expected port %d to be released after age prune, but it still exists in portAssignments", port)
}
}
Loading