statbus/cloud.sh at master · statisticsnorway/statbus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
#!/bin/bash
#
# Cloud fleet management for StatBus on niue.statbus.org
#
# This is an OPERATOR tool, not a product feature.
# ./sb manages a single installation. This script manages the fleet.
#
# Usage:
#   ./cloud.sh status              Show version on all servers
#   ./cloud.sh notify              Tell servers to check for updates (non-disruptive)
#   ./cloud.sh upgrade             Force all servers to apply latest now (via upgrade service)
#   ./cloud.sh install <server>    Smart install: tries upgrade service first; full bootstrap if unreachable
#   ./cloud.sh install <server> <version>  Pin to specific version — always full bootstrap
#   ./cloud.sh install all         Install ALL servers (smart, in sequence)
#   ./cloud.sh tail <server|all>   Follow upgrade log; auto-disconnects on completion
#   ./cloud.sh rescue <server>     Alias for install (backwards compat)
#   ./cloud.sh wipe <server>       DESTRUCTIVE: delete DB and recreate from scratch
#
# Escalation levels:
#   notify   — gentle. Servers discover new version. Admin chooses when to upgrade.
#   upgrade  — firm. All servers apply latest NOW via upgrade service (non-disruptive binary).
#   install  — smart. Tries upgrade service first (fast path); falls back to full bootstrap
#              (stop service, replace binary, re-run install) only if service is unreachable.
#              Pinning a version always takes the full bootstrap path.
#   tail     — observe. Streams upgrade service journal; exits automatically on completion.
#   create   — provision. Creates new deployment slot (DNS, user, workflows, etc.)
#   inspect  — read-only. Shows credentials/URLs for all deployment slots.
#   wipe     — destructive. Deletes database and recreates. Data is lost.
#
set -euo pipefail

# DEBUG=1 ./cloud.sh <command> traces every command to stderr via `set -x`.
# Matches the convention in dev.sh.
if [ "${DEBUG:-}" = "true" ] || [ "${DEBUG:-}" = "1" ]; then
    set -x
fi

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Multi-tenant cloud slots on niue. `statbus_no` was removed on 2026-04-21
# when Norway migrated to the dedicated standalone box rune.statbus.org;
# standalone hosts are NOT managed by cloud.sh (they use the per-host
# ./sb and the standalone deploy workflows, see doc/CLOUD.md §Standalone).
SERVERS="statbus_dev statbus_demo statbus_et statbus_jo statbus_ma statbus_tcc statbus_ug"
HOST="niue.statbus.org"
INSTALL_URL="https://statbus.org/install.sh"
# GitHub username whose signing key should be trusted on each server.
# Passed as --trust-github-user to ./sb install so the installer handles
# key validation, removal of invalid keys, and re-fetching in one pass.
# No default — install must fail if the wrong key is configured, forcing
# the operator to explicitly provide the fix:
#   CLOUD_TRUST_KEY_USER=jhf ./cloud.sh install all
CLOUD_TRUST_KEY_USER="${CLOUD_TRUST_KEY_USER:-}"

usage() {
    echo "Usage: $0 <command> [args]"
    echo ""
    echo "Commands:"
    echo "  status                     Show binary version on all servers"
    echo "  health [server|all]        Upgrade health: service state, last activity, upgrade status"
    echo "  notify                     Tell servers to check for updates (non-disruptive)"
    echo "  upgrade                    Force all servers to apply latest via upgrade service"
    echo "  install <server>           Smart install: upgrade service first, full bootstrap fallback"
    echo "  install <server> <version> Pin to version — always full bootstrap, no fast-path"
    echo "  install all [version]      Install ALL servers in sequence"
    echo "  tail <server|all>          Follow upgrade log; auto-disconnects on completion"
    echo "  rescue <server>            Alias for install"
    echo "  create <code> <name>       Create new cloud installation"
    echo "  inspect                    Show credentials for all installations"
    echo "  wipe <server>              DESTRUCTIVE: delete DB and recreate"
    echo ""
    echo "  migrate-down <server> <migration>  Roll back to before this migration (edge only)"
    echo "  migrate-up <server>               Apply pending migrations (edge only)"
    echo ""
    echo "Servers: $SERVERS"
    exit 1
}

ssh_server() {
    local server="$1"
    shift
    ssh -o ConnectTimeout=10 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 "${server}@${HOST}" "$@"
}

# Stop the user-level upgrade service so the `./sb` binary can be
# replaced without "text file busy". Idempotent — `systemctl stop` on a
# stopped unit is a safe no-op. Any stale upgrade-in-progress flag left
# on disk is reconciled by `./sb install` itself (StateCrashedUpgrade
# dispatch), so cloud.sh doesn't need to do that here.
stop_upgrade_service() {
    local server="$1"
    ssh_server "$server" "systemctl --user stop statbus-upgrade@${server}.service 2>/dev/null || true" 2>&1
}

# Ensure the user-level upgrade service is running on exit. Idempotent —
# `systemctl start` on a running unit is a no-op. Used at the end of
# `cmd_install_one` (and on its error paths) so that any cloud.sh exit
# leaves the server in a normal "service running" state, not "stopped
# pending operator intervention".
ensure_service_started() {
    local server="$1"
    ssh_server "$server" "systemctl --user start statbus-upgrade@${server}.service" 2>&1 || true
}

validate_server() {
    local target="$1"
    if [ "$target" != "all" ] && ! echo "$SERVERS" | grep -qw "$target"; then
        echo "Error: unknown server '$target'"
        echo "Valid servers: $SERVERS"
        exit 1
    fi
}

cmd_status() {
    echo "StatBus Cloud Status"
    echo "===================="
    for server in $SERVERS; do
        printf "  %-16s " "$server:"
        ssh_server "$server" \
            "cd statbus && ./sb --version 2>/dev/null || echo 'UNKNOWN'" 2>/dev/null \
            || echo "SSH FAILED"
    done
}

# cmd_health_one gathers upgrade-subsystem health for one server in a single
# SSH call. Outputs one formatted line. Designed to run in parallel.
cmd_health_one() {
    local server="$1"
    local result
    result=$(ssh_server "$server" "
        cd statbus 2>/dev/null || { printf 'NO_DIR|||'; exit; }
        ver=\$(./sb --version 2>/dev/null | head -1 || echo 'UNKNOWN')
        svc=\$(systemctl --user is-active 'statbus-upgrade@${server}.service' 2>/dev/null || echo 'unknown')
        hb='tmp/upgrade-heartbeat'
        if [ -f \"\$hb\" ]; then
            hb_ts=\$(cat \"\$hb\" | tr -d '[:space:]')
            now=\$(date +%s); age=\$((now - hb_ts))
            if   [ \"\$age\" -lt 60 ];   then progress=\"\${age}s ago\"
            elif [ \"\$age\" -lt 3600 ]; then progress=\"\$((age/60))m ago\"
            else progress=\"stale \$((age/3600))h\"; fi
        else
            last=\$(journalctl --user -u 'statbus-upgrade@${server}.service' \
                -n 1 -o short-unix --no-pager 2>/dev/null | awk '{print int(\$1)}')
            if [ -n \"\$last\" ] && [ \"\$last\" -gt 0 ] 2>/dev/null; then
                now=\$(date +%s); age=\$((now - last))
                if   [ \"\$age\" -lt 60 ];   then progress=\"\${age}s ago\"
                elif [ \"\$age\" -lt 3600 ]; then progress=\"\$((age/60))m ago\"
                else progress=\"stale \$((age/3600))h\"; fi
            else
                progress='no data'
            fi
        fi
        state=\$(./sb upgrade list 2>/dev/null \
            | grep -oE 'completed|failed|in progress|in_progress|rolled_back|pending' | head -1)
        [ -z \"\$state\" ] && state='none'
        printf '%s|%s|%s|%s' \"\$ver\" \"\$svc\" \"\$progress\" \"\$state\"
    " 2>/dev/null) || result="SSH FAILED|||"

    if [ -z "$result" ] || [ "$result" = "SSH FAILED|||" ]; then
        printf "  %-22s SSH FAILED\n" "$server"
        return
    fi

    local ver svc progress state flag
    IFS='|' read -r ver svc progress state <<< "$result"
    flag=""
    [ "${svc:-unknown}" != "active" ] && flag=" ← service ${svc:-unknown}"
    echo "${state:-}" | grep -qE 'in[_ ]progress' && flag="${flag} ← WEDGED?"
    printf "  %-22s %-12s service=%-10s last=%-18s upgrade=%s%s\n" \
        "$server" "${ver:-UNKNOWN}" "${svc:-unknown}" "${progress:-?}" "${state:-none}" "$flag"
}

# cmd_health shows upgrade health for one or all servers, in parallel.
cmd_health() {
    local target="${1:-all}"
    validate_server "$target"
    echo "StatBus Cloud Health"
    echo "===================="
    if [ "$target" = "all" ]; then
        local tmpdir pids=()
        tmpdir=$(mktemp -d)
        for server in $SERVERS; do
            cmd_health_one "$server" > "$tmpdir/$server" &
            pids+=($!)
        done
        wait "${pids[@]}"
        for server in $SERVERS; do
            cat "$tmpdir/$server"
        done
        rm -rf "$tmpdir"
    else
        cmd_health_one "$target"
    fi
}

cmd_notify() {
    echo "Notifying all servers to check for updates..."
    for server in $SERVERS; do
        printf "  %-16s " "$server:"
        ssh_server "$server" "cd statbus && ./sb upgrade discover" 2>/dev/null \
            && echo "notified" || echo "FAILED"
    done
}

cmd_upgrade() {
    echo "Forcing all servers to apply latest..."
    for server in $SERVERS; do
        printf "  %-16s " "$server:"
        ssh_server "$server" "cd statbus && ./sb upgrade apply-latest" 2>/dev/null \
            && echo "scheduled" || echo "FAILED"
    done
}

cmd_install() {
    local target="$1"
    local version="${2:-}"
    validate_server "$target"

    if [ "$target" = "all" ]; then
        echo "Installing ALL servers${version:+ (pinned to $version)}"
        echo "======================"
        for server in $SERVERS; do
            echo ""
            echo "--- $server ---"
            cmd_install_one "$server" "$version"
        done
    else
        cmd_install_one "$target" "$version"
    fi
}

# trust_flag returns the --trust-github-user flag for ./sb install if configured.
trust_flag() {
    local user="${1:-}"
    if [ -n "$user" ]; then
        echo "--trust-github-user $user"
    fi
}

# Migration immutability is a RELEASE-CUT concern, enforced by `./sb release
# prerelease` / `release stable` preflight (checkMigrationImmutability in
# cli/cmd/release.go). The install-time check that used to live here was
# wrong-layer: it diffed git history between two HEADs on the slot, which
# bears no relationship to what's recorded in db.migration. A migrate-down
# (which clears the db.migration row) couldn't satisfy a git-history-based
# check, producing an infinite loop on operators recovering from a known-
# corrected migration. Removed 2026-05-22 after the loop bit a dev recovery.
# Release-cut gate remains authoritative; install just applies forward.

# cmd_migrate_down rolls back a specific migration on an edge server.
# Takes a migration number — rolls back until that migration is gone.
# This is a manual, explicit, operator-invoked command — the upgrade
# service NEVER runs down migrations.
cmd_migrate_down() {
    local server="$1"
    local migration="$2"

    if [ -z "$server" ] || [ -z "$migration" ]; then
        echo "Usage: ./cloud.sh migrate-down <server> <migration>"
        echo "Example: ./cloud.sh migrate-down statbus_dev 20260417130648"
        exit 1
    fi

    validate_server "$server"

    local channel
    channel=$(ssh_server "$server" "cd statbus && ./sb dotenv -f .env.config get UPGRADE_CHANNEL 2>/dev/null" 2>/dev/null || echo "prerelease")
    if [ "$channel" != "edge" ]; then
        echo "Error: migrate-down is only supported on edge channel servers."
        echo "  $server is on channel: $channel"
        echo "  Release/prerelease servers use immutable migrations enforced by RC preflight."
        exit 1
    fi

    echo "Rolling back migration $migration on $server..."
    local current
    while true; do
        current=$(ssh_server "$server" \
            "cd statbus && echo 'SELECT MAX(version) FROM public.schema_migrations;' | ./sb psql -t -A" \
            2>/dev/null || true)
        if [ -z "$current" ] || [ "$current" -lt "$migration" ]; then
            echo "Migration $migration is no longer applied."
            break
        fi
        echo "  Rolling back migration $current..."
        ssh_server "$server" "cd statbus && ./sb migrate down" 2>&1
    done
    echo "Done. Re-run: ./cloud.sh install $server"
}

# cmd_migrate_up applies pending migrations on an edge server.
# Symmetric counterpart to migrate-down. Edge-only.
cmd_migrate_up() {
    local server="$1"
    validate_server "$server"

    local channel
    channel=$(ssh_server "$server" "cd statbus && ./sb dotenv -f .env.config get UPGRADE_CHANNEL 2>/dev/null" 2>/dev/null || echo "prerelease")
    if [ "$channel" != "edge" ]; then
        echo "Error: migrate-up is only supported on edge channel servers."
        echo "  $server is on channel: $channel"
        exit 1
    fi

    echo "Applying pending migrations on $server..."
    ssh_server "$server" "cd statbus && ./sb migrate up" 2>&1
    echo "Done."
}

# cmd_tail_one tails the upgrade service journal for one server and
# auto-disconnects when a terminal state is logged. Prints the final
# upgrade status afterwards.
# Optional $2 = target_version: narrows the exit pattern to that specific
# upgrade so stale recovery log lines for previous versions don't cause
# a premature exit.
cmd_tail_one() {
    local server="$1"
    local target_version="${2:-}"
    # Build the awk exit pattern locally before SSH so we avoid nested-quote
    # hell. The pattern is embedded in the remote awk /.../ regex via double-
    # quote expansion of the outer SSH string. Version strings (sha-*, v*.*)
    # contain no single quotes or shell metacharacters, so expansion is safe.
    local awk_pattern
    if [ -n "$target_version" ]; then
        awk_pattern="Upgrade to ${target_version} .*(completed|failed)|FAILED:"
    else
        awk_pattern="Upgrade to .*(completed|failed)|FAILED:"
    fi
    echo "--- Tailing upgrade log for $server (auto-disconnect on completion) ---"
    ssh_server "$server" \
        "journalctl --user -u 'statbus-upgrade@${server}.service' -o cat -f -n 50 2>&1 | \
         awk '/${awk_pattern}/{print; fflush(); exit} {print; fflush()}'" \
        || true
    echo "--- Log tail disconnected for $server ---"
    echo "Final upgrade status on $server:"
    # Poll until the DB reflects the terminal state (service commits the
    # in_progress→completed transition after logging "Installation complete!").
    # Bounded at 8 tries × 2 s = 16 s max; exits early once state clears.
    ssh_server "$server" \
        'cd statbus && i=0; while [ $i -lt 8 ]; do
             out=$(./sb upgrade list 2>&1)
             echo "$out" | head -5 | grep -qE "in[_ ]progress" || { echo "$out"; exit 0; }
             i=$((i+1)); [ $i -lt 8 ] && sleep 2
         done; ./sb upgrade list' 2>&1 || true
}

# cmd_tail tails the upgrade log for one server or all servers in parallel.
cmd_tail() {
    local target="$1"
    validate_server "$target"
    if [ "$target" = "all" ]; then
        local pids=()
        for server in $SERVERS; do
            cmd_tail_one "$server" &
            pids+=($!)
        done
        wait "${pids[@]}"
    else
        cmd_tail_one "$target"
    fi
}

cmd_install_one() {
    # Idempotent install flow:
    #   stop_upgrade_service → install → ensure_service_started
    #
    # Re-running `./cloud.sh install <server>` after any partial failure
    # (SSH drop, Ctrl-C, transient error) is safe — every step is rerun-safe.
    # The `./sb install` dispatcher handles stale upgrade flags itself
    # (StateCrashedUpgrade reconciles and re-dispatches), so cloud.sh only
    # needs to stop the service so the binary can be replaced.
    #
    # ensure_service_started runs at the end (and on the failure-return path)
    # so a cloud.sh exit always leaves the server with the upgrade service
    # running, not stopped.
    local server="$1"
    local version="${2:-}"
    local exit_code=0

    # Resolve trust key user: explicit env var first, then remote .env.config
    # written by a prior successful run — operator sets it once, remembered forever.
    local resolved_trust_user="$CLOUD_TRUST_KEY_USER"
    if [ -z "$resolved_trust_user" ]; then
        resolved_trust_user=$(ssh_server "$server" \
            "cd statbus && ./sb dotenv -f .env.config get TRUST_GITHUB_USER 2>/dev/null" \
            2>/dev/null || true)
    fi

    # Fast path: if no version is pinned, try the upgrade service first.
    # If it accepts the request (exit 0), tail the journal and return.
    # If it fails (service not running, DB down, etc.), fall through to the
    # full bootstrap install below.
    # Version-skew guard: if remote binary != local binary, skip fast-path
    # and always bootstrap (item #2 rc.64 fix — dev's looping service returned
    # 0 on NOTIFY but never completed, blocking indefinitely).
    if [ -z "$version" ]; then
        local remote_commit local_commit
        remote_commit=$(ssh_server "$server" "cd statbus && ./sb --version 2>/dev/null" \
            | grep -oE 'commit [a-f0-9]+' | awk '{print $2}') || remote_commit=""
        local_commit=$(./sb --version 2>/dev/null | grep -oE 'commit [a-f0-9]+' | awk '{print $2}')

        if [ -z "$remote_commit" ] || [ "$remote_commit" != "$local_commit" ]; then
            echo "Version skew detected: remote=$remote_commit local=$local_commit — skipping fast-path, using bootstrap"
            # Fall through to bootstrap block (do NOT enter the upgrade-service fast-path)
        else
            echo "Trying upgrade service on $server..."
            # Capture exit code WITHOUT triggering set -e. Pre-fix: a plain
            # `apply_out=$(ssh ...)` assignment is part of the surrounding
            # `set -euo pipefail` scope, so a non-zero SSH exit kills cloud.sh
            # before line `apply_rc=$?` runs — operator sees only this echo
            # and a bare prompt (anti-fail-fast). The `|| apply_rc=$?` form
            # captures the failure code AND short-circuits set -e so the
            # fall-through to the bootstrap install fires.
            local apply_out apply_rc=0
            apply_out=$(ssh_server "$server" "cd statbus && ./sb upgrade apply-latest" 2>&1) || apply_rc=$?
            echo "$apply_out"
            # Skip-current short-circuit: when apply-latest detects the slot
            # is already at the latest, it prints "Already at <ver> ..." and
            # exits 0 without scheduling — no NOTIFY upgrade_apply, so the
            # daemon doesn't run a pipeline. cmd_tail_one would tail forever
            # waiting for a completion line that won't come. Detect the
            # marker and return cleanly.
            if [ "$apply_rc" -eq 0 ] && echo "$apply_out" | grep -q "^Already at "; then
                return 0
            fi
            if [ "$apply_rc" -eq 0 ]; then
                # Extract target version from apply-latest output, e.g.:
                #   Sent: NOTIFY upgrade_apply, '9bf48bb8'             # commit_short
                #   Sent: NOTIFY upgrade_apply, 'v2026.04.0-rc.55'     # release tag
                # Passed to cmd_tail_one so the awk exit pattern is version-specific,
                # preventing stale recovery lines for previous upgrades from terminating early.
                local target_version
                target_version=$(echo "$apply_out" | grep "upgrade_apply" | grep -oE "'[^']+'" | tr -d "'" | head -1)
                cmd_tail_one "$server" "$target_version"
                return $?
            fi
            echo "Upgrade service not responsive — falling back to full bootstrap install..."
        fi
    fi

    # Check the server's upgrade channel to decide install strategy.
    # Edge channel tracks master (build from source). Others use tagged releases.
    local channel
    channel=$(ssh_server "$server" "cd statbus && ./sb dotenv -f .env.config get UPGRADE_CHANNEL 2>/dev/null" 2>/dev/null || echo "prerelease")

    if [ "$channel" = "edge" ]; then
        if [ -n "$version" ]; then
            echo "Installing $server (edge — pinned to $version)..."
            # Pinned edge: checkout the specified tag and download its release binary.
            # No --force, no --quiet: install-verified moving tag was deleted
            # in rc.62, so there's nothing to force past. Silent failures
            # previously hid rune's rc.59/rc.60 root causes — let git print.
            ssh_server "$server" "cd statbus && git fetch origin --tags && git checkout $version" 2>&1 \
                || { echo "--- $server FAILED: git fetch/checkout $version (exit $?) ---"; \
                     ensure_service_started "$server"; return 1; }
            echo "Downloading release binary for $version..."
            ssh_server "$server" \
                "cd statbus && curl -fsSL https://github.com/statisticsnorway/statbus/releases/download/${version}/sb-linux-amd64 -o sb-linux-amd64 && chmod +x sb-linux-amd64" 2>&1 \
                || { echo "--- $server FAILED: download binary for $version (exit $?) ---"; \
                     ensure_service_started "$server"; return 1; }
        else
            echo "Installing $server (edge channel — building from master)..."
            # Edge: pull latest master. If HEAD is a tagged release with a
            # published binary, download it (faster, no Go toolchain needed).
            # Otherwise fall back to building from source.
            # No --force, no --quiet (see rc.62 rationale above).
            ssh_server "$server" "cd statbus && git fetch origin master --tags && git checkout origin/master" 2>&1 \
                || { echo "--- $server FAILED: git fetch/checkout master (exit $?) ---"; \
                     ensure_service_started "$server"; return 1; }
            # Check if HEAD is a tagged release with a downloadable binary.
            local head_tag
            head_tag=$(ssh_server "$server" "cd statbus && git describe --exact-match HEAD 2>/dev/null" 2>/dev/null || true)
            if [ -n "$head_tag" ]; then
                echo "HEAD is tagged ($head_tag) — checking for release binary..."
                if "$SCRIPT_DIR/sb" release check --tag "$head_tag" 2>/dev/null; then
                    echo "Release binary available — downloading instead of building."
                    ssh_server "$server" \
                        "cd statbus && curl -fsSL https://github.com/statisticsnorway/statbus/releases/download/${head_tag}/sb-linux-amd64 -o sb-linux-amd64 && chmod +x sb-linux-amd64" 2>&1 \
                        || { echo "--- $server FAILED: download binary for $head_tag (exit $?) ---"; \
                             ensure_service_started "$server"; return 1; }
                else
                    echo "Release binary not ready — building from source..."
                    ssh_server "$server" "cd statbus && export PATH=/home/linuxbrew/.linuxbrew/bin:\$PATH && ./dev.sh build-sb" 2>&1 \
                        || { echo "--- $server FAILED: build from source (exit $?) ---"; \
                             ensure_service_started "$server"; return 1; }
                fi
            else
                echo "HEAD is untagged — building from source..."
                ssh_server "$server" "cd statbus && export PATH=/home/linuxbrew/.linuxbrew/bin:\$PATH && ./dev.sh build-sb" 2>&1 \
                    || { echo "--- $server FAILED: build from source (exit $?) ---"; \
                         ensure_service_started "$server"; return 1; }
            fi
        fi
        # Stop user-level service before replacing binary — systemd --user
        # restarts it on exit (Restart=always), and the running process holds
        # the binary open → "text file busy" on mv.
        stop_upgrade_service "$server"
        ssh_server "$server" "cd statbus && mv sb-linux-amd64 sb" 2>&1 \
            || { echo "--- $server FAILED: replace binary (exit $?) ---"; \
                 ensure_service_started "$server"; return 1; }
        # ./sb install detects the service is stopped and restarts it (user-level, no root needed).
        # --trust-github-user validates/repairs the signing key in one pass.
        ssh_server "$server" "cd statbus && ./sb install $(trust_flag "$resolved_trust_user")" 2>&1 \
            || exit_code=$?
    else
        if [ -n "$version" ]; then
            # Pinned: verify artifacts for the specific version before stopping.
            echo "Checking release artifacts for $version are ready..."
            if ! "$SCRIPT_DIR/sb" release check --tag "$version"; then
                echo "--- Release artifacts for $version not ready. Retry later. ---"
                return 1
            fi
            echo "Installing $server at $version via $INSTALL_URL ..."
            stop_upgrade_service "$server"
            ssh_server "$server" \
                "curl -fsSL ${INSTALL_URL} | bash -s -- --version $version $(trust_flag "$resolved_trust_user")" 2>&1 \
                || exit_code=$?
        else
            # Gate: verify release artifacts are fully published before stopping
            # the running service. If CI is still uploading assets or pushing
            # images, abort early — the server stays up and the operator retries.
            # Rc.63: use --channel so the check resolves to the
            # current latest RC instead of treating "prerelease" as a
            # literal tag.
            echo "Checking release artifacts for channel prerelease are ready..."
            if ! "$SCRIPT_DIR/sb" release check --channel prerelease; then
                echo "--- Release artifacts not ready. Retry in ~5 minutes. ---"
                return 1
            fi
            echo "Installing $server via $INSTALL_URL ..."
            # Stop the user-level upgrade service so install.sh can replace the
            # `./sb` binary without hitting "text file busy". install.sh's
            # install step re-enables and starts the service on completion.
            stop_upgrade_service "$server"
            # Step 1: Run install.sh as the app user.
            # Exit code 42 = service needs root (not a failure).
            ssh_server "$server" \
                "curl -fsSL ${INSTALL_URL} | bash -s -- --channel prerelease $(trust_flag "$resolved_trust_user")" 2>&1 \
                || exit_code=$?
        fi
    fi

    if [ "$exit_code" -ne 0 ]; then
        echo "--- $server install FAILED (exit code $exit_code) ---"
        if [ -z "$resolved_trust_user" ]; then
            echo ""
            echo "If this failed because of an invalid signing key, re-run with:"
            echo "  CLOUD_TRUST_KEY_USER=jhf ./cloud.sh install $server"
            echo ""
        fi
        # Do NOT call ensure_service_started on failure — starting the upgrade
        # service after a broken install can hang (systemctl waiting on a broken
        # binary/DB). The operator re-runs ./cloud.sh install which calls it on success.
        return 1
    fi

    # Persist trust user for future runs so CLOUD_TRUST_KEY_USER need not
    # be set again. Idempotent — safe to re-write the same value.
    if [ -n "$resolved_trust_user" ]; then
        ssh_server "$server" \
            "cd statbus && ./sb dotenv -f .env.config set TRUST_GITHUB_USER '$resolved_trust_user'" \
            2>/dev/null || true
    fi

    # Regenerate config so VERSION in .env matches the checked-out code.
    # Must use 'up -d' not 'restart' — restart doesn't re-read .env.
    echo "Regenerating config and restarting app..."
    ssh_server "$server" "cd statbus && ./sb config generate && docker compose up -d app" 2>&1

    # Always leave the upgrade service running on success, regardless of
    # whether install's own service-install step fired (e.g., when running
    # without root and the user-level path was used).
    ensure_service_started "$server"

    echo "--- $server install complete ---"
}

cmd_wipe() {
    local target="$1"
    validate_server "$target"

    if [ "$target" = "all" ]; then
        echo "ERROR: wipe all is not supported. Wipe servers one at a time."
        exit 1
    fi

    echo "WARNING: This will DELETE the database on $target and recreate from scratch."
    echo "ALL DATA WILL BE LOST."
    read -p "Type the server name to confirm: " confirm
    if [ "$confirm" != "$target" ]; then
        echo "Aborted."
        exit 1
    fi

    echo "Wiping $target..."
    ssh_server "$target" "cd statbus && ./dev.sh recreate-database && ./sb start all" 2>&1
    echo "--- $target wipe complete ---"
}

cmd_create() {
    local code="$1"
    local name="$2"
    exec "$SCRIPT_DIR/ops/create-new-statbus-installation.sh" "$code" "$name"
}

cmd_inspect() {
    exec "$SCRIPT_DIR/ops/inspect-cloud-installations.sh"
}

# Main
if [ $# -lt 1 ]; then
    usage
fi

case "$1" in
    status)
        cmd_status
        ;;
    health)
        cmd_health "${2:-all}"
        ;;
    notify)
        cmd_notify
        ;;
    upgrade)
        cmd_upgrade
        ;;
    install|rescue)
        [ $# -lt 2 ] && { echo "Error: $1 requires a server name or 'all'"; usage; }
        cmd_install "$2" "${3:-}"
        ;;
    create)
        [ $# -lt 3 ] && { echo "Error: create requires <code> and <name>"; echo "Example: $0 create pk \"Pakistan StatBus\""; exit 1; }
        cmd_create "$2" "$3"
        ;;
    inspect)
        cmd_inspect
        ;;
    wipe)
        [ $# -lt 2 ] && { echo "Error: wipe requires a server name"; usage; }
        cmd_wipe "$2"
        ;;
    migrate-down)
        [ $# -lt 3 ] && { echo "Error: migrate-down requires <server> and <migration>"; echo "Example: $0 migrate-down statbus_dev 20260417130648"; exit 1; }
        cmd_migrate_down "$2" "$3"
        ;;
    migrate-up)
        [ $# -lt 2 ] && { echo "Error: migrate-up requires a server name"; exit 1; }
        cmd_migrate_up "$2"
        ;;
    tail)
        [ $# -lt 2 ] && { echo "Error: tail requires a server name or 'all'"; usage; }
        cmd_tail "$2"
        ;;
    *)
        echo "Unknown command: $1"
        usage
        ;;
esac