From 3f4636460d22ca41f3fe2f8359f0ef15b127a908 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:03:50 -0400 Subject: [PATCH 1/6] fix[deploy]: stop Deploy hanging forever on Ubuntu 24.04 apt install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Deploy flow calls apt, wget, tar, git, and go build via Process.runSync with no timeout, no DEBIAN_FRONTEND=noninteractive, no streamed stdout. On Ubuntu 24.04 the first `apt -y install linux-kernel-headers` call can block indefinitely on a dpkg lock (unattended-upgrades) or an interactive prompt, and the operator sees no output between "Git installation detected" and the hang. Reproduced on a fresh Hetzner 24.04 host; only workaround was to pre-install `golang-go build-essential` manually and rebuild znnd from source. This change: - Adds `_runStreaming` that uses `Process.start`, streams stdout/ stderr live, accepts a timeout, and SIGTERM→SIGKILL if exceeded - Adds `_isDpkgLocked` precheck via `fuser /var/lib/dpkg/lock-frontend` so the deploy aborts with an actionable message instead of blocking - Adds `_hasCommand` / `_hasDebPackage` helpers so already-installed tools (git, build-essential, linux-libc-dev, wget, Go) are skipped cleanly — a host with manual `apt install golang-go build-essential` now sails through prereqs instead of re-running every apt step - Routes all apt invocations through `_aptInstall` with DEBIAN_FRONTEND=noninteractive + confold/confdef options so dpkg never waits on config-file prompts - Replaces `linux-kernel-headers` (transitional name, may not exist on Ubuntu 22.04+) with `linux-libc-dev`, accepting either package in the skip check for legacy compatibility - Makes `_buildFromSource` pick `go` from PATH if /usr/local/go is absent, matching the detection in the prereq step - 15-minute timeout on `go build` (slow hosts), 10-min on apt, 5-min on git clone / wget, 2-min on tar extract Caller in main() awaits both async functions — both were bool, now Future. No other call sites. --- bin/znn_controller.dart | 297 ++++++++++++++++++++++++++++++---------- 1 file changed, 222 insertions(+), 75 deletions(-) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index 6f27efa..c68cf68 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -140,10 +140,11 @@ Future main() async { _initZNNService(); } - if (!_installLinuxPrerequisites()) { + if (!await _installLinuxPrerequisites()) { return; } - if (!_buildFromSource('/root/$znnSource', '/usr/local/bin/$znnDaemon')) { + if (!await _buildFromSource( + '/root/$znnSource', '/usr/local/bin/$znnDaemon')) { return; } @@ -390,103 +391,249 @@ bool _verifyProducerConfig(Map config) { return true; } -bool _installLinuxPrerequisites() { +/// Runs [executable] with [arguments], streaming stdout/stderr to the console +/// so the operator sees progress. Kills the process and returns -1 on timeout. +/// [extraEnv] is merged with the parent environment — used to inject +/// DEBIAN_FRONTEND=noninteractive for apt invocations so they don't block +/// waiting on dpkg prompts. +Future _runStreaming( + String executable, + List arguments, { + String? workingDirectory, + Map extraEnv = const {}, + Duration timeout = const Duration(minutes: 10), +}) async { + final label = '\$ $executable ${arguments.join(' ')}'; + print(label); + + final process = await Process.start( + executable, + arguments, + workingDirectory: workingDirectory, + environment: extraEnv, + includeParentEnvironment: true, + runInShell: true, + ); + + final stdoutSub = process.stdout + .transform(utf8.decoder) + .transform(const LineSplitter()) + .listen((line) => stdout.writeln(' $line')); + final stderrSub = process.stderr + .transform(utf8.decoder) + .transform(const LineSplitter()) + .listen((line) => stderr.writeln(' $line')); + + int exitCode; + try { + exitCode = await process.exitCode.timeout(timeout); + } on TimeoutException { + stderr.writeln( + '${red('Error!')} Command exceeded ${timeout.inSeconds}s timeout: $label'); + process.kill(ProcessSignal.sigterm); + try { + exitCode = await process.exitCode + .timeout(const Duration(seconds: 5)); + } on TimeoutException { + process.kill(ProcessSignal.sigkill); + exitCode = -1; + } + } finally { + await stdoutSub.cancel(); + await stderrSub.cancel(); + } + + return exitCode; +} + +/// True if /var/lib/dpkg/lock-frontend is held by another process. +/// Prevents the deploy from silently blocking on apt lock contention +/// (e.g., unattended-upgrades running in the background on a fresh host). +bool _isDpkgLocked() { + final result = Process.runSync( + 'fuser', ['/var/lib/dpkg/lock-frontend'], + runInShell: true); + return result.exitCode == 0; +} + +bool _hasCommand(String command) { + final result = Process.runSync('which', [command], runInShell: true); + return result.exitCode == 0 && + result.stdout.toString().trim().isNotEmpty; +} + +bool _hasDebPackage(String pkg) { + final result = Process.runSync('dpkg-query', + ['-W', '-f=\${Status}', pkg], runInShell: true); + return result.exitCode == 0 && + result.stdout.toString().contains('install ok installed'); +} + +Future _aptInstall(String pkg, + {Duration timeout = const Duration(minutes: 10)}) { + return _runStreaming( + 'apt-get', + [ + '-y', + '-o', + 'DPkg::Options::=--force-confold', + '-o', + 'DPkg::Options::=--force-confdef', + 'install', + pkg, + ], + extraEnv: const {'DEBIAN_FRONTEND': 'noninteractive'}, + timeout: timeout, + ); +} + +Future _installLinuxPrerequisites() async { print('Installing Linux prerequisites ...'); - ProcessResult processResult; - processResult = Process.runSync('git', ['version'], runInShell: true); - if (processResult.exitCode != 0) { + if (_isDpkgLocked()) { + print( + '${red('Error!')} /var/lib/dpkg/lock-frontend is held by another process. ' + 'Wait for any running apt / dpkg / unattended-upgrades job to finish, ' + 'then retry Deploy.'); + return false; + } + + // Git + if (_hasCommand('git')) { + final v = Process.runSync('git', ['--version'], runInShell: true); + print('Git installation detected: ${v.stdout.toString().trim()}'); + } else { print('Git not detected, proceeding with the installation'); - Process.runSync('apt', ['-y', 'install', 'git-all'], runInShell: true); + final rc = await _aptInstall('git'); + if (rc != 0) { + print('${red('Error!')} Could not install git (exit $rc)'); + return false; + } + } + + // Kernel headers — linux-kernel-headers is the legacy transitional + // package name; Ubuntu 18.04+ ships linux-libc-dev instead. Accept + // either so the tool works across historical and current Ubuntu + // releases. + if (_hasDebPackage('linux-libc-dev') || + _hasDebPackage('linux-kernel-headers')) { + print('Kernel headers already installed'); } else { - print('Git installation detected: ${processResult.stdout}'); + final rc = await _aptInstall('linux-libc-dev'); + if (rc != 0) { + print('${red('Error!')} Could not install linux-libc-dev (exit $rc)'); + return false; + } } - if (Process.runSync('apt', ['-y', 'install', 'linux-kernel-headers'], - runInShell: true) - .exitCode != - 0) { - print('${red('Error!')} Could not install linux-kernel-headers'); - return false; + + // build-essential + if (_hasDebPackage('build-essential')) { + print('build-essential already installed'); + } else { + final rc = await _aptInstall('build-essential'); + if (rc != 0) { + print('${red('Error!')} Could not install build-essential (exit $rc)'); + return false; + } } - if (Process.runSync('apt', ['-y', 'install', 'build-essential'], - runInShell: true) - .exitCode != - 0) { - print('${red('Error!')} Could not install build-essential'); - return false; + + // wget + if (_hasCommand('wget')) { + print('wget already installed'); + } else { + final rc = await _aptInstall('wget'); + if (rc != 0) { + print('${red('Error!')} Could not install wget (exit $rc)'); + return false; + } + } + + // Go — accept either the bundled /usr/local/go install or a + // system-packaged `go` on PATH (apt install golang-go is a common + // manual workaround for Ubuntu 24.04 since the bundled go1.20.3 is + // too old for recent go-zenon go.mod requirements). + if (File('/usr/local/go/bin/go').existsSync()) { + final v = Process.runSync( + '/usr/local/go/bin/go', ['version'], runInShell: true); + print('Go installation detected: ${v.stdout.toString().trim()}'); + return true; + } + if (_hasCommand('go')) { + final v = Process.runSync('go', ['version'], runInShell: true); + print('Go installation detected: ${v.stdout.toString().trim()}'); + return true; } - if (Process.runSync('apt', ['-y', 'install', 'wget'], runInShell: true) - .exitCode != - 0) { - print('${red('Error!')} Could not install wget'); + + print('Go not detected, proceeding with the installation ...'); + print('Preparing to download Go ...'); + final tarball = goLinuxDlUrl.substring(goLinuxDlUrl.lastIndexOf('/') + 1); + final wgetRc = await _runStreaming( + 'wget', + [goLinuxDlUrl], + workingDirectory: '/root', + timeout: const Duration(minutes: 5), + ); + if (wgetRc != 0) { + print('${red('Error!')} wget failed (exit $wgetRc)'); return false; } - processResult = - Process.runSync('/usr/local/go/bin/go', ['version'], runInShell: true); + print('Checking Go download ...'); + if (!_verifyChecksum('/root/$tarball', goLinuxSHA256Checksum)) { + print('${red('Error!')} Checksum validation failed'); + return false; + } - if (processResult.exitCode != 0) { - print('Go not detected, proceeding with the installation ...'); - print('Preparing to download Go ...'); - Process.runSync('wget', [goLinuxDlUrl], - workingDirectory: '/root', runInShell: true); - print('Checking Go download ...'); - if (!_verifyChecksum( - '/root/${goLinuxDlUrl.substring(goLinuxDlUrl.lastIndexOf('/') + 1, goLinuxDlUrl.length)}', - goLinuxSHA256Checksum)) { - print('${red('Error!')} Checksum validation failed'); - return false; - } - print('Unpacking Go ...'); - Process.runSync( - 'tar', - [ - '-xzvf', - '/root/${goLinuxDlUrl.substring(goLinuxDlUrl.lastIndexOf('/') + 1, goLinuxDlUrl.length)}', - '-C', - '/usr/local/' - ], - runInShell: true); - Process.runSync('/usr/local/go/bin/go', ['version'], runInShell: true) - .stdout - .toString(); - print('Cleaning downloaded files ...'); - Process.runSync( - 'rm', - [ - '-rf', - goLinuxDlUrl.substring( - goLinuxDlUrl.lastIndexOf('/') + 1, goLinuxDlUrl.length) - ], - workingDirectory: '/root', - runInShell: true); - } else { - print('Go installation detected: ${processResult.stdout}'); + print('Unpacking Go ...'); + final tarRc = await _runStreaming( + 'tar', + ['-xzf', '/root/$tarball', '-C', '/usr/local/'], + timeout: const Duration(minutes: 2), + ); + if (tarRc != 0) { + print('${red('Error!')} tar extraction failed (exit $tarRc)'); + return false; } + Process.runSync('/usr/local/go/bin/go', ['version'], runInShell: true); + print('Cleaning downloaded files ...'); + Process.runSync('rm', ['-f', tarball], + workingDirectory: '/root', runInShell: true); + return true; } -bool _buildFromSource(String sourcePath, String outputFile) { +Future _buildFromSource(String sourcePath, String outputFile) async { Directory goZenonDir = Directory(sourcePath); - ProcessResult processResult; if (goZenonDir.existsSync()) { goZenonDir.deleteSync(recursive: true); } print('Preparing to clone go-zenon ...'); - processResult = Process.runSync( - 'git', ['clone', znnGithubUrl, goZenonDir.absolute.path], - runInShell: true); - if (processResult.exitCode != 0) { + final cloneRc = await _runStreaming( + 'git', + ['clone', znnGithubUrl, goZenonDir.absolute.path], + timeout: const Duration(minutes: 5), + ); + if (cloneRc != 0) { print( - '${red('Error!')} Could not clone $znnGithubUrl into ${goZenonDir.path}'); + '${red('Error!')} Could not clone $znnGithubUrl into ${goZenonDir.path} (exit $cloneRc)'); return false; } - processResult = Process.runSync('/usr/local/go/bin/go', - ['build', '-ldflags', '-s -w', '-o', outputFile, './cmd/znnd/main.go'], - workingDirectory: goZenonDir.absolute.path, runInShell: true); - if (processResult.exitCode != 0) { - print('${red('Error!')} Could not build $znnSource'); + + // Pick whichever go binary is available — matches the detection + // logic in _installLinuxPrerequisites so the operator's choice of + // install method (bundled vs apt) works transparently. + final goBin = File('/usr/local/go/bin/go').existsSync() + ? '/usr/local/go/bin/go' + : 'go'; + final buildRc = await _runStreaming( + goBin, + ['build', '-ldflags', '-s -w', '-o', outputFile, './cmd/znnd/main.go'], + workingDirectory: goZenonDir.absolute.path, + timeout: const Duration(minutes: 15), + ); + if (buildRc != 0) { + print('${red('Error!')} Could not build $znnSource (exit $buildRc)'); return false; } print(Process.runSync('file', ['znnd'], From 5b1d2e55f3a35cecd305661f0059a5dfe2ab6918 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:14:36 -0400 Subject: [PATCH 2/6] =?UTF-8?q?chore[deps]:=20bump=20bundled=20Go=201.20.3?= =?UTF-8?q?=20=E2=86=92=201.22.12=20and=20widen=20Dart=20SDK=20upper=20bou?= =?UTF-8?q?nd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - goLinuxDlUrl bumped to go1.22.12 with verified go.dev/dl SHA256. go-zenon's go.mod still says `go 1.20`, so 1.20.x would technically compile it, but Go only maintains security updates for the two most recent minor versions. 1.22 is the current LTS floor. - pubspec.yaml sdk constraint relaxed from '>=2.14.0 <3.0.0' to '>=2.14.0 <4.0.0'. The existing constraint prevents `dart pub get` on any Dart 3.x release, which is what `dart-lang/setup-dart@v1.5.0` (used by the release workflow) installs by default. dcli 3.0.2 targets Dart 3.0 exactly; keep that implicit via pubspec.lock. Verified in Docker (Ubuntu 24.04 target): - target-prepped (golang-go + build-essential pre-installed): controller detects each prereq, skips all apt calls, advances to `git clone` + `go build` with streamed output. Reproduces the zenonorg5 operator's working scenario. - target-locked (flock holding /var/lib/dpkg/lock-frontend): controller aborts in ~2s with the actionable lock-contention error instead of hanging. No apt call issued. --- bin/znn_controller.dart | 8 ++++++-- pubspec.yaml | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index c68cf68..366cc8c 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -16,9 +16,13 @@ const znnSource = 'go-zenon'; const znnService = 'go-zenon.service'; const znnGithubUrl = 'https://github.com/zenon-network/go-zenon'; -const goLinuxDlUrl = 'https://go.dev/dl/go1.20.3.linux-amd64.tar.gz'; +// Pinned to latest 1.22.x. go-zenon's go.mod currently says `go 1.20`, +// so go1.20.x would technically build it — but 1.20 is EOL (Go maintains +// only the two most recent minor versions). 1.22 is the floor that still +// receives security updates as of 2026-04. +const goLinuxDlUrl = 'https://go.dev/dl/go1.22.12.linux-amd64.tar.gz'; const goLinuxSHA256Checksum = - '979694c2c25c735755bf26f4f45e19e64e4811d661dd07b8c010f7a8e18adfca'; + '4fa4f869b0f7fc6bb1eb2660e74657fbf04cdd290b5aef905585c86051b34d43'; const optionDeploy = 'Deploy'; const optionStatus = 'Status'; diff --git a/pubspec.yaml b/pubspec.yaml index 7e020c9..351d691 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -4,7 +4,7 @@ version: 0.0.4 publish_to: none environment: - sdk: '>=2.14.0 <3.0.0' + sdk: '>=2.14.0 <4.0.0' dependencies: dcli: ^3.0.2 From 55c931ea8fa50c9666922b8c558cc66088f4bde4 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:27:36 -0400 Subject: [PATCH 3/6] feat[deploy]: detect non-Debian distros and abort with clear error Today the Deploy path assumes apt. On a CentOS / RHEL / Arch / NixOS host the first apt call fails with a cryptic error and the operator has to guess what went wrong. Parse /etc/os-release up front and abort with a one-shot actionable message: which distros are supported, what prereqs to install manually, and that Deploy will transparently skip the install step on a pre-prepped host thanks to the _hasCommand / _hasDebPackage detection already landed. Recognizes derivatives via ID_LIKE (e.g., Linux Mint, Raspbian) not just the narrow ID==debian/ubuntu check. --- bin/znn_controller.dart | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index 366cc8c..e8c27b7 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -450,6 +450,39 @@ Future _runStreaming( return exitCode; } +/// Parses /etc/os-release into a flat key/value map. Returns an empty +/// map if the file is missing (non-Linux host, minimal container, etc.). +Map _readOsRelease() { + final file = File('/etc/os-release'); + if (!file.existsSync()) return const {}; + final result = {}; + for (final line in file.readAsLinesSync()) { + final eq = line.indexOf('='); + if (eq <= 0) continue; + final key = line.substring(0, eq).trim(); + var value = line.substring(eq + 1).trim(); + if (value.length >= 2 && + ((value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")))) { + value = value.substring(1, value.length - 1); + } + result[key] = value; + } + return result; +} + +/// True if the current host is Debian/Ubuntu or a derivative that uses +/// apt (e.g., Mint, Raspbian). Derivatives are recognized via the +/// ID_LIKE field in /etc/os-release. +bool _isDebianFamily(Map osRelease) { + final id = (osRelease['ID'] ?? '').toLowerCase(); + if (id == 'debian' || id == 'ubuntu') return true; + final idLike = (osRelease['ID_LIKE'] ?? '').toLowerCase(); + return idLike + .split(RegExp(r'\s+')) + .any((t) => t == 'debian' || t == 'ubuntu'); +} + /// True if /var/lib/dpkg/lock-frontend is held by another process. /// Prevents the deploy from silently blocking on apt lock contention /// (e.g., unattended-upgrades running in the background on a fresh host). @@ -494,6 +527,21 @@ Future _aptInstall(String pkg, Future _installLinuxPrerequisites() async { print('Installing Linux prerequisites ...'); + final osRelease = _readOsRelease(); + if (!_isDebianFamily(osRelease)) { + final detected = osRelease['PRETTY_NAME'] ?? + osRelease['ID'] ?? + 'unrecognized (no /etc/os-release)'; + print('${red('Error!')} This controller currently supports Debian and ' + 'Ubuntu family distros (apt-based).'); + print('Detected: $detected'); + print('To deploy on a non-apt distro, install these prerequisites ' + 'manually and re-run Deploy — it will skip the install step: ' + 'git, build-essential, linux-libc-dev (or equivalent kernel ' + 'headers), wget, go 1.22+.'); + return false; + } + if (_isDpkgLocked()) { print( '${red('Error!')} /var/lib/dpkg/lock-frontend is held by another process. ' From 428d5fd406b71de17712fcc8cf092fbec6c32675 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:28:09 -0400 Subject: [PATCH 4/6] feat[deploy]: pin go-zenon to a tagged release and reuse existing clone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related reproducibility fixes: - Clone pins to `--branch znnRefTag --depth 1`. Previously Deploy pulled go-zenon master at whatever commit happened to be there at deploy time, so two operators following the same runbook a day apart could land different znnd binaries. Tag pin makes the deploy output deterministic across operators and across time. Bumped in lockstep with controller releases when a new go-zenon tag is the recommended mainnet version (v0.0.8 for 0.0.5 of this tool). - Existing clone detection: if /root/go-zenon exists and its origin remote matches znnGithubUrl, fetch + hard-reset to the pinned tag instead of deleting and re-downloading ~200MB of modules. Foreign directories (different remote, scratch files) still get nuked and fresh-cloned — only a legit prior clone is reused. Accelerates re-deploys from several minutes to seconds when the tag hasn't changed. --- bin/znn_controller.dart | 96 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 11 deletions(-) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index e8c27b7..53c5b4a 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -15,6 +15,11 @@ const znnDaemon = 'znnd'; const znnSource = 'go-zenon'; const znnService = 'go-zenon.service'; const znnGithubUrl = 'https://github.com/zenon-network/go-zenon'; +// Pinned go-zenon tag. Using master would make Deploy non-reproducible +// (two operators deploying on different days can land different binaries +// from master drift). Bumped in lockstep with controller releases when a +// new go-zenon tag is the recommended mainnet version. +const znnRefTag = 'v0.0.8'; // Pinned to latest 1.22.x. go-zenon's go.mod currently says `go 1.20`, // so go1.20.x would technically build it — but 1.20 is EOL (Go maintains @@ -655,21 +660,90 @@ Future _installLinuxPrerequisites() async { return true; } +/// True if two git URLs point at the same repo, ignoring a trailing .git +/// suffix. Used to detect when an existing clone can be reused rather than +/// thrown away and re-downloaded. +bool _gitRemoteMatches(String actual, String expected) { + String norm(String s) { + var v = s.trim(); + if (v.endsWith('.git')) v = v.substring(0, v.length - 4); + return v; + } + + return norm(actual) == norm(expected); +} + Future _buildFromSource(String sourcePath, String outputFile) async { Directory goZenonDir = Directory(sourcePath); + + // If an existing clone points at the same remote, reuse it — a re-deploy + // shouldn't re-download ~200MB of modules. Check remote.origin.url via + // git config, and if it matches, fetch + hard-reset to the pinned tag. + // Anything else in the directory (foreign repo, scratch files) → nuke + // and fall through to a fresh clone. + bool cloneNeeded = true; if (goZenonDir.existsSync()) { - goZenonDir.deleteSync(recursive: true); + final remote = Process.runSync( + 'git', + ['-C', goZenonDir.absolute.path, 'config', '--get', 'remote.origin.url'], + runInShell: true, + ); + final url = remote.stdout.toString().trim(); + if (remote.exitCode == 0 && _gitRemoteMatches(url, znnGithubUrl)) { + print('Existing go-zenon clone detected at ${goZenonDir.path} — ' + 'refreshing to $znnRefTag ...'); + final fetchRc = await _runStreaming( + 'git', + ['-C', goZenonDir.absolute.path, 'fetch', 'origin', '--tags'], + timeout: const Duration(minutes: 5), + ); + if (fetchRc == 0) { + final resetRc = await _runStreaming( + 'git', + [ + '-C', + goZenonDir.absolute.path, + 'reset', + '--hard', + 'refs/tags/$znnRefTag', + ], + timeout: const Duration(minutes: 2), + ); + if (resetRc == 0) { + cloneNeeded = false; + } else { + print('(reset to $znnRefTag failed — falling back to fresh clone)'); + } + } else { + print('(fetch failed — falling back to fresh clone)'); + } + } + if (cloneNeeded) { + print('Removing ${goZenonDir.path} before fresh clone ...'); + goZenonDir.deleteSync(recursive: true); + } } - print('Preparing to clone go-zenon ...'); - final cloneRc = await _runStreaming( - 'git', - ['clone', znnGithubUrl, goZenonDir.absolute.path], - timeout: const Duration(minutes: 5), - ); - if (cloneRc != 0) { - print( - '${red('Error!')} Could not clone $znnGithubUrl into ${goZenonDir.path} (exit $cloneRc)'); - return false; + + if (cloneNeeded) { + print('Preparing to clone go-zenon @ $znnRefTag ...'); + final cloneRc = await _runStreaming( + 'git', + [ + 'clone', + '--branch', + znnRefTag, + '--depth', + '1', + znnGithubUrl, + goZenonDir.absolute.path, + ], + timeout: const Duration(minutes: 5), + ); + if (cloneRc != 0) { + print('${red('Error!')} Could not clone $znnGithubUrl @ $znnRefTag ' + 'into ${goZenonDir.path} (exit $cloneRc)'); + return false; + } } // Pick whichever go binary is available — matches the detection From c31ff27016cc9bf7e851576c10fb1acba2936067 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:29:31 -0400 Subject: [PATCH 5/6] feat[deploy]: non-interactive --yes --deploy mode + action flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a small argparse block at the top of main() so the controller can be driven from Ansible / Terraform / bash scripts without TTY. Flags: --deploy / --status / --start-service / --stop-service / --resync Jump straight to the matching menu action. -y / --yes Auto-confirm every prompt (echoes what was answered so the run log shows the choice). -h / --help, -v / --version Info commands, no side effects. Semantics: - All confirm() calls in Deploy / Resync now go through _confirmOrDefault, returning the default in --yes mode. - Existing-keystore password prompt reads ZNN_KEYSTORE_PASSWORD when --yes is set; aborts with an actionable error if missing rather than blocking on ask(). Retries are disabled in --yes mode to fail fast. - Fresh-keystore generation (the common first-deploy path) is already unattended-friendly — the existing RandomStringGenerator fallback produces a password without any prompt. Conflict detection: specifying two action flags (e.g., --deploy --status) exits 2 with a clear error instead of silently picking one. --- bin/znn_controller.dart | 143 ++++++++++++++++++++++++++++++++++------ 1 file changed, 123 insertions(+), 20 deletions(-) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index 53c5b4a..47c30a8 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -39,7 +39,88 @@ const optionQuit = 'Quit'; const znnControllerVersion = '0.0.4'; -Future main() async { +/// When true, prompts skip to their default and `ask()` password prompts +/// are replaced by ZNN_KEYSTORE_PASSWORD lookups. Set from CLI flags at +/// the top of main(); read by _confirmOrDefault and the producer-config +/// branch of the Deploy flow. +bool _nonInteractive = false; + +/// Interactive confirm unless --yes / -y was passed, in which case +/// [defaultValue] is returned after echoing the prompt so the operator +/// sees what was auto-answered. +bool _confirmOrDefault(String prompt, {required bool defaultValue}) { + if (_nonInteractive) { + print('$prompt [--yes: ${defaultValue ? 'y' : 'n'}]'); + return defaultValue; + } + return confirm(prompt, defaultValue: defaultValue); +} + +void _printCliHelp() { + print(''' +Usage: znn-controller [OPTIONS] + +Options: + (no flags) Interactive menu (default). + --deploy Jump straight to the Deploy action, skipping the menu. + --status Jump straight to the Status action. + --start-service Start the go-zenon.service unit. + --stop-service Stop the go-zenon.service unit. + --resync Resync the node from genesis (prompts before destroy). + -y, --yes Auto-confirm all prompts. Pair with --deploy (or any + action flag) for unattended deploys from Ansible / + Terraform / bash scripts. + -h, --help Show this help and exit. + -v, --version Show the controller version and exit. + +Environment: + ZNN_KEYSTORE_PASSWORD + In --yes mode, supplies the producer keystore password when an + existing keystore is detected. Required for unattended re-deploys + on hosts that already have a keystore. Ignored when a fresh + keystore is being generated. +'''); +} + +Future main(List arguments) async { + // Parse CLI flags before any side effects (no logging, no network, no + // root check). --help and --version should work without apt. + final actionFlags = { + '--deploy': optionDeploy, + '--status': optionStatus, + '--start-service': optionStartService, + '--stop-service': optionStopService, + '--resync': optionResync, + }; + String? cliAction; + for (final arg in arguments) { + if (arg == '-h' || arg == '--help') { + _printCliHelp(); + exit(0); + } + if (arg == '-v' || arg == '--version') { + print('znn-controller v$znnControllerVersion'); + exit(0); + } + if (arg == '-y' || arg == '--yes') { + _nonInteractive = true; + continue; + } + final action = actionFlags[arg]; + if (action != null) { + if (cliAction != null && cliAction != action) { + print('${red('Error!')} conflicting action flags: ' + 'only one of ${actionFlags.keys.join(', ')} may be passed.'); + exit(2); + } + cliAction = action; + continue; + } + print('${red('Error!')} unknown argument: $arg'); + print('Run `znn-controller --help` for usage.'); + exit(2); + } + var operatingSystem = Platform.operatingSystem; if (!Platform.isLinux) { @@ -79,16 +160,21 @@ Future main() async { exit(0); } - var selected = - menu('Select an option from the ones listed above\n', options: [ - optionDeploy, - optionStatus, - optionStartService, - optionStopService, - optionResync, - optionHelp, - optionQuit - ]); + String selected; + if (cliAction != null) { + print('Running $cliAction (selected via CLI flag)'); + selected = cliAction; + } else { + selected = menu('Select an option from the ones listed above\n', options: [ + optionDeploy, + optionStatus, + optionStartService, + optionStopService, + optionResync, + optionHelp, + optionQuit, + ]); + } if (selected == 'Quit') { exit(0); @@ -120,7 +206,7 @@ Future main() async { print( '${orange('Warning!')} Insufficient free virtual memory detected. It is recommended to have at least 2 GB of free virtual memory'); } - if (!confirm( + if (!_confirmOrDefault( 'Are you sure you want to proceed with the deployment process?', defaultValue: true)) { exit(0); @@ -161,12 +247,12 @@ Future main() async { File keyStoreFile = File( '${znnDefaultDirectory.absolute.path}${Platform.pathSeparator}wallet${Platform.pathSeparator}producer'); if (_verifyProducerConfig(configJson)) { - if (confirm( + if (_confirmOrDefault( 'Producer configuration detected. Continue using the existing configuration?', defaultValue: true)) { isConfigured = true; if (!keyStoreFile.existsSync()) { - if (!confirm( + if (!_confirmOrDefault( 'Producer key store file not detected. Do you want to create a new producer key store file and configure the Node with it?', defaultValue: false)) { isConfigured = true; @@ -175,7 +261,7 @@ Future main() async { } } else { if (keyStoreFile.existsSync()) { - if (confirm( + if (_confirmOrDefault( 'Producer key store file detected. Do you want to configure the Node with it?', defaultValue: true)) { bool p = false; @@ -183,16 +269,33 @@ Future main() async { int count = 0; while (!p && count < 3) { try { - keyStorePassword = ask( - 'Insert the producer key store password:', - hidden: true, - validator: Ask.all([Ask.dontCare, Ask.lengthMin(2)])); + if (_nonInteractive) { + final envPw = + Platform.environment['ZNN_KEYSTORE_PASSWORD'] ?? ''; + if (envPw.isEmpty) { + print('${red('Error!')} --yes mode: existing keystore ' + 'detected but ZNN_KEYSTORE_PASSWORD not set. Export ' + 'the password or remove the keystore to let Deploy ' + 'generate a fresh one.'); + exit(2); + } + keyStorePassword = envPw; + } else { + keyStorePassword = ask( + 'Insert the producer key store password:', + hidden: true, + validator: Ask.all([Ask.dontCare, Ask.lengthMin(2)])); + } await keyStoreManager.readKeyStore( keyStorePassword, keyStoreFile); p = true; } catch (e) { count++; print('${red('Error!')} ${3 - count} attempts left'); + if (_nonInteractive) { + // No retries in non-interactive mode — fail fast. + exit(2); + } } } if (count == 3) { @@ -317,7 +420,7 @@ Future main() async { break; case optionResync: - if (confirm( + if (_confirmOrDefault( 'This option will resync the Node starting from genesis. Do you want to continue?', defaultValue: true)) { bool running = false; From fe9d69910e4dc8698ea29f52e0e79af94cab3820 Mon Sep 17 00:00:00 2001 From: mehowz <121523914+mehowz@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:31:16 -0400 Subject: [PATCH 6/6] feat[healthcheck]: diagnostic checks for monitoring / systemd watchdogs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a Healthcheck menu option + --healthcheck CLI flag that reports one line per check with [PASS] / [WARN] / [FAIL] tags and exits 0 if every check passed (or only WARN'd), 1 if any FAIL occurred. Shape suitable for: - systemd watchdog (WatchdogSec= + ExecReload=... --healthcheck) - cron + mail-on-nonzero for passive monitoring - Nagios / Icinga check_exit wrapper - Ansible / Terraform post-deploy verification Checks: 1. go-zenon.service active via systemctl is-active 2. /usr/local/bin/znnd binary present + reports a version 3. Local RPC reachable at ws://127.0.0.1:35998 (10s timeout) 4. Frontier momentum fetched; wall-clock lag ≤ 15s = PASS, ≤ 60s = WARN, > 60s = FAIL. Thresholds match the 10s target block time with headroom for short-term jitter. 5. (Only if producer config exists) pillar registered with the configured producer address; producedMomentums > 0 this epoch PASSes, 0 is WARN (new pillar or one that just missed). All RPC calls carry explicit timeouts so a stuck node doesn't cause the healthcheck itself to hang — exactly the failure mode the check is meant to detect. --- bin/znn_controller.dart | 134 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/bin/znn_controller.dart b/bin/znn_controller.dart index 47c30a8..40474aa 100644 --- a/bin/znn_controller.dart +++ b/bin/znn_controller.dart @@ -31,6 +31,7 @@ const goLinuxSHA256Checksum = const optionDeploy = 'Deploy'; const optionStatus = 'Status'; +const optionHealthcheck = 'Healthcheck'; const optionStartService = 'Start service'; const optionStopService = 'Stop service'; const optionResync = 'Resync'; @@ -64,6 +65,9 @@ Options: (no flags) Interactive menu (default). --deploy Jump straight to the Deploy action, skipping the menu. --status Jump straight to the Status action. + --healthcheck Run diagnostic checks (service, RPC, sync, pillar) + and exit 0 if healthy, 1 otherwise. Suitable for + monitoring cron jobs and systemd watchdogs. --start-service Start the go-zenon.service unit. --stop-service Stop the go-zenon.service unit. --resync Resync the node from genesis (prompts before destroy). @@ -88,6 +92,7 @@ Future main(List arguments) async { final actionFlags = { '--deploy': optionDeploy, '--status': optionStatus, + '--healthcheck': optionHealthcheck, '--start-service': optionStartService, '--stop-service': optionStopService, '--resync': optionResync, @@ -168,6 +173,7 @@ Future main(List arguments) async { selected = menu('Select an option from the ones listed above\n', options: [ optionDeploy, optionStatus, + optionHealthcheck, optionStartService, optionStopService, optionResync, @@ -387,6 +393,10 @@ Future main(List arguments) async { } break; + case optionHealthcheck: + final healthy = await _runHealthcheck(configJson); + exit(healthy ? 0 : 1); + case optionStartService: if (_isZNNServiceActive()) { print('$znnService is already active'); @@ -412,6 +422,8 @@ Future main(List arguments) async { case optionHelp: print('Deploy - will deploy a Node with a producing key file configured'); print('Status - will print the status of the Node'); + print('Healthcheck - check service + RPC reachability + sync freshness + ' + 'pillar registration, suitable for monitoring / systemd watchdog'); print('Start service - will start the service'); print('Stop service - will stop the service'); print('Resync - will resync the Node from genesis'); @@ -488,6 +500,128 @@ void _printServiceStatus() { } } +/// Diagnostic checks suitable for monitoring / systemd watchdog / cron +/// health reporting. Prints one line per check with [PASS] / [WARN] / +/// [FAIL] tags and returns true iff no FAILs occurred. WARN conditions +/// (e.g., momentum lag slightly above the 15s threshold) don't fail the +/// overall check — the operator can read the output and decide. +/// +/// Exits via the caller with 0 (healthy) or 1 (any FAIL). +Future _runHealthcheck(Map configJson) async { + print('Healthcheck'); + print('-----------'); + var anyFail = false; + + // 1. Service active? + final serviceActive = _isZNNServiceActive(); + if (serviceActive) { + print('[${green('PASS')}] service $znnService is active'); + } else { + print('[${red('FAIL')}] service $znnService is NOT active'); + anyFail = true; + } + + // 2. znnd binary present? + final binary = File('/usr/local/bin/$znnDaemon'); + if (binary.existsSync()) { + final v = Process.runSync(binary.path, ['version'], runInShell: true); + final version = + v.exitCode == 0 ? v.stdout.toString().trim() : '(version query failed)'; + print('[${green('PASS')}] binary $version'); + } else { + print('[${red('FAIL')}] binary /usr/local/bin/$znnDaemon missing'); + anyFail = true; + } + + // 3. RPC reachability + sync freshness. Only attempt if service is + // active — a down service will obviously have no RPC, no point + // double-reporting. + if (serviceActive) { + final Zenon znnClient = Zenon(); + try { + await znnClient.wsClient + .initialize('ws://127.0.0.1:$defaultWsPort', retry: false) + .timeout(const Duration(seconds: 10)); + print('[${green('PASS')}] rpc reachable at ' + 'ws://127.0.0.1:$defaultWsPort'); + + try { + final momentum = await znnClient.ledger + .getFrontierMomentum() + .timeout(const Duration(seconds: 10)); + final nowS = DateTime.now().toUtc().millisecondsSinceEpoch ~/ 1000; + final lagS = nowS - momentum.timestamp; + final freshnessTag = lagS <= 15 + ? '[${green('PASS')}]' + : (lagS <= 60 ? '[${orange('WARN')}]' : '[${red('FAIL')}]'); + print('$freshnessTag sync height=${momentum.height} ' + 'lag=${lagS}s (threshold: 15s healthy, 60s warn)'); + if (lagS > 60) anyFail = true; + + // 4. Producer / pillar status (only if producer configured). + if (_verifyProducerConfig(configJson)) { + final String producerAddress = configJson['Producer']['Address']; + PillarInfo? pillarFound; + int pageIndex = 0; + try { + PillarInfoList pillarList = await znnClient.embedded.pillar + .getAll(pageIndex: pageIndex) + .timeout(const Duration(seconds: 20)); + while (pillarList.list.isNotEmpty && pillarFound == null) { + for (PillarInfo pillar in pillarList.list) { + if (pillar.producerAddress.toString() == producerAddress) { + pillarFound = pillar; + break; + } + } + pageIndex++; + pillarList = await znnClient.embedded.pillar + .getAll(pageIndex: pageIndex) + .timeout(const Duration(seconds: 20)); + } + } catch (e) { + print('[${red('FAIL')}] pillar RPC error during pillar ' + 'lookup: $e'); + anyFail = true; + } + if (pillarFound != null) { + final produced = pillarFound.currentStats.producedMomentums; + final tag = produced > 0 ? '[${green('PASS')}]' : '[${orange('WARN')}]'; + print('$tag pillar ${pillarFound.name} ' + '($producerAddress), produced=$produced this epoch'); + } else if (pageIndex > 0) { + print('[${orange('WARN')}] pillar producer address ' + '$producerAddress not registered as a pillar'); + } + } else { + print('[${green('PASS')}] producer none configured ' + '(non-producing node)'); + } + } on TimeoutException { + print('[${red('FAIL')}] sync RPC timeout fetching ' + 'frontier momentum'); + anyFail = true; + } + } catch (e) { + print('[${red('FAIL')}] rpc not reachable at ' + 'ws://127.0.0.1:$defaultWsPort: $e'); + anyFail = true; + } finally { + try { + znnClient.wsClient.stop(); + } catch (_) {} + } + } + + print('-----------'); + if (anyFail) { + print('${red('UNHEALTHY')} (one or more checks failed)'); + } else { + print('${green('HEALTHY')}'); + } + return !anyFail; +} + bool _verifyProducerConfig(Map config) { if (!config.containsKey('Producer')) { return false;