From c98f42c2c34ced55c32e1aa90cf43254b98b6dd2 Mon Sep 17 00:00:00 2001 From: sveitser Date: Mon, 15 Jun 2026 19:06:01 +0000 Subject: [PATCH 1/4] chore(node): use snmalloc global allocator - Set snmalloc-rs as the global allocator in espresso-node lib - Applies to both espresso-node and espresso-node-sqlite binaries - For soak-test memory comparison against the system allocator --- Cargo.lock | 21 ++++++++++++++++++++- crates/espresso/node/Cargo.toml | 1 + crates/espresso/node/src/lib.rs | 3 +++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index b31b0ee7d1d..8dbc6986808 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4056,7 +4056,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc2776f0c61eca1ca32528f85548abd1a4be8fb53d1b21c013e4f18da1e7090" dependencies = [ "data-encoding", - "syn 1.0.109", + "syn 2.0.117", ] [[package]] @@ -4892,6 +4892,7 @@ dependencies = [ "serialization-api", "sha2 0.10.9", "snafu 0.8.9", + "snmalloc-rs", "sqlx", "staking-cli", "static_assertions", @@ -12398,6 +12399,24 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "snmalloc-rs" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "530a04ae687609072d0edd38866406fbbcd23d2f716791437e312ec4d64a355a" +dependencies = [ + "snmalloc-sys", +] + +[[package]] +name = "snmalloc-sys" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cbeb16d6bcc5979f80ec907582a886b7fb3b9a707678b63dd93a10d8ee858" +dependencies = [ + "cmake", +] + [[package]] name = "snow" version = "0.10.0" diff --git a/crates/espresso/node/Cargo.toml b/crates/espresso/node/Cargo.toml index f253885809e..261016c704f 100644 --- a/crates/espresso/node/Cargo.toml +++ b/crates/espresso/node/Cargo.toml @@ -101,6 +101,7 @@ serde_json = { workspace = true } serialization-api = { path = "../../serialization/api" } sha2 = { workspace = true } snafu = { workspace = true } +snmalloc-rs = "0.7.4" sqlx = { workspace = true } staking-cli = { workspace = true } static_assertions = { workspace = true } diff --git a/crates/espresso/node/src/lib.rs b/crates/espresso/node/src/lib.rs index 68bfadb20f8..c405f4b91d6 100644 --- a/crates/espresso/node/src/lib.rs +++ b/crates/espresso/node/src/lib.rs @@ -1,3 +1,6 @@ +#[global_allocator] +static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; + mod external_event_handler; mod message_compat_tests; mod proposal_fetcher; From ee3d91176eebba9ac425d8d156b65d027398ac7f Mon Sep 17 00:00:00 2001 From: sveitser Date: Mon, 15 Jun 2026 20:48:43 +0000 Subject: [PATCH 2/4] chore(ci): run memory soak for 4 hours - Bump DURATION_SECONDS 3600 -> 14400 in both soak jobs - Bump soak job timeout-minutes 90 -> 300 to cover the longer sample --- .github/workflows/build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b4bbf61f342..c47bf6f3019 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -712,7 +712,7 @@ jobs: memory-soak-pr: if: github.event_name == 'pull_request' runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 300 needs: [build-dockers-amd] strategy: fail-fast: false @@ -721,7 +721,7 @@ jobs: env: DOCKER_TAG: pr-${{ github.event.pull_request.number }} ESPRESSO_NODE_GENESIS_FILE: genesis/${{ matrix.genesis }}.toml - DURATION_SECONDS: 3600 + DURATION_SECONDS: 14400 steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v8.1.0 @@ -772,7 +772,7 @@ jobs: memory-soak-non-pr: if: github.event_name != 'pull_request' runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 300 needs: [build-dockers-amd, create-multiplatform-docker-image] strategy: fail-fast: false @@ -780,7 +780,7 @@ jobs: genesis: [demo-drb-header, demo-epoch-reward] env: ESPRESSO_NODE_GENESIS_FILE: genesis/${{ matrix.genesis }}.toml - DURATION_SECONDS: 3600 + DURATION_SECONDS: 14400 steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v8.1.0 From f80e7b4ef2d44b6f91f5fe8dc2366241854df834 Mon Sep 17 00:00:00 2001 From: sveitser Date: Tue, 16 Jun 2026 07:23:00 +0000 Subject: [PATCH 3/4] chore(ci): lower soak RUST_LOG to error - Cut per-view container log volume that fills the runner disk and kills the host ~3.5h into the 4h soak - Set RUST_LOG=error in both memory-soak job env blocks (cherry picked from commit b64e5713a5b27075ee8aa445715f77dde06750ce) --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c47bf6f3019..6579f33f2a7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -722,6 +722,7 @@ jobs: DOCKER_TAG: pr-${{ github.event.pull_request.number }} ESPRESSO_NODE_GENESIS_FILE: genesis/${{ matrix.genesis }}.toml DURATION_SECONDS: 14400 + RUST_LOG: error steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v8.1.0 @@ -781,6 +782,7 @@ jobs: env: ESPRESSO_NODE_GENESIS_FILE: genesis/${{ matrix.genesis }}.toml DURATION_SECONDS: 14400 + RUST_LOG: error steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v8.1.0 From 4302e4f163bede9c7f7afae8345f1fde46748b4b Mon Sep 17 00:00:00 2001 From: sveitser Date: Tue, 16 Jun 2026 10:00:51 +0000 Subject: [PATCH 4/4] chore(ci): make soak resilient to host kill, quiet telemetry logs - Sample the soak in 1h chunks and re-upload (overwrite) after each so partial data survives a host-level OOM/kill, where if:always() steps never run - soak.py: add --append (SOAK_APPEND) to accumulate chunks into one JSONL set instead of truncating - Set ESPRESSO_NODE_TELEMETRY_LOG=error in the soak job; the OTLP log layer has its own filter (default warn) that RUST_LOG does not reach, and node-0 pushes those records to an unreachable endpoint - docker-compose: make ESPRESSO_NODE_TELEMETRY_LOG overridable, default unchanged (info) (cherry picked from commit f2b93e9c8fd11c26f31b014fb893cae077158ab4) --- .github/workflows/build.yml | 42 ++++++++++++++++++++++++-- crates/process-metrics/scripts/soak.py | 18 ++++++++--- docker-compose.yaml | 4 +-- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6579f33f2a7..d9a946f6e6b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -723,6 +723,7 @@ jobs: ESPRESSO_NODE_GENESIS_FILE: genesis/${{ matrix.genesis }}.toml DURATION_SECONDS: 14400 RUST_LOG: error + ESPRESSO_NODE_TELEMETRY_LOG: error steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v8.1.0 @@ -743,8 +744,44 @@ jobs: - name: Start demo + smoke test run: just soak::up - - name: Sample - run: just soak::sample + # Sample in 1h chunks, re-uploading (overwrite) after each so partial data + # survives a host-level OOM/kill: when the runner host dies, `if: always()` + # steps do not run, so only artifacts already uploaded persist. + - name: Sample chunk 1/4 (1h) + run: DURATION_SECONDS=3600 just soak::sample + - name: Upload partial soak samples (1h) + if: always() + uses: actions/upload-artifact@v7 + with: + name: memory-soak-${{ matrix.genesis }} + path: ./soak-samples + retention-days: 90 + overwrite: true + + - name: Sample chunk 2/4 (1h) + run: DURATION_SECONDS=3600 just soak::sample --append + - name: Upload partial soak samples (2h) + if: always() + uses: actions/upload-artifact@v7 + with: + name: memory-soak-${{ matrix.genesis }} + path: ./soak-samples + retention-days: 90 + overwrite: true + + - name: Sample chunk 3/4 (1h) + run: DURATION_SECONDS=3600 just soak::sample --append + - name: Upload partial soak samples (3h) + if: always() + uses: actions/upload-artifact@v7 + with: + name: memory-soak-${{ matrix.genesis }} + path: ./soak-samples + retention-days: 90 + overwrite: true + + - name: Sample chunk 4/4 (1h) + run: DURATION_SECONDS=3600 just soak::sample --append - name: Render summary + chart if: always() @@ -761,6 +798,7 @@ jobs: name: memory-soak-${{ matrix.genesis }} path: ./soak-samples retention-days: 90 + overwrite: true - name: Upload soak logs if: always() diff --git a/crates/process-metrics/scripts/soak.py b/crates/process-metrics/scripts/soak.py index 66f7839f678..6e0c46080c8 100755 --- a/crates/process-metrics/scripts/soak.py +++ b/crates/process-metrics/scripts/soak.py @@ -114,11 +114,12 @@ def _append_jsonl(f, ts: int, rows: list[dict]) -> None: f.write(json.dumps(row) + "\n") -def run_sampling(output_dir: Path, duration_seconds: int) -> None: +def run_sampling(output_dir: Path, duration_seconds: int, append: bool = False) -> None: docker_path = output_dir / "docker-stats.jsonl" metrics_path = output_dir / "node-metrics.jsonl" - docker_path.write_text("") - metrics_path.write_text("") + if not append: + docker_path.write_text("") + metrics_path.write_text("") if duration_seconds <= 0: log.warning(f"duration_seconds={duration_seconds}; skipping sampling") return @@ -360,7 +361,14 @@ def cli(log_level: str) -> None: default=Path("./soak-samples"), type=PathOpt, ) -def sample(duration_seconds: int, output_dir: Path) -> None: +@opt( + "--append", + is_flag=True, + envvar="SOAK_APPEND", + default=False, + help="append to existing JSONL instead of truncating (for chunked sampling)", +) +def sample(duration_seconds: int, output_dir: Path, append: bool) -> None: """Scrape docker stats + each node's /v0/status/metrics into JSONL.""" output_dir.mkdir(parents=True, exist_ok=True) env_path = REPO_ROOT / ".env" @@ -368,7 +376,7 @@ def sample(duration_seconds: int, output_dir: Path) -> None: log.error(".env not found. Copy .env.docker.example to .env first.") sys.exit(1) load_dotenv(env_path, override=False) - run_sampling(output_dir, duration_seconds) + run_sampling(output_dir, duration_seconds, append=append) @cli.command() diff --git a/docker-compose.yaml b/docker-compose.yaml index 3137b847489..ff5c2182fe3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -395,7 +395,7 @@ services: - ESPRESSO_NODE_TELEMETRY_LOGS_ENABLE=true - ESPRESSO_NODE_TELEMETRY_METRICS_ENABLE=true - ESPRESSO_NODE_TELEMETRY_ENDPOINT=http://intentionally-unreachable-telemetry-endpoint - - ESPRESSO_NODE_TELEMETRY_LOG=info + - ESPRESSO_NODE_TELEMETRY_LOG=${ESPRESSO_NODE_TELEMETRY_LOG:-info} - ESPRESSO_NODE_EMBEDDED_DB=false - ESPRESSO_NODE_GENESIS_FILE=${ESPRESSO_NODE_GENESIS_FILE:-genesis/demo.toml} - ESPRESSO_NODE_ORCHESTRATOR_URL=http://orchestrator:${ESPRESSO_ORCHESTRATOR_PORT} @@ -579,7 +579,7 @@ services: environment: - ESPRESSO_NODE_TELEMETRY_LOGS_ENABLE=true - ESPRESSO_NODE_TELEMETRY_ENDPOINT=http://vector:${ESPRESSO_DEMO_TELEMETRY_OTLP_PORT} - - ESPRESSO_NODE_TELEMETRY_LOG=info + - ESPRESSO_NODE_TELEMETRY_LOG=${ESPRESSO_NODE_TELEMETRY_LOG:-info} - ESPRESSO_NODE_EMBEDDED_DB=false - ESPRESSO_NODE_GENESIS_FILE=${ESPRESSO_NODE_GENESIS_FILE:-genesis/demo.toml} - ESPRESSO_NODE_ORCHESTRATOR_URL=http://orchestrator:${ESPRESSO_ORCHESTRATOR_PORT}