From 22c41641c35db73e85772a4163a37fde2a33895a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 09:17:17 +0000 Subject: [PATCH 1/7] Initial plan From e7bc3fe06c9327d4c51091563a2c329f2929ecfc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 09:27:21 +0000 Subject: [PATCH 2/7] Add test_add_node_with_corrupted_ledger to reproduce issue #6612 Co-authored-by: achamayou <4016369+achamayou@users.noreply.github.com> --- tests/reconfiguration.py | 75 +++++++++++++++++++++++++++++++++++++++ tests/suite/test_suite.py | 1 + 2 files changed, 76 insertions(+) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 0768241008bf..18455d3beb26 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -148,6 +148,81 @@ def test_add_node(network, args, from_snapshot=True): return network +@reqs.description("Adding a node with corrupted ledger file") +def test_add_node_with_corrupted_ledger(network, args): + # Reproduce issue #6612: a node joining with a corrupted (truncated) ledger + # file should fail to start rather than crash unexpectedly. + new_node = network.create_node() + + # Set up the join node (copies ledger, snapshots, etc.) but do not start it yet + network.setup_join_node( + new_node, + args.package, + args, + from_snapshot=True, + fetch_recent_snapshot=True, + ) + + # Find the latest uncommitted ledger file in the node's working directory + ledger_dir = new_node.remote.get_main_ledger_dir() + ledger_files = sorted( + [ + f + for f in os.listdir(ledger_dir) + if f.startswith("ledger_") and not f.endswith(".committed") + ] + ) + + if not ledger_files: + LOG.warning("No uncommitted ledger files found, skipping corruption test") + new_node.stop() + network.nodes.remove(new_node) + return network + + # Corrupt the latest uncommitted ledger file by truncating it in the middle + # of a transaction, so the transaction size does not match the number of + # bytes available left to read in the file (as described in issue #6612) + ledger = ccf.ledger.Ledger([ledger_dir], committed_only=False) + chunk_filename = None + truncate_offset = None + for chunk in ledger: + for tx in chunk: + offset, next_offset = tx.get_offsets() + chunk_filename = chunk.filename() + truncate_offset = offset + (next_offset - offset) // 2 + + if truncate_offset is None: + LOG.warning("Could not find a transaction to corrupt, skipping") + new_node.stop() + network.nodes.remove(new_node) + return network + + LOG.info( + f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}" + ) + with open(chunk_filename, "r+", encoding="utf-8") as f: + f.truncate(truncate_offset) + + # Attempt to start the node - it should fail due to the corrupted ledger + try: + network.run_join_node(new_node, timeout=3) + except (RuntimeError, TimeoutError) as e: + LOG.info( + f"Node {new_node.local_node_id} with corrupted ledger failed to start, as expected: {e}" + ) + # Cleanup: run_join_node may have already stopped and removed the node + # on TimeoutError, but not on RuntimeError + new_node.stop() + if new_node in network.nodes: + network.nodes.remove(new_node) + else: + assert ( + False + ), f"Node {new_node.local_node_id} with corrupted ledger unexpectedly started" + + return network + + @reqs.description("Test ignore_first_sigterm") def test_ignore_first_sigterm(network, args): # Note: host is supplied explicitly to avoid having differently diff --git a/tests/suite/test_suite.py b/tests/suite/test_suite.py index a2fddd4839eb..e540e6830ca9 100644 --- a/tests/suite/test_suite.py +++ b/tests/suite/test_suite.py @@ -61,6 +61,7 @@ reconfiguration.test_retire_primary, e2e_logging.test_rekey, reconfiguration.test_add_node, + reconfiguration.test_add_node_with_corrupted_ledger, nodes.test_kill_primary, nodes.test_commit_view_history, reconfiguration.test_add_node, From 960f87f15af7df452e1a01b776eb130c34eb07cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 09:35:40 +0000 Subject: [PATCH 3/7] Replace skip-on-missing with assertions for ledger corruption preconditions Co-authored-by: achamayou <4016369+achamayou@users.noreply.github.com> --- tests/reconfiguration.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 18455d3beb26..2e91fad2e41e 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -173,11 +173,7 @@ def test_add_node_with_corrupted_ledger(network, args): ] ) - if not ledger_files: - LOG.warning("No uncommitted ledger files found, skipping corruption test") - new_node.stop() - network.nodes.remove(new_node) - return network + assert ledger_files, "Expected uncommitted ledger files to corrupt" # Corrupt the latest uncommitted ledger file by truncating it in the middle # of a transaction, so the transaction size does not match the number of @@ -191,11 +187,7 @@ def test_add_node_with_corrupted_ledger(network, args): chunk_filename = chunk.filename() truncate_offset = offset + (next_offset - offset) // 2 - if truncate_offset is None: - LOG.warning("Could not find a transaction to corrupt, skipping") - new_node.stop() - network.nodes.remove(new_node) - return network + assert truncate_offset is not None, "Should always find a transaction to corrupt" LOG.info( f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}" From 9b5c9806c544b686dada38de6a42573e74704ec1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:56:26 +0000 Subject: [PATCH 4/7] Add test_add_node_with_corrupted_ledger to run_all in reconfiguration.py Co-authored-by: eddyashton <6000239+eddyashton@users.noreply.github.com> --- tests/reconfiguration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 2e91fad2e41e..880460dc69cd 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -856,6 +856,7 @@ def run_all(args): test_add_node_invalid_service_cert(network, args) test_add_node(network, args, from_snapshot=False) test_add_node_with_read_only_ledger(network, args) + test_add_node_with_corrupted_ledger(network, args) test_join_straddling_primary_replacement(network, args) test_node_replacement(network, args) test_add_node_from_backup(network, args) From 6b2822392cd065e459870ca2fdc933d4ca954d68 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Wed, 3 Jun 2026 09:29:40 +0100 Subject: [PATCH 5/7] Apply suggestion from @achamayou --- tests/reconfiguration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 70b5567c2608..c38e84df9855 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -166,7 +166,7 @@ def test_add_node_with_corrupted_ledger(network, args): new_node, args.package, args, - from_snapshot=True, + from_snapshot=False, fetch_recent_snapshot=True, ) From 07848859daa8d322682d66d1b6fb37c2b6abd8b8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:45:23 +0000 Subject: [PATCH 6/7] Fix corrupted ledger reconfiguration test --- tests/reconfiguration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 141b778e32f4..cfb3584a5e1e 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -168,7 +168,9 @@ def test_add_node_with_corrupted_ledger(network, args): args.package, args, from_snapshot=False, - fetch_recent_snapshot=True, + # This regression test must replay the copied ledger, rather than + # fetching a newer snapshot which could bypass the corrupted chunk. + fetch_recent_snapshot=False, ) # Find the latest uncommitted ledger file in the node's working directory @@ -200,7 +202,7 @@ def test_add_node_with_corrupted_ledger(network, args): LOG.info( f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}" ) - with open(chunk_filename, "r+", encoding="utf-8") as f: + with open(chunk_filename, "rb+") as f: f.truncate(truncate_offset) # Attempt to start the node - it should fail due to the corrupted ledger From dc0dbb63172e567d07fa6d4c7b527372be64caa8 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 5 Jun 2026 14:17:48 +0100 Subject: [PATCH 7/7] Align corrupted-ledger join regression with snapshot-start behavior (#7925) Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> --- tests/reconfiguration.py | 99 +++++++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index cfb3584a5e1e..c16f6107de87 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -158,44 +158,78 @@ def test_add_node(network, args, copy_snapshot=False, fetch_recent_snapshot=True @reqs.description("Adding a node with corrupted ledger file") def test_add_node_with_corrupted_ledger(network, args): - # Reproduce issue #6612: a node joining with a corrupted (truncated) ledger - # file should fail to start rather than crash unexpectedly. + # Reproduce issue #6612: when joining from a recent snapshot, an older + # corrupted/truncated uncommitted ledger file should not prevent startup. + primary, _ = network.find_primary() + snapshot_trigger_txid = primary.trigger_snapshot() + snapshots_dir = network.get_committed_snapshots( + primary, + target_seqno=snapshot_trigger_txid.seqno, + wait_for_target_seqno=True, + ) + new_node = network.create_node() - # Set up the join node (copies ledger, snapshots, etc.) but do not start it yet + # Set up the join node (copies ledger and snapshots) but do not start it yet network.setup_join_node( new_node, args.package, args, - from_snapshot=False, - # This regression test must replay the copied ledger, rather than - # fetching a newer snapshot which could bypass the corrupted chunk. + from_snapshot=True, + snapshots_dir=snapshots_dir, fetch_recent_snapshot=False, ) - # Find the latest uncommitted ledger file in the node's working directory + # Find an uncommitted ledger file in the node's main ledger directory ledger_dir = new_node.remote.get_main_ledger_dir() ledger_files = sorted( [ f for f in os.listdir(ledger_dir) if f.startswith("ledger_") and not f.endswith(".committed") - ] + ], + key=lambda f: ccf.ledger.range_from_filename(f)[0], ) + assert ( + ledger_files + ), "Expected to find uncommitted ledger files for corruption test" + ledger_ranges = { + ledger_file: ccf.ledger.range_from_filename(ledger_file) + for ledger_file in ledger_files + } - assert ledger_files, "Expected uncommitted ledger files to corrupt" + # Prefer a chunk whose range is older than the snapshot we are joining from. + corrupted_ledger_file = next( + ( + f + for f in ledger_files + if ( + ledger_ranges[f][1] is not None + and ledger_ranges[f][1] < snapshot_trigger_txid.seqno + ) + ), + ledger_files[-1], + ) - # Corrupt the latest uncommitted ledger file by truncating it in the middle - # of a transaction, so the transaction size does not match the number of - # bytes available left to read in the file (as described in issue #6612) + # Corrupt the chosen uncommitted ledger file by truncating it in the middle + # of a transaction. ledger = ccf.ledger.Ledger([ledger_dir], committed_only=False) chunk_filename = None truncate_offset = None for chunk in ledger: + if os.path.basename(chunk.filename()) != corrupted_ledger_file: + continue + for tx in chunk: offset, next_offset = tx.get_offsets() chunk_filename = chunk.filename() truncate_offset = offset + (next_offset - offset) // 2 + # Corrupting a single transaction in the selected chunk is + # sufficient to make the file malformed. + break + + if truncate_offset is not None: + break assert truncate_offset is not None, "Should always find a transaction to corrupt" @@ -205,23 +239,34 @@ def test_add_node_with_corrupted_ledger(network, args): with open(chunk_filename, "rb+") as f: f.truncate(truncate_offset) - # Attempt to start the node - it should fail due to the corrupted ledger - try: - network.run_join_node(new_node, timeout=3) - except (RuntimeError, TimeoutError) as e: - LOG.info( - f"Node {new_node.local_node_id} with corrupted ledger failed to start, as expected: {e}" + network.run_join_node(new_node) + network.trust_node(new_node, args) + + with new_node.client() as c: + r = c.get("/node/state") + assert r.body.json()["startup_seqno"] != 0, ( + f"Node {new_node.local_node_id} should have started from snapshot" ) - # Cleanup: run_join_node may have already stopped and removed the node - # on TimeoutError, but not on RuntimeError - new_node.stop() - if new_node in network.nodes: - network.nodes.remove(new_node) - else: - assert ( - False - ), f"Node {new_node.local_node_id} with corrupted ledger unexpectedly started" + out_path, err_path = new_node.get_logs() + if out_path is not None and err_path is not None: + with open(out_path, encoding="utf-8") as out: + out_logs = out.read() + with open(err_path, encoding="utf-8") as err: + err_logs = err.read() + # Depending on where recovery skips/truncates the stale chunk, this + # malformed-ledger line may or may not be emitted. + combined_logs = (out_logs + err_logs).lower() + if "malformed" in combined_logs and "ledger file" in combined_logs: + LOG.info("Observed malformed ledger handling while joining from snapshot") + else: + LOG.info( + "Did not observe malformed ledger log line; join success remains the test invariant" + ) + + primary, _ = network.find_primary() + network.retire_node(primary, new_node) + new_node.stop() return network