Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions tests/reconfiguration.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,120 @@ def test_add_node(network, args, copy_snapshot=False, fetch_recent_snapshot=True
return network


@reqs.description("Adding a node with corrupted ledger file")
def test_add_node_with_corrupted_ledger(network, args):
Comment thread
eddyashton marked this conversation as resolved.
# Reproduce issue #6612: when joining from a recent snapshot, an older
# corrupted/truncated uncommitted ledger file should not prevent startup.
primary, _ = network.find_primary()
snapshot_trigger_txid = primary.trigger_snapshot()
snapshots_dir = network.get_committed_snapshots(
primary,
target_seqno=snapshot_trigger_txid.seqno,
wait_for_target_seqno=True,
)

new_node = network.create_node()

# Set up the join node (copies ledger and snapshots) but do not start it yet
network.setup_join_node(
new_node,
args.package,
args,
from_snapshot=True,
snapshots_dir=snapshots_dir,
fetch_recent_snapshot=False,
)

# Find an uncommitted ledger file in the node's main ledger directory
ledger_dir = new_node.remote.get_main_ledger_dir()
ledger_files = sorted(
[
f
for f in os.listdir(ledger_dir)
if f.startswith("ledger_") and not f.endswith(".committed")
],
key=lambda f: ccf.ledger.range_from_filename(f)[0],
)
assert (
ledger_files
), "Expected to find uncommitted ledger files for corruption test"
ledger_ranges = {
ledger_file: ccf.ledger.range_from_filename(ledger_file)
for ledger_file in ledger_files
}

# Prefer a chunk whose range is older than the snapshot we are joining from.
corrupted_ledger_file = next(
(
f
for f in ledger_files
if (
ledger_ranges[f][1] is not None
and ledger_ranges[f][1] < snapshot_trigger_txid.seqno
)
),
ledger_files[-1],
)

# Corrupt the chosen uncommitted ledger file by truncating it in the middle
# of a transaction.
ledger = ccf.ledger.Ledger([ledger_dir], committed_only=False)
chunk_filename = None
truncate_offset = None
for chunk in ledger:
if os.path.basename(chunk.filename()) != corrupted_ledger_file:
continue

for tx in chunk:
offset, next_offset = tx.get_offsets()
chunk_filename = chunk.filename()
truncate_offset = offset + (next_offset - offset) // 2
# Corrupting a single transaction in the selected chunk is
# sufficient to make the file malformed.
break

if truncate_offset is not None:
break

assert truncate_offset is not None, "Should always find a transaction to corrupt"

LOG.info(
f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}"
)
with open(chunk_filename, "rb+") as f:
f.truncate(truncate_offset)

network.run_join_node(new_node)
network.trust_node(new_node, args)

with new_node.client() as c:
r = c.get("/node/state")
assert r.body.json()["startup_seqno"] != 0, (
f"Node {new_node.local_node_id} should have started from snapshot"
)

out_path, err_path = new_node.get_logs()
if out_path is not None and err_path is not None:
with open(out_path, encoding="utf-8") as out:
out_logs = out.read()
with open(err_path, encoding="utf-8") as err:
err_logs = err.read()
# Depending on where recovery skips/truncates the stale chunk, this
# malformed-ledger line may or may not be emitted.
Comment thread
maxtropets marked this conversation as resolved.
combined_logs = (out_logs + err_logs).lower()
if "malformed" in combined_logs and "ledger file" in combined_logs:
LOG.info("Observed malformed ledger handling while joining from snapshot")
else:
LOG.info(
"Did not observe malformed ledger log line; join success remains the test invariant"
)

primary, _ = network.find_primary()
network.retire_node(primary, new_node)
new_node.stop()
return network


@reqs.description("Test ignore_first_sigterm")
def test_ignore_first_sigterm(network, args):
# Note: host is supplied explicitly to avoid having differently
Expand Down Expand Up @@ -1008,6 +1122,7 @@ def run_all(args):
test_add_node(network, args, copy_snapshot=True, fetch_recent_snapshot=False)
test_add_node(network, args, copy_snapshot=True, fetch_recent_snapshot=True)
test_add_node_with_read_only_ledger(network, args)
test_add_node_with_corrupted_ledger(network, args)
test_join_straddling_primary_replacement(network, args)
test_node_replacement(network, args)
test_add_node_from_backup(network, args)
Expand Down
1 change: 1 addition & 0 deletions tests/suite/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
reconfiguration.test_retire_primary,
e2e_logging.test_rekey,
reconfiguration.test_add_node,
reconfiguration.test_add_node_with_corrupted_ledger,
nodes.test_kill_primary,
nodes.test_commit_view_history,
reconfiguration.test_add_node,
Expand Down
Loading