[release/5.x] Cherry pick: Ensure that end-of-private-recovery service open transaction is written at-most-once (#6926) (#6950)

maxtropets · eddyashton · web-flow · commit 9d4377d8290c · 2025-04-01T14:36:17.000+01:00
Co-authored-by: Eddy Ashton &lt;edashton@microsoft.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,7 +45,7 @@ jobs:
           - name: sgx
             image: sgx
             nodes: [self-hosted, 1ES.Pool=gha-sgx-ccf-sub]
-            options: --user root --publish-all --cap-add NET_ADMIN --cap-add NET_RAW --device /dev/sgx_enclave:/dev/sgx_enclave --device /dev/sgx_provision:/dev/sgx_provision -v /dev/sgx:/dev/sgx -v /lib/modules:/lib/modules:ro
+            options: --user root --publish-all --cap-add NET_ADMIN --cap-add NET_RAW --cap-add SYS_PTRACE --device /dev/sgx_enclave:/dev/sgx_enclave --device /dev/sgx_provision:/dev/sgx_provision -v /dev/sgx:/dev/sgx -v /lib/modules:/lib/modules:ro
     runs-on: ${{ matrix.platform.nodes }}
     container:
       image: ghcr.io/microsoft/ccf/ci/${{ matrix.platform.image }}:build-05-12-2024
@@ -80,6 +80,12 @@ jobs:
         shell: bash
         if: "${{ matrix.platform.name == 'snp' }}"
 
+      - name: "Install extra dependencies"
+        run: |
+          # For backported #6926.
+          sudo apt install strace -y
+        shell: bash
+
       - name: "Test ${{ matrix.platform.name }}"
         run: |
           set -ex
@@ -92,7 +98,7 @@ jobs:
           # All other acceptably fast tests, mostly end-to-end
           ./tests.sh --timeout 360 --output-on-failure -LE "benchmark|perf|protocolstest|suite|unit"
           # Partitions tests
-          ./tests.sh --timeout 240 --output-on-failure -L partitions -C partitions
+          ./tests.sh --timeout 360 --output-on-failure -L partitions -C partitions
         shell: bash
         if: "${{ matrix.platform.name != 'snp' }}" # Needs 1ES Pool support
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -114,6 +114,12 @@ jobs:
         shell: bash
         if: ${{ matrix.platform.name != 'snp' }}
 
+      - name: "Install extra dependencies"
+        run: |
+          # For backported #6926.
+          sudo apt install strace -y
+        shell: bash
+
       - name: "Test ${{ matrix.platform.name }}"
         run: |
           set -ex
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Fixed
 
 - Improved locking in indexing system, to remove race conditions which were possible when accessing historical state (#6886).
+- Fixed a bug which could produce an invalid secret chain (containing duplicate ledger secret sealing entries) in the ledger if an election occurred during private recovery (#6926). Comes with #6912 as a prerequisite.
 
 ## [5.0.14]
 
diff --git a/src/node/node_state.h b/src/node/node_state.h
@@ -1064,7 +1064,10 @@ namespace ccf
       {
         auto entry = ::consensus::LedgerEnclave::get_entry(data, size);
 
-        LOG_INFO_FMT("Deserialising private ledger entry [{}]", entry.size());
+        LOG_INFO_FMT(
+          "Deserialising private ledger entry {} [{}]",
+          last_recovered_idx + 1,
+          entry.size());
 
         // When reading the private ledger, deserialise in the recovery store
         ccf::kv::ApplyResult result = ccf::kv::ApplyResult::FAIL;
@@ -1148,6 +1151,29 @@ namespace ccf
       {
         auto tx = network.tables->create_tx();
 
+        {
+          // Ensure this transition happens at-most-once, by checking that no
+          // other node has already advanced the state
+          auto service = tx.ro<ccf::Service>(Tables::SERVICE);
+          auto active_service = service->get();
+
+          if (!active_service.has_value())
+          {
+            throw std::logic_error(fmt::format(
+              "Error in {}: no value in {}", __func__, Tables::SERVICE));
+          }
+
+          if (
+            active_service->status !=
+            ServiceStatus::WAITING_FOR_RECOVERY_SHARES)
+          {
+            throw std::logic_error(fmt::format(
+              "Error in {}: current service status is {}",
+              __func__,
+              active_service->status));
+          }
+        }
+
         // Clear recovery shares that were submitted to initiate the recovery
         // procedure
         ShareManager::clear_submitted_recovery_shares(tx);
diff --git a/tests/infra/network.py b/tests/infra/network.py
@@ -1662,7 +1662,7 @@ def save_service_identity(self, args):
         with open(previous_identity, "w", encoding="utf-8") as f:
             f.write(current_ident)
         args.previous_service_identity_file = previous_identity
-        return args
+        return current_ident
 
     def identity(self, name=None):
         if name is not None:
diff --git a/tests/partitions_test.py b/tests/partitions_test.py
@@ -17,6 +17,7 @@
 import contextlib
 import ccf.ledger
 from reconfiguration import test_ledger_invariants
+import subprocess
 
 from loguru import logger as LOG
 
@@ -711,6 +712,150 @@ def wait_for_new_view(node, original_view, timeout_multiplier):
     return network
 
 
+@reqs.supports_methods("/app/log/public")
+def test_recovery_elections(orig_network, args):
+    # Ensure we have 3 nodes
+    original_size = orig_network.resize(3, args)
+
+    old_primary, _ = orig_network.find_nodes()
+    with old_primary.client("user0") as c:
+        LOG.warning("Writing some initial state")
+        for _ in range(300):
+            r = c.post(
+                "/app/log/public",
+                {
+                    "id": 42,
+                    "msg": "Uninteresting recoverable transactions",
+                },
+            )
+            assert r.status_code == 200, r
+
+        r = c.get("/node/network")
+        assert r.status_code == 200, r
+        previous_identity = orig_network.save_service_identity(args)
+        c.wait_for_commit(
+            orig_network.consortium.set_recovery_threshold(old_primary, 1)
+        )
+    orig_network.stop_all_nodes(skip_verification=True)
+    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()
+
+    # Create a recovery network, where we will manually take the recovery steps (transition to open and submit share)
+    network = infra.network.Network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        args.perf_nodes,
+        existing_network=orig_network,
+    )
+    network.start_in_recovery(
+        args,
+        ledger_dir=current_ledger_dir,
+        committed_ledger_dirs=committed_ledger_dirs,
+    )
+    new_primary, new_backups = network.find_nodes()
+    network.consortium.transition_service_to_open(
+        new_primary, previous_service_identity=previous_identity
+    )
+
+    with new_primary.client("user0") as c:
+        previous_identity = network.save_service_identity(args)
+
+    member = network.consortium.get_active_recovery_members()[0]
+
+    # We need to delay a backup's private recovery process until:
+    # - The primary has completed its private recovery, and fully opened the network
+    # - The backup has called and won an election
+    # So that the backup node _is primary_ at the point it completes private recovery.
+    # We force the delay by injecting a delay into the file operations of the backup,
+    # and force an election (after the primary has completed its recovery) by killing
+    # the original primary node.
+    backup = new_backups[0]
+    LOG.info(f"Using strace to inject delays in file IO of {backup}")
+    assert not backup.remote.check_done()
+
+    strace_command = [
+        "strace",
+        f"--attach={backup.remote.remote.proc.pid}",
+        "--inject=lseek:delay_exit=10s",
+        "-tt",
+        "--trace=lseek,read,open,openat",
+        "--output=strace_output.txt",
+    ]
+    LOG.warning(f"About to run strace: {strace_command}")
+    strace_process = subprocess.Popen(
+        strace_command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    member.get_and_submit_recovery_share(new_primary)
+    network.recovery_count += 1
+
+    LOG.info("Confirming that primary completes private recovery")
+    network.wait_for_state(
+        new_primary,
+        infra.node.State.PART_OF_NETWORK.value,
+        timeout=30,
+    )
+
+    election_s = args.election_timeout_ms / 1000
+    LOG.info(
+        f"Holding backup stalled via strace for {election_s}, to trigger an election"
+    )
+    time.sleep(election_s)
+
+    # If strace failed to stall the node, the rest of the test is meaningless.
+    try:
+        strace_process.communicate(timeout=1)
+    except subprocess.TimeoutExpired:
+        assert strace_process.returncode is None, strace_process.returncode
+    else:
+        assert (
+            False
+        ), f"strace must not have been completed yet (retcode: {strace_process.returncode})"
+
+    LOG.info("Ending strace, and terminating primary node")
+    strace_process.terminate()
+    strace_process.communicate()
+
+    new_primary.stop()
+
+    LOG.info(
+        f"Give {backup} time to finish its recovery (including becoming primary), and confirm that it dies in the process"
+    )
+    time.sleep(election_s)
+    # The result of all of that is that this node, which had become primary while it
+    # completed its private recovery, crashed at the end of recovery (rather than)
+    # producing an invalid ledger)
+    assert backup.remote.check_done()
+
+    network.ignore_errors_on_shutdown()
+    network.stop_all_nodes(skip_verification=True)
+    current_ledger_dir, committed_ledger_dirs = backup.get_ledger()
+
+    LOG.info(
+        "Trying a further recovery, to confirm that the ledger is in a recoverable state"
+    )
+    recovery_network = infra.network.Network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        args.perf_nodes,
+        existing_network=network,
+    )
+    recovery_network.start_in_recovery(
+        args,
+        ledger_dir=current_ledger_dir,
+        committed_ledger_dirs=committed_ledger_dirs,
+    )
+    recovery_network.recover(args)
+
+    # Restore original network size
+    recovery_network.resize(original_size, args)
+
+    return recovery_network
+
+
 def run(args):
     txs = app.LoggingTxs("user0")
 
@@ -737,6 +882,7 @@ def run(args):
         # HTTP2 doesn't support forwarding
         if not args.http2:
             test_session_consistency(network, args)
+        network = test_recovery_elections(network, args)
         test_ledger_invariants(network, args)