Skip to content

Commit b79409e

Browse files
CopilotachamayouCopilot
authored
Add periodic snapshot file cleanup (#7747)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: achamayou <4016369+achamayou@users.noreply.github.com> Co-authored-by: Amaury Chamayou <amchamay@microsoft.com> Co-authored-by: Amaury Chamayou <amaury@xargs.fr> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 87a8a1e commit b79409e

12 files changed

Lines changed: 426 additions & 37 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1111

1212
### Added
1313

14+
- Added `files_cleanup.max_snapshots` configuration option to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. The value must be at least 1 if set.
15+
- Added `files_cleanup.interval` configuration option (default `"30s"`) to periodically scan the snapshot directory and delete old committed snapshots exceeding `max_snapshots`. This ensures backup nodes (which receive snapshots via `backup_fetch`) also prune old snapshots. Only effective when `max_snapshots` is set.
1416
- Added `POST /node/snapshot:create`, gated by the `SnapshotCreate` RPC interface operator feature, to create a snapshot via an operator endpoint rather than a governance action.
1517
- Added `make_cose_verifier_from_pem_cert()` and `make_cose_verifier_from_der_cert()` that accept certificates in a known format. The existing `make_cose_verifier_cert()` is renamed to `make_cose_verifier_any_cert()` (#7768).
1618

doc/host_config_schema/cchost_config.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,24 @@
546546
"description": "This section includes configuration for the snapshot directories and files",
547547
"additionalProperties": false
548548
},
549+
"files_cleanup": {
550+
"type": "object",
551+
"properties": {
552+
"max_snapshots": {
553+
"type": ["integer", "null"],
554+
"default": null,
555+
"description": "Maximum number of committed snapshot files to retain. When the number of committed snapshots exceeds this value, the oldest snapshots are deleted. Must be at least 1 if set. If null or unset, no automated snapshot garbage collection is performed.",
556+
"minimum": 1
557+
},
558+
"interval": {
559+
"type": "string",
560+
"default": "30s",
561+
"description": "Time interval at which to scan the snapshot directory and delete old committed snapshots in excess of max_snapshots. This periodic cleanup executes regardless of the node's status (primary or backup)."
562+
}
563+
},
564+
"description": "This section includes configuration for periodic cleanup of old files (snapshots, ledger chunks)",
565+
"additionalProperties": false
566+
},
549567
"logging": {
550568
"type": "object",
551569
"properties": {

doc/operations/ledger_snapshot.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ Committed snapshot files are named ``snapshot_<seqno>_<evidence_seqno>.committed
178178

179179
Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot_<seqno>_<evidence_seqno>``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity.
180180

181+
.. note:: The ``files_cleanup.max_snapshots`` configuration entry can be used to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. This is useful to control the local persistent storage footprint of a node. The value must be at least 1 if set.
182+
181183
Join or Recover From Snapshot
182184
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183185

include/ccf/node/startup_config.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,15 @@ namespace ccf
116116
bool operator==(const Snapshots&) const = default;
117117
};
118118
Snapshots snapshots = {};
119+
120+
struct FilesCleanup
121+
{
122+
std::optional<size_t> max_snapshots = std::nullopt;
123+
ccf::ds::TimeString interval = {"30s"};
124+
125+
bool operator==(const FilesCleanup&) const = default;
126+
};
127+
FilesCleanup files_cleanup = {};
119128
};
120129

121130
struct RecoveryDecisionProtocolConfig

src/common/configuration.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,11 @@ namespace ccf
114114
read_only_directory,
115115
backup_fetch);
116116

117+
DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCFConfig::FilesCleanup);
118+
DECLARE_JSON_REQUIRED_FIELDS(CCFConfig::FilesCleanup);
119+
DECLARE_JSON_OPTIONAL_FIELDS(
120+
CCFConfig::FilesCleanup, max_snapshots, interval);
121+
117122
DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCFConfig);
118123
DECLARE_JSON_REQUIRED_FIELDS(CCFConfig, network);
119124
DECLARE_JSON_OPTIONAL_FIELDS(
@@ -126,6 +131,7 @@ namespace ccf
126131
jwt,
127132
attestation,
128133
snapshots,
134+
files_cleanup,
129135
node_to_node_message_limit,
130136
historical_cache_soft_limit);
131137

src/host/run.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "enclave/entry_points.h"
3131
#include "handle_ring_buffer.h"
3232
#include "host/env.h"
33+
#include "host/snapshot_cleanup_timer.h"
3334
#include "http/curl.h"
3435
#include "json_schema.h"
3536
#include "lfs_file_handler.h"
@@ -620,6 +621,18 @@ namespace ccf
620621
config.snapshots.read_only_directory);
621622
snapshots.register_message_handlers(buffer_processor.get_dispatcher());
622623

624+
std::optional<asynchost::SnapshotCleanupTimer> snapshot_cleanup;
625+
if (
626+
config.files_cleanup.max_snapshots.has_value() &&
627+
std::chrono::milliseconds(config.files_cleanup.interval) >
628+
std::chrono::milliseconds::zero())
629+
{
630+
snapshot_cleanup.emplace(
631+
std::chrono::milliseconds(config.files_cleanup.interval),
632+
config.snapshots.directory,
633+
config.files_cleanup.max_snapshots.value());
634+
}
635+
623636
// handle LFS-related messages from the enclave
624637
asynchost::LFSFileHandler lfs_file_handler(
625638
writer_factory.create_writer_to_inside());

src/host/snapshot_cleanup_timer.h

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the Apache 2.0 License.
3+
#pragma once
4+
5+
#include "snapshots/filenames.h"
6+
#include "timer.h"
7+
8+
#include <filesystem>
9+
#include <string>
10+
11+
namespace asynchost
12+
{
13+
class SnapshotCleanupImpl
14+
{
15+
private:
16+
std::filesystem::path dir;
17+
size_t max_retained;
18+
19+
struct CleanupWork
20+
{
21+
std::filesystem::path dir;
22+
size_t max_retained;
23+
};
24+
25+
static void cleanup_old_snapshots(
26+
const std::filesystem::path& dir, size_t max_retained)
27+
{
28+
std::vector<std::filesystem::path> directories{dir};
29+
decltype(snapshots::find_committed_snapshots_in_directories(
30+
directories)) committed;
31+
try
32+
{
33+
committed =
34+
snapshots::find_committed_snapshots_in_directories(directories);
35+
}
36+
catch (const std::filesystem::filesystem_error& e)
37+
{
38+
LOG_FAIL_FMT(
39+
"Failed to list committed snapshots in {}: {}", dir, e.what());
40+
return;
41+
}
42+
catch (const std::exception& e)
43+
{
44+
LOG_FAIL_FMT(
45+
"Unexpected error while listing committed snapshots in {}: {}",
46+
dir,
47+
e.what());
48+
return;
49+
}
50+
51+
if (committed.size() > max_retained)
52+
{
53+
// committed is sorted descending by snapshot index, so the
54+
// oldest are at the end
55+
for (auto it = committed.rbegin();
56+
it != committed.rend() - max_retained;
57+
++it)
58+
{
59+
const auto& path = it->second;
60+
LOG_INFO_FMT(
61+
"Deleting old snapshot {} (retaining {})",
62+
path.filename(),
63+
max_retained);
64+
std::error_code ec;
65+
std::filesystem::remove(path, ec);
66+
if (ec)
67+
{
68+
LOG_FAIL_FMT(
69+
"Failed to delete old snapshot {}: {}",
70+
path.filename(),
71+
ec.message());
72+
}
73+
}
74+
}
75+
}
76+
77+
static void on_cleanup_work(uv_work_t* req)
78+
{
79+
auto* work = static_cast<CleanupWork*>(req->data);
80+
cleanup_old_snapshots(work->dir, work->max_retained);
81+
}
82+
83+
static void on_cleanup_work_done(uv_work_t* req, int /*status*/)
84+
{
85+
auto* work = static_cast<CleanupWork*>(req->data);
86+
LOG_DEBUG_FMT("Snapshot cleanup completed");
87+
delete work; // NOLINT(cppcoreguidelines-owning-memory)
88+
delete req; // NOLINT(cppcoreguidelines-owning-memory)
89+
}
90+
91+
public:
92+
SnapshotCleanupImpl(const std::string& dir_, size_t max_retained_) :
93+
dir(dir_),
94+
max_retained(max_retained_)
95+
{
96+
if (max_retained < 1)
97+
{
98+
throw std::logic_error(fmt::format(
99+
"files_cleanup.max_snapshots must be at least 1, got {}",
100+
max_retained));
101+
}
102+
}
103+
104+
void on_timer()
105+
{
106+
// NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
107+
auto* work = new CleanupWork{.dir = dir, .max_retained = max_retained};
108+
// NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
109+
auto* req = new uv_work_t;
110+
req->data = work;
111+
int rc = uv_queue_work(
112+
uv_default_loop(), req, &on_cleanup_work, &on_cleanup_work_done);
113+
if (rc < 0)
114+
{
115+
LOG_FAIL_FMT(
116+
"Failed to queue snapshot cleanup work: {}", uv_strerror(rc));
117+
delete work; // NOLINT(cppcoreguidelines-owning-memory)
118+
delete req; // NOLINT(cppcoreguidelines-owning-memory)
119+
}
120+
}
121+
};
122+
123+
using SnapshotCleanupTimer = proxy_ptr<Timer<SnapshotCleanupImpl>>;
124+
}

src/snapshots/snapshot_manager.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ namespace snapshots
2424

2525
const fs::path snapshot_dir;
2626
const std::optional<fs::path> read_snapshot_dir = std::nullopt;
27-
2827
struct PendingSnapshot
2928
{
3029
::consensus::Index evidence_idx;
@@ -62,6 +61,9 @@ namespace snapshots
6261
}
6362
}
6463

64+
SnapshotManager(const SnapshotManager&) = delete;
65+
SnapshotManager& operator=(const SnapshotManager&) = delete;
66+
6567
[[nodiscard]] fs::path get_main_directory() const
6668
{
6769
return snapshot_dir;

tests/config.jinja

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,12 @@
7373
"target_rpc_interface": "{{ backup_snapshot_fetch_target_rpc_interface }}"{% endif %}{% if backup_snapshot_fetch_max_size %},
7474
"max_size": "{{ backup_snapshot_fetch_max_size }}"{% endif %}
7575
}{% endif %}
76-
},
76+
},{% if files_cleanup_max_snapshots or files_cleanup_interval %}
77+
"files_cleanup":
78+
{
79+
{% if files_cleanup_max_snapshots %}"max_snapshots": {{ files_cleanup_max_snapshots }}{% endif %}{% if files_cleanup_max_snapshots and files_cleanup_interval %},
80+
{% endif %}{% if files_cleanup_interval %}"interval": "{{ files_cleanup_interval }}"{% endif %}
81+
},{% endif %}
7782
"logging":
7883
{
7984
"host_level": "{{ host_log_level }}",

0 commit comments

Comments
 (0)