From 7f3f7793fea50e4a87a93be5fae3275f5313dfa9 Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Wed, 27 May 2026 21:18:15 -0400 Subject: [PATCH 01/10] Plumb crash report lifecycle configuration Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../debug/crashreport/inproccrashreporter.h | 4 ++ src/coreclr/inc/clrconfigvalues.h | 2 + src/coreclr/vm/crashreportstackwalker.cpp | 57 ++++++++++++++++++- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.h b/src/coreclr/debug/crashreport/inproccrashreporter.h index 71e9148bec8a4d..54a66907ee31b2 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.h +++ b/src/coreclr/debug/crashreport/inproccrashreporter.h @@ -21,6 +21,8 @@ // - 256 for identifiers (process name, type/class/exception names). static constexpr size_t CRASHREPORT_PATH_BUFFER_SIZE = 1024; static constexpr size_t CRASHREPORT_STRING_BUFFER_SIZE = 256; +static constexpr int32_t CRASHREPORT_DEFAULT_MAX_FILE_COUNT = 32; +static constexpr int32_t CRASHREPORT_UNLIMITED_FILE_COUNT = -1; #if defined(__ANDROID__) static const char CRASHREPORT_LOG_TAG[] = "DOTNET_CRASH"; @@ -74,11 +76,13 @@ using InProcCrashReportModuleInfoCallback = bool (*)( struct InProcCrashReporterSettings { const char* reportPath; + const char* lifecycleRootPath; InProcCrashReportIsManagedThreadCallback isManagedThreadCallback; InProcCrashReportWalkStackCallback walkStackCallback; InProcCrashReportEnumerateThreadsCallback enumerateThreadsCallback; InProcCrashReportModuleInfoCallback moduleInfoCallback; uint32_t frameLimitPerThread; + int32_t maxFileCount; }; // Free-function entry point used by the runtime to wire the in-proc crash diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 324e2b7241bed5..9a26dc367ec17e 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -578,6 +578,8 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_DbgMiniDumpType, W("DbgMiniDumpType"), 0, "Cra RETAIL_CONFIG_DWORD_INFO(INTERNAL_CreateDumpDiagnostics, W("CreateDumpDiagnostics"), 0, "Enable crash dump generation diagnostic logging") RETAIL_CONFIG_DWORD_INFO(INTERNAL_CrashReportBeforeSignalChaining, W("CrashReportBeforeSignalChaining"), 0, "Enable crash report generation before chaining to previous signal handler") RETAIL_CONFIG_DWORD_INFO_EX(INTERNAL_CrashReportFrameLimitPerThread, W("CrashReportFrameLimitPerThread"), 32, "Maximum number of managed stack frames per thread to emit in the in-proc crash report's compact log; 0 disables the limit; remaining frames are summarized as '... +N more frames'", CLRConfig::LookupOptions::ParseIntegerAsBase10) +RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportRootPath, W("CrashReportRootPath"), "Root path for lifecycle-managed in-proc crash report JSON files") +RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportMaxFileCount, W("CrashReportMaxFileCount"), "Maximum number of lifecycle-managed in-proc crash report JSON files to retain; default 32, -1 unlimited, 0 cleanup-only") /// /// R2R diff --git a/src/coreclr/vm/crashreportstackwalker.cpp b/src/coreclr/vm/crashreportstackwalker.cpp index 1a259cb2a23cec..28e21804287f84 100644 --- a/src/coreclr/vm/crashreportstackwalker.cpp +++ b/src/coreclr/vm/crashreportstackwalker.cpp @@ -10,7 +10,10 @@ #include "peassembly.h" #include #include +#include +#include #include +#include #ifdef FEATURE_INPROC_CRASHREPORT @@ -587,6 +590,43 @@ CrashReportEnumerateThreads( } } +// This runs during configuration, not from the crash signal path. It uses +// strtol because TryAsInteger parses into DWORD via strtoul and cannot +// represent the -1 unlimited-retention sentinel for this lifecycle setting. +static +int32_t +GetCrashReportMaxFileCount() +{ + int32_t maxFileCount = CRASHREPORT_DEFAULT_MAX_FILE_COUNT; + + CLRConfigNoCache maxFileCountCfg = CLRConfigNoCache::Get("CrashReportMaxFileCount", /*noprefix*/ false, &getenv); + if (!maxFileCountCfg.IsSet()) + { + return maxFileCount; + } + + const char* maxFileCountString = maxFileCountCfg.AsString(); + if (maxFileCountString == nullptr) + { + return maxFileCount; + } + + errno = 0; + char* end = nullptr; + long configuredMaxFileCount = strtol(maxFileCountString, &end, 10); + if (end != maxFileCountString && *end == '\0' && errno != ERANGE && + configuredMaxFileCount >= CRASHREPORT_UNLIMITED_FILE_COUNT && configuredMaxFileCount <= INT32_MAX) + { + maxFileCount = static_cast(configuredMaxFileCount); + } + else + { + InProcCrashReportLogInitializationFailure(".NET crash report using default CrashReportMaxFileCount: invalid configured value"); + } + + return maxFileCount; +} + void CrashReportConfigure() { @@ -612,11 +652,22 @@ CrashReportConfigure() return; } - CLRConfigNoCache dmpNameCfg = CLRConfigNoCache::Get("DbgMiniDumpName", /*noprefix*/ false, &getenv); - const char* dumpName = dmpNameCfg.IsSet() ? dmpNameCfg.AsString() : nullptr; + CLRConfigNoCache crashReportRootPathCfg = CLRConfigNoCache::Get("CrashReportRootPath", /*noprefix*/ false, &getenv); + const char* crashReportRootPath = crashReportRootPathCfg.IsSet() ? crashReportRootPathCfg.AsString() : nullptr; + bool lifecycleRootConfigured = crashReportRootPath != nullptr && crashReportRootPath[0] != '\0'; InProcCrashReporterSettings settings = {}; - settings.reportPath = dumpName; + if (lifecycleRootConfigured) + { + settings.lifecycleRootPath = crashReportRootPath; + settings.maxFileCount = GetCrashReportMaxFileCount(); + } + else + { + CLRConfigNoCache dmpNameCfg = CLRConfigNoCache::Get("DbgMiniDumpName", /*noprefix*/ false, &getenv); + settings.reportPath = dmpNameCfg.IsSet() ? dmpNameCfg.AsString() : nullptr; + } + settings.isManagedThreadCallback = CrashReportIsCurrentThreadManaged; settings.walkStackCallback = CrashReportWalkStack; settings.enumerateThreadsCallback = CrashReportEnumerateThreads; From 7d803e85d44b29b54d867e6630fa547bdbe9aadc Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Wed, 27 May 2026 21:18:15 -0400 Subject: [PATCH 02/10] Add lifecycle-managed crash report storage Introduce InProcCrashReportLifecycle to own the on-disk lifecycle of in-proc crash reports: choosing the output directory, validating its writability, generating report names, and enforcing the retention cap so the crash path only has to emit a report into a prepared location. The crash-time entry points (preparing the report file, building paths) remain allocation-free and signal-safe; the initialization-time work (directory probing, pruning) runs once during startup before the crash handler is armed and may use the heap and non-async-signal-safe calls. Shared string copy/append helpers used by both the reporter and the lifecycle are factored into CrashReportStringUtils (crashreportstringutils.{h,cpp}) so the two callers no longer carry duplicate implementations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/debug/crashreport/CMakeLists.txt | 2 + .../crashreport/crashreportstringutils.cpp | 45 + .../crashreport/crashreportstringutils.h | 32 + .../debug/crashreport/inproccrashreporter.cpp | 117 +-- .../debug/crashreport/inproccrashreporter.h | 1 + .../inproccrashreportlifecycle.cpp | 767 ++++++++++++++++++ .../crashreport/inproccrashreportlifecycle.h | 201 +++++ 7 files changed, 1113 insertions(+), 52 deletions(-) create mode 100644 src/coreclr/debug/crashreport/crashreportstringutils.cpp create mode 100644 src/coreclr/debug/crashreport/crashreportstringutils.h create mode 100644 src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp create mode 100644 src/coreclr/debug/crashreport/inproccrashreportlifecycle.h diff --git a/src/coreclr/debug/crashreport/CMakeLists.txt b/src/coreclr/debug/crashreport/CMakeLists.txt index 22c66d611422ba..08e99596114a85 100644 --- a/src/coreclr/debug/crashreport/CMakeLists.txt +++ b/src/coreclr/debug/crashreport/CMakeLists.txt @@ -1,9 +1,11 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CRASHREPORT_SOURCES + crashreportstringutils.cpp signalsafeformatter.cpp signalsafejsonwriter.cpp signalsafeconsolewriter.cpp + inproccrashreportlifecycle.cpp inproccrashreporter.cpp ) diff --git a/src/coreclr/debug/crashreport/crashreportstringutils.cpp b/src/coreclr/debug/crashreport/crashreportstringutils.cpp new file mode 100644 index 00000000000000..ee8872ac1a72ee --- /dev/null +++ b/src/coreclr/debug/crashreport/crashreportstringutils.cpp @@ -0,0 +1,45 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "crashreportstringutils.h" + +#include + +void CrashReportStringUtils::CopyString(char* buffer, size_t bufferSize, const char* value) +{ + if (buffer == nullptr || bufferSize == 0) + { + return; + } + + if (value == nullptr) + { + buffer[0] = '\0'; + return; + } + + size_t toCopy = strnlen(value, bufferSize - 1); + if (toCopy != 0) + { + memcpy(buffer, value, toCopy); + } + + buffer[toCopy] = '\0'; +} + +bool CrashReportStringUtils::AppendString(char* buffer, size_t bufferSize, size_t* pos, const char* value) +{ + if (buffer == nullptr || pos == nullptr || value == nullptr || bufferSize == 0) + { + return false; + } + + size_t p = *pos; + while (*value != '\0' && p + 1 < bufferSize) + { + buffer[p++] = *value++; + } + buffer[p] = '\0'; + *pos = p; + return *value == '\0'; +} diff --git a/src/coreclr/debug/crashreport/crashreportstringutils.h b/src/coreclr/debug/crashreport/crashreportstringutils.h new file mode 100644 index 00000000000000..79c916610da48d --- /dev/null +++ b/src/coreclr/debug/crashreport/crashreportstringutils.h @@ -0,0 +1,32 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// Shared, allocation-free, bounds-safe string helpers used by both the in-proc +// crash reporter and the crash report lifecycle. These are safe to call from the +// signal/crash path: they perform no heap allocation and never call into the +// runtime. + +#pragma once + +#include + +namespace CrashReportStringUtils +{ + // Copies value into buffer, truncating if necessary, and always + // null-terminates. A null value yields an empty string. No-op if buffer is + // null or bufferSize is 0. + void CopyString( + char* buffer, + size_t bufferSize, + const char* value); + + // Appends value to buffer starting at *pos, advancing *pos past the appended + // characters, and always null-terminates. Returns false (without advancing + // past the buffer) if the value did not fit, the arguments are invalid, or + // bufferSize is 0. + bool AppendString( + char* buffer, + size_t bufferSize, + size_t* pos, + const char* value); +} diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.cpp b/src/coreclr/debug/crashreport/inproccrashreporter.cpp index 95d62e95a38651..73722573b42cd4 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreporter.cpp @@ -6,6 +6,8 @@ // Streams a createdump-shaped JSON skeleton to a crashreport.json file. #include "inproccrashreporter.h" +#include "inproccrashreportlifecycle.h" +#include "crashreportstringutils.h" #include "signalsafeconsolewriter.h" #include "signalsafejsonwriter.h" #include "signalsafeformatter.h" @@ -16,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -96,28 +99,6 @@ struct StackOverflowTraceSnapshot static char sccsid[] = "@(#)Version N/A"; #endif -static void CopyStringToBuffer(char* buffer, size_t bufferSize, const char* value) -{ - if (buffer == nullptr || bufferSize == 0) - { - return; - } - - if (value == nullptr) - { - buffer[0] = '\0'; - return; - } - - size_t toCopy = strnlen(value, bufferSize - 1); - if (toCopy != 0) - { - memcpy(buffer, value, toCopy); - } - - buffer[toCopy] = '\0'; -} - #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) // Query a sysctl by name into a caller-supplied buffer. Called from Initialize, NOT from the // signal handler -- sysctl/sysctlbyname is not on POSIX's async-signal-safe list, so the @@ -428,7 +409,9 @@ class InProcCrashReporter InProcCrashReportModuleInfoCallback m_moduleInfoCallback = nullptr; volatile LONG m_crashKind = static_cast(InProcCrashReportCrashKind::Unknown); uint32_t m_frameLimitPerThread = 0; - char m_reportPath[CRASHREPORT_PATH_BUFFER_SIZE]; + InProcCrashReportLifecycle m_lifecycle; + bool m_manageReportLifecycle = false; + char m_configuredReportPath[CRASHREPORT_PATH_BUFFER_SIZE]; char m_reportFilePath[CRASHREPORT_PATH_BUFFER_SIZE]; char m_processName[CRASHREPORT_STRING_BUFFER_SIZE]; char m_hostName[CRASHREPORT_STRING_BUFFER_SIZE]; @@ -608,25 +591,34 @@ InProcCrashReporter::CreateReport( { return; } - m_reportFilePath[0] = '\0'; - // The JSON file sink is only enabled when DbgMiniDumpName supplied a - // template AND the template expanded to a valid path. Otherwise the + // The JSON file sink is enabled by lifecycle-managed output or by a + // DbgMiniDumpName template that expands to a valid path. Otherwise the // crash report runs in compact-log-only mode: the JSON emitter still - // executes (so it can keep its bookkeeping consistent) but writes go - // to a no-op DiscardOutputCallback instead of an open fd. - bool jsonEnabled = m_reportPath[0] != '\0' && BuildReportPath(); + // executes (so it can keep its bookkeeping consistent) but writes go to a + // no-op DiscardOutputCallback instead of an open fd. + bool jsonEnabled = false; int fd = -1; - if (jsonEnabled) + if (m_manageReportLifecycle) + { + jsonEnabled = m_lifecycle.IsReportFileOutputEnabled() && + m_lifecycle.PrepareReportFile(&m_formatter, m_reportFilePath, sizeof(m_reportFilePath), &fd); + } + else { - fd = open(m_reportFilePath, O_WRONLY | O_CREAT | O_TRUNC, 0600); - if (fd == -1) + jsonEnabled = m_configuredReportPath[0] != '\0' && BuildReportPath(); + if (jsonEnabled) { - jsonEnabled = false; + fd = open(m_reportFilePath, O_WRONLY | O_CREAT | O_TRUNC, 0600); } } + if (jsonEnabled && fd == -1) + { + jsonEnabled = false; + } + InProcCrashReportCrashKind crashKind = static_cast( InterlockedExchange(&m_crashKind, static_cast(InProcCrashReportCrashKind::Unknown))); @@ -744,7 +736,9 @@ InProcCrashReporter::Initialize( m_frameLimitPerThread = settings.frameLimitPerThread; m_crashKind = static_cast(InProcCrashReportCrashKind::Unknown); m_stackOverflowTrace.available = 0; - CrashReportHelpers::CopyString(m_reportPath, sizeof(m_reportPath), settings.reportPath); + m_manageReportLifecycle = false; + m_configuredReportPath[0] = '\0'; + m_reportFilePath[0] = '\0'; m_processName[0] = '\0'; #if defined(__ANDROID__) @@ -786,6 +780,28 @@ InProcCrashReporter::Initialize( m_hostName[0] = '\0'; } + if (settings.lifecycleRootPath != nullptr && settings.lifecycleRootPath[0] != '\0') + { + m_manageReportLifecycle = true; + size_t pos = 0; + if (CrashReportHelpers::AppendString( + m_configuredReportPath, + sizeof(m_configuredReportPath), + &pos, + settings.lifecycleRootPath)) + { + m_lifecycle.Initialize(m_configuredReportPath, m_processName, settings.maxFileCount); + } + else + { + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: invalid CrashReportRootPath"); + } + } + else + { + CrashReportHelpers::CopyString(m_configuredReportPath, sizeof(m_configuredReportPath), settings.reportPath); + } + #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) // Cache sysctl values at Initialize because sysctl/sysctlbyname is not on POSIX's // async-signal-safe list; CreateReport reads these from the signal-handler path. @@ -827,7 +843,7 @@ InProcCrashReporter::AddStackOverflowTraceFrame( } StackOverflowTraceFrame& frame = trace.frames[trace.frameCount++]; - CopyStringToBuffer(frame.methodName, sizeof(frame.methodName), methodName); + CrashReportStringUtils::CopyString(frame.methodName, sizeof(frame.methodName), methodName); frame.repeatCount = repeatCount; frame.repeatSequenceLength = repeatSequenceLength; } @@ -849,7 +865,10 @@ InProcCrashReportSignalDispatcher(int signal, void* siginfo, void* context) return; } + // Preserve the interrupted context's errno before the crash reporter uses syscalls. + int savedErrno = errno; reporter->CreateReport(signal, context); + errno = savedErrno; } void @@ -1114,7 +1133,7 @@ InProcCrashReporter::ExpandDumpTemplate( bool InProcCrashReporter::BuildReportPath() { - if (m_reportPath[0] == '\0') + if (m_configuredReportPath[0] == '\0') { return false; } @@ -1122,7 +1141,7 @@ InProcCrashReporter::BuildReportPath() size_t pos = ExpandDumpTemplate( m_reportFilePath, sizeof(m_reportFilePath), - m_reportPath); + m_configuredReportPath); if (pos == 0) { return false; @@ -1182,19 +1201,7 @@ CrashReportHelpers::AppendString( size_t* pos, const char* value) { - if (buffer == nullptr || pos == nullptr || value == nullptr || bufferSize == 0) - { - return false; - } - - size_t p = *pos; - while (*value != '\0' && p + 1 < bufferSize) - { - buffer[p++] = *value++; - } - buffer[p] = '\0'; - *pos = p; - return *value == '\0'; + return CrashReportStringUtils::AppendString(buffer, bufferSize, pos, value); } void @@ -1398,7 +1405,7 @@ CrashReportHelpers::CopyString( size_t bufferSize, const char* value) { - CopyStringToBuffer(buffer, bufferSize, value); + CrashReportStringUtils::CopyString(buffer, bufferSize, value); } void @@ -2307,7 +2314,13 @@ InProcCrashReporter::EndJsonReport( writeFailed = true; } - if (close(fd) != 0 || !finishSucceeded || writeFailed) + bool closeSucceeded = close(fd) == 0; + bool reportSucceeded = finishSucceeded && !writeFailed && closeSucceeded; + if (m_manageReportLifecycle) + { + m_lifecycle.FinishReportFile(reportSucceeded, m_reportFilePath); + } + else if (!reportSucceeded) { unlink(m_reportFilePath); } diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.h b/src/coreclr/debug/crashreport/inproccrashreporter.h index 54a66907ee31b2..b72077e1a75c0a 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.h +++ b/src/coreclr/debug/crashreport/inproccrashreporter.h @@ -23,6 +23,7 @@ static constexpr size_t CRASHREPORT_PATH_BUFFER_SIZE = 1024; static constexpr size_t CRASHREPORT_STRING_BUFFER_SIZE = 256; static constexpr int32_t CRASHREPORT_DEFAULT_MAX_FILE_COUNT = 32; static constexpr int32_t CRASHREPORT_UNLIMITED_FILE_COUNT = -1; +static constexpr int32_t CRASHREPORT_CLEANUP_ONLY_FILE_COUNT = 0; #if defined(__ANDROID__) static const char CRASHREPORT_LOG_TAG[] = "DOTNET_CRASH"; diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp new file mode 100644 index 00000000000000..b2d6ef7c9f20b5 --- /dev/null +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -0,0 +1,767 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "inproccrashreportlifecycle.h" + +#include "crashreportstringutils.h" +#include "pal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char CrashReportManagedRootDirectory[] = ".dotnet"; +static const char CrashReportManagedReportDirectory[] = "crash-reports"; +static const char CrashReportFilePrefix[] = "report-"; +static const char CrashReportFileExtension[] = ".crashreport.json"; +static const char CrashReportTempExtension[] = ".tmp"; +// Bounded retry avoids an unbounded crash-path loop if a process repeatedly +// crashes in the same second with the same pid and stale files are present. +static constexpr uint32_t CrashReportMaxSuffixRetry = 32; +// Directory scans run at initialization, but misconfigured roots should not let +// a diagnostic feature stall application startup indefinitely. +static constexpr size_t CrashReportMaxDirectoryEntriesToScan = 4096; + +InProcCrashReportLifecycle::~InProcCrashReportLifecycle() +{ + delete[] m_deleteCandidates; +} + +bool +InProcCrashReportLifecycle::Initialize( + const char* rootPath, + const char* processName, + int32_t maxFileCount) +{ + m_reportFileOutputEnabled = false; + m_reportDirectory[0] = '\0'; + m_tempReportFilePath[0] = '\0'; + + if (!EstablishReportDirectory(rootPath, processName)) + { + return false; + } + + RetentionMode mode = GetRetentionMode(maxFileCount); + + FileInfo* reports = nullptr; + size_t reportCount = 0; + if (CollectExistingReports(mode, &reports, &reportCount) != CollectResult::Ready) + { + // Both a hard failure and the cleanup-only path leave output disabled; the + // former already logged, the latter performed its deletions during the scan. + return false; + } + + if (mode == RetentionMode::Bounded && !SelectDeleteCandidates(reports, reportCount, maxFileCount)) + { + free(reports); + return false; + } + + free(reports); + m_reportFileOutputEnabled = true; + return true; +} + +InProcCrashReportLifecycle::RetentionMode +InProcCrashReportLifecycle::GetRetentionMode(int32_t maxFileCount) +{ + if (maxFileCount == CRASHREPORT_CLEANUP_ONLY_FILE_COUNT) + { + return RetentionMode::CleanupOnly; + } + + if (maxFileCount == CRASHREPORT_UNLIMITED_FILE_COUNT) + { + return RetentionMode::Unlimited; + } + + // The configuration layer constrains maxFileCount to the unlimited (-1), + // cleanup-only (0), or positive-bound domain, so anything else is a bound. + return RetentionMode::Bounded; +} + +bool +InProcCrashReportLifecycle::EstablishReportDirectory( + const char* rootPath, + const char* processName) +{ + if (rootPath == nullptr || rootPath[0] == '\0') + { + return false; + } + + char root[CRASHREPORT_PATH_BUFFER_SIZE]; + if (!ResolveRootPath(root, sizeof(root), rootPath)) + { + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: invalid CrashReportRootPath"); + return false; + } + + struct stat rootStat; + if (stat(root, &rootStat) != 0 || !S_ISDIR(rootStat.st_mode)) + { + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: CrashReportRootPath is not an existing directory"); + return false; + } + + char appName[CRASHREPORT_STRING_BUFFER_SIZE]; + SanitizePathComponent(appName, sizeof(appName), processName); + + size_t pos = 0; + if (!CrashReportStringUtils::AppendString(m_reportDirectory, sizeof(m_reportDirectory), &pos, root) || + !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, CrashReportManagedRootDirectory) || + !EnsureDirectory(m_reportDirectory) || + !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, CrashReportManagedReportDirectory) || + !EnsureDirectory(m_reportDirectory) || + !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, appName) || + !EnsureDirectory(m_reportDirectory) || + !ProbeDirectoryWritable(m_reportDirectory)) + { + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to initialize crash report directory"); + m_reportDirectory[0] = '\0'; + return false; + } + + return true; +} + +InProcCrashReportLifecycle::CollectResult +InProcCrashReportLifecycle::CollectExistingReports( + RetentionMode mode, + FileInfo** reports, + size_t* reportCount) +{ + *reports = nullptr; + *reportCount = 0; + + DIR* dir = opendir(m_reportDirectory); + if (dir == nullptr) + { + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to scan crash report directory"); + return CollectResult::Failed; + } + + FileInfo* collected = nullptr; + size_t collectedCount = 0; + size_t collectedCapacity = 0; + size_t scannedEntries = 0; + bool scanExceeded = false; + + while (dirent* entry = readdir(dir)) + { + if (++scannedEntries > CrashReportMaxDirectoryEntriesToScan) + { + scanExceeded = true; + break; + } + + FileInfo info = {}; + bool hasTempExtension = false; + bool parsedOwnedName = TryParseReportName(entry->d_name, &info, &hasTempExtension); + bool isTemp = parsedOwnedName && hasTempExtension; + bool isCompleted = parsedOwnedName && !hasTempExtension; + + char fullPath[CRASHREPORT_PATH_BUFFER_SIZE]; + fullPath[0] = '\0'; + size_t fullPathPos = 0; + if (!CrashReportStringUtils::AppendString(fullPath, sizeof(fullPath), &fullPathPos, m_reportDirectory) || + !AppendPathComponent(fullPath, sizeof(fullPath), &fullPathPos, entry->d_name)) + { + continue; + } + + if (isTemp) + { + if (!IsProcessAlive(info.pid)) + { + unlink(fullPath); + } + continue; + } + + if (!isCompleted) + { + continue; + } + + if (mode == RetentionMode::CleanupOnly) + { + unlink(fullPath); + continue; + } + + if (mode == RetentionMode::Unlimited) + { + continue; + } + + if (collectedCount == collectedCapacity) + { + size_t newCapacity = collectedCapacity == 0 ? 16 : collectedCapacity * 2; + FileInfo* grown = reinterpret_cast(realloc(collected, newCapacity * sizeof(FileInfo))); + if (grown == nullptr) + { + free(collected); + closedir(dir); + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to allocate retention scan storage"); + return CollectResult::Failed; + } + + collected = grown; + collectedCapacity = newCapacity; + } + + CrashReportStringUtils::CopyString(info.path.value, sizeof(info.path.value), fullPath); + collected[collectedCount++] = info; + } + + closedir(dir); + + if (scanExceeded) + { + free(collected); + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: too many files in crash report directory"); + return CollectResult::Failed; + } + + if (mode == RetentionMode::CleanupOnly) + { + free(collected); + return CollectResult::CleanupOnlyComplete; + } + + *reports = collected; + *reportCount = collectedCount; + return CollectResult::Ready; +} + +bool +InProcCrashReportLifecycle::SelectDeleteCandidates( + FileInfo* reports, + size_t reportCount, + int32_t maxFileCount) +{ + qsort(reports, reportCount, sizeof(FileInfo), &CompareFileInfo); + + size_t keepBeforeCrash = static_cast(maxFileCount - 1); + if (reportCount <= keepBeforeCrash) + { + return true; + } + + m_deleteCandidateCount = reportCount - keepBeforeCrash; + m_deleteCandidates = new (std::nothrow) ReportPath[m_deleteCandidateCount]; + if (m_deleteCandidates == nullptr) + { + m_deleteCandidateCount = 0; + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to allocate retention candidate storage"); + return false; + } + + for (size_t i = 0; i < m_deleteCandidateCount; i++) + { + CrashReportStringUtils::CopyString(m_deleteCandidates[i].value, sizeof(m_deleteCandidates[i].value), reports[i].path.value); + } + + return true; +} + +bool +InProcCrashReportLifecycle::PrepareReportFile( + SignalSafeFormatter* formatter, + char* reportFilePath, + size_t reportFilePathSize, + int* fd) +{ + if (formatter == nullptr || reportFilePath == nullptr || reportFilePathSize == 0 || + fd == nullptr || m_reportDirectory[0] == '\0') + { + return false; + } + + reportFilePath[0] = '\0'; + *fd = -1; + uint64_t timestamp = static_cast(time(nullptr)); + uint32_t pid = static_cast(GetCurrentProcessId()); + + // Retention candidates are deleted before opening the temp file so the + // completed-report set never exceeds the configured bound and old reports + // can free space for the new crash report. If the new write later fails, + // the deleted candidates are intentionally not restored. + DeleteCandidates(); + + for (uint32_t suffix = 0; suffix <= CrashReportMaxSuffixRetry; suffix++) + { + if (!BuildReportPaths(formatter, reportFilePath, reportFilePathSize, m_tempReportFilePath, sizeof(m_tempReportFilePath), timestamp, pid, suffix)) + { + return false; + } + + if (access(reportFilePath, F_OK) == 0) + { + continue; + } + + int tempFd = open(m_tempReportFilePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); + if (tempFd == -1) + { + if (errno == EEXIST) + { + continue; + } + + return false; + } + + *fd = tempFd; + return true; + } + + reportFilePath[0] = '\0'; + m_tempReportFilePath[0] = '\0'; + return false; +} + +void +InProcCrashReportLifecycle::FinishReportFile( + bool succeeded, + const char* reportFilePath) +{ + if (m_tempReportFilePath[0] == '\0') + { + return; + } + + if (succeeded && reportFilePath != nullptr && reportFilePath[0] != '\0') + { + (void)link(m_tempReportFilePath, reportFilePath); + } + + unlink(m_tempReportFilePath); + m_tempReportFilePath[0] = '\0'; +} + +bool +InProcCrashReportLifecycle::BuildReportPaths( + SignalSafeFormatter* formatter, + char* reportFilePath, + size_t reportFilePathSize, + char* tempReportFilePath, + size_t tempReportFilePathSize, + uint64_t timestamp, + uint32_t pid, + uint32_t suffix) +{ + reportFilePath[0] = '\0'; + tempReportFilePath[0] = '\0'; + + size_t pos = 0; + if (!CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, m_reportDirectory) || + !AppendPathComponent(reportFilePath, reportFilePathSize, &pos, CrashReportFilePrefix) || + !CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, formatter->FormatUnsignedDecimal(timestamp)) || + !CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, "-") || + !CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, formatter->FormatUnsignedDecimal(pid))) + { + return false; + } + + if (suffix != 0) + { + if (!CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, "-") || + !CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, formatter->FormatUnsignedDecimal(suffix))) + { + return false; + } + } + + if (!CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, CrashReportFileExtension)) + { + return false; + } + + size_t tempPos = 0; + return CrashReportStringUtils::AppendString(tempReportFilePath, tempReportFilePathSize, &tempPos, reportFilePath) && + CrashReportStringUtils::AppendString(tempReportFilePath, tempReportFilePathSize, &tempPos, CrashReportTempExtension); +} + +void +InProcCrashReportLifecycle::DeleteCandidates() +{ + for (size_t i = 0; i < m_deleteCandidateCount; i++) + { + if (m_deleteCandidates[i].value[0] != '\0') + { + unlink(m_deleteCandidates[i].value); + } + } +} + +bool +InProcCrashReportLifecycle::AppendPathComponent( + char* buffer, + size_t bufferSize, + size_t* pos, + const char* component) +{ + if (buffer == nullptr || pos == nullptr || component == nullptr || component[0] == '\0') + { + return false; + } + + if (*pos != 0 && buffer[*pos - 1] != '/') + { + if (!CrashReportStringUtils::AppendString(buffer, bufferSize, pos, "/")) + { + return false; + } + } + + while (*component == '/') + { + component++; + } + + return component[0] != '\0' && CrashReportStringUtils::AppendString(buffer, bufferSize, pos, component); +} + +bool +InProcCrashReportLifecycle::IsAbsolutePath(const char* path) +{ + return path != nullptr && path[0] == '/'; +} + +bool +InProcCrashReportLifecycle::ResolveRootPath( + char* buffer, + size_t bufferSize, + const char* rootPath) +{ + if (buffer == nullptr || bufferSize == 0 || rootPath == nullptr || rootPath[0] == '\0') + { + return false; + } + + buffer[0] = '\0'; + size_t pos = 0; + + if (rootPath[0] == '~' && (rootPath[1] == '\0' || rootPath[1] == '/')) + { + const char* home = getenv("HOME"); + if (home == nullptr || home[0] == '\0') + { + return false; + } + + if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, home)) + { + return false; + } + + rootPath++; + if (*rootPath == '/') + { + rootPath++; + } + if (*rootPath != '\0' && !AppendPathComponent(buffer, bufferSize, &pos, rootPath)) + { + return false; + } + } + else if (strncmp(rootPath, "$HOME", 5) == 0 && (rootPath[5] == '\0' || rootPath[5] == '/')) + { + const char* home = getenv("HOME"); + if (home == nullptr || home[0] == '\0') + { + return false; + } + + if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, home)) + { + return false; + } + + rootPath += 5; + if (*rootPath == '/') + { + rootPath++; + } + if (*rootPath != '\0' && !AppendPathComponent(buffer, bufferSize, &pos, rootPath)) + { + return false; + } + } + else if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, rootPath)) + { + return false; + } + + return IsAbsolutePath(buffer); +} + +bool +InProcCrashReportLifecycle::EnsureDirectory(const char* path) +{ + if (path == nullptr || path[0] == '\0') + { + return false; + } + + struct stat st; + if (stat(path, &st) == 0) + { + return S_ISDIR(st.st_mode); + } + + if (errno != ENOENT) + { + return false; + } + + if (mkdir(path, 0700) != 0) + { + return errno == EEXIST && stat(path, &st) == 0 && S_ISDIR(st.st_mode); + } + + return true; +} + +bool +InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) +{ + // Borrow the temp-report path buffer as scratch: it is unused until the + // crash path calls PrepareReportFile, so reusing it here avoids a second + // large path buffer while keeping the probe allocation-free. + char* probePath = m_tempReportFilePath; + probePath[0] = '\0'; + size_t pos = 0; + + // Use a hidden throwaway file to verify the directory allows create/delete. + SignalSafeFormatter formatter; + bool built = + CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, path) && + CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, "/.probe-") && + CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, formatter.FormatUnsignedDecimal(static_cast(GetCurrentProcessId()))); + + bool writable = false; + if (built) + { + int fd = open(probePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); + if (fd != -1) + { + bool closeSucceeded = close(fd) == 0; + bool unlinkSucceeded = unlink(probePath) == 0; + writable = closeSucceeded && unlinkSucceeded; + } + } + + // Leave the borrowed buffer empty so the crash path's "no temp file yet" + // invariant continues to hold after initialization. + m_tempReportFilePath[0] = '\0'; + return writable; +} + +static bool TryParseUnsigned( + const char** value, + uint64_t* result) +{ + const char* current = *value; + if (current == nullptr || *current < '0' || *current > '9') + { + return false; + } + + uint64_t parsed = 0; + do + { + uint64_t digit = static_cast(*current - '0'); + if (parsed > (UINT64_MAX - digit) / 10) + { + return false; + } + + parsed = parsed * 10 + digit; + current++; + } while (*current >= '0' && *current <= '9'); + + *value = current; + *result = parsed; + return true; +} + +bool +InProcCrashReportLifecycle::TryParseReportName( + const char* name, + FileInfo* info, + bool* isTempExtension) +{ + if (name == nullptr || info == nullptr || isTempExtension == nullptr) + { + return false; + } + + *isTempExtension = false; + + size_t prefixLength = sizeof(CrashReportFilePrefix) - 1; + size_t extensionLength = sizeof(CrashReportFileExtension) - 1; + + // The shortest name this function can accept is the prefix, a single + // timestamp digit, the '-' separator, a single pid digit, and the extension. + // "0-0" encodes that minimal timestamp-separator-pid core. Reject anything + // shorter up front so we never walk the per-part parse for a name that cannot + // possibly match. + size_t minimumLength = prefixLength + (sizeof("0-0") - 1) + extensionLength; + if (strlen(name) < minimumLength) + { + return false; + } + + if (strncmp(name, CrashReportFilePrefix, prefixLength) != 0) + { + return false; + } + + const char* current = name + prefixLength; + uint64_t timestamp = 0; + uint64_t pid = 0; + uint64_t suffix = 0; + + if (!TryParseUnsigned(¤t, ×tamp) || *current != '-') + { + return false; + } + current++; + + if (!TryParseUnsigned(¤t, &pid)) + { + return false; + } + + if (*current == '-') + { + current++; + if (!TryParseUnsigned(¤t, &suffix)) + { + return false; + } + } + + if (strncmp(current, CrashReportFileExtension, extensionLength) != 0) + { + return false; + } + current += extensionLength; + + if (*current != '\0') + { + size_t tempExtensionLength = sizeof(CrashReportTempExtension) - 1; + if (strncmp(current, CrashReportTempExtension, tempExtensionLength) != 0 || + current[tempExtensionLength] != '\0') + { + return false; + } + + *isTempExtension = true; + } + + info->timestamp = timestamp; + info->pid = pid; + info->suffix = suffix; + return true; +} + +bool +InProcCrashReportLifecycle::IsProcessAlive(uint64_t pid) +{ + if (pid == 0 || pid > static_cast(INT_MAX)) + { + return false; + } + + // Signal 0 probes for process existence without delivering a signal. + if (kill(static_cast(pid), 0) == 0) + { + return true; + } + + return errno != ESRCH; +} + +int +InProcCrashReportLifecycle::CompareFileInfo( + const void* left, + const void* right) +{ + const FileInfo* leftInfo = reinterpret_cast(left); + const FileInfo* rightInfo = reinterpret_cast(right); + + if (leftInfo->timestamp < rightInfo->timestamp) + { + return -1; + } + if (leftInfo->timestamp > rightInfo->timestamp) + { + return 1; + } + if (leftInfo->suffix < rightInfo->suffix) + { + return -1; + } + if (leftInfo->suffix > rightInfo->suffix) + { + return 1; + } + return strcmp(leftInfo->path.value, rightInfo->path.value); +} + +bool +InProcCrashReportLifecycle::IsSafePathCharacter(char c) +{ + return (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '.' || + c == '_' || + c == '-'; +} + +void +InProcCrashReportLifecycle::SanitizePathComponent( + char* buffer, + size_t bufferSize, + const char* value) +{ + if (buffer == nullptr || bufferSize == 0) + { + return; + } + + const char* source = (value != nullptr && value[0] != '\0') ? value : "unknown"; + size_t pos = 0; + while (*source != '\0' && pos + 1 < bufferSize) + { + buffer[pos++] = IsSafePathCharacter(*source) ? *source : '_'; + source++; + } + + if (pos == 0) + { + const char fallback[] = "unknown"; + for (size_t i = 0; fallback[i] != '\0' && pos + 1 < bufferSize; i++) + { + buffer[pos++] = fallback[i]; + } + } + + buffer[pos] = '\0'; +} diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h new file mode 100644 index 00000000000000..38a9e9e82713a0 --- /dev/null +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h @@ -0,0 +1,201 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma once + +#include "inproccrashreporter.h" +#include "signalsafeformatter.h" + +#include +#include + +// Manages the on-disk lifecycle of in-proc crash reports: at startup it +// establishes the managed report directory, prunes stale and over-retention +// reports, and selects deletion candidates; on a crash it hands out a uniquely +// named temp report file and finalizes it. Composed alongside (not derived from) +// the signal-safe writer family that emits the report contents. +// +// Members run in one of two execution contexts: +// * Initialization path -- runs once at process startup. May allocate and call +// libc/filesystem APIs; NOT async-signal-safe. This is the default contract +// for members of this class. +// * Crash/signal path -- invoked from the crash signal handler. Must be +// async-signal-safe and allocation-free. Only these members run there: +// IsReportFileOutputEnabled, PrepareReportFile, FinishReportFile, +// BuildReportPaths, DeleteCandidates, and the shared AppendPathComponent. +// Each is marked accordingly below. +class InProcCrashReportLifecycle +{ +public: + InProcCrashReportLifecycle() = default; + ~InProcCrashReportLifecycle(); + InProcCrashReportLifecycle(const InProcCrashReportLifecycle&) = delete; + InProcCrashReportLifecycle& operator=(const InProcCrashReportLifecycle&) = delete; + + // Prepares lifecycle-managed storage for crash reports under rootPath for the + // given processName, keeping at most maxFileCount completed reports + // (CRASHREPORT_UNLIMITED_FILE_COUNT for no limit, + // CRASHREPORT_CLEANUP_ONLY_FILE_COUNT for cleanup-only). Runs at startup (not + // the crash path) and may allocate. Returns false if storage could not be + // initialized or output is disabled. + bool Initialize( + const char* rootPath, + const char* processName, + int32_t maxFileCount); + + // Returns whether lifecycle-managed report files should be written. False when + // initialization failed or when output is intentionally disabled (for example + // the cleanup-only mode, maxFileCount == CRASHREPORT_CLEANUP_ONLY_FILE_COUNT, + // which still initializes successfully). Crash/signal-path safe: reads one bool. + bool IsReportFileOutputEnabled() const { return m_reportFileOutputEnabled; } + + // Opens a uniquely named temporary report file under the managed directory, + // returning its path and an open fd. Deletes any over-retention candidates + // first. Runs on the crash/signal path: allocation-free and signal-safe. + // Returns false if no file could be opened. + bool PrepareReportFile( + SignalSafeFormatter* formatter, + char* reportFilePath, + size_t reportFilePathSize, + int* fd); + + // Finalizes the report opened by PrepareReportFile: on success links the temp + // file to its final reportFilePath, then removes the temp file in all cases. + // Runs on the crash/signal path: allocation-free and signal-safe. + void FinishReportFile( + bool succeeded, + const char* reportFilePath); + +private: + // How existing and future reports are retained, derived once from the + // configured maxFileCount so the scan does not re-test sentinels per entry. + enum class RetentionMode + { + CleanupOnly, // delete every completed report and leave output disabled + Unlimited, // keep every completed report; never prune + Bounded, // keep at most maxFileCount completed reports + }; + + // Outcome of CollectExistingReports, distinguishing a hard failure from the + // intentional cleanup-only path (both leave report output disabled). + enum class CollectResult + { + Failed, + CleanupOnlyComplete, + Ready, + }; + + struct ReportPath + { + char value[CRASHREPORT_PATH_BUFFER_SIZE]; + }; + + struct FileInfo + { + uint64_t timestamp; + uint64_t pid; + uint64_t suffix; + ReportPath path; + }; + + // Maps the configured maxFileCount onto the retention mode that governs how + // the existing-report scan and pruning behave. + static RetentionMode GetRetentionMode(int32_t maxFileCount); + + // Resolves rootPath/processName into m_reportDirectory, creating the managed + // directory tree and verifying it is writable. Initialization path; logs and + // returns false on failure. + bool EstablishReportDirectory( + const char* rootPath, + const char* processName); + + // Scans m_reportDirectory: removes stale temp files and, in CleanupOnly mode, + // every completed report; otherwise collects completed reports into a newly + // allocated array transferred to the caller on CollectResult::Ready. + // Initialization path; may allocate. + CollectResult CollectExistingReports( + RetentionMode mode, + FileInfo** reports, + size_t* reportCount); + + // Sorts the collected reports oldest-first and records the over-retention + // entries (beyond maxFileCount - 1, reserving one slot for the imminent + // report) into m_deleteCandidates. Initialization path; logs and returns + // false if the candidate storage allocation fails. + bool SelectDeleteCandidates( + FileInfo* reports, + size_t reportCount, + int32_t maxFileCount); + + // Builds the final and temporary report paths from the timestamp/pid/suffix + // into the caller-provided buffers. Crash-path, allocation-free. + bool BuildReportPaths( + SignalSafeFormatter* formatter, + char* reportFilePath, + size_t reportFilePathSize, + char* tempReportFilePath, + size_t tempReportFilePathSize, + uint64_t timestamp, + uint32_t pid, + uint32_t suffix); + + // Unlinks the over-retention reports selected during Initialize. Crash-path. + void DeleteCandidates(); + + // Appends component as a new path segment (inserting a single '/' separator as + // needed). Allocation-free; used by both the init and crash paths. + static bool AppendPathComponent( + char* buffer, + size_t bufferSize, + size_t* pos, + const char* component); + + // Returns whether path is absolute (begins with '/'). Initialization path. + static bool IsAbsolutePath(const char* path); + + // Resolves rootPath (expanding a leading '~' or '$HOME') into an absolute path. + static bool ResolveRootPath( + char* buffer, + size_t bufferSize, + const char* rootPath); + + // Creates the directory if missing; succeeds if it already exists as a directory. + static bool EnsureDirectory(const char* path); + + // Verifies the directory permits create/delete by creating and removing a + // hidden probe file. Borrows the (init-time idle) m_tempReportFilePath buffer + // rather than placing a second large path buffer on the stack. + bool ProbeDirectoryWritable(const char* path); + + // Parses a managed report file name (report--[-]), + // accepting either a completed report or the in-progress .tmp form, into info. + // Reports which form matched through isTempExtension. + static bool TryParseReportName( + const char* name, + FileInfo* info, + bool* isTempExtension); + + // Returns whether a process with the given pid currently exists. + static bool IsProcessAlive(uint64_t pid); + + // qsort comparator ordering reports oldest-first (timestamp, then suffix, then path). + static int CompareFileInfo( + const void* left, + const void* right); + + // Returns whether c is allowed unescaped in a path component. + static bool IsSafePathCharacter(char c); + + // Copies value into buffer, replacing unsafe characters with '_' and falling + // back to "unknown" when the result would be empty. + static void SanitizePathComponent( + char* buffer, + size_t bufferSize, + const char* value); + + char m_reportDirectory[CRASHREPORT_PATH_BUFFER_SIZE] = {}; + char m_tempReportFilePath[CRASHREPORT_PATH_BUFFER_SIZE] = {}; + ReportPath* m_deleteCandidates = nullptr; + size_t m_deleteCandidateCount = 0; + bool m_reportFileOutputEnabled = false; +}; From 01938d8731d860974f99198bad72669824e23b70 Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Thu, 28 May 2026 20:45:39 -0400 Subject: [PATCH 03/10] Document crash report lifecycle design Add design notes co-located with the in-proc crash report lifecycle source, capturing the Android/iOS crash-artifact investigation, the configuration and on-disk layout, the init- vs crash/signal-path split, the retention model, and the key design decisions and review findings that shaped the implementation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inproccrashreportlifecycle.design.md | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md new file mode 100644 index 00000000000000..bc4135ec6fb263 --- /dev/null +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md @@ -0,0 +1,166 @@ +# In-proc crash report lifecycle — design notes + +Design rationale for `InProcCrashReportLifecycle` (`inproccrashreportlifecycle.h` / +`inproccrashreportlifecycle.cpp`). It records *why* the lifecycle behaves the way +it does, derived from an investigation of how Android and iOS manage their own +crash artifacts and a brainstorming/grill-me design session. It is documentation, +not a spec — the code is the source of truth. + +## Problem + +The in-proc crash reporter (`InProcCrashReporter`) emits a createdump-shaped +`*.crashreport.json` payload from the crash signal handler. On mobile, disk space +is constrained, so crash reports that pile up without bound can bloat the device — +yet there was no durable, app-local place to put them and no mechanism to cap how +many are kept. + +The reporter could write a file only when `DOTNET_DbgMiniDumpName` was set (writing +`.crashreport.json`), emitting compact logs otherwise. +That path is relatively new and carries no notion of a managed location or of +retaining/pruning past reports; it stays supported here, but it is recent enough +that we could drop it if we chose to. + +## Background: how the platforms manage crash artifacts + +**Android (tombstones).** The OS keeps a *bounded rotating set* of tombstone +files. `tombstoned` writes completed dumps to `/data/tombstones/tombstone_00`, +`tombstone_01`, … The default maximum is **32** +(`tombstoned.max_tombstone_count`). It picks a missing slot first, otherwise the +oldest slot, then advances modulo the max. Dumps are first written to anonymous +temp files and only **linked** into the final `tombstone_NN` path once the dumper +reports completion, so a failed/timed-out dump never leaves a named partial +tombstone. Because filenames are fixed slots, age is tracked by `mtime`. The +breadcrumb tying a crash to its file is the logcat line +`Tombstone written to: /data/tombstones/tombstone_06`. (See AOSP `debuggerd`/ +`tombstoned` and the Android debugging docs for the authoritative mechanics.) + +**iOS / Apple.** Crash reports are OS-managed diagnostic logs (`.ips`/`.crash`), +not a public circular tombstone directory. The system generates a report after +the app is allowed to finish crashing (an attached Xcode debugger intercepts the +crash until you detach). Reports live in the device's Analytics Data and are +surfaced/transferred via Xcode/Console, TestFlight, and the App Store. Retention +and pruning are Apple/OS-managed; there is **no** app-controlled, Android-style +fixed-slot rotation contract. (See Apple's developer documentation on acquiring +crash reports and diagnostic logs.) + +**Takeaways that shaped this design.** Borrow Android's robust mechanics — +default of 32, temp-file-then-link commit, bounded retention — but improve on the +fixed-slot model by encoding the crash time in the filename (so ordering does not +depend on fragile `mtime`), and accept that on iOS the runtime, not the OS, owns +this app-local directory. + +## Goals and non-goals + +- **Goal:** a durable, bounded, app-local directory of completed crash reports, + written safely from the crash signal handler. +- **Goal:** opt-in. No file output unless explicitly configured, so we never + silently start persisting files to a device. +- **Goal:** fail loud and disabled on misconfiguration rather than guessing. +- **Non-goal:** cross-process coordination/locking. Retention is best-effort when + multiple processes share a root (see *Cross-process*). +- **Non-goal:** replacing the OS crash facilities; this is the runtime's own JSON + report alongside them. + +## Configuration + +| Variable | Meaning | +| --- | --- | +| `DOTNET_CrashReportRootPath` | Activates file output. Root under which the managed directory is created. Supports a leading `~` or `$HOME` shorthand expanded to `getenv("HOME")`. Must resolve to an existing, absolute directory or output is disabled. | +| `DOTNET_CrashReportMaxFileCount` | Retention bound. Default **32**; **-1** unlimited; **0** cleanup-only. Parsed as a *signed* value (the usual DWORD config helper is unsigned); out-of-range values fall back to the default. | + +When `CrashReportRootPath` is unset, the legacy `DbgMiniDumpName + +.crashreport.json` behavior is unchanged — the root path supersedes the legacy +path only when set. + +## On-disk layout and naming + +``` +/.dotnet/crash-reports//report--[-].crashreport.json +``` + +- `` is the sanitized process name (unsafe path characters replaced with + `_`, falling back to `unknown`) — isolates retention per app under a shared root. +- `` (epoch seconds) and `` order reports by intended crash time; + the optional `-` disambiguates same-second/pid collisions. +- The `.crashreport.json` extension is the canonical one already emitted by + `createdump` and the existing reporter; external tooling recognizes it. The + `report-` prefix is this feature's marker/parse anchor. +- In-progress reports use a `…/report-….crashreport.json.tmp` temp file. + +On mobile, `HOME` is app-scoped (`context.getFilesDir()` on Android CoreCLR app +hosting; the app sandbox home on iOS), so `$HOME/.dotnet/crash-reports` is +per-app, not shared across apps. + +## Two execution contexts + +The class deliberately spans two contexts; every member documents which it obeys. + +**Initialization path (`Initialize` and helpers).** Runs once at startup. May +allocate and call libc/filesystem APIs. Responsibilities: + +1. Resolve/validate the root, create `.dotnet/crash-reports/` (each + `mkdir` tolerates `EEXIST`; the final node is verified to be a directory). +2. Verify writability with a create+close+unlink probe file — so the crash path + never discovers an unwritable directory. +3. Scan the directory (capped at a bounded entry count to avoid startup hangs): + - delete stale `.tmp` files whose owning pid is no longer alive + (`kill(pid,0)`), so we never unlink another live process's active temp; + - in cleanup-only mode, delete every completed report and leave output + disabled; + - otherwise collect completed reports, sort oldest-first, and **precompute** + the over-retention deletion candidates into fixed buffers. + +**Crash/signal path (`PrepareReportFile`, `FinishReportFile`, and helpers).** +Invoked from the signal handler; must be async-signal-safe and allocation-free: + +- build the temp + final paths into caller-provided fixed buffers (signal-safe + integer formatting, no `snprintf`/`std::string`); +- `open` the temp with `O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC` (exclusive create + so a collision never truncates an existing report; `O_CLOEXEC` avoids fd leaks); +- on success `link` the temp to the final name then `unlink` the temp; the old + precomputed candidates are unlinked separately. Deletion is *intentional only*: + completed reports are removed by retention, never by the write path overwriting. + +Because the final report is a *new* file (not a replacement), there can briefly be +`N+1` completed reports between link and candidate deletion. + +## Retention model + +`GetRetentionMode` maps `maxFileCount` to one mode once, so the scan does not +re-test sentinels per entry: + +- **Unlimited (`-1`):** keep every completed report; still clean stale temps. +- **CleanupOnly (`0`):** delete all completed reports + stale temps, then disable + new writes for the process lifetime. +- **Bounded (`>0`):** keep at most `maxFileCount`; precompute deletions beyond + `maxFileCount - 1`, reserving one slot for the imminent report. + +Candidates are precomputed at init (not enumerated in the signal handler) because +directory enumeration and sorting are not signal-safe. + +## Key decisions and rationale + +- **Dedicated app-local directory instead of inferring from `DbgMiniDumpName`.** + A managed directory gives a clear ownership boundary for retention. +- **Opt-in via explicit root.** A default directory + automatic file creation + would implicitly turn on persistent files; we avoid that. +- **Timestamp filenames over Android-style fixed slots + `mtime`.** Robust to + copy/touch/restore that would scramble `mtime`; ordering comes from the name. +- **Exclusive create (no `O_TRUNC`).** A PID/timestamp collision, stale file, or + test mistake must never silently erase a completed report. +- **Init-time candidate caching in fixed buffers.** Keeps the crash path + signal-safe and free of malloc/heap state that may be corrupted at crash time. +- **`link` + `unlink` commit (not `rename`).** Both are POSIX async-signal-safe; + `rename` is not formally on the AS-safe list across bionic/Darwin. +- **pid in temp name + liveness-gated stale cleanup.** Lets multiple processes + share a root without deleting each other's in-flight temp files. + +## Cross-process considerations + +Retention is best-effort when several processes share one root: each caches its +own delete candidates at init, so the directory may transiently hold more than +`maxFileCount` reports. The app-name subdirectory and pid-tagged temps prevent the +dangerous cases (deleting a live writer's temp, or one app pruning another's +reports). Strict global retention would require cross-process locking, which is a +non-goal here. + From a2f3879ce4f13b527fdbe053ca178a56292e280b Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Thu, 28 May 2026 22:59:05 -0400 Subject: [PATCH 04/10] Publish crash reports with rename instead of link FinishReportFile published the completed report via link() followed by an unconditional unlink() of the temp file, discarding the link() return value. Android and Apple mobile app-private storage reject hard links with EPERM, so every crash report was silently lost on the exact platforms this in-proc feature targets. Replace the link+unlink commit with a same-directory rename, which is POSIX async-signal-safe and permitted in those sandboxes. A best-effort access(final, F_OK) check before the rename preserves the exclusive-create intent that link's EEXIST previously provided; the final name is collision-resistant by construction, so the residual TOCTOU window is benign. On any failure the temp file is still removed. Update the header contract and design doc to match. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inproccrashreportlifecycle.cpp | 20 +++++++++++++--- .../inproccrashreportlifecycle.design.md | 23 +++++++++++++------ .../crashreport/inproccrashreportlifecycle.h | 6 ++--- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index b2d6ef7c9f20b5..b81e1af35793ba 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -343,9 +344,22 @@ InProcCrashReportLifecycle::FinishReportFile( return; } - if (succeeded && reportFilePath != nullptr && reportFilePath[0] != '\0') - { - (void)link(m_tempReportFilePath, reportFilePath); + // Publish the completed report by renaming the temp file to its final name. + // rename is POSIX async-signal-safe and, unlike link, is permitted in the + // app-private storage sandboxes on Android and Apple mobile platforms, where + // hard links are rejected with EPERM. Temp and final share m_reportDirectory, + // so this is an atomic same-directory metadata operation. The access check + // best-effort preserves the "never overwrite a completed report" invariant + // that link's EEXIST gave us; the final name is collision-resistant by + // construction (PrepareReportFile probes and retries suffixes), so the + // residual TOCTOU window is benign. On any failure the temp is removed so no + // partial report is left behind. + if (succeeded && reportFilePath != nullptr && reportFilePath[0] != '\0' && + access(reportFilePath, F_OK) != 0 && + rename(m_tempReportFilePath, reportFilePath) == 0) + { + m_tempReportFilePath[0] = '\0'; + return; } unlink(m_tempReportFilePath); diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md index bc4135ec6fb263..3df66fd2d8963b 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md @@ -44,7 +44,7 @@ fixed-slot rotation contract. (See Apple's developer documentation on acquiring crash reports and diagnostic logs.) **Takeaways that shaped this design.** Borrow Android's robust mechanics — -default of 32, temp-file-then-link commit, bounded retention — but improve on the +default of 32, temp-file-then-commit, bounded retention — but improve on the fixed-slot model by encoding the crash time in the filename (so ordering does not depend on fragile `mtime`), and accept that on iOS the runtime, not the OS, owns this app-local directory. @@ -117,12 +117,16 @@ Invoked from the signal handler; must be async-signal-safe and allocation-free: integer formatting, no `snprintf`/`std::string`); - `open` the temp with `O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC` (exclusive create so a collision never truncates an existing report; `O_CLOEXEC` avoids fd leaks); -- on success `link` the temp to the final name then `unlink` the temp; the old - precomputed candidates are unlinked separately. Deletion is *intentional only*: - completed reports are removed by retention, never by the write path overwriting. +- on success `rename` the temp to the final name; the old precomputed candidates + are unlinked separately. `rename` is a same-directory atomic commit and, unlike + `link`, is permitted in the Android/iOS app-private storage sandboxes (hard + links there are rejected with `EPERM`, which would silently lose every report). + A best-effort `access(final, F_OK)` check before the rename preserves the + exclusive-create intent. Deletion is *intentional only*: completed reports are + removed by retention, never by the write path overwriting. Because the final report is a *new* file (not a replacement), there can briefly be -`N+1` completed reports between link and candidate deletion. +`N+1` completed reports between commit and candidate deletion. ## Retention model @@ -150,8 +154,13 @@ directory enumeration and sorting are not signal-safe. test mistake must never silently erase a completed report. - **Init-time candidate caching in fixed buffers.** Keeps the crash path signal-safe and free of malloc/heap state that may be corrupted at crash time. -- **`link` + `unlink` commit (not `rename`).** Both are POSIX async-signal-safe; - `rename` is not formally on the AS-safe list across bionic/Darwin. +- **Same-directory `rename` commit (not `link` + `unlink`).** `rename` is POSIX + async-signal-safe (per `signal-safety(7)`) and is the portable atomic-publish + primitive. Hard links via `link` are rejected with `EPERM` in the Android and + Apple mobile app-private storage sandboxes, which would silently lose every + report — the precise failure this feature exists to prevent. A best-effort + `access(final, F_OK)` guard keeps the exclusive-create intent; the final name is + collision-resistant by construction, so the residual TOCTOU window is benign. - **pid in temp name + liveness-gated stale cleanup.** Lets multiple processes share a root without deleting each other's in-flight temp files. diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h index 38a9e9e82713a0..7d85b177b9ee94 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h @@ -59,9 +59,9 @@ class InProcCrashReportLifecycle size_t reportFilePathSize, int* fd); - // Finalizes the report opened by PrepareReportFile: on success links the temp - // file to its final reportFilePath, then removes the temp file in all cases. - // Runs on the crash/signal path: allocation-free and signal-safe. + // Finalizes the report opened by PrepareReportFile: on success renames the temp + // file to its final reportFilePath, otherwise removes the temp file. Runs on the + // crash/signal path: allocation-free and signal-safe. void FinishReportFile( bool succeeded, const char* reportFilePath); From 8109a8c6e85dc9dac2d4f9d7d20bafd48659b2d0 Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Thu, 28 May 2026 23:23:36 -0400 Subject: [PATCH 05/10] Probe rename capability at init to disable file output up front ProbeDirectoryWritable now also exercises a same-directory rename (the primitive FinishReportFile uses to publish a completed report), not just create/delete. If a sandbox permits create/delete but rejects rename, file output is disabled at initialization with a clear diagnostic instead of silently losing every report on the signal path at crash time. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inproccrashreportlifecycle.cpp | 42 ++++++++++++++++--- .../crashreport/inproccrashreportlifecycle.h | 7 ++-- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index b81e1af35793ba..d5db71013c00b2 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -553,14 +553,15 @@ InProcCrashReportLifecycle::EnsureDirectory(const char* path) bool InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) { - // Borrow the temp-report path buffer as scratch: it is unused until the - // crash path calls PrepareReportFile, so reusing it here avoids a second - // large path buffer while keeping the probe allocation-free. + // Borrow the temp-report path buffer as scratch for the probe path: it is + // unused until the crash path calls PrepareReportFile, so reusing it here + // keeps the probe off the heap and out of the signal path. char* probePath = m_tempReportFilePath; probePath[0] = '\0'; size_t pos = 0; - // Use a hidden throwaway file to verify the directory allows create/delete. + // Use a hidden throwaway file to verify the directory allows create, rename, + // and delete (rename is the operation FinishReportFile uses to publish). SignalSafeFormatter formatter; bool built = CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, path) && @@ -574,8 +575,37 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) if (fd != -1) { bool closeSucceeded = close(fd) == 0; - bool unlinkSucceeded = unlink(probePath) == 0; - writable = closeSucceeded && unlinkSucceeded; + + // Also verify the publish primitive the crash path depends on: a + // same-directory rename. Some sandboxes (notably Android and Apple + // app-private storage) permit create/delete yet reject other + // link/rename operations. Probing it here disables file output up + // front with a clear diagnostic instead of silently losing every + // report when FinishReportFile cannot publish on the signal path. + char committedPath[CRASHREPORT_PATH_BUFFER_SIZE]; + size_t committedPos = 0; + bool committedBuilt = + CrashReportStringUtils::AppendString(committedPath, sizeof(committedPath), &committedPos, probePath) && + CrashReportStringUtils::AppendString(committedPath, sizeof(committedPath), &committedPos, ".committed"); + + if (committedBuilt) + { + // Clear any stale probe artifact left by a prior aborted run so the + // rename targets a fresh name rather than silently replacing it. + unlink(committedPath); + } + + if (committedBuilt && rename(probePath, committedPath) == 0) + { + bool unlinkSucceeded = unlink(committedPath) == 0; + writable = closeSucceeded && unlinkSucceeded; + } + else + { + // The rename probe failed (or the target path did not fit); remove + // the original probe file so no stray artifact is left behind. + unlink(probePath); + } } } diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h index 7d85b177b9ee94..a515be170cc086 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h @@ -162,9 +162,10 @@ class InProcCrashReportLifecycle // Creates the directory if missing; succeeds if it already exists as a directory. static bool EnsureDirectory(const char* path); - // Verifies the directory permits create/delete by creating and removing a - // hidden probe file. Borrows the (init-time idle) m_tempReportFilePath buffer - // rather than placing a second large path buffer on the stack. + // Verifies the directory permits create, rename, and delete by exercising a + // hidden probe file (rename is the primitive FinishReportFile uses to publish + // a completed report). Borrows the (init-time idle) m_tempReportFilePath buffer + // for the probe path and uses a transient stack buffer for the rename target. bool ProbeDirectoryWritable(const char* path); // Parses a managed report file name (report--[-]), From 50eb6a8e7e0cc4de083b37c2ca525442fce7aafd Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Tue, 2 Jun 2026 13:59:44 -0400 Subject: [PATCH 06/10] Refine the in-proc crash report lifecycle per review feedback Consolidates the follow-up changes addressing maintainer review of the in-proc crash report lifecycle into a single net change over the prior pushed state, so there are no intermediate commits that add behavior a later commit removes. Net behavior of the lifecycle and its reporter integration: - Publish completed reports with a same-directory rename guarded by an ENOENT existence check, instead of link(); rename is async-signal-safe and is permitted in the mobile app-private storage sandboxes where hard links are rejected with EPERM. - Probe the report directory for create/rename/delete at init using local stack buffers, disabling file output up front on an unusable directory. - Reuse CrashReportStringUtils::CopyString rather than a second local string-copy helper. - Require an absolute CrashReportRootPath; the runtime no longer expands a leading '~' or $HOME. The integration layer supplies a concrete path. - Drop the per-app process-name path segment: on mobile the root is already app-private, so reports live directly under /.dotnet/crash-reports/. - Name reports with a nanosecond CLOCK_REALTIME timestamp and drop the suffix-retry loop; nanosecond resolution makes collisions vanishingly unlikely, so no disambiguating suffix is needed. - Replace the retention modes with a single bounded model: prune to maxFileCount at init (caching the oldest report only when at the bound) and unlink that cached oldest on the crash path before publishing a new report, so the bound holds whether or not a crash follows. - Remove the process-liveness probe for stale temp cleanup; a leftover .tmp can only come from a defunct run of the same app, so it is removed unconditionally. - Make the lifecycle-managed root the only JSON sink: drop DbgMiniDumpName (and its template expansion and cached hostname) from the in-proc reporter. With no root configured the reporter runs compact-log-only. - Clamp configured maxFileCount to [1, INT32_MAX], falling back to the default for out-of-range or unparseable values. - Update the design notes to match the implemented model. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../debug/crashreport/inproccrashreporter.cpp | 233 +-------- .../debug/crashreport/inproccrashreporter.h | 7 +- .../inproccrashreportlifecycle.cpp | 473 +++++------------- .../inproccrashreportlifecycle.design.md | 160 +++--- .../crashreport/inproccrashreportlifecycle.h | 125 ++--- src/coreclr/vm/crashreportstackwalker.cpp | 13 +- 6 files changed, 276 insertions(+), 735 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.cpp b/src/coreclr/debug/crashreport/inproccrashreporter.cpp index 73722573b42cd4..bf379ecf803572 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreporter.cpp @@ -102,8 +102,7 @@ static char sccsid[] = "@(#)Version N/A"; #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) // Query a sysctl by name into a caller-supplied buffer. Called from Initialize, NOT from the // signal handler -- sysctl/sysctlbyname is not on POSIX's async-signal-safe list, so the -// queried values are cached for use during crash reporting (mirrors the hostName / -// gethostname pattern). +// queried values are cached for use during crash reporting. static void CacheSysctlString(const char* sysctlName, char* buffer, size_t bufferSize) { buffer[0] = '\0'; @@ -388,12 +387,6 @@ class InProcCrashReporter bool jsonEnabled, int fd); - bool BuildReportPath(); - size_t ExpandDumpTemplate( - char* buffer, - size_t bufferSize, - const char* pattern); - static const char* GetSignalNameAscii(int signal); SignalSafeJsonWriter m_jsonWriter; @@ -410,11 +403,8 @@ class InProcCrashReporter volatile LONG m_crashKind = static_cast(InProcCrashReportCrashKind::Unknown); uint32_t m_frameLimitPerThread = 0; InProcCrashReportLifecycle m_lifecycle; - bool m_manageReportLifecycle = false; - char m_configuredReportPath[CRASHREPORT_PATH_BUFFER_SIZE]; char m_reportFilePath[CRASHREPORT_PATH_BUFFER_SIZE]; char m_processName[CRASHREPORT_STRING_BUFFER_SIZE]; - char m_hostName[CRASHREPORT_STRING_BUFFER_SIZE]; char m_stringScratch[CRASHREPORT_STRING_BUFFER_SIZE]; #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) char m_osVersion[CRASHREPORT_STRING_BUFFER_SIZE]; @@ -463,11 +453,6 @@ class CrashReportHelpers static const char* GetFilename( const char* path); - static void CopyString( - char* buffer, - size_t bufferSize, - const char* value); - static void WriteFrameToJson( SignalSafeJsonWriter* writer, SignalSafeFormatter* formatter, @@ -574,9 +559,9 @@ class CrashReportHelpers size_t len); // SignalSafeJsonWriter callback that drops everything: used when the - // crash report is running in compact-log-only mode (no DbgMiniDumpName) - // so the JSON formatter still keeps its bookkeeping consistent without - // emitting bytes anywhere. + // crash report is running in compact-log-only mode (no managed report + // directory configured) so the JSON formatter still keeps its bookkeeping + // consistent without emitting bytes anywhere. static bool DiscardOutputCallback(const char* buffer, size_t len, void* ctx); }; @@ -592,27 +577,13 @@ InProcCrashReporter::CreateReport( return; } m_reportFilePath[0] = '\0'; - // The JSON file sink is enabled by lifecycle-managed output or by a - // DbgMiniDumpName template that expands to a valid path. Otherwise the - // crash report runs in compact-log-only mode: the JSON emitter still + // The JSON file sink is enabled only by lifecycle-managed output. Otherwise + // the crash report runs in compact-log-only mode: the JSON emitter still // executes (so it can keep its bookkeeping consistent) but writes go to a // no-op DiscardOutputCallback instead of an open fd. - bool jsonEnabled = false; - int fd = -1; - if (m_manageReportLifecycle) - { - jsonEnabled = m_lifecycle.IsReportFileOutputEnabled() && - m_lifecycle.PrepareReportFile(&m_formatter, m_reportFilePath, sizeof(m_reportFilePath), &fd); - } - else - { - jsonEnabled = m_configuredReportPath[0] != '\0' && BuildReportPath(); - if (jsonEnabled) - { - fd = open(m_reportFilePath, O_WRONLY | O_CREAT | O_TRUNC, 0600); - } - } + bool jsonEnabled = m_lifecycle.IsReportFileOutputEnabled() && + m_lifecycle.PrepareReportFile(&m_formatter, m_reportFilePath, sizeof(m_reportFilePath), &fd); if (jsonEnabled && fd == -1) { @@ -736,8 +707,6 @@ InProcCrashReporter::Initialize( m_frameLimitPerThread = settings.frameLimitPerThread; m_crashKind = static_cast(InProcCrashReportCrashKind::Unknown); m_stackOverflowTrace.available = 0; - m_manageReportLifecycle = false; - m_configuredReportPath[0] = '\0'; m_reportFilePath[0] = '\0'; m_processName[0] = '\0'; @@ -754,7 +723,7 @@ InProcCrashReporter::Initialize( if (n > 0) { m_stringScratch[n] = '\0'; - CrashReportHelpers::CopyString(m_processName, sizeof(m_processName), CrashReportHelpers::GetFilename(m_stringScratch)); + CrashReportStringUtils::CopyString(m_processName, sizeof(m_processName), CrashReportHelpers::GetFilename(m_stringScratch)); } } #endif @@ -762,44 +731,17 @@ InProcCrashReporter::Initialize( { if (char* exePath = minipal_getexepath()) { - CrashReportHelpers::CopyString(m_processName, sizeof(m_processName), CrashReportHelpers::GetFilename(exePath)); + CrashReportStringUtils::CopyString(m_processName, sizeof(m_processName), CrashReportHelpers::GetFilename(exePath)); free(exePath); } } - // Cache hostname here because gethostname is not on the POSIX - // async-signal-safe list; the dump-template expander needs it for %h - // expansion at crash time. - m_hostName[0] = '\0'; - if (gethostname(m_hostName, sizeof(m_hostName) - 1) == 0) - { - m_hostName[sizeof(m_hostName) - 1] = '\0'; - } - else - { - m_hostName[0] = '\0'; - } - + // File output is produced only through the lifecycle-managed report + // directory. When no root is configured the reporter still runs, emitting + // compact console logs without writing a JSON report file. if (settings.lifecycleRootPath != nullptr && settings.lifecycleRootPath[0] != '\0') { - m_manageReportLifecycle = true; - size_t pos = 0; - if (CrashReportHelpers::AppendString( - m_configuredReportPath, - sizeof(m_configuredReportPath), - &pos, - settings.lifecycleRootPath)) - { - m_lifecycle.Initialize(m_configuredReportPath, m_processName, settings.maxFileCount); - } - else - { - InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: invalid CrashReportRootPath"); - } - } - else - { - CrashReportHelpers::CopyString(m_configuredReportPath, sizeof(m_configuredReportPath), settings.reportPath); + m_lifecycle.Initialize(settings.lifecycleRootPath, settings.maxFileCount); } #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) @@ -1029,131 +971,6 @@ CrashReportOutputContext::ChunkCallback( return outputContext->HandleChunk(buffer, len); } -// Expand the coredump template patterns supported by createdump's -// FormatDumpName for DOTNET_DbgMiniDumpName: %% %p %d (PID), %e (process -// name, cached at Initialize), %h (hostname, cached at Initialize), and %t -// (current epoch seconds via time(2), POSIX async-signal-safe). Unknown -// specifiers are rejected (return 0) to match createdump and to avoid -// silently producing diverging file names from the same template. -size_t -InProcCrashReporter::ExpandDumpTemplate( - char* buffer, - size_t bufferSize, - const char* pattern) -{ - if (buffer == nullptr || bufferSize == 0 || - pattern == nullptr) - { - return 0; - } - - size_t pos = 0; - unsigned pid = static_cast(GetCurrentProcessId()); - - while (*pattern != '\0' && pos + 1 < bufferSize) - { - if (*pattern != '%') - { - buffer[pos++] = *pattern++; - continue; - } - - pattern++; - char specifier = *pattern; - - const char* substitution = nullptr; - - switch (specifier) - { - case '%': - if (pos + 1 < bufferSize) - { - buffer[pos++] = '%'; - } - pattern++; - continue; - - case 'p': - case 'd': - substitution = m_formatter.FormatUnsignedDecimal(pid); - break; - - case 'e': - substitution = (m_processName[0] != '\0') ? m_processName : nullptr; - break; - - case 'h': - substitution = (m_hostName[0] != '\0') ? m_hostName : nullptr; - break; - - case 't': - substitution = m_formatter.FormatUnsignedDecimal(static_cast(time(nullptr))); - break; - - default: - // Unknown / unsupported specifier; fail rather than emit a - // path with a literal '%X' that would diverge from the file - // name createdump would produce for the same template. - return 0; - } - - if (substitution == nullptr) - { - // Required substitution unavailable (e.g. hostname capture failed - // at Initialize). Fail rather than emit a path missing this - // component, which could collide with the dump file on disk. - return 0; - } - - size_t subLen = strlen(substitution); - if (pos + subLen >= bufferSize) - { - return 0; - } - memcpy(buffer + pos, substitution, subLen); - pos += subLen; - - if (*pattern != '\0') - { - pattern++; - } - } - - buffer[pos] = '\0'; - if (*pattern != '\0') - { - // The output buffer filled before the full template was consumed. - // Fail rather than returning a truncated path that could collide or - // unexpectedly change the report location. - return 0; - } - return pos; -} - -bool -InProcCrashReporter::BuildReportPath() -{ - if (m_configuredReportPath[0] == '\0') - { - return false; - } - - size_t pos = ExpandDumpTemplate( - m_reportFilePath, - sizeof(m_reportFilePath), - m_configuredReportPath); - if (pos == 0) - { - return false; - } - - if (!CrashReportHelpers::AppendString(m_reportFilePath, sizeof(m_reportFilePath), &pos, ".crashreport.json")) - { - return false; - } - return true; -} - void CrashReportHelpers::GetVersionString( char* buffer, @@ -1338,11 +1155,11 @@ CrashReportHelpers::BuildMethodName( } else if (className != nullptr) { - CopyString(buffer, bufferSize, className); + CrashReportStringUtils::CopyString(buffer, bufferSize, className); } else if (methodName != nullptr) { - CopyString(buffer, bufferSize, methodName); + CrashReportStringUtils::CopyString(buffer, bufferSize, methodName); } else { @@ -1399,15 +1216,6 @@ HasManagedIdentity( (token != 0 && HasModuleName(moduleName)); } -void -CrashReportHelpers::CopyString( - char* buffer, - size_t bufferSize, - const char* value) -{ - CrashReportStringUtils::CopyString(buffer, bufferSize, value); -} - void CrashReportHelpers::WriteFrameToJson( SignalSafeJsonWriter* writer, @@ -2316,14 +2124,7 @@ InProcCrashReporter::EndJsonReport( bool closeSucceeded = close(fd) == 0; bool reportSucceeded = finishSucceeded && !writeFailed && closeSucceeded; - if (m_manageReportLifecycle) - { - m_lifecycle.FinishReportFile(reportSucceeded, m_reportFilePath); - } - else if (!reportSucceeded) - { - unlink(m_reportFilePath); - } + m_lifecycle.FinishReportFile(reportSucceeded, m_reportFilePath); } else { diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.h b/src/coreclr/debug/crashreport/inproccrashreporter.h index b72077e1a75c0a..05f154ef53b9ea 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.h +++ b/src/coreclr/debug/crashreport/inproccrashreporter.h @@ -15,15 +15,11 @@ #include // Scratch-buffer sizes used throughout the in-proc crash reporter: -// - 1024 (matching createdump's MAX_LONGPATH) for paths (report paths and -// expanded dump templates), so DOTNET_DbgMiniDumpName values that work -// with createdump also work here. +// - 1024 (matching createdump's MAX_LONGPATH) for report paths. // - 256 for identifiers (process name, type/class/exception names). static constexpr size_t CRASHREPORT_PATH_BUFFER_SIZE = 1024; static constexpr size_t CRASHREPORT_STRING_BUFFER_SIZE = 256; static constexpr int32_t CRASHREPORT_DEFAULT_MAX_FILE_COUNT = 32; -static constexpr int32_t CRASHREPORT_UNLIMITED_FILE_COUNT = -1; -static constexpr int32_t CRASHREPORT_CLEANUP_ONLY_FILE_COUNT = 0; #if defined(__ANDROID__) static const char CRASHREPORT_LOG_TAG[] = "DOTNET_CRASH"; @@ -76,7 +72,6 @@ using InProcCrashReportModuleInfoCallback = bool (*)( struct InProcCrashReporterSettings { - const char* reportPath; const char* lifecycleRootPath; InProcCrashReportIsManagedThreadCallback isManagedThreadCallback; InProcCrashReportWalkStackCallback walkStackCallback; diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index d5db71013c00b2..141019c876794b 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -9,9 +9,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -25,77 +23,36 @@ static const char CrashReportManagedReportDirectory[] = "crash-reports"; static const char CrashReportFilePrefix[] = "report-"; static const char CrashReportFileExtension[] = ".crashreport.json"; static const char CrashReportTempExtension[] = ".tmp"; -// Bounded retry avoids an unbounded crash-path loop if a process repeatedly -// crashes in the same second with the same pid and stale files are present. -static constexpr uint32_t CrashReportMaxSuffixRetry = 32; -// Directory scans run at initialization, but misconfigured roots should not let -// a diagnostic feature stall application startup indefinitely. -static constexpr size_t CrashReportMaxDirectoryEntriesToScan = 4096; - -InProcCrashReportLifecycle::~InProcCrashReportLifecycle() -{ - delete[] m_deleteCandidates; -} + +static const uint64_t NanosecondsPerSecond = 1000000000ull; bool InProcCrashReportLifecycle::Initialize( const char* rootPath, - const char* processName, int32_t maxFileCount) { m_reportFileOutputEnabled = false; m_reportDirectory[0] = '\0'; m_tempReportFilePath[0] = '\0'; + m_cachedOldestReport.value[0] = '\0'; - if (!EstablishReportDirectory(rootPath, processName)) - { - return false; - } - - RetentionMode mode = GetRetentionMode(maxFileCount); - - FileInfo* reports = nullptr; - size_t reportCount = 0; - if (CollectExistingReports(mode, &reports, &reportCount) != CollectResult::Ready) + if (!EstablishReportDirectory(rootPath)) { - // Both a hard failure and the cleanup-only path leave output disabled; the - // former already logged, the latter performed its deletions during the scan. return false; } - if (mode == RetentionMode::Bounded && !SelectDeleteCandidates(reports, reportCount, maxFileCount)) + if (!PruneExistingReports(maxFileCount)) { - free(reports); return false; } - free(reports); m_reportFileOutputEnabled = true; return true; } -InProcCrashReportLifecycle::RetentionMode -InProcCrashReportLifecycle::GetRetentionMode(int32_t maxFileCount) -{ - if (maxFileCount == CRASHREPORT_CLEANUP_ONLY_FILE_COUNT) - { - return RetentionMode::CleanupOnly; - } - - if (maxFileCount == CRASHREPORT_UNLIMITED_FILE_COUNT) - { - return RetentionMode::Unlimited; - } - - // The configuration layer constrains maxFileCount to the unlimited (-1), - // cleanup-only (0), or positive-bound domain, so anything else is a bound. - return RetentionMode::Bounded; -} - bool InProcCrashReportLifecycle::EstablishReportDirectory( - const char* rootPath, - const char* processName) + const char* rootPath) { if (rootPath == nullptr || rootPath[0] == '\0') { @@ -116,17 +73,12 @@ InProcCrashReportLifecycle::EstablishReportDirectory( return false; } - char appName[CRASHREPORT_STRING_BUFFER_SIZE]; - SanitizePathComponent(appName, sizeof(appName), processName); - size_t pos = 0; if (!CrashReportStringUtils::AppendString(m_reportDirectory, sizeof(m_reportDirectory), &pos, root) || !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, CrashReportManagedRootDirectory) || !EnsureDirectory(m_reportDirectory) || !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, CrashReportManagedReportDirectory) || !EnsureDirectory(m_reportDirectory) || - !AppendPathComponent(m_reportDirectory, sizeof(m_reportDirectory), &pos, appName) || - !EnsureDirectory(m_reportDirectory) || !ProbeDirectoryWritable(m_reportDirectory)) { InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to initialize crash report directory"); @@ -137,42 +89,61 @@ InProcCrashReportLifecycle::EstablishReportDirectory( return true; } -InProcCrashReportLifecycle::CollectResult -InProcCrashReportLifecycle::CollectExistingReports( - RetentionMode mode, - FileInfo** reports, - size_t* reportCount) +size_t +InProcCrashReportLifecycle::FindOldestReportIndex( + const FileInfo* reports, + size_t reportCount) { - *reports = nullptr; - *reportCount = 0; + size_t oldest = 0; + for (size_t i = 1; i < reportCount; i++) + { + if (CompareFileInfo(&reports[i], &reports[oldest]) < 0) + { + oldest = i; + } + } + + return oldest; +} +bool +InProcCrashReportLifecycle::PruneExistingReports(int32_t maxFileCount) +{ DIR* dir = opendir(m_reportDirectory); if (dir == nullptr) { InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to scan crash report directory"); - return CollectResult::Failed; + return false; } - FileInfo* collected = nullptr; - size_t collectedCount = 0; - size_t collectedCapacity = 0; - size_t scannedEntries = 0; - bool scanExceeded = false; + // Retain at most the newest maxFileCount completed reports. The kept set is + // held in a fixed array sized to the bound, so a directory with an + // unexpectedly large number of reports cannot drive an unbounded allocation; + // overflow reports are unlinked inline as they are encountered. maxFileCount + // is guaranteed positive by the configuration layer. + size_t capacity = static_cast(maxFileCount); + FileInfo* kept = new (std::nothrow) FileInfo[capacity]; + if (kept == nullptr) + { + closedir(dir); + InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to allocate retention scan storage"); + return false; + } + size_t keptCount = 0; while (dirent* entry = readdir(dir)) { - if (++scannedEntries > CrashReportMaxDirectoryEntriesToScan) - { - scanExceeded = true; - break; - } - FileInfo info = {}; bool hasTempExtension = false; bool parsedOwnedName = TryParseReportName(entry->d_name, &info, &hasTempExtension); bool isTemp = parsedOwnedName && hasTempExtension; bool isCompleted = parsedOwnedName && !hasTempExtension; + if (!isTemp && !isCompleted) + { + continue; + } + char fullPath[CRASHREPORT_PATH_BUFFER_SIZE]; fullPath[0] = '\0'; size_t fullPathPos = 0; @@ -184,97 +155,49 @@ InProcCrashReportLifecycle::CollectExistingReports( if (isTemp) { - if (!IsProcessAlive(info.pid)) - { - unlink(fullPath); - } + // Any leftover temp file is from a previous, now-defunct run of this + // app (each app has its own report directory under its private + // storage, and the writer renames its temp to the final name before + // returning), so it can be removed unconditionally. + unlink(fullPath); continue; } - if (!isCompleted) - { - continue; - } + CrashReportStringUtils::CopyString(info.path.value, sizeof(info.path.value), fullPath); - if (mode == RetentionMode::CleanupOnly) + if (keptCount < capacity) { - unlink(fullPath); + kept[keptCount++] = info; continue; } - if (mode == RetentionMode::Unlimited) + // The kept set is full, so this entry competes with the current oldest + // kept report: unlink the older of the two and keep the newer. + size_t oldestIndex = FindOldestReportIndex(kept, keptCount); + if (CompareFileInfo(&kept[oldestIndex], &info) < 0) { - continue; + unlink(kept[oldestIndex].path.value); + kept[oldestIndex] = info; } - - if (collectedCount == collectedCapacity) + else { - size_t newCapacity = collectedCapacity == 0 ? 16 : collectedCapacity * 2; - FileInfo* grown = reinterpret_cast(realloc(collected, newCapacity * sizeof(FileInfo))); - if (grown == nullptr) - { - free(collected); - closedir(dir); - InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to allocate retention scan storage"); - return CollectResult::Failed; - } - - collected = grown; - collectedCapacity = newCapacity; + unlink(info.path.value); } - - CrashReportStringUtils::CopyString(info.path.value, sizeof(info.path.value), fullPath); - collected[collectedCount++] = info; } closedir(dir); - if (scanExceeded) - { - free(collected); - InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: too many files in crash report directory"); - return CollectResult::Failed; - } - - if (mode == RetentionMode::CleanupOnly) - { - free(collected); - return CollectResult::CleanupOnlyComplete; - } - - *reports = collected; - *reportCount = collectedCount; - return CollectResult::Ready; -} - -bool -InProcCrashReportLifecycle::SelectDeleteCandidates( - FileInfo* reports, - size_t reportCount, - int32_t maxFileCount) -{ - qsort(reports, reportCount, sizeof(FileInfo), &CompareFileInfo); - - size_t keepBeforeCrash = static_cast(maxFileCount - 1); - if (reportCount <= keepBeforeCrash) - { - return true; - } - - m_deleteCandidateCount = reportCount - keepBeforeCrash; - m_deleteCandidates = new (std::nothrow) ReportPath[m_deleteCandidateCount]; - if (m_deleteCandidates == nullptr) + // A full kept set means the directory already holds maxFileCount reports, so + // the next crash report would exceed the bound: cache the oldest, and the + // crash path unlinks it before publishing the new report. Below the bound + // nothing is cached and the crash path deletes nothing. + if (keptCount == capacity) { - m_deleteCandidateCount = 0; - InProcCrashReportLogInitializationFailure(".NET crash report file output disabled: failed to allocate retention candidate storage"); - return false; - } - - for (size_t i = 0; i < m_deleteCandidateCount; i++) - { - CrashReportStringUtils::CopyString(m_deleteCandidates[i].value, sizeof(m_deleteCandidates[i].value), reports[i].path.value); + size_t oldestIndex = FindOldestReportIndex(kept, keptCount); + CrashReportStringUtils::CopyString(m_cachedOldestReport.value, sizeof(m_cachedOldestReport.value), kept[oldestIndex].path.value); } + delete[] kept; return true; } @@ -293,45 +216,38 @@ InProcCrashReportLifecycle::PrepareReportFile( reportFilePath[0] = '\0'; *fd = -1; - uint64_t timestamp = static_cast(time(nullptr)); + + // Nanosecond-resolution timestamp keeps report names unique without a retry + // loop: even back-to-back crashes in the same process get distinct names. + // clock_gettime(CLOCK_REALTIME) is POSIX async-signal-safe, so it is valid + // on the crash path. A failed read degrades to a zero timestamp rather than + // aborting the report write. + struct timespec now = {}; + clock_gettime(CLOCK_REALTIME, &now); + uint64_t timestampNs = static_cast(now.tv_sec) * NanosecondsPerSecond + + static_cast(now.tv_nsec); uint32_t pid = static_cast(GetCurrentProcessId()); - // Retention candidates are deleted before opening the temp file so the - // completed-report set never exceeds the configured bound and old reports - // can free space for the new crash report. If the new write later fails, - // the deleted candidates are intentionally not restored. - DeleteCandidates(); + // Delete the cached over-retention report (if any) before opening the temp + // file, freeing a slot so the completed set stays at the bound. A later write + // failure intentionally does not restore it. + DeleteCachedReport(); - for (uint32_t suffix = 0; suffix <= CrashReportMaxSuffixRetry; suffix++) + if (!BuildReportPaths(formatter, reportFilePath, reportFilePathSize, m_tempReportFilePath, sizeof(m_tempReportFilePath), timestampNs, pid)) { - if (!BuildReportPaths(formatter, reportFilePath, reportFilePathSize, m_tempReportFilePath, sizeof(m_tempReportFilePath), timestamp, pid, suffix)) - { - return false; - } - - if (access(reportFilePath, F_OK) == 0) - { - continue; - } - - int tempFd = open(m_tempReportFilePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); - if (tempFd == -1) - { - if (errno == EEXIST) - { - continue; - } - - return false; - } + return false; + } - *fd = tempFd; - return true; + int tempFd = open(m_tempReportFilePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); + if (tempFd == -1) + { + reportFilePath[0] = '\0'; + m_tempReportFilePath[0] = '\0'; + return false; } - reportFilePath[0] = '\0'; - m_tempReportFilePath[0] = '\0'; - return false; + *fd = tempFd; + return true; } void @@ -345,17 +261,16 @@ InProcCrashReportLifecycle::FinishReportFile( } // Publish the completed report by renaming the temp file to its final name. - // rename is POSIX async-signal-safe and, unlike link, is permitted in the - // app-private storage sandboxes on Android and Apple mobile platforms, where - // hard links are rejected with EPERM. Temp and final share m_reportDirectory, - // so this is an atomic same-directory metadata operation. The access check - // best-effort preserves the "never overwrite a completed report" invariant - // that link's EEXIST gave us; the final name is collision-resistant by - // construction (PrepareReportFile probes and retries suffixes), so the - // residual TOCTOU window is benign. On any failure the temp is removed so no - // partial report is left behind. + // rename is async-signal-safe and, unlike link, is permitted in the Android + // and Apple app-private storage sandboxes (where link fails with EPERM); temp + // and final share m_reportDirectory, so this is an atomic same-directory op. + // Only publish when the destination is absent (access fails with ENOENT) to + // preserve the "never overwrite a completed report" invariant; any other + // errno leaves the destination state unknown, so decline rather than risk a + // replace. The collision-resistant final name (nanosecond timestamp plus pid) + // keeps the residual TOCTOU window benign. On any failure the temp is removed. if (succeeded && reportFilePath != nullptr && reportFilePath[0] != '\0' && - access(reportFilePath, F_OK) != 0 && + access(reportFilePath, F_OK) != 0 && errno == ENOENT && rename(m_tempReportFilePath, reportFilePath) == 0) { m_tempReportFilePath[0] = '\0'; @@ -374,8 +289,7 @@ InProcCrashReportLifecycle::BuildReportPaths( char* tempReportFilePath, size_t tempReportFilePathSize, uint64_t timestamp, - uint32_t pid, - uint32_t suffix) + uint32_t pid) { reportFilePath[0] = '\0'; tempReportFilePath[0] = '\0'; @@ -390,15 +304,6 @@ InProcCrashReportLifecycle::BuildReportPaths( return false; } - if (suffix != 0) - { - if (!CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, "-") || - !CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, formatter->FormatUnsignedDecimal(suffix))) - { - return false; - } - } - if (!CrashReportStringUtils::AppendString(reportFilePath, reportFilePathSize, &pos, CrashReportFileExtension)) { return false; @@ -410,14 +315,11 @@ InProcCrashReportLifecycle::BuildReportPaths( } void -InProcCrashReportLifecycle::DeleteCandidates() +InProcCrashReportLifecycle::DeleteCachedReport() { - for (size_t i = 0; i < m_deleteCandidateCount; i++) + if (m_cachedOldestReport.value[0] != '\0') { - if (m_deleteCandidates[i].value[0] != '\0') - { - unlink(m_deleteCandidates[i].value); - } + unlink(m_cachedOldestReport.value); } } @@ -466,61 +368,17 @@ InProcCrashReportLifecycle::ResolveRootPath( return false; } - buffer[0] = '\0'; - size_t pos = 0; - - if (rootPath[0] == '~' && (rootPath[1] == '\0' || rootPath[1] == '/')) - { - const char* home = getenv("HOME"); - if (home == nullptr || home[0] == '\0') - { - return false; - } - - if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, home)) - { - return false; - } - - rootPath++; - if (*rootPath == '/') - { - rootPath++; - } - if (*rootPath != '\0' && !AppendPathComponent(buffer, bufferSize, &pos, rootPath)) - { - return false; - } - } - else if (strncmp(rootPath, "$HOME", 5) == 0 && (rootPath[5] == '\0' || rootPath[5] == '/')) - { - const char* home = getenv("HOME"); - if (home == nullptr || home[0] == '\0') - { - return false; - } - - if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, home)) - { - return false; - } - - rootPath += 5; - if (*rootPath == '/') - { - rootPath++; - } - if (*rootPath != '\0' && !AppendPathComponent(buffer, bufferSize, &pos, rootPath)) - { - return false; - } - } - else if (!CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, rootPath)) + // The configuring host is responsible for supplying a fully-resolved + // absolute path; the runtime does not expand a leading '~' or environment + // variables and rejects anything that is not already absolute. + if (!IsAbsolutePath(rootPath)) { return false; } - return IsAbsolutePath(buffer); + buffer[0] = '\0'; + size_t pos = 0; + return CrashReportStringUtils::AppendString(buffer, bufferSize, &pos, rootPath); } bool @@ -553,10 +411,10 @@ InProcCrashReportLifecycle::EnsureDirectory(const char* path) bool InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) { - // Borrow the temp-report path buffer as scratch for the probe path: it is - // unused until the crash path calls PrepareReportFile, so reusing it here - // keeps the probe off the heap and out of the signal path. - char* probePath = m_tempReportFilePath; + // This runs only on the initialization path, so the probe paths are kept in + // local stack buffers rather than borrowing a member buffer; that keeps the + // probe self-contained and off both the heap and the signal path. + char probePath[CRASHREPORT_PATH_BUFFER_SIZE]; probePath[0] = '\0'; size_t pos = 0; @@ -564,13 +422,16 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) // and delete (rename is the operation FinishReportFile uses to publish). SignalSafeFormatter formatter; bool built = - CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, path) && - CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, "/.probe-") && - CrashReportStringUtils::AppendString(probePath, sizeof(m_tempReportFilePath), &pos, formatter.FormatUnsignedDecimal(static_cast(GetCurrentProcessId()))); + CrashReportStringUtils::AppendString(probePath, sizeof(probePath), &pos, path) && + CrashReportStringUtils::AppendString(probePath, sizeof(probePath), &pos, "/.probe-") && + CrashReportStringUtils::AppendString(probePath, sizeof(probePath), &pos, formatter.FormatUnsignedDecimal(static_cast(GetCurrentProcessId()))); bool writable = false; if (built) { + // Clear any stale probe file so the O_EXCL create below doesn't wrongly report the directory as unwritable. + unlink(probePath); + int fd = open(probePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); if (fd != -1) { @@ -590,8 +451,7 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) if (committedBuilt) { - // Clear any stale probe artifact left by a prior aborted run so the - // rename targets a fresh name rather than silently replacing it. + // Clear any stale committed artifact so the rename targets a fresh name instead of replacing it. unlink(committedPath); } @@ -602,16 +462,12 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) } else { - // The rename probe failed (or the target path did not fit); remove - // the original probe file so no stray artifact is left behind. + // The rename probe failed (or the target path did not fit); remove the probe file so no stray artifact remains. unlink(probePath); } } } - // Leave the borrowed buffer empty so the crash path's "no temp file yet" - // invariant continues to hold after initialization. - m_tempReportFilePath[0] = '\0'; return writable; } @@ -678,7 +534,6 @@ InProcCrashReportLifecycle::TryParseReportName( const char* current = name + prefixLength; uint64_t timestamp = 0; uint64_t pid = 0; - uint64_t suffix = 0; if (!TryParseUnsigned(¤t, ×tamp) || *current != '-') { @@ -691,15 +546,6 @@ InProcCrashReportLifecycle::TryParseReportName( return false; } - if (*current == '-') - { - current++; - if (!TryParseUnsigned(¤t, &suffix)) - { - return false; - } - } - if (strncmp(current, CrashReportFileExtension, extensionLength) != 0) { return false; @@ -720,27 +566,10 @@ InProcCrashReportLifecycle::TryParseReportName( info->timestamp = timestamp; info->pid = pid; - info->suffix = suffix; return true; } -bool -InProcCrashReportLifecycle::IsProcessAlive(uint64_t pid) -{ - if (pid == 0 || pid > static_cast(INT_MAX)) - { - return false; - } - - // Signal 0 probes for process existence without delivering a signal. - if (kill(static_cast(pid), 0) == 0) - { - return true; - } - - return errno != ESRCH; -} - +// Comparator ordering reports oldest-first (timestamp, then path). int InProcCrashReportLifecycle::CompareFileInfo( const void* left, @@ -757,55 +586,5 @@ InProcCrashReportLifecycle::CompareFileInfo( { return 1; } - if (leftInfo->suffix < rightInfo->suffix) - { - return -1; - } - if (leftInfo->suffix > rightInfo->suffix) - { - return 1; - } return strcmp(leftInfo->path.value, rightInfo->path.value); } - -bool -InProcCrashReportLifecycle::IsSafePathCharacter(char c) -{ - return (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c >= '0' && c <= '9') || - c == '.' || - c == '_' || - c == '-'; -} - -void -InProcCrashReportLifecycle::SanitizePathComponent( - char* buffer, - size_t bufferSize, - const char* value) -{ - if (buffer == nullptr || bufferSize == 0) - { - return; - } - - const char* source = (value != nullptr && value[0] != '\0') ? value : "unknown"; - size_t pos = 0; - while (*source != '\0' && pos + 1 < bufferSize) - { - buffer[pos++] = IsSafePathCharacter(*source) ? *source : '_'; - source++; - } - - if (pos == 0) - { - const char fallback[] = "unknown"; - for (size_t i = 0; fallback[i] != '\0' && pos + 1 < bufferSize; i++) - { - buffer[pos++] = fallback[i]; - } - } - - buffer[pos] = '\0'; -} diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md index 3df66fd2d8963b..957bd8d97af7a3 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md @@ -14,11 +14,11 @@ is constrained, so crash reports that pile up without bound can bloat the device yet there was no durable, app-local place to put them and no mechanism to cap how many are kept. -The reporter could write a file only when `DOTNET_DbgMiniDumpName` was set (writing -`.crashreport.json`), emitting compact logs otherwise. -That path is relatively new and carries no notion of a managed location or of -retaining/pruning past reports; it stays supported here, but it is recent enough -that we could drop it if we chose to. +The lifecycle-managed root (`DOTNET_CrashReportRootPath`) is the **only** way to +enable JSON report files in the in-proc reporter. Without it the reporter still +runs, emitting compact console logs but writing no report file. The feature is +still in development, so it does not carry the out-of-proc `createdump` +`DbgMiniDumpName` template path; that subsystem is separate and untouched. ## Background: how the platforms manage crash artifacts @@ -65,32 +65,32 @@ this app-local directory. | Variable | Meaning | | --- | --- | -| `DOTNET_CrashReportRootPath` | Activates file output. Root under which the managed directory is created. Supports a leading `~` or `$HOME` shorthand expanded to `getenv("HOME")`. Must resolve to an existing, absolute directory or output is disabled. | -| `DOTNET_CrashReportMaxFileCount` | Retention bound. Default **32**; **-1** unlimited; **0** cleanup-only. Parsed as a *signed* value (the usual DWORD config helper is unsigned); out-of-range values fall back to the default. | +| `DOTNET_CrashReportRootPath` | Activates file output. Absolute path to an existing directory under which the managed report directory is created. Must already be absolute — the runtime does **not** expand a leading `~` or environment variables. If it cannot be resolved or made writable, output is disabled and the failure is logged. | +| `DOTNET_CrashReportMaxFileCount` | Retention bound: the maximum number of completed reports to keep. Default **32**. Parsed with `strtol`; values that are non-numeric, have trailing junk, overflow, or fall outside `[1, INT32_MAX]` are rejected and fall back to the default (logged). | -When `CrashReportRootPath` is unset, the legacy `DbgMiniDumpName + -.crashreport.json` behavior is unchanged — the root path supersedes the legacy -path only when set. +On mobile the configured root is expected to live under the app's private +storage (for example `context.getFilesDir()` on Android CoreCLR app hosting, or +the app sandbox home on iOS), which makes the report directory per-app without the +lifecycle needing to encode the app identity in the path. ## On-disk layout and naming ``` -/.dotnet/crash-reports//report--[-].crashreport.json +/.dotnet/crash-reports/report--.crashreport.json ``` -- `` is the sanitized process name (unsafe path characters replaced with - `_`, falling back to `unknown`) — isolates retention per app under a shared root. -- `` (epoch seconds) and `` order reports by intended crash time; - the optional `-` disambiguates same-second/pid collisions. +- Reports are written directly under `/.dotnet/crash-reports/`. There is no + per-app subdirectory: on mobile the root is already app-private, so adding an + app-name segment (and sanitizing it) bought nothing. +- `` (nanoseconds from `clock_gettime(CLOCK_REALTIME)`) and `` + order reports by intended crash time. Nanosecond resolution makes names unique + without a suffix-retry loop — even back-to-back crashes in the same process get + distinct names — so there is no `-` component. - The `.crashreport.json` extension is the canonical one already emitted by `createdump` and the existing reporter; external tooling recognizes it. The `report-` prefix is this feature's marker/parse anchor. - In-progress reports use a `…/report-….crashreport.json.tmp` temp file. -On mobile, `HOME` is app-scoped (`context.getFilesDir()` on Android CoreCLR app -hosting; the app sandbox home on iOS), so `$HOME/.dotnet/crash-reports` is -per-app, not shared across apps. - ## Two execution contexts The class deliberately spans two contexts; every member documents which it obeys. @@ -98,78 +98,98 @@ The class deliberately spans two contexts; every member documents which it obeys **Initialization path (`Initialize` and helpers).** Runs once at startup. May allocate and call libc/filesystem APIs. Responsibilities: -1. Resolve/validate the root, create `.dotnet/crash-reports/` (each - `mkdir` tolerates `EEXIST`; the final node is verified to be a directory). -2. Verify writability with a create+close+unlink probe file — so the crash path - never discovers an unwritable directory. -3. Scan the directory (capped at a bounded entry count to avoid startup hangs): - - delete stale `.tmp` files whose owning pid is no longer alive - (`kill(pid,0)`), so we never unlink another live process's active temp; - - in cleanup-only mode, delete every completed report and leave output - disabled; - - otherwise collect completed reports, sort oldest-first, and **precompute** - the over-retention deletion candidates into fixed buffers. +1. Resolve/validate the root (require an absolute path), create + `.dotnet/crash-reports` (each `mkdir` tolerates `EEXIST`; the final node is + verified to be a directory). +2. Verify the directory permits create/rename/delete with a hidden probe file + (`ProbeDirectoryWritable`), so the crash path never discovers an unusable + directory. The probe paths live in local stack buffers because this runs only + at init. +3. Scan the directory once (`PruneExistingReports`): + - delete any leftover `.tmp` files unconditionally — a stray temp is from a + previous, now-defunct run of this app (the directory is app-private and the + writer renames its temp to the final name before returning), so no + process-liveness probe is needed; + - retain only the newest `maxFileCount` completed reports, unlinking older ones + inline as it scans (a fixed `FileInfo[maxFileCount]` buffer allocated with + `new(std::nothrow)`; OOM disables output gracefully); + - if the directory is already at the bound, cache the single oldest retained + report into `m_cachedOldestReport` so the crash path can unlink it before + publishing a new one. **Crash/signal path (`PrepareReportFile`, `FinishReportFile`, and helpers).** Invoked from the signal handler; must be async-signal-safe and allocation-free: -- build the temp + final paths into caller-provided fixed buffers (signal-safe - integer formatting, no `snprintf`/`std::string`); +- read a nanosecond timestamp via `clock_gettime(CLOCK_REALTIME)` (POSIX + async-signal-safe; a failed read degrades to a zero timestamp rather than + aborting the write) and build the temp + final paths into caller-provided fixed + buffers (signal-safe integer formatting, no `snprintf`/`std::string`); +- unlink the cached over-retention report, if one was selected at init + (`DeleteCachedReport`), so retention stays at the bound; - `open` the temp with `O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC` (exclusive create so a collision never truncates an existing report; `O_CLOEXEC` avoids fd leaks); -- on success `rename` the temp to the final name; the old precomputed candidates - are unlinked separately. `rename` is a same-directory atomic commit and, unlike - `link`, is permitted in the Android/iOS app-private storage sandboxes (hard - links there are rejected with `EPERM`, which would silently lose every report). - A best-effort `access(final, F_OK)` check before the rename preserves the - exclusive-create intent. Deletion is *intentional only*: completed reports are - removed by retention, never by the write path overwriting. - -Because the final report is a *new* file (not a replacement), there can briefly be -`N+1` completed reports between commit and candidate deletion. +- on success `rename` the temp to the final name. `rename` is a same-directory + atomic commit and, unlike `link`, is permitted in the Android/iOS app-private + storage sandboxes (hard links there are rejected with `EPERM`, which would + silently lose every report). Deletion is *intentional only*: completed reports + are removed by retention, never by the write path overwriting. ## Retention model -`GetRetentionMode` maps `maxFileCount` to one mode once, so the scan does not -re-test sentinels per entry: +There is a single bounded retention mode: keep at most `maxFileCount` completed +reports. The configuration layer guarantees `maxFileCount >= 1`, so the scan does +not test sentinels per entry. -- **Unlimited (`-1`):** keep every completed report; still clean stale temps. -- **CleanupOnly (`0`):** delete all completed reports + stale temps, then disable - new writes for the process lifetime. -- **Bounded (`>0`):** keep at most `maxFileCount`; precompute deletions beyond - `maxFileCount - 1`, reserving one slot for the imminent report. +- At init, `PruneExistingReports` keeps the newest `min(total, maxFileCount)` + completed reports and unlinks the rest. If the kept set is full + (`keptCount == maxFileCount`), it caches the single oldest kept report. +- On a crash, the write path unlinks that cached oldest report (if any) before + publishing the new one, so the directory stays at the bound. When the directory + was below the bound, nothing is cached and the crash path performs no deletion — + the new report simply fills an open slot. -Candidates are precomputed at init (not enumerated in the signal handler) because -directory enumeration and sorting are not signal-safe. +This means the configured bound is enforced at startup regardless of whether a +crash follows: lowering `maxFileCount` between runs prunes the excess on the next +init, not only on the next crash. Directory enumeration and sorting happen at init +(not in the signal handler) because they are not signal-safe; only the single +precomputed unlink runs on the crash path. ## Key decisions and rationale -- **Dedicated app-local directory instead of inferring from `DbgMiniDumpName`.** - A managed directory gives a clear ownership boundary for retention. +- **Lifecycle-managed root is the only file sink.** A managed directory gives a + clear ownership boundary for retention; dropping the `DbgMiniDumpName` template + path (which the in-proc reporter never relied on) removed template-expansion and + hostname-caching logic the lifecycle does not need. - **Opt-in via explicit root.** A default directory + automatic file creation would implicitly turn on persistent files; we avoid that. -- **Timestamp filenames over Android-style fixed slots + `mtime`.** Robust to - copy/touch/restore that would scramble `mtime`; ordering comes from the name. -- **Exclusive create (no `O_TRUNC`).** A PID/timestamp collision, stale file, or +- **Absolute root only, no `~`/`$HOME` expansion.** On mobile the integration + layer supplies a concrete app-private path; doing shell-style expansion inside + the runtime would add ambiguity for no real caller benefit. +- **Flattened layout (no per-app subdirectory).** The root is already app-private + on mobile, so an app-name segment — and the path-sanitization it required — + added complexity without isolating anything. +- **Nanosecond timestamp filenames over fixed slots + `mtime`, and no suffix + retry.** Robust to copy/touch/restore that would scramble `mtime`; ordering + comes from the name, and nanosecond resolution makes collisions vanishingly + unlikely, so no disambiguating suffix or open-retry loop is needed. +- **Exclusive create (no `O_TRUNC`).** A timestamp/pid collision, stale file, or test mistake must never silently erase a completed report. -- **Init-time candidate caching in fixed buffers.** Keeps the crash path - signal-safe and free of malloc/heap state that may be corrupted at crash time. +- **Init-time pruning + single cached oldest in fixed buffers.** Keeps the crash + path signal-safe and free of malloc/heap state that may be corrupted at crash + time, while still applying the bound eagerly at startup. - **Same-directory `rename` commit (not `link` + `unlink`).** `rename` is POSIX async-signal-safe (per `signal-safety(7)`) and is the portable atomic-publish primitive. Hard links via `link` are rejected with `EPERM` in the Android and Apple mobile app-private storage sandboxes, which would silently lose every - report — the precise failure this feature exists to prevent. A best-effort - `access(final, F_OK)` guard keeps the exclusive-create intent; the final name is - collision-resistant by construction, so the residual TOCTOU window is benign. -- **pid in temp name + liveness-gated stale cleanup.** Lets multiple processes - share a root without deleting each other's in-flight temp files. + report — the precise failure this feature exists to prevent. +- **Unconditional stale-temp cleanup.** Because each app owns its report + directory under private storage, a leftover `.tmp` can only come from a defunct + run of the same app, so it is removed without a `kill(pid, 0)` liveness probe. ## Cross-process considerations -Retention is best-effort when several processes share one root: each caches its -own delete candidates at init, so the directory may transiently hold more than -`maxFileCount` reports. The app-name subdirectory and pid-tagged temps prevent the -dangerous cases (deleting a live writer's temp, or one app pruning another's -reports). Strict global retention would require cross-process locking, which is a -non-goal here. - +Retention is best-effort if several processes were ever to share one root: each +caches its own oldest-report deletion candidate at init, so the directory may +transiently hold more than `maxFileCount` reports. Strict global retention would +require cross-process locking, which is a non-goal here. In the intended mobile +deployment the root is app-private, so this is not a concern in practice. diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h index a515be170cc086..749641793de3ce 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h @@ -10,10 +10,10 @@ #include // Manages the on-disk lifecycle of in-proc crash reports: at startup it -// establishes the managed report directory, prunes stale and over-retention -// reports, and selects deletion candidates; on a crash it hands out a uniquely -// named temp report file and finalizes it. Composed alongside (not derived from) -// the signal-safe writer family that emits the report contents. +// establishes the managed report directory and prunes stale and over-retention +// reports; on a crash it hands out a uniquely named temp report file and +// finalizes it. Composed alongside (not derived from) the signal-safe writer +// family that emits the report contents. // // Members run in one of two execution contexts: // * Initialization path -- runs once at process startup. May allocate and call @@ -22,35 +22,29 @@ // * Crash/signal path -- invoked from the crash signal handler. Must be // async-signal-safe and allocation-free. Only these members run there: // IsReportFileOutputEnabled, PrepareReportFile, FinishReportFile, -// BuildReportPaths, DeleteCandidates, and the shared AppendPathComponent. +// BuildReportPaths, DeleteCachedReport, and the shared AppendPathComponent. // Each is marked accordingly below. class InProcCrashReportLifecycle { public: InProcCrashReportLifecycle() = default; - ~InProcCrashReportLifecycle(); InProcCrashReportLifecycle(const InProcCrashReportLifecycle&) = delete; InProcCrashReportLifecycle& operator=(const InProcCrashReportLifecycle&) = delete; - // Prepares lifecycle-managed storage for crash reports under rootPath for the - // given processName, keeping at most maxFileCount completed reports - // (CRASHREPORT_UNLIMITED_FILE_COUNT for no limit, - // CRASHREPORT_CLEANUP_ONLY_FILE_COUNT for cleanup-only). Runs at startup (not - // the crash path) and may allocate. Returns false if storage could not be - // initialized or output is disabled. + // Prepares lifecycle-managed storage for crash reports under rootPath, + // keeping at most maxFileCount completed reports. Runs at startup (not the + // crash path) and may allocate. Returns false if storage could not be + // initialized. bool Initialize( const char* rootPath, - const char* processName, int32_t maxFileCount); // Returns whether lifecycle-managed report files should be written. False when - // initialization failed or when output is intentionally disabled (for example - // the cleanup-only mode, maxFileCount == CRASHREPORT_CLEANUP_ONLY_FILE_COUNT, - // which still initializes successfully). Crash/signal-path safe: reads one bool. + // initialization failed. Crash/signal-path safe: reads one bool. bool IsReportFileOutputEnabled() const { return m_reportFileOutputEnabled; } // Opens a uniquely named temporary report file under the managed directory, - // returning its path and an open fd. Deletes any over-retention candidates + // returning its path and an open fd. Deletes the cached over-retention report // first. Runs on the crash/signal path: allocation-free and signal-safe. // Returns false if no file could be opened. bool PrepareReportFile( @@ -67,24 +61,6 @@ class InProcCrashReportLifecycle const char* reportFilePath); private: - // How existing and future reports are retained, derived once from the - // configured maxFileCount so the scan does not re-test sentinels per entry. - enum class RetentionMode - { - CleanupOnly, // delete every completed report and leave output disabled - Unlimited, // keep every completed report; never prune - Bounded, // keep at most maxFileCount completed reports - }; - - // Outcome of CollectExistingReports, distinguishing a hard failure from the - // intentional cleanup-only path (both leave report output disabled). - enum class CollectResult - { - Failed, - CleanupOnlyComplete, - Ready, - }; - struct ReportPath { char value[CRASHREPORT_PATH_BUFFER_SIZE]; @@ -94,40 +70,28 @@ class InProcCrashReportLifecycle { uint64_t timestamp; uint64_t pid; - uint64_t suffix; ReportPath path; }; - // Maps the configured maxFileCount onto the retention mode that governs how - // the existing-report scan and pruning behave. - static RetentionMode GetRetentionMode(int32_t maxFileCount); - - // Resolves rootPath/processName into m_reportDirectory, creating the managed - // directory tree and verifying it is writable. Initialization path; logs and + // Resolves rootPath into m_reportDirectory, creating the managed directory + // tree and verifying it is writable. Initialization path; logs and // returns false on failure. bool EstablishReportDirectory( - const char* rootPath, - const char* processName); - - // Scans m_reportDirectory: removes stale temp files and, in CleanupOnly mode, - // every completed report; otherwise collects completed reports into a newly - // allocated array transferred to the caller on CollectResult::Ready. - // Initialization path; may allocate. - CollectResult CollectExistingReports( - RetentionMode mode, - FileInfo** reports, - size_t* reportCount); - - // Sorts the collected reports oldest-first and records the over-retention - // entries (beyond maxFileCount - 1, reserving one slot for the imminent - // report) into m_deleteCandidates. Initialization path; logs and returns - // false if the candidate storage allocation fails. - bool SelectDeleteCandidates( - FileInfo* reports, - size_t reportCount, - int32_t maxFileCount); + const char* rootPath); + + // Scans m_reportDirectory, removing stale temp files and retaining only the + // newest maxFileCount completed reports (unlinking older ones inline). When + // the directory is already at the bound, caches the oldest retained report so + // the crash path can unlink it before publishing a new one. Initialization + // path; may allocate. Logs and returns false on failure. + bool PruneExistingReports(int32_t maxFileCount); - // Builds the final and temporary report paths from the timestamp/pid/suffix + // Returns the index of the oldest report in reports per CompareFileInfo. + static size_t FindOldestReportIndex( + const FileInfo* reports, + size_t reportCount); + + // Builds the final and temporary report paths from the timestamp/pid // into the caller-provided buffers. Crash-path, allocation-free. bool BuildReportPaths( SignalSafeFormatter* formatter, @@ -136,11 +100,11 @@ class InProcCrashReportLifecycle char* tempReportFilePath, size_t tempReportFilePathSize, uint64_t timestamp, - uint32_t pid, - uint32_t suffix); + uint32_t pid); - // Unlinks the over-retention reports selected during Initialize. Crash-path. - void DeleteCandidates(); + // Unlinks the cached over-retention report selected during Initialize, if any. + // Crash-path. + void DeleteCachedReport(); // Appends component as a new path segment (inserting a single '/' separator as // needed). Allocation-free; used by both the init and crash paths. @@ -153,7 +117,8 @@ class InProcCrashReportLifecycle // Returns whether path is absolute (begins with '/'). Initialization path. static bool IsAbsolutePath(const char* path); - // Resolves rootPath (expanding a leading '~' or '$HOME') into an absolute path. + // Copies rootPath into buffer, requiring it to already be an absolute path; + // the runtime does not expand a leading '~' or environment variables. static bool ResolveRootPath( char* buffer, size_t bufferSize, @@ -164,11 +129,11 @@ class InProcCrashReportLifecycle // Verifies the directory permits create, rename, and delete by exercising a // hidden probe file (rename is the primitive FinishReportFile uses to publish - // a completed report). Borrows the (init-time idle) m_tempReportFilePath buffer - // for the probe path and uses a transient stack buffer for the rename target. + // a completed report). Runs only at initialization, so the probe paths live in + // local stack buffers. bool ProbeDirectoryWritable(const char* path); - // Parses a managed report file name (report--[-]), + // Parses a managed report file name (report--), // accepting either a completed report or the in-progress .tmp form, into info. // Reports which form matched through isTempExtension. static bool TryParseReportName( @@ -176,27 +141,13 @@ class InProcCrashReportLifecycle FileInfo* info, bool* isTempExtension); - // Returns whether a process with the given pid currently exists. - static bool IsProcessAlive(uint64_t pid); - - // qsort comparator ordering reports oldest-first (timestamp, then suffix, then path). + // Comparator ordering reports oldest-first (timestamp, then path). static int CompareFileInfo( const void* left, const void* right); - // Returns whether c is allowed unescaped in a path component. - static bool IsSafePathCharacter(char c); - - // Copies value into buffer, replacing unsafe characters with '_' and falling - // back to "unknown" when the result would be empty. - static void SanitizePathComponent( - char* buffer, - size_t bufferSize, - const char* value); - char m_reportDirectory[CRASHREPORT_PATH_BUFFER_SIZE] = {}; char m_tempReportFilePath[CRASHREPORT_PATH_BUFFER_SIZE] = {}; - ReportPath* m_deleteCandidates = nullptr; - size_t m_deleteCandidateCount = 0; + ReportPath m_cachedOldestReport = {}; bool m_reportFileOutputEnabled = false; }; diff --git a/src/coreclr/vm/crashreportstackwalker.cpp b/src/coreclr/vm/crashreportstackwalker.cpp index 28e21804287f84..a071c3cab327b8 100644 --- a/src/coreclr/vm/crashreportstackwalker.cpp +++ b/src/coreclr/vm/crashreportstackwalker.cpp @@ -590,9 +590,9 @@ CrashReportEnumerateThreads( } } -// This runs during configuration, not from the crash signal path. It uses -// strtol because TryAsInteger parses into DWORD via strtoul and cannot -// represent the -1 unlimited-retention sentinel for this lifecycle setting. +// This runs during configuration, not from the crash signal path. maxFileCount +// is a positive retention bound; values outside [1, INT32_MAX] (including the +// non-positive and overflowing ones) fall back to the default. static int32_t GetCrashReportMaxFileCount() @@ -615,7 +615,7 @@ GetCrashReportMaxFileCount() char* end = nullptr; long configuredMaxFileCount = strtol(maxFileCountString, &end, 10); if (end != maxFileCountString && *end == '\0' && errno != ERANGE && - configuredMaxFileCount >= CRASHREPORT_UNLIMITED_FILE_COUNT && configuredMaxFileCount <= INT32_MAX) + configuredMaxFileCount >= 1 && configuredMaxFileCount <= INT32_MAX) { maxFileCount = static_cast(configuredMaxFileCount); } @@ -662,11 +662,6 @@ CrashReportConfigure() settings.lifecycleRootPath = crashReportRootPath; settings.maxFileCount = GetCrashReportMaxFileCount(); } - else - { - CLRConfigNoCache dmpNameCfg = CLRConfigNoCache::Get("DbgMiniDumpName", /*noprefix*/ false, &getenv); - settings.reportPath = dmpNameCfg.IsSet() ? dmpNameCfg.AsString() : nullptr; - } settings.isManagedThreadCallback = CrashReportIsCurrentThreadManaged; settings.walkStackCallback = CrashReportWalkStack; From 20590ec2eb752bd4d5be8bdbf3fc5fe7736e5c0f Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Tue, 2 Jun 2026 17:07:47 -0400 Subject: [PATCH 07/10] Polish in-proc crash report lifecycle per review feedback - Rename settings field lifecycleRootPath -> reportRootPath for clarity and consistency with the CrashReportRootPath configuration knob. - Inline the single-use DeleteCachedReport helper into the crash path. - Document why retention pruning uses a linear oldest-scan rather than a min-heap. - Document why TryParseUnsigned is used instead of strtoul/strtoull. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../debug/crashreport/inproccrashreporter.cpp | 4 +- .../debug/crashreport/inproccrashreporter.h | 2 +- .../inproccrashreportlifecycle.cpp | 47 +++++++++---------- .../crashreport/inproccrashreportlifecycle.h | 6 +-- src/coreclr/vm/crashreportstackwalker.cpp | 6 +-- 5 files changed, 30 insertions(+), 35 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.cpp b/src/coreclr/debug/crashreport/inproccrashreporter.cpp index 6c23c114dd9955..c3765b98e43b11 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreporter.cpp @@ -745,9 +745,9 @@ InProcCrashReporter::Initialize( // File output is produced only through the lifecycle-managed report // directory. When no root is configured the reporter still runs, emitting // compact console logs without writing a JSON report file. - if (settings.lifecycleRootPath != nullptr && settings.lifecycleRootPath[0] != '\0') + if (settings.reportRootPath != nullptr && settings.reportRootPath[0] != '\0') { - m_lifecycle.Initialize(settings.lifecycleRootPath, settings.maxFileCount); + m_lifecycle.Initialize(settings.reportRootPath, settings.maxFileCount); } #if defined(TARGET_IOS) || defined(TARGET_TVOS) || defined(TARGET_MACCATALYST) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.h b/src/coreclr/debug/crashreport/inproccrashreporter.h index f267f6a9ab4ff4..90f54a1e4ad367 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.h +++ b/src/coreclr/debug/crashreport/inproccrashreporter.h @@ -72,7 +72,7 @@ using InProcCrashReportModuleInfoCallback = bool (*)( struct InProcCrashReporterSettings { - const char* lifecycleRootPath; + const char* reportRootPath; int timeoutSeconds; InProcCrashReportIsManagedThreadCallback isManagedThreadCallback; InProcCrashReportWalkStackCallback walkStackCallback; diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index 141019c876794b..bc138d7dac0f8a 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -172,7 +172,13 @@ InProcCrashReportLifecycle::PruneExistingReports(int32_t maxFileCount) } // The kept set is full, so this entry competes with the current oldest - // kept report: unlink the older of the two and keep the newer. + // kept report: unlink the older of the two and keep the newer. A linear + // FindOldestReportIndex scan is used rather than a timestamp-ordered heap: + // the array holds at most maxFileCount entries (a small bound), this runs + // once at init, and the directory is pruned to the bound on every init so + // the scanned count stays near the bound in steady state. A min-heap would + // perform better but adds an dependency and heap bookkeeping + // for no measurable gain here. size_t oldestIndex = FindOldestReportIndex(kept, keptCount); if (CompareFileInfo(&kept[oldestIndex], &info) < 0) { @@ -231,7 +237,10 @@ InProcCrashReportLifecycle::PrepareReportFile( // Delete the cached over-retention report (if any) before opening the temp // file, freeing a slot so the completed set stays at the bound. A later write // failure intentionally does not restore it. - DeleteCachedReport(); + if (m_cachedOldestReport.value[0] != '\0') + { + unlink(m_cachedOldestReport.value); + } if (!BuildReportPaths(formatter, reportFilePath, reportFilePathSize, m_tempReportFilePath, sizeof(m_tempReportFilePath), timestampNs, pid)) { @@ -314,15 +323,6 @@ InProcCrashReportLifecycle::BuildReportPaths( CrashReportStringUtils::AppendString(tempReportFilePath, tempReportFilePathSize, &tempPos, CrashReportTempExtension); } -void -InProcCrashReportLifecycle::DeleteCachedReport() -{ - if (m_cachedOldestReport.value[0] != '\0') - { - unlink(m_cachedOldestReport.value); - } -} - bool InProcCrashReportLifecycle::AppendPathComponent( char* buffer, @@ -471,6 +471,10 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) return writable; } +// Parses a single unsigned filename component, advancing *value past the digits +// consumed. strtoull skips leading whitespace and accepts a leading '+'/'-' sign, +// so a leading-digit guard is kept up front to stop malformed names from matching +// the report-- pattern. static bool TryParseUnsigned( const char** value, uint64_t* result) @@ -481,21 +485,16 @@ static bool TryParseUnsigned( return false; } - uint64_t parsed = 0; - do + errno = 0; + char* end = nullptr; + unsigned long long parsed = strtoull(current, &end, 10); + if (errno == ERANGE) { - uint64_t digit = static_cast(*current - '0'); - if (parsed > (UINT64_MAX - digit) / 10) - { - return false; - } - - parsed = parsed * 10 + digit; - current++; - } while (*current >= '0' && *current <= '9'); + return false; + } - *value = current; - *result = parsed; + *value = end; + *result = static_cast(parsed); return true; } diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h index 749641793de3ce..84d92c8b5dba54 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.h @@ -22,7 +22,7 @@ // * Crash/signal path -- invoked from the crash signal handler. Must be // async-signal-safe and allocation-free. Only these members run there: // IsReportFileOutputEnabled, PrepareReportFile, FinishReportFile, -// BuildReportPaths, DeleteCachedReport, and the shared AppendPathComponent. +// BuildReportPaths, and the shared AppendPathComponent. // Each is marked accordingly below. class InProcCrashReportLifecycle { @@ -102,10 +102,6 @@ class InProcCrashReportLifecycle uint64_t timestamp, uint32_t pid); - // Unlinks the cached over-retention report selected during Initialize, if any. - // Crash-path. - void DeleteCachedReport(); - // Appends component as a new path segment (inserting a single '/' separator as // needed). Allocation-free; used by both the init and crash paths. static bool AppendPathComponent( diff --git a/src/coreclr/vm/crashreportstackwalker.cpp b/src/coreclr/vm/crashreportstackwalker.cpp index e5bd1ce67ffac1..1f7780ddf066c9 100644 --- a/src/coreclr/vm/crashreportstackwalker.cpp +++ b/src/coreclr/vm/crashreportstackwalker.cpp @@ -711,12 +711,12 @@ CrashReportConfigure() CLRConfigNoCache crashReportRootPathCfg = CLRConfigNoCache::Get("CrashReportRootPath", /*noprefix*/ false, &getenv); const char* crashReportRootPath = crashReportRootPathCfg.IsSet() ? crashReportRootPathCfg.AsString() : nullptr; - bool lifecycleRootConfigured = crashReportRootPath != nullptr && crashReportRootPath[0] != '\0'; + bool rootConfigured = crashReportRootPath != nullptr && crashReportRootPath[0] != '\0'; InProcCrashReporterSettings settings = {}; - if (lifecycleRootConfigured) + if (rootConfigured) { - settings.lifecycleRootPath = crashReportRootPath; + settings.reportRootPath = crashReportRootPath; settings.maxFileCount = GetCrashReportMaxFileCount(); } From f921471e26c4bd59800586dd71bb9a464b704512 Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Tue, 2 Jun 2026 18:07:21 -0400 Subject: [PATCH 08/10] Address Copilot review feedback on in-proc crash report lifecycle - Remove duplicate (crashreportstackwalker.cpp) and (inproccrashreporter.cpp) includes. - Correct the CrashReportMaxFileCount config description to a positive integer bound; the -1/0 modes were dropped during the earlier simplification. - Inline strtoull/strtoul in TryParseReportName and remove the TryParseUnsigned helper. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../debug/crashreport/inproccrashreporter.cpp | 1 - .../inproccrashreportlifecycle.cpp | 42 +++++-------------- src/coreclr/inc/clrconfigvalues.h | 2 +- src/coreclr/vm/crashreportstackwalker.cpp | 1 - 4 files changed, 11 insertions(+), 35 deletions(-) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.cpp b/src/coreclr/debug/crashreport/inproccrashreporter.cpp index c3765b98e43b11..09ec9ddfc12607 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreporter.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index bc138d7dac0f8a..f72f3301161396 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -471,33 +471,6 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) return writable; } -// Parses a single unsigned filename component, advancing *value past the digits -// consumed. strtoull skips leading whitespace and accepts a leading '+'/'-' sign, -// so a leading-digit guard is kept up front to stop malformed names from matching -// the report-- pattern. -static bool TryParseUnsigned( - const char** value, - uint64_t* result) -{ - const char* current = *value; - if (current == nullptr || *current < '0' || *current > '9') - { - return false; - } - - errno = 0; - char* end = nullptr; - unsigned long long parsed = strtoull(current, &end, 10); - if (errno == ERANGE) - { - return false; - } - - *value = end; - *result = static_cast(parsed); - return true; -} - bool InProcCrashReportLifecycle::TryParseReportName( const char* name, @@ -531,19 +504,24 @@ InProcCrashReportLifecycle::TryParseReportName( } const char* current = name + prefixLength; - uint64_t timestamp = 0; - uint64_t pid = 0; - if (!TryParseUnsigned(¤t, ×tamp) || *current != '-') + // The timestamp and pid are written by this process as plain decimal digits, + // so parse them directly with strtoull/strtoul. end == current means no digits + // were consumed, which rejects an empty timestamp or pid component. + char* end = nullptr; + uint64_t timestamp = strtoull(current, &end, 10); + if (end == current || *end != '-') { return false; } - current++; + current = end + 1; - if (!TryParseUnsigned(¤t, &pid)) + uint64_t pid = strtoul(current, &end, 10); + if (end == current) { return false; } + current = end; if (strncmp(current, CrashReportFileExtension, extensionLength) != 0) { diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 9a26dc367ec17e..16ceb4aa173842 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -579,7 +579,7 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_CreateDumpDiagnostics, W("CreateDumpDiagnostic RETAIL_CONFIG_DWORD_INFO(INTERNAL_CrashReportBeforeSignalChaining, W("CrashReportBeforeSignalChaining"), 0, "Enable crash report generation before chaining to previous signal handler") RETAIL_CONFIG_DWORD_INFO_EX(INTERNAL_CrashReportFrameLimitPerThread, W("CrashReportFrameLimitPerThread"), 32, "Maximum number of managed stack frames per thread to emit in the in-proc crash report's compact log; 0 disables the limit; remaining frames are summarized as '... +N more frames'", CLRConfig::LookupOptions::ParseIntegerAsBase10) RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportRootPath, W("CrashReportRootPath"), "Root path for lifecycle-managed in-proc crash report JSON files") -RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportMaxFileCount, W("CrashReportMaxFileCount"), "Maximum number of lifecycle-managed in-proc crash report JSON files to retain; default 32, -1 unlimited, 0 cleanup-only") +RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportMaxFileCount, W("CrashReportMaxFileCount"), "Maximum number of lifecycle-managed in-proc crash report JSON files to retain (positive integer); default 32") /// /// R2R diff --git a/src/coreclr/vm/crashreportstackwalker.cpp b/src/coreclr/vm/crashreportstackwalker.cpp index 1f7780ddf066c9..48aa551f572a10 100644 --- a/src/coreclr/vm/crashreportstackwalker.cpp +++ b/src/coreclr/vm/crashreportstackwalker.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include From e8f163c35e3116a2c9fb7bd38967df1a1eefa77b Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Wed, 3 Jun 2026 13:08:55 -0400 Subject: [PATCH 09/10] Reduce copies and relax file-create flags in crash report pruning Address reviewer feedback on the in-proc crash report lifecycle init path: - Value-initialize the kept-report array so any never-populated slots stay well-defined (hygiene; the slots were never read uninitialized). - Populate each kept slot directly from the local full path instead of copying into a temporary FileInfo and then assigning the whole nested struct, removing a redundant ~1KB path copy per retained report. The full-set overflow comparison is inlined (timestamp, then path) to avoid materializing a candidate FileInfo. - Use O_TRUNC instead of O_CREAT|O_EXCL for the throwaway temp report file and the directory-writability probe. Neither needs the never-overwrite guarantee, so O_TRUNC tolerates a stale leftover while still failing on a genuinely unwritable directory. This also lets the probe drop its preceding unlink of a stale probe file. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../inproccrashreportlifecycle.cpp | 26 +-- .../inproccrashreportlifecycle.design.md | 195 ------------------ 2 files changed, 14 insertions(+), 207 deletions(-) delete mode 100644 src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp index f72f3301161396..31159e05f66062 100644 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.cpp @@ -122,7 +122,7 @@ InProcCrashReportLifecycle::PruneExistingReports(int32_t maxFileCount) // overflow reports are unlinked inline as they are encountered. maxFileCount // is guaranteed positive by the configuration layer. size_t capacity = static_cast(maxFileCount); - FileInfo* kept = new (std::nothrow) FileInfo[capacity]; + FileInfo* kept = new (std::nothrow) FileInfo[capacity](); if (kept == nullptr) { closedir(dir); @@ -163,11 +163,12 @@ InProcCrashReportLifecycle::PruneExistingReports(int32_t maxFileCount) continue; } - CrashReportStringUtils::CopyString(info.path.value, sizeof(info.path.value), fullPath); - if (keptCount < capacity) { - kept[keptCount++] = info; + kept[keptCount].timestamp = info.timestamp; + kept[keptCount].pid = info.pid; + CrashReportStringUtils::CopyString(kept[keptCount].path.value, sizeof(kept[keptCount].path.value), fullPath); + keptCount++; continue; } @@ -180,14 +181,18 @@ InProcCrashReportLifecycle::PruneExistingReports(int32_t maxFileCount) // perform better but adds an dependency and heap bookkeeping // for no measurable gain here. size_t oldestIndex = FindOldestReportIndex(kept, keptCount); - if (CompareFileInfo(&kept[oldestIndex], &info) < 0) + + if (kept[oldestIndex].timestamp < info.timestamp || + (kept[oldestIndex].timestamp == info.timestamp && strcmp(kept[oldestIndex].path.value, fullPath) < 0)) { unlink(kept[oldestIndex].path.value); - kept[oldestIndex] = info; + kept[oldestIndex].timestamp = info.timestamp; + kept[oldestIndex].pid = info.pid; + CrashReportStringUtils::CopyString(kept[oldestIndex].path.value, sizeof(kept[oldestIndex].path.value), fullPath); } else { - unlink(info.path.value); + unlink(fullPath); } } @@ -247,7 +252,7 @@ InProcCrashReportLifecycle::PrepareReportFile( return false; } - int tempFd = open(m_tempReportFilePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); + int tempFd = open(m_tempReportFilePath, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0600); if (tempFd == -1) { reportFilePath[0] = '\0'; @@ -429,10 +434,7 @@ InProcCrashReportLifecycle::ProbeDirectoryWritable(const char* path) bool writable = false; if (built) { - // Clear any stale probe file so the O_EXCL create below doesn't wrongly report the directory as unwritable. - unlink(probePath); - - int fd = open(probePath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0600); + int fd = open(probePath, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0600); if (fd != -1) { bool closeSucceeded = close(fd) == 0; diff --git a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md b/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md deleted file mode 100644 index 957bd8d97af7a3..00000000000000 --- a/src/coreclr/debug/crashreport/inproccrashreportlifecycle.design.md +++ /dev/null @@ -1,195 +0,0 @@ -# In-proc crash report lifecycle — design notes - -Design rationale for `InProcCrashReportLifecycle` (`inproccrashreportlifecycle.h` / -`inproccrashreportlifecycle.cpp`). It records *why* the lifecycle behaves the way -it does, derived from an investigation of how Android and iOS manage their own -crash artifacts and a brainstorming/grill-me design session. It is documentation, -not a spec — the code is the source of truth. - -## Problem - -The in-proc crash reporter (`InProcCrashReporter`) emits a createdump-shaped -`*.crashreport.json` payload from the crash signal handler. On mobile, disk space -is constrained, so crash reports that pile up without bound can bloat the device — -yet there was no durable, app-local place to put them and no mechanism to cap how -many are kept. - -The lifecycle-managed root (`DOTNET_CrashReportRootPath`) is the **only** way to -enable JSON report files in the in-proc reporter. Without it the reporter still -runs, emitting compact console logs but writing no report file. The feature is -still in development, so it does not carry the out-of-proc `createdump` -`DbgMiniDumpName` template path; that subsystem is separate and untouched. - -## Background: how the platforms manage crash artifacts - -**Android (tombstones).** The OS keeps a *bounded rotating set* of tombstone -files. `tombstoned` writes completed dumps to `/data/tombstones/tombstone_00`, -`tombstone_01`, … The default maximum is **32** -(`tombstoned.max_tombstone_count`). It picks a missing slot first, otherwise the -oldest slot, then advances modulo the max. Dumps are first written to anonymous -temp files and only **linked** into the final `tombstone_NN` path once the dumper -reports completion, so a failed/timed-out dump never leaves a named partial -tombstone. Because filenames are fixed slots, age is tracked by `mtime`. The -breadcrumb tying a crash to its file is the logcat line -`Tombstone written to: /data/tombstones/tombstone_06`. (See AOSP `debuggerd`/ -`tombstoned` and the Android debugging docs for the authoritative mechanics.) - -**iOS / Apple.** Crash reports are OS-managed diagnostic logs (`.ips`/`.crash`), -not a public circular tombstone directory. The system generates a report after -the app is allowed to finish crashing (an attached Xcode debugger intercepts the -crash until you detach). Reports live in the device's Analytics Data and are -surfaced/transferred via Xcode/Console, TestFlight, and the App Store. Retention -and pruning are Apple/OS-managed; there is **no** app-controlled, Android-style -fixed-slot rotation contract. (See Apple's developer documentation on acquiring -crash reports and diagnostic logs.) - -**Takeaways that shaped this design.** Borrow Android's robust mechanics — -default of 32, temp-file-then-commit, bounded retention — but improve on the -fixed-slot model by encoding the crash time in the filename (so ordering does not -depend on fragile `mtime`), and accept that on iOS the runtime, not the OS, owns -this app-local directory. - -## Goals and non-goals - -- **Goal:** a durable, bounded, app-local directory of completed crash reports, - written safely from the crash signal handler. -- **Goal:** opt-in. No file output unless explicitly configured, so we never - silently start persisting files to a device. -- **Goal:** fail loud and disabled on misconfiguration rather than guessing. -- **Non-goal:** cross-process coordination/locking. Retention is best-effort when - multiple processes share a root (see *Cross-process*). -- **Non-goal:** replacing the OS crash facilities; this is the runtime's own JSON - report alongside them. - -## Configuration - -| Variable | Meaning | -| --- | --- | -| `DOTNET_CrashReportRootPath` | Activates file output. Absolute path to an existing directory under which the managed report directory is created. Must already be absolute — the runtime does **not** expand a leading `~` or environment variables. If it cannot be resolved or made writable, output is disabled and the failure is logged. | -| `DOTNET_CrashReportMaxFileCount` | Retention bound: the maximum number of completed reports to keep. Default **32**. Parsed with `strtol`; values that are non-numeric, have trailing junk, overflow, or fall outside `[1, INT32_MAX]` are rejected and fall back to the default (logged). | - -On mobile the configured root is expected to live under the app's private -storage (for example `context.getFilesDir()` on Android CoreCLR app hosting, or -the app sandbox home on iOS), which makes the report directory per-app without the -lifecycle needing to encode the app identity in the path. - -## On-disk layout and naming - -``` -/.dotnet/crash-reports/report--.crashreport.json -``` - -- Reports are written directly under `/.dotnet/crash-reports/`. There is no - per-app subdirectory: on mobile the root is already app-private, so adding an - app-name segment (and sanitizing it) bought nothing. -- `` (nanoseconds from `clock_gettime(CLOCK_REALTIME)`) and `` - order reports by intended crash time. Nanosecond resolution makes names unique - without a suffix-retry loop — even back-to-back crashes in the same process get - distinct names — so there is no `-` component. -- The `.crashreport.json` extension is the canonical one already emitted by - `createdump` and the existing reporter; external tooling recognizes it. The - `report-` prefix is this feature's marker/parse anchor. -- In-progress reports use a `…/report-….crashreport.json.tmp` temp file. - -## Two execution contexts - -The class deliberately spans two contexts; every member documents which it obeys. - -**Initialization path (`Initialize` and helpers).** Runs once at startup. May -allocate and call libc/filesystem APIs. Responsibilities: - -1. Resolve/validate the root (require an absolute path), create - `.dotnet/crash-reports` (each `mkdir` tolerates `EEXIST`; the final node is - verified to be a directory). -2. Verify the directory permits create/rename/delete with a hidden probe file - (`ProbeDirectoryWritable`), so the crash path never discovers an unusable - directory. The probe paths live in local stack buffers because this runs only - at init. -3. Scan the directory once (`PruneExistingReports`): - - delete any leftover `.tmp` files unconditionally — a stray temp is from a - previous, now-defunct run of this app (the directory is app-private and the - writer renames its temp to the final name before returning), so no - process-liveness probe is needed; - - retain only the newest `maxFileCount` completed reports, unlinking older ones - inline as it scans (a fixed `FileInfo[maxFileCount]` buffer allocated with - `new(std::nothrow)`; OOM disables output gracefully); - - if the directory is already at the bound, cache the single oldest retained - report into `m_cachedOldestReport` so the crash path can unlink it before - publishing a new one. - -**Crash/signal path (`PrepareReportFile`, `FinishReportFile`, and helpers).** -Invoked from the signal handler; must be async-signal-safe and allocation-free: - -- read a nanosecond timestamp via `clock_gettime(CLOCK_REALTIME)` (POSIX - async-signal-safe; a failed read degrades to a zero timestamp rather than - aborting the write) and build the temp + final paths into caller-provided fixed - buffers (signal-safe integer formatting, no `snprintf`/`std::string`); -- unlink the cached over-retention report, if one was selected at init - (`DeleteCachedReport`), so retention stays at the bound; -- `open` the temp with `O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC` (exclusive create - so a collision never truncates an existing report; `O_CLOEXEC` avoids fd leaks); -- on success `rename` the temp to the final name. `rename` is a same-directory - atomic commit and, unlike `link`, is permitted in the Android/iOS app-private - storage sandboxes (hard links there are rejected with `EPERM`, which would - silently lose every report). Deletion is *intentional only*: completed reports - are removed by retention, never by the write path overwriting. - -## Retention model - -There is a single bounded retention mode: keep at most `maxFileCount` completed -reports. The configuration layer guarantees `maxFileCount >= 1`, so the scan does -not test sentinels per entry. - -- At init, `PruneExistingReports` keeps the newest `min(total, maxFileCount)` - completed reports and unlinks the rest. If the kept set is full - (`keptCount == maxFileCount`), it caches the single oldest kept report. -- On a crash, the write path unlinks that cached oldest report (if any) before - publishing the new one, so the directory stays at the bound. When the directory - was below the bound, nothing is cached and the crash path performs no deletion — - the new report simply fills an open slot. - -This means the configured bound is enforced at startup regardless of whether a -crash follows: lowering `maxFileCount` between runs prunes the excess on the next -init, not only on the next crash. Directory enumeration and sorting happen at init -(not in the signal handler) because they are not signal-safe; only the single -precomputed unlink runs on the crash path. - -## Key decisions and rationale - -- **Lifecycle-managed root is the only file sink.** A managed directory gives a - clear ownership boundary for retention; dropping the `DbgMiniDumpName` template - path (which the in-proc reporter never relied on) removed template-expansion and - hostname-caching logic the lifecycle does not need. -- **Opt-in via explicit root.** A default directory + automatic file creation - would implicitly turn on persistent files; we avoid that. -- **Absolute root only, no `~`/`$HOME` expansion.** On mobile the integration - layer supplies a concrete app-private path; doing shell-style expansion inside - the runtime would add ambiguity for no real caller benefit. -- **Flattened layout (no per-app subdirectory).** The root is already app-private - on mobile, so an app-name segment — and the path-sanitization it required — - added complexity without isolating anything. -- **Nanosecond timestamp filenames over fixed slots + `mtime`, and no suffix - retry.** Robust to copy/touch/restore that would scramble `mtime`; ordering - comes from the name, and nanosecond resolution makes collisions vanishingly - unlikely, so no disambiguating suffix or open-retry loop is needed. -- **Exclusive create (no `O_TRUNC`).** A timestamp/pid collision, stale file, or - test mistake must never silently erase a completed report. -- **Init-time pruning + single cached oldest in fixed buffers.** Keeps the crash - path signal-safe and free of malloc/heap state that may be corrupted at crash - time, while still applying the bound eagerly at startup. -- **Same-directory `rename` commit (not `link` + `unlink`).** `rename` is POSIX - async-signal-safe (per `signal-safety(7)`) and is the portable atomic-publish - primitive. Hard links via `link` are rejected with `EPERM` in the Android and - Apple mobile app-private storage sandboxes, which would silently lose every - report — the precise failure this feature exists to prevent. -- **Unconditional stale-temp cleanup.** Because each app owns its report - directory under private storage, a leftover `.tmp` can only come from a defunct - run of the same app, so it is removed without a `kill(pid, 0)` liveness probe. - -## Cross-process considerations - -Retention is best-effort if several processes were ever to share one root: each -caches its own oldest-report deletion candidate at init, so the directory may -transiently hold more than `maxFileCount` reports. Strict global retention would -require cross-process locking, which is a non-goal here. In the intended mobile -deployment the root is app-private, so this is not a concern in practice. From 2685f79bdc994707a79bd82b9f653861b7ba3596 Mon Sep 17 00:00:00 2001 From: Mitchell Hwang Date: Wed, 3 Jun 2026 14:16:04 -0400 Subject: [PATCH 10/10] Address feedback --- src/coreclr/debug/crashreport/crashreportstringutils.h | 8 ++++---- src/coreclr/inc/clrconfigvalues.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/debug/crashreport/crashreportstringutils.h b/src/coreclr/debug/crashreport/crashreportstringutils.h index 79c916610da48d..a3499908dd6de7 100644 --- a/src/coreclr/debug/crashreport/crashreportstringutils.h +++ b/src/coreclr/debug/crashreport/crashreportstringutils.h @@ -20,10 +20,10 @@ namespace CrashReportStringUtils size_t bufferSize, const char* value); - // Appends value to buffer starting at *pos, advancing *pos past the appended - // characters, and always null-terminates. Returns false (without advancing - // past the buffer) if the value did not fit, the arguments are invalid, or - // bufferSize is 0. + // Appends value to buffer starting at *pos, copying as much as fits, advancing + // *pos past the characters actually written, and always null-terminating. + // Returns true if the entire value fit; returns false if the value was + // truncated, the arguments are invalid, or bufferSize is 0. bool AppendString( char* buffer, size_t bufferSize, diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 16ceb4aa173842..ccc0b586789e68 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -579,7 +579,7 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_CreateDumpDiagnostics, W("CreateDumpDiagnostic RETAIL_CONFIG_DWORD_INFO(INTERNAL_CrashReportBeforeSignalChaining, W("CrashReportBeforeSignalChaining"), 0, "Enable crash report generation before chaining to previous signal handler") RETAIL_CONFIG_DWORD_INFO_EX(INTERNAL_CrashReportFrameLimitPerThread, W("CrashReportFrameLimitPerThread"), 32, "Maximum number of managed stack frames per thread to emit in the in-proc crash report's compact log; 0 disables the limit; remaining frames are summarized as '... +N more frames'", CLRConfig::LookupOptions::ParseIntegerAsBase10) RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportRootPath, W("CrashReportRootPath"), "Root path for lifecycle-managed in-proc crash report JSON files") -RETAIL_CONFIG_STRING_INFO(INTERNAL_CrashReportMaxFileCount, W("CrashReportMaxFileCount"), "Maximum number of lifecycle-managed in-proc crash report JSON files to retain (positive integer); default 32") +RETAIL_CONFIG_DWORD_INFO_EX(INTERNAL_CrashReportMaxFileCount, W("CrashReportMaxFileCount"), 32, "Maximum number of lifecycle-managed in-proc crash report JSON files to retain (positive integer); default 32", CLRConfig::LookupOptions::ParseIntegerAsBase10) /// /// R2R