Skip to content

Commit c74e704

Browse files
Aayush MainiCopilot
andcommitted
fix: harden Docker scan timeout and cancellation in LinuxContainerDetector
- Fix OperationCanceledException swallowing in 5 DockerService methods (CanPingDockerAsync, CanRunLinuxContainersAsync, ImageExistsLocallyAsync, TryPullImageAsync, InspectImageAsync) by adding cancellationToken.ThrowIfCancellationRequested() in catch blocks - Move CanRunLinuxContainersAsync call inside try/catch in LinuxContainerDetector to prevent unhandled OCE crash - Add belt-and-suspenders Task.WhenAny with 30s race timer for RemoveContainerAsync in finally block - Make stream.Dispose() fire-and-forget via Task.Run to prevent blocking if underlying socket close() hangs - Add TelemetryRelay.FlushCurrentTelemetry() before ReadContainerOutputAsync to ensure telemetry is durable before risky long-running operation - Add 60s heartbeat timer in LinuxContainerDetector for process liveness monitoring - Add ILogger breadcrumbs for container start, removal, and cleanup Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 7ce401b commit c74e704

3 files changed

Lines changed: 100 additions & 16 deletions

File tree

src/Microsoft.ComponentDetection.Common/DockerService.cs

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ namespace Microsoft.ComponentDetection.Common;
1010
using System.Threading.Tasks;
1111
using Docker.DotNet;
1212
using Docker.DotNet.Models;
13+
using Microsoft.ComponentDetection.Common.Telemetry;
1314
using Microsoft.ComponentDetection.Common.Telemetry.Records;
1415
using Microsoft.ComponentDetection.Contracts;
1516
using Microsoft.ComponentDetection.Contracts.BcdeModels;
@@ -37,6 +38,7 @@ public async Task<bool> CanPingDockerAsync(CancellationToken cancellationToken =
3738
}
3839
catch (Exception e)
3940
{
41+
cancellationToken.ThrowIfCancellationRequested();
4042
this.logger.LogError(e, "Failed to ping docker");
4143
return false;
4244
}
@@ -58,6 +60,7 @@ public async Task<bool> CanRunLinuxContainersAsync(CancellationToken cancellatio
5860
}
5961
catch (Exception e)
6062
{
63+
cancellationToken.ThrowIfCancellationRequested();
6164
record.ExceptionMessage = e.Message;
6265
}
6366

@@ -78,6 +81,7 @@ public async Task<bool> ImageExistsLocallyAsync(string image, CancellationToken
7881
}
7982
catch (Exception e)
8083
{
84+
cancellationToken.ThrowIfCancellationRequested();
8185
record.ExceptionMessage = e.Message;
8286
return false;
8387
}
@@ -113,6 +117,7 @@ public async Task<bool> TryPullImageAsync(string image, CancellationToken cancel
113117
}
114118
catch (Exception e)
115119
{
120+
cancellationToken.ThrowIfCancellationRequested();
116121
record.ExceptionMessage = e.Message;
117122
return false;
118123
}
@@ -177,6 +182,7 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
177182
}
178183
catch (Exception e)
179184
{
185+
cancellationToken.ThrowIfCancellationRequested();
180186
record.ExceptionMessage = e.Message;
181187
return null;
182188
}
@@ -207,6 +213,12 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
207213
var stream = await AttachContainerAsync(container.ID, cancellationToken);
208214
await StartContainerAsync(container.ID, cancellationToken);
209215

216+
this.logger.LogInformation("Container {ContainerId} started for image {Image}, reading output...", container.ID, image);
217+
218+
// Flush telemetry before the long-running ReadOutput so we get mid-scan
219+
// data in App Insights even if the process hangs during the read.
220+
TelemetryRelay.Instance.FlushCurrentTelemetry();
221+
210222
var (stdout, stderr) = await ReadContainerOutputAsync(stream, container.ID, image, cancellationToken);
211223

212224
record.Stdout = stdout;
@@ -217,13 +229,33 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
217229
finally
218230
{
219231
// Best-effort container cleanup with a bounded timeout.
220-
// RemoveContainerAsync already handles not-found, but we must guard against
221-
// the Docker daemon hanging on container removal (e.g. when the container
222-
// process is stuck), which would block the detector indefinitely.
232+
// Use Task.WhenAny as belt-and-suspenders: even if Docker.DotNet's HTTP
233+
// pipeline doesn't honor the CTS (e.g. kernel-level socket blocking),
234+
// we abandon the removal rather than hanging indefinitely.
235+
this.logger.LogInformation("Removing container {ContainerId}...", container.ID);
223236
using var removeCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
224237
try
225238
{
226-
await RemoveContainerAsync(container.ID, removeCts.Token);
239+
var removeTask = RemoveContainerAsync(container.ID, removeCts.Token);
240+
var removeTimeout = Task.Delay(TimeSpan.FromSeconds(30), CancellationToken.None);
241+
242+
if (await Task.WhenAny(removeTask, removeTimeout) == removeTimeout)
243+
{
244+
this.logger.LogWarning(
245+
"RemoveContainerAsync timed out for container {ContainerId}; abandoning cleanup",
246+
container.ID);
247+
248+
// Observe the abandoned task to prevent unobserved task exceptions
249+
_ = removeTask.ContinueWith(
250+
static _ => { },
251+
CancellationToken.None,
252+
TaskContinuationOptions.OnlyOnFaulted,
253+
TaskScheduler.Default);
254+
}
255+
else
256+
{
257+
await removeTask; // Observe any exception from completed task
258+
}
227259
}
228260
catch (Exception ex)
229261
{
@@ -232,6 +264,8 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
232264
"Failed to remove container {ContainerId}; abandoning cleanup",
233265
container.ID);
234266
}
267+
268+
this.logger.LogInformation("Container {ContainerId} cleanup complete", container.ID);
235269
}
236270
}
237271

@@ -264,8 +298,22 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
264298
{
265299
record.WasCancelled = true;
266300

267-
// Dispose the stream to unblock any pending read operation
268-
stream.Dispose();
301+
// Dispose the stream to unblock any pending read operation.
302+
// Run in fire-and-forget: if the underlying socket close() blocks
303+
// (e.g. Docker daemon in kernel D-state), we don't want to hang here.
304+
_ = Task.Run(
305+
() =>
306+
{
307+
try
308+
{
309+
stream.Dispose();
310+
}
311+
catch
312+
{
313+
// best effort
314+
}
315+
},
316+
CancellationToken.None);
269317

270318
// Observe the readTask to prevent unobserved task exceptions.
271319
// Running any continuation automatically marks the exception as observed.

src/Microsoft.ComponentDetection.Common/Telemetry/TelemetryRelay.cs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,25 @@ public void PostTelemetryRecord(IDetectionTelemetryRecord record)
4848
}
4949
}
5050

51+
/// <summary>
52+
/// Flushes all buffered telemetry records to their services without shutting down.
53+
/// Use this at critical checkpoints to ensure telemetry is delivered even if the process later hangs.
54+
/// </summary>
55+
public void FlushCurrentTelemetry()
56+
{
57+
foreach (var service in this.telemetryServices)
58+
{
59+
try
60+
{
61+
service.Flush();
62+
}
63+
catch
64+
{
65+
// Telemetry should never crash the application
66+
}
67+
}
68+
}
69+
5170
/// <summary>
5271
/// Disables the sending of telemetry and flushes any messages out of the queue for each service.
5372
/// </summary>

src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -99,19 +99,32 @@ public async Task<IndividualDetectorScanResult> ExecuteDetectorAsync(
9999
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
100100
timeoutCts.CancelAfter(GetTimeout(request.DetectorArgs));
101101

102-
if (!await this.dockerService.CanRunLinuxContainersAsync(timeoutCts.Token))
103-
{
104-
using var record = new LinuxContainerDetectorUnsupportedOs
105-
{
106-
Os = RuntimeInformation.OSDescription,
107-
};
108-
this.logger.LogInformation("Linux containers are not available on this host.");
109-
return EmptySuccessfulScan();
110-
}
111-
112102
var results = Enumerable.Empty<ImageScanningResult>();
103+
104+
// Heartbeat timer: logs every 60s while the detector is running.
105+
// If the process dies silently (OOM, SIGKILL), the heartbeat stops
106+
// and we know approximately when it happened from the last log entry.
107+
var scanStart = DateTime.UtcNow;
108+
using var heartbeat = new Timer(
109+
_ => this.logger.LogInformation(
110+
"LinuxContainerDetector heartbeat — still scanning ({ElapsedSeconds}s elapsed)",
111+
(DateTime.UtcNow - scanStart).TotalSeconds),
112+
state: null,
113+
dueTime: TimeSpan.FromSeconds(60),
114+
period: TimeSpan.FromSeconds(60));
115+
113116
try
114117
{
118+
if (!await this.dockerService.CanRunLinuxContainersAsync(timeoutCts.Token))
119+
{
120+
using var record = new LinuxContainerDetectorUnsupportedOs
121+
{
122+
Os = RuntimeInformation.OSDescription,
123+
};
124+
this.logger.LogInformation("Linux containers are not available on this host.");
125+
return EmptySuccessfulScan();
126+
}
127+
115128
results = await this.ProcessImagesAsync(
116129
allImages,
117130
request.ComponentRecorder,
@@ -132,6 +145,10 @@ public async Task<IndividualDetectorScanResult> ExecuteDetectorAsync(
132145
this.logger.LogError(e, "Unexpected error during Linux container image scanning");
133146
}
134147

148+
this.logger.LogInformation(
149+
"LinuxContainerDetector completed after {ElapsedSeconds}s",
150+
(DateTime.UtcNow - scanStart).TotalSeconds);
151+
135152
return new IndividualDetectorScanResult
136153
{
137154
ContainerDetails = results

0 commit comments

Comments
 (0)