Skip to content

Commit d8663f2

Browse files
AMaini503Aayush MainiCopilot
authored
Fix Docker scan hang: bound container removal, surface timeout telemetry (#1778)
* Fix Docker scan hang: bound container removal, surface timeout telemetry Fix 1: Add 30s timeout to RemoveContainerAsync in the finally block of CreateAndRunContainerAsync. Previously used CancellationToken.None which could hang indefinitely if the Docker daemon is slow removing a stuck container, blocking the entire detector and pipeline. Fix 2: Re-throw OperationCanceledException (when the token is cancelled) before the generic catch(Exception) in ResolveImageAsync, ScanDockerImageAsync, and ScanLocalImageAsync. This lets the OCE propagate to ExecuteDetectorAsync where LinuxContainerDetectorTimeout telemetry is emitted, enabling root-cause analysis of stuck scans. Fix 3: Replace JsonSerializer.Serialize(e) with e.ToString() in LinuxScanner.RunSyftAsync. Serializing exceptions with System.Text.Json throws NotSupportedException on MethodBase properties. Fix 4: Add safety-net catch(Exception) in ExecuteDetectorAsync after the OCE catch. Ensures unexpected errors never escape the detector and crash the harness or lose other detectors' results. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Address review: fix test token cancellation and assertions - Use pre-cancelled CancellationToken in OCE tests so the 'when (cancellationToken.IsCancellationRequested)' guard is exercised - Add Verify/assertion in multi-image test to confirm first image was scanned - Use CancelAsync() instead of Cancel() per analyzer rules Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Include exception in container cleanup timeout log Pass the caught exception to LogWarning so Docker daemon cleanup hangs are diagnosable from logs and telemetry. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Catch all exceptions in container cleanup finally block A Docker API or transport error from RemoveContainerAsync would escape the finally and mask the original scan result. Catch all exceptions so cleanup failures never override the primary outcome. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Aayush Maini <aamaini@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent e0ec95f commit d8663f2

4 files changed

Lines changed: 238 additions & 3 deletions

File tree

src/Microsoft.ComponentDetection.Common/DockerService.cs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,22 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
216216
}
217217
finally
218218
{
219-
// Best-effort container cleanup; RemoveContainerAsync already handles not-found.
220-
await RemoveContainerAsync(container.ID, CancellationToken.None);
219+
// Best-effort container cleanup with a bounded timeout.
220+
// RemoveContainerAsync already handles not-found, but we must guard against
221+
// the Docker daemon hanging on container removal (e.g. when the container
222+
// process is stuck), which would block the detector indefinitely.
223+
using var removeCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
224+
try
225+
{
226+
await RemoveContainerAsync(container.ID, removeCts.Token);
227+
}
228+
catch (Exception ex)
229+
{
230+
this.logger.LogWarning(
231+
ex,
232+
"Failed to remove container {ContainerId}; abandoning cleanup",
233+
container.ID);
234+
}
221235
}
222236
}
223237

src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ public async Task<IndividualDetectorScanResult> ExecuteDetectorAsync(
127127
GetTimeout(request.DetectorArgs)
128128
);
129129
}
130+
catch (Exception e)
131+
{
132+
this.logger.LogError(e, "Unexpected error during Linux container image scanning");
133+
}
130134

131135
return new IndividualDetectorScanResult
132136
{
@@ -285,6 +289,10 @@ private async Task ResolveImageAsync(
285289
);
286290
}
287291
}
292+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
293+
{
294+
throw;
295+
}
288296
catch (Exception e)
289297
{
290298
this.logger.LogWarning(e, "Processing of image {ContainerImage} (kind {ImageType}) failed", imageRef.OriginalInput, imageRef.Kind);
@@ -388,6 +396,10 @@ private async Task<ImageScanningResult> ScanDockerImageAsync(
388396

389397
return this.RecordComponents(containerDetails, layers, componentRecorder);
390398
}
399+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
400+
{
401+
throw;
402+
}
391403
catch (Exception e)
392404
{
393405
this.logger.LogWarning(e, "Scanning of image {ImageId} failed", containerDetails.ImageId);
@@ -525,6 +537,10 @@ private async Task<ImageScanningResult> ScanLocalImageAsync(
525537

526538
return this.RecordComponents(containerDetails, layers, componentRecorder);
527539
}
540+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
541+
{
542+
throw;
543+
}
528544
catch (Exception e)
529545
{
530546
this.logger.LogWarning(

src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ private async Task<string> RunSyftAsync(
296296
}
297297
catch (Exception e)
298298
{
299-
syftTelemetryRecord.Exception = JsonSerializer.Serialize(e);
299+
syftTelemetryRecord.Exception = e.ToString();
300300
this.logger.LogError(e, "Failed to run syft");
301301
throw;
302302
}

test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,4 +1290,209 @@ public async Task TestLinuxContainerDetector_ImageParseFailure_ContinuesScanning
12901290
Times.Once
12911291
);
12921292
}
1293+
1294+
[TestMethod]
1295+
public async Task ExecuteDetectorAsync_ScannerThrowsOce_ReturnsSuccessWithEmptyResults()
1296+
{
1297+
// Verifies Fix 2 + OCE catch: when the scanning timeout fires,
1298+
// the OCE propagates out of the scan methods and is caught by
1299+
// ExecuteDetectorAsync, which returns Success with no components.
1300+
// We pass a pre-cancelled token so the linked timeoutCts is also
1301+
// cancelled, making the `when (cancellationToken.IsCancellationRequested)`
1302+
// guard match and re-throw the OCE to ExecuteDetectorAsync.
1303+
var componentRecorder = new ComponentRecorder();
1304+
var detectorArgs = new Dictionary<string, string> { { "Linux.ScanningTimeoutSec", "600" } };
1305+
1306+
var scanRequest = new ScanRequest(
1307+
new DirectoryInfo(Path.GetTempPath()),
1308+
(_, __) => false,
1309+
this.mockLogger.Object,
1310+
detectorArgs,
1311+
[NodeLatestImage],
1312+
componentRecorder
1313+
);
1314+
1315+
using var cts = new CancellationTokenSource();
1316+
await cts.CancelAsync();
1317+
1318+
this.mockSyftLinuxScanner.Setup(scanner =>
1319+
scanner.ScanLinuxAsync(
1320+
It.IsAny<string>(),
1321+
It.IsAny<IEnumerable<DockerLayer>>(),
1322+
It.IsAny<int>(),
1323+
It.IsAny<ISet<ComponentType>>(),
1324+
It.IsAny<LinuxScannerScope>(),
1325+
It.IsAny<CancellationToken>()
1326+
)
1327+
)
1328+
.ThrowsAsync(new OperationCanceledException());
1329+
1330+
var detector = new LinuxContainerDetector(
1331+
this.mockSyftLinuxScanner.Object,
1332+
this.mockDockerService.Object,
1333+
this.mockLinuxContainerDetectorLogger.Object
1334+
);
1335+
1336+
var result = await detector.ExecuteDetectorAsync(scanRequest, cts.Token);
1337+
1338+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1339+
result.ContainerDetails.Should().BeEmpty();
1340+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1341+
}
1342+
1343+
[TestMethod]
1344+
public async Task ExecuteDetectorAsync_ScannerThrowsUnexpectedException_ReturnsSuccessDoesNotCrash()
1345+
{
1346+
// Verifies Fix 4 safety net: an unexpected exception from the
1347+
// scanner must never escape the detector. It should be caught,
1348+
// and the detector should return Success with empty results.
1349+
var componentRecorder = new ComponentRecorder();
1350+
1351+
var scanRequest = new ScanRequest(
1352+
new DirectoryInfo(Path.GetTempPath()),
1353+
(_, __) => false,
1354+
this.mockLogger.Object,
1355+
null,
1356+
[NodeLatestImage],
1357+
componentRecorder
1358+
);
1359+
1360+
this.mockSyftLinuxScanner.Setup(scanner =>
1361+
scanner.ScanLinuxAsync(
1362+
It.IsAny<string>(),
1363+
It.IsAny<IEnumerable<DockerLayer>>(),
1364+
It.IsAny<int>(),
1365+
It.IsAny<ISet<ComponentType>>(),
1366+
It.IsAny<LinuxScannerScope>(),
1367+
It.IsAny<CancellationToken>()
1368+
)
1369+
)
1370+
.ThrowsAsync(new InvalidOperationException("Unexpected Docker failure"));
1371+
1372+
var detector = new LinuxContainerDetector(
1373+
this.mockSyftLinuxScanner.Object,
1374+
this.mockDockerService.Object,
1375+
this.mockLinuxContainerDetectorLogger.Object
1376+
);
1377+
1378+
// The critical assertion: the detector must NOT throw.
1379+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1380+
1381+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1382+
result.ContainerDetails.Should().BeEmpty();
1383+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1384+
}
1385+
1386+
[TestMethod]
1387+
public async Task ExecuteDetectorAsync_ImageResolveThrowsOce_ReturnsSuccessWithEmptyResults()
1388+
{
1389+
// Verifies Fix 2 in the resolve phase: if the timeout fires
1390+
// during image pull/inspect, the OCE is re-thrown from
1391+
// ResolveImageAsync and caught by ExecuteDetectorAsync.
1392+
// We pass a pre-cancelled token so the linked timeoutCts is also
1393+
// cancelled, making the `when` guard match.
1394+
var componentRecorder = new ComponentRecorder();
1395+
var detectorArgs = new Dictionary<string, string> { { "Linux.ScanningTimeoutSec", "600" } };
1396+
1397+
var scanRequest = new ScanRequest(
1398+
new DirectoryInfo(Path.GetTempPath()),
1399+
(_, __) => false,
1400+
this.mockLogger.Object,
1401+
detectorArgs,
1402+
[NodeLatestImage],
1403+
componentRecorder
1404+
);
1405+
1406+
using var cts = new CancellationTokenSource();
1407+
await cts.CancelAsync();
1408+
1409+
this.mockDockerService.Setup(service =>
1410+
service.ImageExistsLocallyAsync(It.IsAny<string>(), It.IsAny<CancellationToken>())
1411+
)
1412+
.ThrowsAsync(new OperationCanceledException());
1413+
1414+
var detector = new LinuxContainerDetector(
1415+
this.mockSyftLinuxScanner.Object,
1416+
this.mockDockerService.Object,
1417+
this.mockLinuxContainerDetectorLogger.Object
1418+
);
1419+
1420+
var result = await detector.ExecuteDetectorAsync(scanRequest, cts.Token);
1421+
1422+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1423+
result.ContainerDetails.Should().BeEmpty();
1424+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1425+
}
1426+
1427+
[TestMethod]
1428+
public async Task ExecuteDetectorAsync_ScanThrowsOce_OtherImageStillScanned()
1429+
{
1430+
// When scanning multiple images and one fails with an uncancelled OCE,
1431+
// the generic catch swallows it, so the other image still scans.
1432+
// Token is NOT cancelled here — this tests multi-image resilience
1433+
// via the generic catch path, not the timeout re-throw path.
1434+
var componentRecorder = new ComponentRecorder();
1435+
const string secondImage = "alpine:latest";
1436+
const string secondImageId = "alpine123";
1437+
1438+
this.mockDockerService.Setup(service =>
1439+
service.InspectImageAsync(secondImage, It.IsAny<CancellationToken>())
1440+
)
1441+
.ReturnsAsync(
1442+
new ContainerDetails
1443+
{
1444+
Id = 2,
1445+
ImageId = secondImageId,
1446+
Layers = [],
1447+
}
1448+
);
1449+
1450+
// First image (NodeLatestImage) scans fine via default mock.
1451+
// Second image throws OCE (not tied to cancellation, so generic catch handles it).
1452+
this.mockSyftLinuxScanner.Setup(scanner =>
1453+
scanner.ScanLinuxAsync(
1454+
secondImageId,
1455+
It.IsAny<IEnumerable<DockerLayer>>(),
1456+
It.IsAny<int>(),
1457+
It.IsAny<ISet<ComponentType>>(),
1458+
It.IsAny<LinuxScannerScope>(),
1459+
It.IsAny<CancellationToken>()
1460+
)
1461+
)
1462+
.ThrowsAsync(new OperationCanceledException());
1463+
1464+
var scanRequest = new ScanRequest(
1465+
new DirectoryInfo(Path.GetTempPath()),
1466+
(_, __) => false,
1467+
this.mockLogger.Object,
1468+
null,
1469+
[NodeLatestImage, secondImage],
1470+
componentRecorder
1471+
);
1472+
1473+
var detector = new LinuxContainerDetector(
1474+
this.mockSyftLinuxScanner.Object,
1475+
this.mockDockerService.Object,
1476+
this.mockLinuxContainerDetectorLogger.Object
1477+
);
1478+
1479+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1480+
1481+
// Must not crash, must return Success
1482+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1483+
1484+
// The first image should have been scanned successfully
1485+
this.mockSyftLinuxScanner.Verify(
1486+
scanner => scanner.ScanLinuxAsync(
1487+
NodeLatestDigest,
1488+
It.IsAny<IEnumerable<DockerLayer>>(),
1489+
It.IsAny<int>(),
1490+
It.IsAny<ISet<ComponentType>>(),
1491+
It.IsAny<LinuxScannerScope>(),
1492+
It.IsAny<CancellationToken>()),
1493+
Times.Once);
1494+
1495+
// The first image should have produced components
1496+
componentRecorder.GetDetectedComponents().Should().NotBeEmpty();
1497+
}
12931498
}

0 commit comments

Comments
 (0)