Skip to content

Commit ea5ce24

Browse files
Aayush MainiCopilot
andcommitted
Fix Docker scan hang: bound container removal, surface timeout telemetry
Fix 1: Add 30s timeout to RemoveContainerAsync in the finally block of CreateAndRunContainerAsync. Previously used CancellationToken.None which could hang indefinitely if the Docker daemon is slow removing a stuck container, blocking the entire detector and pipeline. Fix 2: Re-throw OperationCanceledException (when the token is cancelled) before the generic catch(Exception) in ResolveImageAsync, ScanDockerImageAsync, and ScanLocalImageAsync. This lets the OCE propagate to ExecuteDetectorAsync where LinuxContainerDetectorTimeout telemetry is emitted, enabling root-cause analysis of stuck scans. Fix 3: Replace JsonSerializer.Serialize(e) with e.ToString() in LinuxScanner.RunSyftAsync. Serializing exceptions with System.Text.Json throws NotSupportedException on MethodBase properties. Fix 4: Add safety-net catch(Exception) in ExecuteDetectorAsync after the OCE catch. Ensures unexpected errors never escape the detector and crash the harness or lose other detectors' results. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent c3ef82c commit ea5ce24

4 files changed

Lines changed: 209 additions & 3 deletions

File tree

src/Microsoft.ComponentDetection.Common/DockerService.cs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,21 @@ public async Task<ContainerDetails> InspectImageAsync(string image, Cancellation
216216
}
217217
finally
218218
{
219-
// Best-effort container cleanup; RemoveContainerAsync already handles not-found.
220-
await RemoveContainerAsync(container.ID, CancellationToken.None);
219+
// Best-effort container cleanup with a bounded timeout.
220+
// RemoveContainerAsync already handles not-found, but we must guard against
221+
// the Docker daemon hanging on container removal (e.g. when the container
222+
// process is stuck), which would block the detector indefinitely.
223+
using var removeCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
224+
try
225+
{
226+
await RemoveContainerAsync(container.ID, removeCts.Token);
227+
}
228+
catch (Exception ex) when (ex is OperationCanceledException or TimeoutException)
229+
{
230+
this.logger.LogWarning(
231+
"Timed out removing container {ContainerId} after 30s; abandoning cleanup",
232+
container.ID);
233+
}
221234
}
222235
}
223236

src/Microsoft.ComponentDetection.Detectors/linux/LinuxContainerDetector.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ public async Task<IndividualDetectorScanResult> ExecuteDetectorAsync(
127127
GetTimeout(request.DetectorArgs)
128128
);
129129
}
130+
catch (Exception e)
131+
{
132+
this.logger.LogError(e, "Unexpected error during Linux container image scanning");
133+
}
130134

131135
return new IndividualDetectorScanResult
132136
{
@@ -285,6 +289,10 @@ private async Task ResolveImageAsync(
285289
);
286290
}
287291
}
292+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
293+
{
294+
throw;
295+
}
288296
catch (Exception e)
289297
{
290298
this.logger.LogWarning(e, "Processing of image {ContainerImage} (kind {ImageType}) failed", imageRef.OriginalInput, imageRef.Kind);
@@ -388,6 +396,10 @@ private async Task<ImageScanningResult> ScanDockerImageAsync(
388396

389397
return this.RecordComponents(containerDetails, layers, componentRecorder);
390398
}
399+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
400+
{
401+
throw;
402+
}
391403
catch (Exception e)
392404
{
393405
this.logger.LogWarning(e, "Scanning of image {ImageId} failed", containerDetails.ImageId);
@@ -525,6 +537,10 @@ private async Task<ImageScanningResult> ScanLocalImageAsync(
525537

526538
return this.RecordComponents(containerDetails, layers, componentRecorder);
527539
}
540+
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
541+
{
542+
throw;
543+
}
528544
catch (Exception e)
529545
{
530546
this.logger.LogWarning(

src/Microsoft.ComponentDetection.Detectors/linux/LinuxScanner.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ private async Task<string> RunSyftAsync(
296296
}
297297
catch (Exception e)
298298
{
299-
syftTelemetryRecord.Exception = JsonSerializer.Serialize(e);
299+
syftTelemetryRecord.Exception = e.ToString();
300300
this.logger.LogError(e, "Failed to run syft");
301301
throw;
302302
}

test/Microsoft.ComponentDetection.Detectors.Tests/LinuxContainerDetectorTests.cs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,4 +1290,181 @@ public async Task TestLinuxContainerDetector_ImageParseFailure_ContinuesScanning
12901290
Times.Once
12911291
);
12921292
}
1293+
1294+
[TestMethod]
1295+
public async Task ExecuteDetectorAsync_ScannerThrowsOce_ReturnsSuccessWithEmptyResults()
1296+
{
1297+
// Verifies Fix 2 + OCE catch: when the scanning timeout fires,
1298+
// the OCE propagates out of the scan methods and is caught by
1299+
// ExecuteDetectorAsync, which returns Success with no components.
1300+
var componentRecorder = new ComponentRecorder();
1301+
var detectorArgs = new Dictionary<string, string> { { "Linux.ScanningTimeoutSec", "600" } };
1302+
1303+
var scanRequest = new ScanRequest(
1304+
new DirectoryInfo(Path.GetTempPath()),
1305+
(_, __) => false,
1306+
this.mockLogger.Object,
1307+
detectorArgs,
1308+
[NodeLatestImage],
1309+
componentRecorder
1310+
);
1311+
1312+
this.mockSyftLinuxScanner.Setup(scanner =>
1313+
scanner.ScanLinuxAsync(
1314+
It.IsAny<string>(),
1315+
It.IsAny<IEnumerable<DockerLayer>>(),
1316+
It.IsAny<int>(),
1317+
It.IsAny<ISet<ComponentType>>(),
1318+
It.IsAny<LinuxScannerScope>(),
1319+
It.IsAny<CancellationToken>()
1320+
)
1321+
)
1322+
.ThrowsAsync(new OperationCanceledException());
1323+
1324+
var detector = new LinuxContainerDetector(
1325+
this.mockSyftLinuxScanner.Object,
1326+
this.mockDockerService.Object,
1327+
this.mockLinuxContainerDetectorLogger.Object
1328+
);
1329+
1330+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1331+
1332+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1333+
result.ContainerDetails.Should().BeEmpty();
1334+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1335+
}
1336+
1337+
[TestMethod]
1338+
public async Task ExecuteDetectorAsync_ScannerThrowsUnexpectedException_ReturnsSuccessDoesNotCrash()
1339+
{
1340+
// Verifies Fix 4 safety net: an unexpected exception from the
1341+
// scanner must never escape the detector. It should be caught,
1342+
// and the detector should return Success with empty results.
1343+
var componentRecorder = new ComponentRecorder();
1344+
1345+
var scanRequest = new ScanRequest(
1346+
new DirectoryInfo(Path.GetTempPath()),
1347+
(_, __) => false,
1348+
this.mockLogger.Object,
1349+
null,
1350+
[NodeLatestImage],
1351+
componentRecorder
1352+
);
1353+
1354+
this.mockSyftLinuxScanner.Setup(scanner =>
1355+
scanner.ScanLinuxAsync(
1356+
It.IsAny<string>(),
1357+
It.IsAny<IEnumerable<DockerLayer>>(),
1358+
It.IsAny<int>(),
1359+
It.IsAny<ISet<ComponentType>>(),
1360+
It.IsAny<LinuxScannerScope>(),
1361+
It.IsAny<CancellationToken>()
1362+
)
1363+
)
1364+
.ThrowsAsync(new InvalidOperationException("Unexpected Docker failure"));
1365+
1366+
var detector = new LinuxContainerDetector(
1367+
this.mockSyftLinuxScanner.Object,
1368+
this.mockDockerService.Object,
1369+
this.mockLinuxContainerDetectorLogger.Object
1370+
);
1371+
1372+
// The critical assertion: the detector must NOT throw.
1373+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1374+
1375+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1376+
result.ContainerDetails.Should().BeEmpty();
1377+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1378+
}
1379+
1380+
[TestMethod]
1381+
public async Task ExecuteDetectorAsync_ImageResolveThrowsOce_ReturnsSuccessWithEmptyResults()
1382+
{
1383+
// Verifies Fix 2 in the resolve phase: if the timeout fires
1384+
// during image pull/inspect, the OCE is re-thrown from
1385+
// ResolveImageAsync and caught by ExecuteDetectorAsync.
1386+
var componentRecorder = new ComponentRecorder();
1387+
var detectorArgs = new Dictionary<string, string> { { "Linux.ScanningTimeoutSec", "600" } };
1388+
1389+
var scanRequest = new ScanRequest(
1390+
new DirectoryInfo(Path.GetTempPath()),
1391+
(_, __) => false,
1392+
this.mockLogger.Object,
1393+
detectorArgs,
1394+
[NodeLatestImage],
1395+
componentRecorder
1396+
);
1397+
1398+
this.mockDockerService.Setup(service =>
1399+
service.ImageExistsLocallyAsync(It.IsAny<string>(), It.IsAny<CancellationToken>())
1400+
)
1401+
.ThrowsAsync(new OperationCanceledException());
1402+
1403+
var detector = new LinuxContainerDetector(
1404+
this.mockSyftLinuxScanner.Object,
1405+
this.mockDockerService.Object,
1406+
this.mockLinuxContainerDetectorLogger.Object
1407+
);
1408+
1409+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1410+
1411+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1412+
result.ContainerDetails.Should().BeEmpty();
1413+
componentRecorder.GetDetectedComponents().Should().BeEmpty();
1414+
}
1415+
1416+
[TestMethod]
1417+
public async Task ExecuteDetectorAsync_ScanThrowsOce_OtherImageStillScanned()
1418+
{
1419+
// When scanning multiple images and one times out, the entire
1420+
// detector should still return Success (not crash).
1421+
var componentRecorder = new ComponentRecorder();
1422+
const string secondImage = "alpine:latest";
1423+
const string secondImageId = "alpine123";
1424+
1425+
this.mockDockerService.Setup(service =>
1426+
service.InspectImageAsync(secondImage, It.IsAny<CancellationToken>())
1427+
)
1428+
.ReturnsAsync(
1429+
new ContainerDetails
1430+
{
1431+
Id = 2,
1432+
ImageId = secondImageId,
1433+
Layers = [],
1434+
}
1435+
);
1436+
1437+
// First image scans fine, second throws OCE
1438+
this.mockSyftLinuxScanner.Setup(scanner =>
1439+
scanner.ScanLinuxAsync(
1440+
secondImageId,
1441+
It.IsAny<IEnumerable<DockerLayer>>(),
1442+
It.IsAny<int>(),
1443+
It.IsAny<ISet<ComponentType>>(),
1444+
It.IsAny<LinuxScannerScope>(),
1445+
It.IsAny<CancellationToken>()
1446+
)
1447+
)
1448+
.ThrowsAsync(new OperationCanceledException());
1449+
1450+
var scanRequest = new ScanRequest(
1451+
new DirectoryInfo(Path.GetTempPath()),
1452+
(_, __) => false,
1453+
this.mockLogger.Object,
1454+
null,
1455+
[NodeLatestImage, secondImage],
1456+
componentRecorder
1457+
);
1458+
1459+
var detector = new LinuxContainerDetector(
1460+
this.mockSyftLinuxScanner.Object,
1461+
this.mockDockerService.Object,
1462+
this.mockLinuxContainerDetectorLogger.Object
1463+
);
1464+
1465+
var result = await detector.ExecuteDetectorAsync(scanRequest);
1466+
1467+
// Must not crash, must return Success
1468+
result.ResultCode.Should().Be(ProcessingResultCode.Success);
1469+
}
12931470
}

0 commit comments

Comments
 (0)