From 89ea7c6f2ed6e22dd3f2b22c0bef6c7bec27c9c1 Mon Sep 17 00:00:00 2001 From: Julian Pinzer Date: Thu, 4 Jun 2026 16:17:50 -0400 Subject: [PATCH 1/2] Enhance Docker Compose and Helm detectors for improved parallel processing and file size handling --- .../DockerComposeComponentDetector.cs | 24 +++++++--- .../helm/HelmComponentDetector.cs | 46 ++++++++++++++++--- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.ComponentDetection.Detectors/dockercompose/DockerComposeComponentDetector.cs b/src/Microsoft.ComponentDetection.Detectors/dockercompose/DockerComposeComponentDetector.cs index 8f2c857d1..ce2669795 100644 --- a/src/Microsoft.ComponentDetection.Detectors/dockercompose/DockerComposeComponentDetector.cs +++ b/src/Microsoft.ComponentDetection.Detectors/dockercompose/DockerComposeComponentDetector.cs @@ -40,7 +40,15 @@ public DockerComposeComponentDetector( public override IEnumerable Categories => [nameof(DetectorClass.DockerCompose)]; - protected override async Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary detectorArgs, CancellationToken cancellationToken = default) + /// + /// Gets or sets a value indicating whether compose files are processed concurrently. + /// Each file is parsed independently into its own + /// and is stateless, so parsing is thread-safe and + /// scales across cores for repositories containing many compose files. + /// + protected override bool EnableParallelism { get; set; } = true; + + protected override Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary detectorArgs, CancellationToken cancellationToken = default) { var singleFileComponentRecorder = processRequest.SingleFileComponentRecorder; var file = processRequest.ComponentStream; @@ -49,18 +57,18 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID { this.Logger.LogInformation("Discovered Docker Compose file: {Location}", file.Location); - string contents; + // Parse directly from the stream; the content is already buffered in memory by + // LazyComponentStream, so reading it into an intermediate string only adds an + // extra full-file allocation and GC pressure under parallel processing. + var yaml = new YamlStream(); using (var reader = new StreamReader(file.Stream)) { - contents = await reader.ReadToEndAsync(cancellationToken); + yaml.Load(reader); } - var yaml = new YamlStream(); - yaml.Load(new StringReader(contents)); - if (yaml.Documents.Count == 0) { - return; + return Task.CompletedTask; } foreach (var document in yaml.Documents) @@ -75,6 +83,8 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID { this.Logger.LogError(e, "Failed to parse Docker Compose file: {Location}", file.Location); } + + return Task.CompletedTask; } private static YamlMappingNode? GetMappingChild(YamlMappingNode parent, string key) diff --git a/src/Microsoft.ComponentDetection.Detectors/helm/HelmComponentDetector.cs b/src/Microsoft.ComponentDetection.Detectors/helm/HelmComponentDetector.cs index 9da078303..030f30069 100644 --- a/src/Microsoft.ComponentDetection.Detectors/helm/HelmComponentDetector.cs +++ b/src/Microsoft.ComponentDetection.Detectors/helm/HelmComponentDetector.cs @@ -17,6 +17,14 @@ namespace Microsoft.ComponentDetection.Detectors.Helm; public class HelmComponentDetector : FileComponentDetector, IExperimentalDetector { + /// + /// Maximum size (in bytes) of a values file the detector will parse. The "*values*" globs + /// can match large, non-Helm YAML files whose full-DOM parse dominates worst-case runtime; + /// files above this limit are skipped so a single pathological file cannot exhaust the + /// detector's time budget. + /// + private const long MaxValuesFileSizeBytes = 20 * 1024 * 1024; // 20 MB + public HelmComponentDetector( IComponentStreamEnumerableFactory componentStreamEnumerableFactory, IObservableDirectoryWalkerFactory walkerFactory, @@ -41,6 +49,14 @@ public HelmComponentDetector( public override IEnumerable Categories => [nameof(DetectorClass.Helm)]; + /// + /// Gets or sets a value indicating whether values files are processed concurrently. + /// Each file is parsed independently into its own + /// and is stateless, so parsing is thread-safe and + /// scales across cores for repositories containing many charts. + /// + protected override bool EnableParallelism { get; set; } = true; + /// /// Pre-filters scan work to only values files co-located with a Chart.yaml/Chart.yml. /// Materializes all matched files, identifies Helm chart directories, then filters. @@ -65,7 +81,7 @@ protected override async Task> OnPrepareDetectionAsy .ToObservable(); } - protected override async Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary detectorArgs, CancellationToken cancellationToken = default) + protected override Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary detectorArgs, CancellationToken cancellationToken = default) { var file = processRequest.ComponentStream; @@ -74,20 +90,34 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID // filename/directory checks are needed. try { + // Check the size before touching ComponentStream so an oversized file is never + // buffered into memory. The "*values*" globs can match large, non-Helm YAML files + // whose full-DOM parse is the main driver of worst-case (timeout) runtime. + var fileInfo = new FileInfo(file.Location); + if (fileInfo.Exists && fileInfo.Length > MaxValuesFileSizeBytes) + { + this.Logger.LogWarning( + "Skipping Helm values file exceeding size limit ({Length} bytes > {Limit} bytes): {Location}", + fileInfo.Length, + MaxValuesFileSizeBytes, + file.Location); + return Task.CompletedTask; + } + this.Logger.LogInformation("Discovered Helm values file: {Location}", file.Location); - string contents; + // Parse directly from the stream; the content is already buffered in memory by + // LazyComponentStream, so reading it into an intermediate string only adds an + // extra full-file allocation and GC pressure under parallel processing. + var yaml = new YamlStream(); using (var reader = new StreamReader(file.Stream)) { - contents = await reader.ReadToEndAsync(cancellationToken); + yaml.Load(reader); } - var yaml = new YamlStream(); - yaml.Load(new StringReader(contents)); - if (yaml.Documents.Count == 0) { - return; + return Task.CompletedTask; } this.ExtractImageReferencesFromValues(yaml, processRequest.SingleFileComponentRecorder); @@ -96,6 +126,8 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID { this.Logger.LogError(e, "Failed to parse Helm file: {Location}", file.Location); } + + return Task.CompletedTask; } /// From e82564ae36fd7f26222f6de400630f2ad03a88ef Mon Sep 17 00:00:00 2001 From: Julian Pinzer Date: Thu, 4 Jun 2026 16:35:48 -0400 Subject: [PATCH 2/2] Enhance HasUnresolvedVariables method to support double underscore and hash-delimited tokens in image references --- .../DockerReference/DockerReferenceUtility.cs | 22 +++++++-- .../DockerReferenceUtilityTests.cs | 48 +++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ComponentDetection.Common/DockerReference/DockerReferenceUtility.cs b/src/Microsoft.ComponentDetection.Common/DockerReference/DockerReferenceUtility.cs index fe3122647..83eba3413 100644 --- a/src/Microsoft.ComponentDetection.Common/DockerReference/DockerReferenceUtility.cs +++ b/src/Microsoft.ComponentDetection.Common/DockerReference/DockerReferenceUtility.cs @@ -28,6 +28,7 @@ namespace Microsoft.ComponentDetection.Common; using System; using System.Diagnostics.CodeAnalysis; +using System.Text.RegularExpressions; using Microsoft.ComponentDetection.Contracts; using Microsoft.Extensions.Logging; @@ -39,14 +40,29 @@ public static class DockerReferenceUtility private const string LEGACYDEFAULTDOMAIN = "index.docker.io"; private const string OFFICIALREPOSITORYNAME = "library"; + // Characters that only appear in an image reference as part of an unresolved templating + // token. '$', '{' and '}' cover shell / Helm / Go-template placeholders (e.g. ${VAR}, + // {{ .Values.tag }}); '#' covers Azure DevOps and other token-replacement placeholders + // (e.g. #imageTag#) and is never valid in a resolved docker reference. + private static readonly char[] TemplateDelimiters = ['$', '{', '}', '#']; + + // Matches token-replacement placeholders that wrap an identifier in double underscores, + // e.g. __IMAGE_TAG__ or __MCR_ENDPOINT__. Without this they parse as an uppercase repository + // name and surface as a noisy parse failure instead of being skipped as a templated value. + private static readonly Regex DoubleUnderscoreTokenRegex = new(@"__\w+__"); + /// - /// Returns true if the reference contains unresolved variable placeholders (e.g., ${VAR}, {{ .Values.tag }}). - /// Such references should be skipped before calling or . + /// Returns true if the reference contains unresolved variable or templating placeholders, + /// e.g. ${VAR}, {{ .Values.tag }}, #imageTag#, or __IMAGE_TAG__. + /// Such references are not real, resolvable images, so they should be skipped before calling + /// or and treated as + /// unresolved values rather than reported as parse failures. /// /// The image reference string to check. /// true if the reference contains variable placeholder characters; otherwise false. public static bool HasUnresolvedVariables(string reference) => - reference.IndexOfAny(['$', '{', '}']) >= 0; + reference.IndexOfAny(TemplateDelimiters) >= 0 || + DoubleUnderscoreTokenRegex.IsMatch(reference); /// /// Attempts to parse an image reference string into a . diff --git a/test/Microsoft.ComponentDetection.Common.Tests/DockerReferenceUtilityTests.cs b/test/Microsoft.ComponentDetection.Common.Tests/DockerReferenceUtilityTests.cs index d50c65be0..902e80b9d 100644 --- a/test/Microsoft.ComponentDetection.Common.Tests/DockerReferenceUtilityTests.cs +++ b/test/Microsoft.ComponentDetection.Common.Tests/DockerReferenceUtilityTests.cs @@ -284,18 +284,66 @@ public void HasUnresolvedVariables_ReturnsTrueForBraces() DockerReferenceUtility.HasUnresolvedVariables("{{ .Values.image }}").Should().BeTrue(); } + [TestMethod] + public void HasUnresolvedVariables_ReturnsTrueForDoubleUnderscoreTokens() + { + DockerReferenceUtility.HasUnresolvedVariables("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__").Should().BeTrue(); + } + + [TestMethod] + public void HasUnresolvedVariables_ReturnsTrueForHashDelimitedTokens() + { + DockerReferenceUtility.HasUnresolvedVariables("#cs_containerRegistryLoginServerUrl#/coreservicesaksservice_#cs_aks_workloadName#_#cs_aks_serviceTrackIdentifier#/#serviceName#:#imageTag#").Should().BeTrue(); + } + [TestMethod] public void HasUnresolvedVariables_ReturnsFalseForPlainReference() { DockerReferenceUtility.HasUnresolvedVariables("docker.io/library/nginx:latest").Should().BeFalse(); } + [TestMethod] + public void HasUnresolvedVariables_ReturnsFalseForReferenceWithUnderscores() + { + DockerReferenceUtility.HasUnresolvedVariables("mcr.microsoft.com/some_repo/my_image:1.0").Should().BeFalse(); + } + [TestMethod] public void TryParseImageReference_ReturnsNullForUnresolvedVariables() { DockerReferenceUtility.TryParseImageReference("${IMAGE}:latest").Should().BeNull(); } + [TestMethod] + public void TryParseImageReference_ReturnsNullForDoubleUnderscoreTokens() + { + DockerReferenceUtility.TryParseImageReference("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__").Should().BeNull(); + } + + [TestMethod] + public void TryParseImageReference_ReturnsNullForHashDelimitedTokens() + { + DockerReferenceUtility.TryParseImageReference("#cs_containerRegistryLoginServerUrl#/svc/#serviceName#:#imageTag#").Should().BeNull(); + } + + [TestMethod] + public void TryParseImageReference_DoesNotLogWarningForTemplatedReference() + { + var logger = new Mock(); + + var result = DockerReferenceUtility.TryParseImageReference("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__", logger.Object); + + result.Should().BeNull(); + logger.Verify( + l => l.Log( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny>()), + Times.Never); + } + [TestMethod] public void TryParseImageReference_ReturnsNullForInvalidReference() {