Skip to content

Commit f8c809e

Browse files
authored
Enhance Docker Compose and Helm detectors for parallel processing (#1819)
* Enhance Docker Compose and Helm detectors for improved parallel processing and file size handling * Enhance HasUnresolvedVariables method to support double underscore and hash-delimited tokens in image references
1 parent 248e744 commit f8c809e

4 files changed

Lines changed: 123 additions & 17 deletions

File tree

src/Microsoft.ComponentDetection.Common/DockerReference/DockerReferenceUtility.cs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ namespace Microsoft.ComponentDetection.Common;
2828

2929
using System;
3030
using System.Diagnostics.CodeAnalysis;
31+
using System.Text.RegularExpressions;
3132
using Microsoft.ComponentDetection.Contracts;
3233
using Microsoft.Extensions.Logging;
3334

@@ -39,14 +40,29 @@ public static class DockerReferenceUtility
3940
private const string LEGACYDEFAULTDOMAIN = "index.docker.io";
4041
private const string OFFICIALREPOSITORYNAME = "library";
4142

43+
// Characters that only appear in an image reference as part of an unresolved templating
44+
// token. '$', '{' and '}' cover shell / Helm / Go-template placeholders (e.g. ${VAR},
45+
// {{ .Values.tag }}); '#' covers Azure DevOps and other token-replacement placeholders
46+
// (e.g. #imageTag#) and is never valid in a resolved docker reference.
47+
private static readonly char[] TemplateDelimiters = ['$', '{', '}', '#'];
48+
49+
// Matches token-replacement placeholders that wrap an identifier in double underscores,
50+
// e.g. __IMAGE_TAG__ or __MCR_ENDPOINT__. Without this they parse as an uppercase repository
51+
// name and surface as a noisy parse failure instead of being skipped as a templated value.
52+
private static readonly Regex DoubleUnderscoreTokenRegex = new(@"__\w+__");
53+
4254
/// <summary>
43-
/// Returns true if the reference contains unresolved variable placeholders (e.g., ${VAR}, {{ .Values.tag }}).
44-
/// Such references should be skipped before calling <see cref="ParseFamiliarName"/> or <see cref="ParseQualifiedName"/>.
55+
/// Returns true if the reference contains unresolved variable or templating placeholders,
56+
/// e.g. <c>${VAR}</c>, <c>{{ .Values.tag }}</c>, <c>#imageTag#</c>, or <c>__IMAGE_TAG__</c>.
57+
/// Such references are not real, resolvable images, so they should be skipped before calling
58+
/// <see cref="ParseFamiliarName"/> or <see cref="ParseQualifiedName"/> and treated as
59+
/// unresolved values rather than reported as parse failures.
4560
/// </summary>
4661
/// <param name="reference">The image reference string to check.</param>
4762
/// <returns><c>true</c> if the reference contains variable placeholder characters; otherwise <c>false</c>.</returns>
4863
public static bool HasUnresolvedVariables(string reference) =>
49-
reference.IndexOfAny(['$', '{', '}']) >= 0;
64+
reference.IndexOfAny(TemplateDelimiters) >= 0 ||
65+
DoubleUnderscoreTokenRegex.IsMatch(reference);
5066

5167
/// <summary>
5268
/// Attempts to parse an image reference string into a <see cref="DockerReference"/>.

src/Microsoft.ComponentDetection.Detectors/dockercompose/DockerComposeComponentDetector.cs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,15 @@ public DockerComposeComponentDetector(
4040

4141
public override IEnumerable<string> Categories => [nameof(DetectorClass.DockerCompose)];
4242

43-
protected override async Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary<string, string> detectorArgs, CancellationToken cancellationToken = default)
43+
/// <summary>
44+
/// Gets or sets a value indicating whether compose files are processed concurrently.
45+
/// Each file is parsed independently into its own <see cref="ISingleFileComponentRecorder"/>
46+
/// and <see cref="DockerReferenceUtility"/> is stateless, so parsing is thread-safe and
47+
/// scales across cores for repositories containing many compose files.
48+
/// </summary>
49+
protected override bool EnableParallelism { get; set; } = true;
50+
51+
protected override Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary<string, string> detectorArgs, CancellationToken cancellationToken = default)
4452
{
4553
var singleFileComponentRecorder = processRequest.SingleFileComponentRecorder;
4654
var file = processRequest.ComponentStream;
@@ -49,18 +57,18 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
4957
{
5058
this.Logger.LogInformation("Discovered Docker Compose file: {Location}", file.Location);
5159

52-
string contents;
60+
// Parse directly from the stream; the content is already buffered in memory by
61+
// LazyComponentStream, so reading it into an intermediate string only adds an
62+
// extra full-file allocation and GC pressure under parallel processing.
63+
var yaml = new YamlStream();
5364
using (var reader = new StreamReader(file.Stream))
5465
{
55-
contents = await reader.ReadToEndAsync(cancellationToken);
66+
yaml.Load(reader);
5667
}
5768

58-
var yaml = new YamlStream();
59-
yaml.Load(new StringReader(contents));
60-
6169
if (yaml.Documents.Count == 0)
6270
{
63-
return;
71+
return Task.CompletedTask;
6472
}
6573

6674
foreach (var document in yaml.Documents)
@@ -75,6 +83,8 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
7583
{
7684
this.Logger.LogError(e, "Failed to parse Docker Compose file: {Location}", file.Location);
7785
}
86+
87+
return Task.CompletedTask;
7888
}
7989

8090
private static YamlMappingNode? GetMappingChild(YamlMappingNode parent, string key)

src/Microsoft.ComponentDetection.Detectors/helm/HelmComponentDetector.cs

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ namespace Microsoft.ComponentDetection.Detectors.Helm;
1717

1818
public class HelmComponentDetector : FileComponentDetector, IExperimentalDetector
1919
{
20+
/// <summary>
21+
/// Maximum size (in bytes) of a values file the detector will parse. The "*values*" globs
22+
/// can match large, non-Helm YAML files whose full-DOM parse dominates worst-case runtime;
23+
/// files above this limit are skipped so a single pathological file cannot exhaust the
24+
/// detector's time budget.
25+
/// </summary>
26+
private const long MaxValuesFileSizeBytes = 20 * 1024 * 1024; // 20 MB
27+
2028
public HelmComponentDetector(
2129
IComponentStreamEnumerableFactory componentStreamEnumerableFactory,
2230
IObservableDirectoryWalkerFactory walkerFactory,
@@ -41,6 +49,14 @@ public HelmComponentDetector(
4149

4250
public override IEnumerable<string> Categories => [nameof(DetectorClass.Helm)];
4351

52+
/// <summary>
53+
/// Gets or sets a value indicating whether values files are processed concurrently.
54+
/// Each file is parsed independently into its own <see cref="ISingleFileComponentRecorder"/>
55+
/// and <see cref="DockerReferenceUtility"/> is stateless, so parsing is thread-safe and
56+
/// scales across cores for repositories containing many charts.
57+
/// </summary>
58+
protected override bool EnableParallelism { get; set; } = true;
59+
4460
/// <summary>
4561
/// Pre-filters scan work to only values files co-located with a Chart.yaml/Chart.yml.
4662
/// Materializes all matched files, identifies Helm chart directories, then filters.
@@ -65,7 +81,7 @@ protected override async Task<IObservable<ProcessRequest>> OnPrepareDetectionAsy
6581
.ToObservable();
6682
}
6783

68-
protected override async Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary<string, string> detectorArgs, CancellationToken cancellationToken = default)
84+
protected override Task OnFileFoundAsync(ProcessRequest processRequest, IDictionary<string, string> detectorArgs, CancellationToken cancellationToken = default)
6985
{
7086
var file = processRequest.ComponentStream;
7187

@@ -74,20 +90,34 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
7490
// filename/directory checks are needed.
7591
try
7692
{
93+
// Check the size before touching ComponentStream so an oversized file is never
94+
// buffered into memory. The "*values*" globs can match large, non-Helm YAML files
95+
// whose full-DOM parse is the main driver of worst-case (timeout) runtime.
96+
var fileInfo = new FileInfo(file.Location);
97+
if (fileInfo.Exists && fileInfo.Length > MaxValuesFileSizeBytes)
98+
{
99+
this.Logger.LogWarning(
100+
"Skipping Helm values file exceeding size limit ({Length} bytes > {Limit} bytes): {Location}",
101+
fileInfo.Length,
102+
MaxValuesFileSizeBytes,
103+
file.Location);
104+
return Task.CompletedTask;
105+
}
106+
77107
this.Logger.LogInformation("Discovered Helm values file: {Location}", file.Location);
78108

79-
string contents;
109+
// Parse directly from the stream; the content is already buffered in memory by
110+
// LazyComponentStream, so reading it into an intermediate string only adds an
111+
// extra full-file allocation and GC pressure under parallel processing.
112+
var yaml = new YamlStream();
80113
using (var reader = new StreamReader(file.Stream))
81114
{
82-
contents = await reader.ReadToEndAsync(cancellationToken);
115+
yaml.Load(reader);
83116
}
84117

85-
var yaml = new YamlStream();
86-
yaml.Load(new StringReader(contents));
87-
88118
if (yaml.Documents.Count == 0)
89119
{
90-
return;
120+
return Task.CompletedTask;
91121
}
92122

93123
this.ExtractImageReferencesFromValues(yaml, processRequest.SingleFileComponentRecorder);
@@ -96,6 +126,8 @@ protected override async Task OnFileFoundAsync(ProcessRequest processRequest, ID
96126
{
97127
this.Logger.LogError(e, "Failed to parse Helm file: {Location}", file.Location);
98128
}
129+
130+
return Task.CompletedTask;
99131
}
100132

101133
/// <summary>

test/Microsoft.ComponentDetection.Common.Tests/DockerReferenceUtilityTests.cs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,18 +284,66 @@ public void HasUnresolvedVariables_ReturnsTrueForBraces()
284284
DockerReferenceUtility.HasUnresolvedVariables("{{ .Values.image }}").Should().BeTrue();
285285
}
286286

287+
[TestMethod]
288+
public void HasUnresolvedVariables_ReturnsTrueForDoubleUnderscoreTokens()
289+
{
290+
DockerReferenceUtility.HasUnresolvedVariables("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__").Should().BeTrue();
291+
}
292+
293+
[TestMethod]
294+
public void HasUnresolvedVariables_ReturnsTrueForHashDelimitedTokens()
295+
{
296+
DockerReferenceUtility.HasUnresolvedVariables("#cs_containerRegistryLoginServerUrl#/coreservicesaksservice_#cs_aks_workloadName#_#cs_aks_serviceTrackIdentifier#/#serviceName#:#imageTag#").Should().BeTrue();
297+
}
298+
287299
[TestMethod]
288300
public void HasUnresolvedVariables_ReturnsFalseForPlainReference()
289301
{
290302
DockerReferenceUtility.HasUnresolvedVariables("docker.io/library/nginx:latest").Should().BeFalse();
291303
}
292304

305+
[TestMethod]
306+
public void HasUnresolvedVariables_ReturnsFalseForReferenceWithUnderscores()
307+
{
308+
DockerReferenceUtility.HasUnresolvedVariables("mcr.microsoft.com/some_repo/my_image:1.0").Should().BeFalse();
309+
}
310+
293311
[TestMethod]
294312
public void TryParseImageReference_ReturnsNullForUnresolvedVariables()
295313
{
296314
DockerReferenceUtility.TryParseImageReference("${IMAGE}:latest").Should().BeNull();
297315
}
298316

317+
[TestMethod]
318+
public void TryParseImageReference_ReturnsNullForDoubleUnderscoreTokens()
319+
{
320+
DockerReferenceUtility.TryParseImageReference("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__").Should().BeNull();
321+
}
322+
323+
[TestMethod]
324+
public void TryParseImageReference_ReturnsNullForHashDelimitedTokens()
325+
{
326+
DockerReferenceUtility.TryParseImageReference("#cs_containerRegistryLoginServerUrl#/svc/#serviceName#:#imageTag#").Should().BeNull();
327+
}
328+
329+
[TestMethod]
330+
public void TryParseImageReference_DoesNotLogWarningForTemplatedReference()
331+
{
332+
var logger = new Mock<ILogger>();
333+
334+
var result = DockerReferenceUtility.TryParseImageReference("__MCR_ENDPOINT__/aks/devinfra/helm3sample:__IMAGE_TAG__", logger.Object);
335+
336+
result.Should().BeNull();
337+
logger.Verify(
338+
l => l.Log(
339+
It.IsAny<LogLevel>(),
340+
It.IsAny<EventId>(),
341+
It.IsAny<It.IsAnyType>(),
342+
It.IsAny<Exception>(),
343+
It.IsAny<Func<It.IsAnyType, Exception, string>>()),
344+
Times.Never);
345+
}
346+
299347
[TestMethod]
300348
public void TryParseImageReference_ReturnsNullForInvalidReference()
301349
{

0 commit comments

Comments
 (0)