Skip to content

Commit 5477018

Browse files
committed
feat: defensive identity overrides for MavenComponent and GitComponent
MavenComponent - Override GetExtendedIdProperties() to suppress DownloadUrl/SourceUrl from Id. GAV is the canonical Maven identity; the Maven Central download URL is deterministic from GAV and SourceUrl is surfaced server-side from the POM - neither should affect identity. GitComponent - Add PackageUrl override returning pkg:github/{owner}/{repo}@{commit} for repositories hosted on github.com (case-insensitive host match, .git suffix stripped, trailing slash normalised, path must resolve cleanly to owner/repo). Returns null for any other host (gitlab, bitbucket, ADO, GitHub Enterprise) and for malformed paths; consumers should fall back to RepositoryUrl in that case. - Override GetExtendedIdProperties() defensively for the same reason as MavenComponent. Tests - PurlGenerationTests: explicit Maven coverage plus 7 GitHub PURL cases (canonical, .git suffix, trailing slash, case-insensitive host, non-github hosts, malformed paths, missing commit hash). - TypedComponentSerializationTests: Id-stability tests proving both overrides exclude DownloadUrl/SourceUrl from Id even when set. No production behaviour change today (no detector currently sets DownloadUrl/SourceUrl on MavenComponent/GitComponent), but locks identity stability ahead of upcoming SBOM enrichment work that will populate DownloadUrl in the CD-Internal converter.
1 parent 4c08ecd commit 5477018

4 files changed

Lines changed: 217 additions & 0 deletions

File tree

src/Microsoft.ComponentDetection.Contracts/TypedComponent/GitComponent.cs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22
namespace Microsoft.ComponentDetection.Contracts.TypedComponent;
33

44
using System;
5+
using System.Collections.Generic;
56
using System.Text.Json.Serialization;
7+
using PackageUrl;
68

79
public class GitComponent : TypedComponent
810
{
11+
private const string GithubHost = "github.com";
12+
private const string DotGitSuffix = ".git";
13+
914
public GitComponent(Uri repositoryUrl, string commitHash)
1015
{
1116
this.RepositoryUrl = this.ValidateRequiredInput(repositoryUrl, nameof(this.RepositoryUrl), nameof(ComponentType.Git));
@@ -32,5 +37,79 @@ public GitComponent()
3237
[JsonIgnore]
3338
public override ComponentType Type => ComponentType.Git;
3439

40+
/// <summary>
41+
/// Gets <c>pkg:github/{owner}/{repo}@{commit}</c> for repositories hosted on github.com whose
42+
/// path resolves cleanly to <c>owner/repo</c>; null for any other host (gitlab, bitbucket, ADO,
43+
/// GitHub Enterprise, etc.) or malformed paths. Consumers should fall back to
44+
/// <see cref="RepositoryUrl"/> when this returns null.
45+
/// </summary>
46+
[JsonPropertyName("packageUrl")]
47+
public override PackageURL PackageUrl
48+
{
49+
get
50+
{
51+
if (string.IsNullOrEmpty(this.CommitHash)
52+
|| !TryGetGithubOwnerAndRepo(this.RepositoryUrl, out var owner, out var repo))
53+
{
54+
return null;
55+
}
56+
57+
return new PackageURL("github", owner, repo, this.CommitHash, null, null);
58+
}
59+
}
60+
3561
protected override string ComputeBaseId() => $"{this.RepositoryUrl} : {this.CommitHash} - {this.Type}";
62+
63+
/// <summary>
64+
/// Suppresses the base impl so <see cref="TypedComponent.Id"/> stays stable if a detector later
65+
/// populates <see cref="TypedComponent.DownloadUrl"/> or <see cref="TypedComponent.SourceUrl"/>.
66+
/// RepositoryUrl and CommitHash are already in BaseId; the GitHub archive download URL is
67+
/// deterministic and source URL would duplicate RepositoryUrl.
68+
/// </summary>
69+
/// <returns>An empty sequence.</returns>
70+
protected override IEnumerable<KeyValuePair<string, string>> GetExtendedIdProperties()
71+
{
72+
yield break;
73+
}
74+
75+
private static bool TryGetGithubOwnerAndRepo(Uri repositoryUrl, out string owner, out string repo)
76+
{
77+
owner = null;
78+
repo = null;
79+
80+
if (repositoryUrl == null
81+
|| !repositoryUrl.IsAbsoluteUri
82+
|| !string.Equals(repositoryUrl.Host, GithubHost, StringComparison.OrdinalIgnoreCase))
83+
{
84+
return false;
85+
}
86+
87+
var trimmedPath = repositoryUrl.AbsolutePath?.Trim('/');
88+
if (string.IsNullOrEmpty(trimmedPath))
89+
{
90+
return false;
91+
}
92+
93+
var segments = trimmedPath.Split('/');
94+
if (segments.Length != 2)
95+
{
96+
return false;
97+
}
98+
99+
var ownerSegment = segments[0];
100+
var repoSegment = segments[1];
101+
if (repoSegment.EndsWith(DotGitSuffix, StringComparison.OrdinalIgnoreCase))
102+
{
103+
repoSegment = repoSegment.Substring(0, repoSegment.Length - DotGitSuffix.Length);
104+
}
105+
106+
if (string.IsNullOrEmpty(ownerSegment) || string.IsNullOrEmpty(repoSegment))
107+
{
108+
return false;
109+
}
110+
111+
owner = ownerSegment;
112+
repo = repoSegment;
113+
return true;
114+
}
36115
}

src/Microsoft.ComponentDetection.Contracts/TypedComponent/MavenComponent.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#nullable disable
22
namespace Microsoft.ComponentDetection.Contracts.TypedComponent;
33

4+
using System.Collections.Generic;
45
using System.Text.Json.Serialization;
56
using PackageUrl;
67

@@ -34,4 +35,16 @@ public MavenComponent()
3435
public override PackageURL PackageUrl => new PackageURL("maven", this.GroupId, this.ArtifactId, this.Version, null, null);
3536

3637
protected override string ComputeBaseId() => $"{this.GroupId} {this.ArtifactId} {this.Version} - {this.Type}";
38+
39+
/// <summary>
40+
/// Suppresses the base impl so <see cref="TypedComponent.Id"/> stays stable if a detector later
41+
/// populates <see cref="TypedComponent.DownloadUrl"/> or <see cref="TypedComponent.SourceUrl"/>.
42+
/// GroupId/ArtifactId/Version are already in BaseId; the Maven Central download URL is deterministic
43+
/// and source/repo URLs are surfaced server-side from the POM.
44+
/// </summary>
45+
/// <returns>An empty sequence.</returns>
46+
protected override IEnumerable<KeyValuePair<string, string>> GetExtendedIdProperties()
47+
{
48+
yield break;
49+
}
3750
}

test/Microsoft.ComponentDetection.Contracts.Tests/PurlGenerationTests.cs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#nullable disable
22
namespace Microsoft.ComponentDetection.Contracts.Tests;
33

4+
using System;
45
using AwesomeAssertions;
56
using Microsoft.ComponentDetection.Contracts.TypedComponent;
67
using Microsoft.VisualStudio.TestTools.UnitTesting;
@@ -110,4 +111,96 @@ public void CocoaPodNameShouldPurlWithCustomQualifier()
110111

111112
packageOne.PackageUrl.ToString().Should().Be("pkg:cocoapods/AFNetworking@4.0.1?repository_url=https:%2F%2Fcustom_repo.example.com%2Fpath%2Fto%2Frepo%2Fspecs.git");
112113
}
114+
115+
[TestMethod]
116+
public void MavenComponentShouldGenerateMavenPurl()
117+
{
118+
// https://github.com/package-url/purl-spec/blob/b8ddd39a6d533b8895f3b741f2e62e2695d82aa4/PURL-TYPES.rst#maven
119+
var component = new MavenComponent("com.google.guava", "guava", "33.0-jre");
120+
121+
component.PackageUrl.Type.Should().Be("maven");
122+
component.PackageUrl.Namespace.Should().Be("com.google.guava");
123+
component.PackageUrl.Name.Should().Be("guava");
124+
component.PackageUrl.Version.Should().Be("33.0-jre");
125+
component.PackageUrl.ToString().Should().Be("pkg:maven/com.google.guava/guava@33.0-jre");
126+
}
127+
128+
[TestMethod]
129+
public void GitComponentGithubRepositoryShouldGenerateGithubPurl()
130+
{
131+
// https://github.com/package-url/purl-spec/blob/b8ddd39a6d533b8895f3b741f2e62e2695d82aa4/PURL-TYPES.rst#github
132+
var component = new GitComponent(new Uri("https://github.com/google/guava"), "abcdef1234567890");
133+
134+
component.PackageUrl.Type.Should().Be("github");
135+
component.PackageUrl.Namespace.Should().Be("google");
136+
component.PackageUrl.Name.Should().Be("guava");
137+
component.PackageUrl.Version.Should().Be("abcdef1234567890");
138+
component.PackageUrl.ToString().Should().Be("pkg:github/google/guava@abcdef1234567890");
139+
}
140+
141+
[TestMethod]
142+
public void GitComponentGithubRepositoryWithDotGitSuffixShouldStripIt()
143+
{
144+
var component = new GitComponent(new Uri("https://github.com/google/guava.git"), "abcdef1234567890");
145+
146+
component.PackageUrl.Name.Should().Be("guava", "the .git suffix is not part of the canonical repo name");
147+
component.PackageUrl.ToString().Should().Be("pkg:github/google/guava@abcdef1234567890");
148+
}
149+
150+
[TestMethod]
151+
public void GitComponentGithubRepositoryWithTrailingSlashShouldBeNormalized()
152+
{
153+
var component = new GitComponent(new Uri("https://github.com/google/guava/"), "abcdef1234567890");
154+
155+
component.PackageUrl.ToString().Should().Be("pkg:github/google/guava@abcdef1234567890");
156+
}
157+
158+
[TestMethod]
159+
public void GitComponentGithubHostMatchIsCaseInsensitive()
160+
{
161+
var component = new GitComponent(new Uri("https://GitHub.com/google/guava"), "abcdef1234567890");
162+
163+
component.PackageUrl.ToString().Should().Be("pkg:github/google/guava@abcdef1234567890");
164+
}
165+
166+
[TestMethod]
167+
public void GitComponentNonGithubRepositoryShouldHaveNoPackageUrl()
168+
{
169+
// GitLab / Bitbucket / Azure DevOps / GitHub Enterprise have no canonical PURL representation today.
170+
// Consumers should fall back to RepositoryUrl in this case.
171+
var gitlab = new GitComponent(new Uri("https://gitlab.com/foo/bar"), "abcdef1234567890");
172+
var bitbucket = new GitComponent(new Uri("https://bitbucket.org/foo/bar"), "abcdef1234567890");
173+
var ado = new GitComponent(new Uri("https://dev.azure.com/org/proj/_git/repo"), "abcdef1234567890");
174+
var ghEnterprise = new GitComponent(new Uri("https://github.contoso.com/foo/bar"), "abcdef1234567890");
175+
176+
gitlab.PackageUrl.Should().BeNull();
177+
bitbucket.PackageUrl.Should().BeNull();
178+
ado.PackageUrl.Should().BeNull();
179+
ghEnterprise.PackageUrl.Should().BeNull();
180+
}
181+
182+
[TestMethod]
183+
public void GitComponentMalformedGithubUrlShouldHaveNoPackageUrl()
184+
{
185+
// Owner only, or paths deeper than owner/repo (e.g. browse URLs) — not canonical repository URLs.
186+
var ownerOnly = new GitComponent(new Uri("https://github.com/google"), "abcdef1234567890");
187+
var tooDeep = new GitComponent(new Uri("https://github.com/google/guava/tree/main"), "abcdef1234567890");
188+
var rootOnly = new GitComponent(new Uri("https://github.com/"), "abcdef1234567890");
189+
190+
ownerOnly.PackageUrl.Should().BeNull();
191+
tooDeep.PackageUrl.Should().BeNull();
192+
rootOnly.PackageUrl.Should().BeNull();
193+
}
194+
195+
[TestMethod]
196+
public void GitComponentMissingCommitHashShouldHaveNoPackageUrl()
197+
{
198+
// CommitHash is required via the public ctor, but the parameterless deserialization ctor allows null.
199+
var component = new GitComponent
200+
{
201+
RepositoryUrl = new Uri("https://github.com/google/guava"),
202+
};
203+
204+
component.PackageUrl.Should().BeNull();
205+
}
113206
}

test/Microsoft.ComponentDetection.Contracts.Tests/TypedComponentSerializationTests.cs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,4 +504,36 @@ public void TypedComponent_Id_IncludesBothUrls_WhenPresent()
504504
tc.BaseId.Should().Be("TestPackage 1.0.0 - NuGet");
505505
tc.Id.Should().Be("TestPackage 1.0.0 - NuGet [DownloadUrl:https://example.com/package/1.0.0 SourceUrl:https://github.com/test-org/TestPackage]");
506506
}
507+
508+
[TestMethod]
509+
public void MavenComponent_Id_ExcludesDownloadUrlAndSourceUrl_WhenSet()
510+
{
511+
// MavenComponent overrides GetExtendedIdProperties to keep Id stable across detectors that
512+
// may or may not populate DownloadUrl / SourceUrl (e.g. CDS surfaces SourceUrl from the POM,
513+
// and DownloadUrl is deterministic from GAV — neither should affect identity).
514+
var tc = new MavenComponent("com.google.guava", "guava", "33.0-jre")
515+
{
516+
DownloadUrl = new Uri("https://repo1.maven.org/maven2/com/google/guava/guava/33.0-jre/guava-33.0-jre.jar"),
517+
SourceUrl = new Uri("https://github.com/google/guava"),
518+
};
519+
520+
tc.BaseId.Should().Be("com.google.guava guava 33.0-jre - Maven");
521+
tc.Id.Should().Be(tc.BaseId, "DownloadUrl and SourceUrl must not affect MavenComponent identity");
522+
}
523+
524+
[TestMethod]
525+
public void GitComponent_Id_ExcludesDownloadUrlAndSourceUrl_WhenSet()
526+
{
527+
// GitComponent overrides GetExtendedIdProperties for the same reason as MavenComponent:
528+
// SourceUrl would duplicate RepositoryUrl, and DownloadUrl (the github archive URL) is deterministic.
529+
var repo = new Uri("https://github.com/google/guava");
530+
var tc = new GitComponent(repo, "abcdef1234567890")
531+
{
532+
DownloadUrl = new Uri("https://github.com/google/guava/archive/abcdef1234567890.zip"),
533+
SourceUrl = repo,
534+
};
535+
536+
tc.BaseId.Should().Be("https://github.com/google/guava : abcdef1234567890 - Git");
537+
tc.Id.Should().Be(tc.BaseId, "DownloadUrl and SourceUrl must not affect GitComponent identity");
538+
}
507539
}

0 commit comments

Comments
 (0)