From f70d7be62ce0b910fb3ce99f6ebe9a537815d181 Mon Sep 17 00:00:00 2001 From: Jan Krivanek Date: Thu, 12 Mar 2026 21:22:57 +0100 Subject: [PATCH 1/3] Add plugin-aware skill validation with activation detection --- eng/dashboard/dashboard.js | 323 +++++++++++++++--- eng/dashboard/generate-benchmark-data.ps1 | 102 +++++- .../src/Commands/ValidateCommand.cs | 283 ++++++++++----- eng/skill-validator/src/Models/Models.cs | 20 +- .../src/Services/AgentRunner.cs | 160 +++++++-- .../src/Services/Comparator.cs | 20 +- .../src/Services/MetricsCollector.cs | 18 +- .../src/Services/PairwiseJudge.cs | 2 +- eng/skill-validator/src/Services/Reporter.cs | 315 +++++++++++++---- .../src/Services/SkillDiscovery.cs | 61 ++++ eng/skill-validator/tests/ComparatorTests.cs | 31 ++ eng/skill-validator/tests/DiscoveryTests.cs | 93 +++++ eng/skill-validator/tests/MetricsTests.cs | 137 ++++++++ .../tests/OverfittingJudgeTests.cs | 22 +- eng/skill-validator/tests/RunnerTests.cs | 94 +++-- 15 files changed, 1388 insertions(+), 293 deletions(-) diff --git a/eng/dashboard/dashboard.js b/eng/dashboard/dashboard.js index b8f5e58265..b0e731d302 100644 --- a/eng/dashboard/dashboard.js +++ b/eng/dashboard/dashboard.js @@ -160,14 +160,16 @@ if (qualityEntries.length > 0) { // Use only the most recent entries for summary cards const recentEntries = qualityEntries.slice(-SUMMARY_WINDOW); - let skilledTotal = 0, skilledCount = 0, vanillaTotal = 0, vanillaCount = 0; + let skilledTotal = 0, skilledCount = 0, pluginTotal = 0, pluginCount = 0, vanillaTotal = 0, vanillaCount = 0; recentEntries.forEach(entry => { entry.benches.forEach(b => { if (b.name.endsWith('- Skilled Quality')) { skilledTotal += b.value; skilledCount++; } + if (b.name.endsWith('- Plugin Quality')) { pluginTotal += b.value; pluginCount++; } if (b.name.endsWith('- Vanilla Quality')) { vanillaTotal += b.value; vanillaCount++; } }); }); const skilledAvg = skilledCount > 0 ? skilledTotal / skilledCount : null; + const pluginAvg = pluginCount > 0 ? pluginTotal / pluginCount : null; const vanillaAvg = vanillaCount > 0 ? vanillaTotal / vanillaCount : null; const latestModel = qualityEntries[qualityEntries.length - 1].model; const windowLabel = qualityEntries.length > SUMMARY_WINDOW @@ -177,22 +179,43 @@ const delta = (skilledAvg - vanillaAvg).toFixed(2); const deltaClass = delta > 0 ? 'positive' : delta < 0 ? 'negative' : 'neutral'; const deltaSign = delta > 0 ? '+' : ''; - summaryDiv.innerHTML = ` + let cardsHtml = `
-
Skilled Avg
+
Skilled (Isolated) Avg
${skilledAvg.toFixed(2)}
${windowLabel}
-
+ `; + if (pluginAvg !== null) { + cardsHtml += ` +
+
Skilled (Plugin) Avg
+
${pluginAvg.toFixed(2)}
+
${windowLabel}
+
`; + } + cardsHtml += `
Vanilla Avg
${vanillaAvg.toFixed(2)}
${windowLabel}
-
Delta
+
Delta (Isolated)
${deltaSign}${delta}
${delta > 0 ? 'Skills improve quality' : delta < 0 ? 'Skills degrade quality' : 'No difference'}
-
+ `; + if (pluginAvg !== null && vanillaAvg !== null) { + const pluginDelta = (pluginAvg - vanillaAvg).toFixed(2); + const pluginDeltaClass = pluginDelta > 0 ? 'positive' : pluginDelta < 0 ? 'negative' : 'neutral'; + const pluginDeltaSign = pluginDelta > 0 ? '+' : ''; + cardsHtml += ` +
+
Delta (Plugin)
+
${pluginDeltaSign}${pluginDelta}
+
${pluginDelta > 0 ? 'Plugin improves quality' : pluginDelta < 0 ? 'Plugin degrades quality' : 'No difference'}
+
`; + } + cardsHtml += `
Data Points
${qualityEntries.length}
@@ -204,6 +227,7 @@
latest run
`; + summaryDiv.innerHTML = cardsHtml; } // Count not-activated entries @@ -267,19 +291,32 @@ if (qualityEntries.length > 0) { // Discover tests from all entries (not just latest, which may have partial data) const tests = new Set(); + let hasAnyPlugin = false; qualityEntries.forEach(entry => { entry.benches.forEach(b => { - const match = b.name.match(/^(.+) - (Skilled|Vanilla) Quality$/); - if (match) tests.add(match[1]); + const match = b.name.match(/^(.+) - (Skilled|Plugin|Vanilla) Quality$/); + if (match) { + tests.add(match[1]); + if (match[2] === 'Plugin') hasAnyPlugin = true; + } }); }); tests.forEach(test => { - createPairedChart( - qualityChartsDiv, test, qualityEntries, - `${test} - Skilled Quality`, `${test} - Vanilla Quality`, - 'Skilled', 'Vanilla', '#58a6ff', '#8b949e' - ); + if (hasAnyPlugin) { + createTripleChart( + qualityChartsDiv, test, qualityEntries, + `${test} - Skilled Quality`, `${test} - Plugin Quality`, `${test} - Vanilla Quality`, + 'Isolated', 'Plugin', 'Vanilla', + '#58a6ff', '#3fb950', '#8b949e' + ); + } else { + createPairedChart( + qualityChartsDiv, test, qualityEntries, + `${test} - Skilled Quality`, `${test} - Vanilla Quality`, + 'Skilled', 'Vanilla', '#58a6ff', '#8b949e' + ); + } }); } @@ -288,10 +325,13 @@ if (efficiencyEntries.length > 0) { // Discover tests from all entries (not just latest, which may have partial data) const effTests = new Set(); + let hasAnyPluginEff = false; efficiencyEntries.forEach(entry => { entry.benches.forEach(b => { - const match = b.name.match(/^(.+) - Skilled Time$/); - if (match) effTests.add(match[1]); + const matchSkilled = b.name.match(/^(.+) - Skilled Time$/); + if (matchSkilled) effTests.add(matchSkilled[1]); + const matchPlugin = b.name.match(/^(.+) - Plugin Time$/); + if (matchPlugin) { effTests.add(matchPlugin[1]); hasAnyPluginEff = true; } }); }); @@ -310,15 +350,20 @@ // Precompute per-entry data in a single pass over e.benches const timeName = `${test} - Skilled Time`; const tokenName = `${test} - Skilled Tokens In`; + const plugTimeName = `${test} - Plugin Time`; + const plugTokenName = `${test} - Plugin Tokens In`; const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false }; const perEntryData = efficiencyEntries.map(e => { let timeBench = undefined; let tokenBench = undefined; + let plugTimeBench = undefined; + let plugTokenBench = undefined; for (const b of e.benches) { if (!timeBench && b.name === timeName) timeBench = b; else if (!tokenBench && b.name === tokenName) tokenBench = b; - if (timeBench && tokenBench) break; + else if (!plugTimeBench && b.name === plugTimeName) plugTimeBench = b; + else if (!plugTokenBench && b.name === plugTokenName) plugTokenBench = b; } const timeNA = !!(timeBench && timeBench.notActivated); const tokenNA = !!(tokenBench && tokenBench.notActivated); @@ -344,11 +389,15 @@ tokenNotActivated: tokenNA, tokenTimedOut: tokenTO, tokenOverfitting: tokenOF, + plugTimeValue: plugTimeBench ? plugTimeBench.value : null, + plugTokenValue: plugTokenBench ? plugTokenBench.value / 1000 : null, }; }); const timeData = perEntryData.map(d => d.timeValue); const tokenData = perEntryData.map(d => d.tokenValue); + const plugTimeData = perEntryData.map(d => d.plugTimeValue); + const plugTokenData = perEntryData.map(d => d.plugTokenValue); // Per-point styling using shared helper const timeAp = perEntryData.map(d => getPointAppearance({ timedOut: d.timeTimedOut, notActivated: d.timeNotActivated, overfitting: d.timeOverfitting }, '#f0883e')); @@ -362,41 +411,70 @@ const tokenPointRadius = tokenAp.map(a => a.radius); const tokenPointBorderWidth = tokenAp.map(a => a.borderWidth); + const datasets = [ + { + label: 'Isolated Time (s)', + data: timeData, + borderColor: '#f0883e', + borderWidth: 2, + pointBackgroundColor: timePointBg, + pointBorderColor: timePointBg, + pointRadius: timePointRadius, + pointBorderWidth: timePointBorderWidth, + pointStyle: timePointStyle, + tension: 0.3, + fill: false, + yAxisID: 'y' + }, + { + label: 'Isolated Tokens (k)', + data: tokenData, + borderColor: '#a371f7', + borderWidth: 2, + pointBackgroundColor: tokenPointBg, + pointBorderColor: tokenPointBg, + pointRadius: tokenPointRadius, + pointBorderWidth: tokenPointBorderWidth, + pointStyle: tokenPointStyle, + tension: 0.3, + borderDash: [5, 5], + fill: false, + yAxisID: 'y1' + } + ]; + + // Add plugin efficiency datasets if any plugin data exists + if (hasAnyPluginEff) { + datasets.push({ + label: 'Plugin Time (s)', + data: plugTimeData, + borderColor: '#3fb950', + borderWidth: 2, + pointRadius: 4, + pointHoverRadius: 6, + tension: 0.3, + fill: false, + yAxisID: 'y' + }); + datasets.push({ + label: 'Plugin Tokens (k)', + data: plugTokenData, + borderColor: '#56d364', + borderWidth: 2, + pointRadius: 4, + pointHoverRadius: 6, + tension: 0.3, + borderDash: [5, 5], + fill: false, + yAxisID: 'y1' + }); + } + new Chart(canvas, { type: 'line', data: { labels, - datasets: [ - { - label: 'Time (s)', - data: timeData, - borderColor: '#f0883e', - borderWidth: 2, - pointBackgroundColor: timePointBg, - pointBorderColor: timePointBg, - pointRadius: timePointRadius, - pointBorderWidth: timePointBorderWidth, - pointStyle: timePointStyle, - tension: 0.3, - fill: false, - yAxisID: 'y' - }, - { - label: 'Tokens In (k)', - data: tokenData, - borderColor: '#a371f7', - borderWidth: 2, - pointBackgroundColor: tokenPointBg, - pointBorderColor: tokenPointBg, - pointRadius: tokenPointRadius, - pointBorderWidth: tokenPointBorderWidth, - pointStyle: tokenPointStyle, - tension: 0.3, - borderDash: [5, 5], - fill: false, - yAxisID: 'y1' - } - ] + datasets }, options: { responsive: true, @@ -445,6 +523,157 @@ } } + // Helper: create a triple line chart (baseline, isolated, plugin) + function createTripleChart(container, title, entries, nameA, nameB, nameC, labelA, labelB, labelC, colorA, colorB, colorC) { + const div = document.createElement('div'); + div.className = 'chart-container'; + div.innerHTML = `

${title}

`; + container.appendChild(div); + const canvas = div.querySelector('canvas'); + + const labels = entries.map(e => { + const d = new Date(e.date); + return d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit' }); + }); + + // Precompute per-entry data in a single pass + const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false }; + const perEntryData = entries.map(e => { + let benchA = undefined, benchB = undefined, benchC = undefined; + for (const b of e.benches) { + if (!benchA && b.name === nameA) benchA = b; + else if (!benchB && b.name === nameB) benchB = b; + else if (!benchC && b.name === nameC) benchC = b; + if (benchA && benchB && benchC) break; + } + const aNotActivated = !!(benchA && benchA.notActivated); + const aTimedOut = !!(benchA && benchA.timedOut); + const aOverfitting = benchA && benchA.overfitting ? benchA.overfitting : null; + const bNotActivated = !!(benchB && benchB.notActivated); + const bTimedOut = !!(benchB && benchB.timedOut); + const bOverfitting = benchB && benchB.overfitting ? benchB.overfitting : null; + if (aNotActivated || bNotActivated) legendFlags.notActivated = true; + if (aTimedOut || bTimedOut) legendFlags.timedOut = true; + if (aOverfitting || bOverfitting) { + if (aOverfitting === 'high' || bOverfitting === 'high') legendFlags.overfittingHigh = true; + else legendFlags.overfittingModerate = true; + } + const aIssues = (aNotActivated ? 1 : 0) + (aTimedOut ? 1 : 0) + (aOverfitting ? 1 : 0); + const bIssues = (bNotActivated ? 1 : 0) + (bTimedOut ? 1 : 0) + (bOverfitting ? 1 : 0); + if (aIssues > 1 || bIssues > 1) legendFlags.multiIssue = true; + return { + valueA: benchA ? benchA.value : null, + valueB: benchB ? benchB.value : null, + valueC: benchC ? benchC.value : null, + aNotActivated, aTimedOut, aOverfitting, + bNotActivated, bTimedOut, bOverfitting, + }; + }); + + const dataA = perEntryData.map(d => d.valueA); + const dataB = perEntryData.map(d => d.valueB); + const dataC = perEntryData.map(d => d.valueC); + + // Per-point styling for dataset A (Isolated) and B (Plugin) + const pointApA = perEntryData.map(d => getPointAppearance({ timedOut: d.aTimedOut, notActivated: d.aNotActivated, overfitting: d.aOverfitting }, colorA)); + const pointBgA = pointApA.map(a => a.color); + const pointBorderA = pointApA.map(a => a.color); + const pointRadiusA = pointApA.map(a => a.radius); + const pointStyleA = pointApA.map(a => a.style); + const pointBorderWidthA = pointApA.map(a => a.borderWidth); + const pointApB = perEntryData.map(d => getPointAppearance({ timedOut: d.bTimedOut, notActivated: d.bNotActivated, overfitting: d.bOverfitting }, colorB)); + const pointBgB = pointApB.map(a => a.color); + const pointBorderB = pointApB.map(a => a.color); + const pointRadiusB = pointApB.map(a => a.radius); + const pointStyleB = pointApB.map(a => a.style); + const pointBorderWidthB = pointApB.map(a => a.borderWidth); + + new Chart(canvas, { + type: 'line', + data: { + labels, + datasets: [ + { + label: labelA, + data: dataA, + borderColor: colorA, + backgroundColor: colorA + '20', + borderWidth: 2, + pointBackgroundColor: pointBgA, + pointBorderColor: pointBorderA, + pointRadius: pointRadiusA, + pointBorderWidth: pointBorderWidthA, + pointStyle: pointStyleA, + pointHoverRadius: 8, + tension: 0.3, + fill: false + }, + { + label: labelB, + data: dataB, + borderColor: colorB, + backgroundColor: colorB + '20', + borderWidth: 2, + pointBackgroundColor: pointBgB, + pointBorderColor: pointBorderB, + pointRadius: pointRadiusB, + pointBorderWidth: pointBorderWidthB, + pointStyle: pointStyleB, + pointHoverRadius: 8, + tension: 0.3, + fill: false + }, + { + label: labelC, + data: dataC, + borderColor: colorC, + backgroundColor: colorC + '20', + borderWidth: 2, + pointRadius: 4, + pointHoverRadius: 6, + tension: 0.3, + borderDash: [5, 5], + fill: false + } + ] + }, + options: { + responsive: true, + interaction: { mode: 'index', intersect: false }, + plugins: { + legend: { labels: { color: '#8b949e', font: { size: 11 }, usePointStyle: true } }, + tooltip: { + callbacks: { + afterTitle: (items) => { + const idx = items[0].dataIndex; + const entry = entries[idx]; + const parts = []; + if (entry && entry.model) parts.push(`Model: ${entry.model}`); + if (entry && entry.commit) { + const msg = entry.commit.message.split('\n')[0]; + parts.push(msg.length > 60 ? msg.substring(0, 60) + '...' : msg); + } + parts.push(...buildIssueTooltipLines(entry, b => b.name === nameA || b.name === nameB || b.name === nameC)); + return parts.join('\n'); + } + } + } + }, + scales: { + x: { ticks: { color: '#8b949e' }, grid: { color: '#30363d' } }, + y: { + ticks: { color: '#8b949e' }, + grid: { color: '#30363d' }, + suggestedMin: 0, + suggestedMax: 10 + } + } + } + }); + + appendLegendNotes(div, legendFlags); + } + // Helper: create a paired line chart function createPairedChart(container, title, entries, nameA, nameB, labelA, labelB, colorA, colorB) { const div = document.createElement('div'); diff --git a/eng/dashboard/generate-benchmark-data.ps1 b/eng/dashboard/generate-benchmark-data.ps1 index c227467e8f..16e9d564e5 100644 --- a/eng/dashboard/generate-benchmark-data.ps1 +++ b/eng/dashboard/generate-benchmark-data.ps1 @@ -128,15 +128,15 @@ foreach ($verdict in $results.verdicts) { # roll-up across all scenarios and must NOT be used here — each datapoint # should reflect only its own scenario's activation result). $notActivated = $false - if ($scenario.skillActivation -and -not $scenario.skillActivation.activated) { - # Only flag as not-activated if activation was expected (expect_activation defaults to true) - $expectActivation = $true - if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) { - $expectActivation = $false - } - if ($expectActivation) { - $notActivated = $true - } + # Determine whether activation is expected (defaults to true) + $expectActivation = $true + if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) { + $expectActivation = $false + } + # Support both old (skillActivation) and new (skillActivationIsolated) JSON schemas + $sa = if ($scenario.PSObject.Properties['skillActivationIsolated']) { $scenario.skillActivationIsolated } else { $scenario.skillActivation } + if ($sa -and -not $sa.activated -and $expectActivation) { + $notActivated = $true } # Check per-scenario timeout state @@ -184,12 +184,18 @@ foreach ($verdict in $results.verdicts) { } } + # Support both old (withSkill) and new (skilledIsolated) JSON schemas + $skilled = if ($scenario.PSObject.Properties['skilledIsolated']) { $scenario.skilledIsolated } else { $scenario.withSkill } + + # Plugin run (may not exist for older results or utility methods) + $plugin = if ($scenario.PSObject.Properties['skilledPlugin']) { $scenario.skilledPlugin } else { $null } + # Quality scores (from judge results, scale 0-5 mapped to 0-10 for dashboard) - if ($null -ne $scenario.withSkill.judgeResult.overallScore) { + if ($null -ne $skilled.judgeResult.overallScore) { $benchEntry = @{ name = "$testName - Skilled Quality" unit = "Score (0-10)" - value = [float]$scenario.withSkill.judgeResult.overallScore * 2 + value = [float]$skilled.judgeResult.overallScore * 2 } if ($notActivated) { $benchEntry.notActivated = $true @@ -203,6 +209,26 @@ foreach ($verdict in $results.verdicts) { } $qualityBenches.Add($benchEntry) } + if ($null -ne $plugin -and $null -ne $plugin.judgeResult.overallScore) { + $pluginBenchEntry = @{ + name = "$testName - Plugin Quality" + unit = "Score (0-10)" + value = [float]$plugin.judgeResult.overallScore * 2 + } + # Plugin activation check + $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null } + if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) { + $pluginBenchEntry.notActivated = $true + } + if ($scenarioTimedOut) { + $pluginBenchEntry.timedOut = $true + } + if ($overfittingSeverity) { + $pluginBenchEntry.overfitting = $overfittingSeverity + $pluginBenchEntry.overfittingScore = $overfittingScore + } + $qualityBenches.Add($pluginBenchEntry) + } if ($null -ne $scenario.baseline.judgeResult.overallScore) { $qualityBenches.Add(@{ name = "$testName - Vanilla Quality" @@ -211,12 +237,12 @@ foreach ($verdict in $results.verdicts) { }) } - # Efficiency metrics (from with-skill run) - if ($null -ne $scenario.withSkill.metrics.wallTimeMs) { + # Efficiency metrics (from with-skill isolated run) + if ($null -ne $skilled.metrics.wallTimeMs) { $effBenchEntry = @{ name = "$testName - Skilled Time" unit = "seconds" - value = [math]::Round([float]$scenario.withSkill.metrics.wallTimeMs / 1000, 1) + value = [math]::Round([float]$skilled.metrics.wallTimeMs / 1000, 1) } if ($notActivated) { $effBenchEntry.notActivated = $true @@ -230,11 +256,11 @@ foreach ($verdict in $results.verdicts) { } $efficiencyBenches.Add($effBenchEntry) } - if ($null -ne $scenario.withSkill.metrics.tokenEstimate) { + if ($null -ne $skilled.metrics.tokenEstimate) { $tokenBenchEntry = @{ name = "$testName - Skilled Tokens In" unit = "tokens" - value = [float]$scenario.withSkill.metrics.tokenEstimate + value = [float]$skilled.metrics.tokenEstimate } if ($notActivated) { $tokenBenchEntry.notActivated = $true @@ -248,6 +274,50 @@ foreach ($verdict in $results.verdicts) { } $efficiencyBenches.Add($tokenBenchEntry) } + + # Efficiency metrics (from plugin run, if exists) + # Compute plugin-specific notActivated signal for efficiency benches + $pluginNotActivated = $false + $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null } + if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) { + $pluginNotActivated = $true + } + if ($null -ne $plugin -and $null -ne $plugin.metrics.wallTimeMs) { + $pluginTimeBench = @{ + name = "$testName - Plugin Time" + unit = "seconds" + value = [math]::Round([float]$plugin.metrics.wallTimeMs / 1000, 1) + } + if ($pluginNotActivated) { + $pluginTimeBench.notActivated = $true + } + if ($scenarioTimedOut) { + $pluginTimeBench.timedOut = $true + } + if ($overfittingSeverity) { + $pluginTimeBench.overfitting = $overfittingSeverity + $pluginTimeBench.overfittingScore = $overfittingScore + } + $efficiencyBenches.Add($pluginTimeBench) + } + if ($null -ne $plugin -and $null -ne $plugin.metrics.tokenEstimate) { + $pluginTokenBench = @{ + name = "$testName - Plugin Tokens In" + unit = "tokens" + value = [float]$plugin.metrics.tokenEstimate + } + if ($pluginNotActivated) { + $pluginTokenBench.notActivated = $true + } + if ($scenarioTimedOut) { + $pluginTokenBench.timedOut = $true + } + if ($overfittingSeverity) { + $pluginTokenBench.overfitting = $overfittingSeverity + $pluginTokenBench.overfittingScore = $overfittingScore + } + $efficiencyBenches.Add($pluginTokenBench) + } } } diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 7220b3805d..2e18b23a7b 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -188,6 +188,22 @@ public static async Task Run(ValidatorConfig config) Console.WriteLine($"Noise test enabled: discovered {noiseSkills.Count} noise skill(s) from {config.NoiseSkillsDir}"); } + // Group skills by their plugin — standalone skills are errors + var (_, pluginErrors) = SkillDiscovery.GroupSkillsByPlugin(allSkills); + foreach (var error in pluginErrors) + Console.Error.WriteLine($"\x1b[31m❌ {error}\x1b[0m"); + if (pluginErrors.Count > 0) + { + if (allSkills.Count == pluginErrors.Count) + { + Console.Error.WriteLine("\x1b[31mAll skills are standalone (no valid plugin.json found) — nothing to evaluate.\x1b[0m"); + return 1; + } + // Filter out standalone skills — they would silently run without a + // real plugin context, producing misleading "plugin" results. + allSkills = allSkills.Where(s => SkillDiscovery.FindPluginRoot(s.Path) is not null).ToList(); + } + // Check per-plugin aggregate description size var aggregateFailures = CheckAggregateDescriptionLimits(allSkills); if (aggregateFailures.Count > 0) @@ -310,7 +326,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, Console.Error.WriteLine(); } - await AgentRunner.StopSharedClient(); + await AgentRunner.StopAllClients(); await AgentRunner.CleanupWorkDirs(); // Always fail on execution errors, even in --verdict-warn-only mode @@ -489,11 +505,10 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList c.SkillActivation is { Activated: false }).ToList(); - // Separate unexpected non-activations (expect_activation defaulting to true) - // from expected ones (negative tests with expect_activation: false). - var unexpectedNotActivated = notActivated.Where(c => c.ExpectActivation).ToList(); - var expectedNotActivated = notActivated.Where(c => !c.ExpectActivation).ToList(); + var notActivatedIsolated = comparisons.Where(c => c.SkillActivationIsolated is { Activated: false } && c.ExpectActivation).ToList(); + var notActivatedPlugin = comparisons.Where(c => c.SkillActivationPlugin is { Activated: false } && c.ExpectActivation).ToList(); + var expectedNotActivated = comparisons.Where(c => + (c.SkillActivationIsolated is { Activated: false } || c.SkillActivationPlugin is { Activated: false }) && !c.ExpectActivation).ToList(); if (expectedNotActivated.Count > 0) { @@ -501,14 +516,23 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList 0) + if (notActivatedIsolated.Count > 0) { - var names = string.Join(", ", unexpectedNotActivated.Select(c => c.ScenarioName)); - log($"\x1b[33m\u26a0\ufe0f Skill was NOT activated in scenario(s): {names}\x1b[0m"); + var names = string.Join(", ", notActivatedIsolated.Select(c => c.ScenarioName)); + log($"\x1b[33m⚠️ Skill NOT activated (isolated) in: {names}\x1b[0m"); verdict.SkillNotActivated = true; verdict.Passed = false; verdict.FailureKind = "skill_not_activated"; - verdict.Reason += $" [SKILL NOT ACTIVATED in {unexpectedNotActivated.Count} scenario(s): {names}]"; + verdict.Reason += $" [NOT ACTIVATED (isolated) in {notActivatedIsolated.Count} scenario(s)]"; + } + if (notActivatedPlugin.Count > 0) + { + var names = string.Join(", ", notActivatedPlugin.Select(c => c.ScenarioName)); + log($"\x1b[33m⚠️ Skill NOT activated (plugin) in: {names}\x1b[0m"); + verdict.SkillNotActivated = true; + verdict.Passed = false; + verdict.FailureKind = "skill_not_activated"; + verdict.Reason += $" [NOT ACTIVATED (plugin) in {notActivatedPlugin.Count} scenario(s)]"; } var timedOutScenarios = comparisons.Where(c => c.TimedOut).ToList(); @@ -544,34 +568,97 @@ private static async Task ExecuteScenario( scenarioLog($"✓ All {config.Runs} run(s) complete"); var baselineRuns = runResults.Select(r => r.Baseline).ToList(); - var withSkillRuns = runResults.Select(r => r.WithSkill).ToList(); + var isolatedRuns = runResults.Select(r => r.SkilledIsolated).ToList(); + var pluginRuns = runResults.Select(r => r.SkilledPlugin).ToList(); var perRunPairwise = runResults.Select(r => r.Pairwise).ToList(); - var perRunScores = new List(); + // Per-run improvement scores — effective score is min(isolated, plugin) + // Pairwise result is generated against the worse-scoring run (isolated + // or plugin). Apply it only to that matching comparison so it does not + // skew the other one. + var perRunIsolatedScores = new List(); + var perRunPluginScores = new List(); for (int i = 0; i < baselineRuns.Count; i++) { - var runComparison = Comparator.CompareScenario(scenario.Name, baselineRuns[i], withSkillRuns[i], perRunPairwise[i]); - perRunScores.Add(runComparison.ImprovementScore); + var pw = perRunPairwise[i]; + bool pairwiseFromPlugin = runResults[i].PairwiseFromPlugin; + var isoComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], isolatedRuns[i], + pairwiseFromPlugin ? null : pw); + var plgComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], pluginRuns[i], + pairwiseFromPlugin ? pw : null); + perRunIsolatedScores.Add(isoComp.ImprovementScore); + perRunPluginScores.Add(plgComp.ImprovementScore); } - var avgBaseline = AverageResults(baselineRuns); - var avgWithSkill = AverageResults(withSkillRuns); - var bestPairwise = perRunPairwise.FirstOrDefault(pw => pw?.PositionSwapConsistent == true) - ?? perRunPairwise.FirstOrDefault(); + var perRunScores = perRunIsolatedScores + .Zip(perRunPluginScores, (iso, plg) => Math.Min(iso, plg)) + .ToList(); - var comparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgWithSkill, bestPairwise); + var avgBaseline = AverageResults(baselineRuns); + var avgIsolated = AverageResults(isolatedRuns); + var avgPlugin = AverageResults(pluginRuns); + // Select the best pairwise result and track which run it came from + int bestPairwiseIdx = -1; + for (int i = 0; i < perRunPairwise.Count; i++) + { + if (perRunPairwise[i]?.PositionSwapConsistent == true) { bestPairwiseIdx = i; break; } + } + if (bestPairwiseIdx < 0) + { + for (int i = 0; i < perRunPairwise.Count; i++) + { + if (perRunPairwise[i] is not null) { bestPairwiseIdx = i; break; } + } + } + var bestPairwise = bestPairwiseIdx >= 0 ? perRunPairwise[bestPairwiseIdx] : null; + + // Two comparisons — apply pairwise only to the matching one, + // using the source run's flag (not any-run) to avoid misattribution. + bool aggPairwiseFromPlugin = bestPairwiseIdx >= 0 && runResults[bestPairwiseIdx].PairwiseFromPlugin; + var isoComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgIsolated, + aggPairwiseFromPlugin ? null : bestPairwise); + var plgComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgPlugin, + aggPairwiseFromPlugin ? bestPairwise : null); + + // Build the combined ScenarioComparison + var comparison = new ScenarioComparison + { + ScenarioName = scenario.Name, + Baseline = avgBaseline, + SkilledIsolated = avgIsolated, + SkilledPlugin = avgPlugin, + ImprovementScore = Math.Min(isoComparison.ImprovementScore, plgComparison.ImprovementScore), + IsolatedImprovementScore = isoComparison.ImprovementScore, + PluginImprovementScore = plgComparison.ImprovementScore, + Breakdown = isoComparison.ImprovementScore <= plgComparison.ImprovementScore + ? isoComparison.Breakdown : plgComparison.Breakdown, + IsolatedBreakdown = isoComparison.Breakdown, + PluginBreakdown = plgComparison.Breakdown, + PairwiseResult = bestPairwise, + }; comparison.PerRunScores = perRunScores; - // Aggregate skill activation info - var allActivations = runResults.Select(r => r.SkillActivation).ToList(); - comparison.SkillActivation = new SkillActivationInfo( - Activated: allActivations.Any(a => a.Activated), - DetectedSkills: allActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(), - ExtraTools: allActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(), - SkillEventCount: allActivations.Sum(a => a.SkillEventCount)); + // Aggregate skill activation — BOTH skilled runs independently + var allIsoActivations = runResults.Select(r => r.SkillActivationIsolated).ToList(); + var allPlgActivations = runResults.Select(r => r.SkillActivationPlugin).ToList(); + + comparison.SkillActivationIsolated = new SkillActivationInfo( + Activated: allIsoActivations.Any(a => a.Activated), + DetectedSkills: allIsoActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(), + ExtraTools: allIsoActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(), + SkillEventCount: allIsoActivations.Sum(a => a.SkillEventCount)); + + comparison.SkillActivationPlugin = new SkillActivationInfo( + Activated: allPlgActivations.Any(a => a.Activated), + DetectedSkills: allPlgActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(), + ExtraTools: allPlgActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(), + SkillEventCount: allPlgActivations.Sum(a => a.SkillEventCount)); // Propagate timeout info from any run - comparison.TimedOut = runResults.Any(r => r.WithSkill.Metrics.TimedOut || r.Baseline.Metrics.TimedOut); + comparison.TimedOut = runResults.Any(r => + r.Baseline.Metrics.TimedOut || + r.SkilledIsolated.Metrics.TimedOut || + r.SkilledPlugin.Metrics.TimedOut); // Propagate expect_activation from scenario config comparison.ExpectActivation = scenario.ExpectActivation; @@ -581,9 +668,12 @@ private static async Task ExecuteScenario( private sealed record RunExecutionResult( RunResult Baseline, - RunResult WithSkill, + RunResult SkilledIsolated, + RunResult SkilledPlugin, PairwiseJudgeResult? Pairwise, - SkillActivationInfo SkillActivation); + bool PairwiseFromPlugin, + SkillActivationInfo SkillActivationIsolated, + SkillActivationInfo SkillActivationPlugin); private static async Task ExecuteRun( int runIndex, @@ -602,81 +692,83 @@ private static async Task ExecuteRun( if (config.Verbose) runLog("running agents..."); + var pluginRoot = SkillDiscovery.FindPluginRoot(skill.Path); + var agentTasks = await Task.WhenAll( - AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)), - AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog))); + // 1. Baseline: no plugin, no skills — vanilla agent + AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog)), + // 2. Skilled-isolated: single skill only (current behavior) + AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog)), + // 3. Skilled-plugin: entire plugin loaded via --plugin-dir + AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, + PluginRoot: pluginRoot, Log: runLog))); var baselineMetrics = agentTasks[0]; - var withSkillMetrics = agentTasks[1]; + var isolatedMetrics = agentTasks[1]; + var pluginMetrics = agentTasks[2]; - // Evaluate assertions + // Evaluate assertions on all three runs if (scenario.Assertions is { Count: > 0 }) { baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir); - withSkillMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, withSkillMetrics.AgentOutput, withSkillMetrics.WorkDir); + isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir); + pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir); } - // Evaluate constraints + // Evaluate constraints on all three runs var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics); - var withSkillConstraints = AssertionEvaluator.EvaluateConstraints(scenario, withSkillMetrics); + var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics); + var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics); baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints]; - withSkillMetrics.AssertionResults = [..withSkillMetrics.AssertionResults, ..withSkillConstraints]; + isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints]; + pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints]; - // Task completion + // Task completion for all three if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0) { baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed); - withSkillMetrics.TaskCompleted = withSkillMetrics.AssertionResults.All(a => a.Passed); + isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed); + pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed); } else { baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0; - withSkillMetrics.TaskCompleted = withSkillMetrics.ErrorCount == 0; + isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0; + pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0; } - // Judge — failures are non-fatal so a single timeout doesn't kill the whole evaluation. - // Await each judge independently so a failure in one doesn't discard the other's result. + // Judge all three runs independently (failures are non-fatal) var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path); var baselineJudgeTask = Services.Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog); - var withSkillJudgeTask = Services.Judge.JudgeRun( - scenario, withSkillMetrics, judgeOpts with { WorkDir = withSkillMetrics.WorkDir }, runLog); - - JudgeResult baselineJudge; - try - { - baselineJudge = await baselineJudgeTask; - } - catch (Exception error) - { - var shortMsg = SanitizeErrorMessage(error.Message); - runLog($"\x1b[33m⚠️ Judge (baseline) failed, using fallback scores: {shortMsg}\x1b[0m"); - baselineJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}"); - } + var isolatedJudgeTask = Services.Judge.JudgeRun( + scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog); + var pluginJudgeTask = Services.Judge.JudgeRun( + scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog); - JudgeResult withSkillJudge; - try - { - withSkillJudge = await withSkillJudgeTask; - } - catch (Exception error) - { - var shortMsg = SanitizeErrorMessage(error.Message); - runLog($"\x1b[33m⚠️ Judge (with skill) failed, using fallback scores: {shortMsg}\x1b[0m"); - withSkillJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}"); - } + var baselineJudge = await SafeJudge(baselineJudgeTask, "baseline", runLog); + var isolatedJudge = await SafeJudge(isolatedJudgeTask, "isolated", runLog); + var pluginJudge = await SafeJudge(pluginJudgeTask, "plugin", runLog); - var baseline = new RunResult(baselineMetrics, baselineJudge); - var withSkillResult = new RunResult(withSkillMetrics, withSkillJudge); + var baselineResult = new RunResult(baselineMetrics, baselineJudge); + var isolatedResult = new RunResult(isolatedMetrics, isolatedJudge); + var pluginResult = new RunResult(pluginMetrics, pluginJudge); - // Pairwise judging + // Pairwise judging — compare baseline vs worse-scoring skilled run + // Track which run the pairwise result corresponds to. PairwiseJudgeResult? pairwise = null; + bool pairwiseFromPlugin = false; if (usePairwise) { + pairwiseFromPlugin = pluginJudge.OverallScore < isolatedJudge.OverallScore; + var worseSkilled = pairwiseFromPlugin + ? pluginMetrics : isolatedMetrics; try { pairwise = await Services.PairwiseJudge.Judge( - scenario, baselineMetrics, withSkillMetrics, - new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, withSkillMetrics.WorkDir), + scenario, baselineMetrics, worseSkilled, + new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir), runLog); } catch (Exception error) @@ -685,25 +777,40 @@ private static async Task ExecuteRun( } } - // Skill activation - var skillActivation = MetricsCollector.ExtractSkillActivation(withSkillMetrics.Events, baselineMetrics.ToolCallBreakdown); + // Skill activation — check both skilled runs independently + // Pass the target skill name so that in plugin runs only the skill under + // test counts towards activation (prevents sibling-skill false positives). + var isolatedActivation = MetricsCollector.ExtractSkillActivation( + isolatedMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name); + var pluginActivation = MetricsCollector.ExtractSkillActivation( + pluginMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name); + + runLog(isolatedActivation.Activated + ? $"🔌 Skill activated (isolated): skills={string.Join(", ", isolatedActivation.DetectedSkills)}" + : "⚠️ Skill NOT activated (isolated)"); + runLog(pluginActivation.Activated + ? $"🔌 Skill activated (plugin): skills={string.Join(", ", pluginActivation.DetectedSkills)}" + : "⚠️ Skill NOT activated (plugin)"); + + if (config.Verbose) + runLog("✓ complete"); + + return new RunExecutionResult(baselineResult, isolatedResult, pluginResult, pairwise, + pairwiseFromPlugin, isolatedActivation, pluginActivation); + } - if (skillActivation.Activated) + private static async Task SafeJudge(Task task, string label, Action runLog) + { + try { - var parts = new List(); - if (skillActivation.DetectedSkills.Count > 0) parts.Add($"skills: {string.Join(", ", skillActivation.DetectedSkills)}"); - if (skillActivation.ExtraTools.Count > 0) parts.Add($"extra tools: {string.Join(", ", skillActivation.ExtraTools)}"); - runLog($"🔌 Skill activated ({string.Join("; ", parts)})"); + return await task; } - else + catch (Exception error) { - runLog("\x1b[33m⚠️ Skill was NOT activated during this run\x1b[0m"); + var shortMsg = SanitizeErrorMessage(error.Message); + runLog($"\x1b[33m⚠️ Judge ({label}) failed, using fallback scores: {shortMsg}\x1b[0m"); + return new JudgeResult([], 3, $"Judge failed: {shortMsg}"); } - - if (config.Verbose) - runLog("✓ complete"); - - return new RunExecutionResult(baseline, withSkillResult, pairwise, skillActivation); } // --- Noise-only evaluation: skill-only vs all-skills (no pure-agent baseline) --- @@ -807,11 +914,11 @@ private static async Task ExecuteNoiseTest( { // Run with target skill only var skillOnlyMetrics = await AgentRunner.RunAgent(new RunOptions( - scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog)); + scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog)); // Run with all skills loaded var allSkillsMetrics = await AgentRunner.RunAgent(new RunOptions( - scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog, AdditionalSkills: otherSkills)); + scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, AdditionalSkills: otherSkills)); // Evaluate assertions on both if (scenario.Assertions is { Count: > 0 }) diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 2374b7f60d..83e0223793 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -230,15 +230,29 @@ public sealed class ScenarioComparison { public required string ScenarioName { get; init; } public required RunResult Baseline { get; init; } - public required RunResult WithSkill { get; init; } + public RunResult SkilledIsolated { get; init; } = null!; + public RunResult? SkilledPlugin { get; init; } public required double ImprovementScore { get; init; } + public double IsolatedImprovementScore { get; init; } + public double PluginImprovementScore { get; init; } public required MetricBreakdown Breakdown { get; init; } + public MetricBreakdown? IsolatedBreakdown { get; init; } + public MetricBreakdown? PluginBreakdown { get; init; } public PairwiseJudgeResult? PairwiseResult { get; init; } public IReadOnlyList? PerRunScores { get; set; } - public SkillActivationInfo? SkillActivation { get; set; } + public SkillActivationInfo? SkillActivationIsolated { get; set; } + public SkillActivationInfo? SkillActivationPlugin { get; set; } public bool TimedOut { get; set; } /// When false, non-activation is expected (negative test) and should not flag the verdict. public bool ExpectActivation { get; set; } = true; + + // Backward-compatible aliases for JSON deserialization of older results files. + // These must be settable (init) so System.Text.Json can populate them during + // deserialization of legacy JSON that uses the old property names. + [JsonPropertyName("withSkill")] + public RunResult WithSkill { get => SkilledIsolated; init => SkilledIsolated = value; } + [JsonPropertyName("skillActivation")] + public SkillActivationInfo? SkillActivation { get => SkillActivationIsolated; init => SkillActivationIsolated = value; } } // --- Verdict --- @@ -253,6 +267,8 @@ public sealed class SkillVerdict public double? NormalizedGain { get; init; } public ConfidenceInterval? ConfidenceInterval { get; init; } public bool? IsSignificant { get; init; } + public double? IsolatedScore { get; set; } + public double? PluginScore { get; set; } public required string Reason { get; set; } /// Categorizes why the verdict failed, if it did. public string? FailureKind { get; set; } diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index aef38084d3..058b17b717 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -14,46 +14,66 @@ public sealed record RunOptions( string? EvalPath, string Model, bool Verbose, + string? PluginRoot = null, Action? Log = null, IReadOnlyList? AdditionalSkills = null); public static class AgentRunner { - private static CopilotClient? _sharedClient; + private static readonly ConcurrentDictionary _pluginClients = new(StringComparer.OrdinalIgnoreCase); private static readonly SemaphoreSlim _clientLock = new(1, 1); private static readonly ConcurrentBag _workDirs = []; + private static string? _capturedGitHubToken; + private static bool _tokenCaptured; /// - /// Returns the shared , creating it on first call. - /// Must be called before executing any untrusted workloads (eval scenarios, - /// setup commands). + /// Capture GITHUB_TOKEN once at startup so multiple clients can share it + /// and the env var is cleared from child processes. /// - public static async Task GetSharedClient(bool verbose) + public static void CaptureGitHubToken() { - if (_sharedClient is not null) return _sharedClient; + if (_tokenCaptured) return; + _capturedGitHubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); + if (!string.IsNullOrEmpty(_capturedGitHubToken)) + Environment.SetEnvironmentVariable("GITHUB_TOKEN", null); + _tokenCaptured = true; + } + + /// + /// Returns a CopilotClient configured for the given plugin. + /// The client is created once per plugin root and reused. + /// Note: --plugin-dir is NOT honored by the SDK, so all clients share + /// the same configuration. Plugin skills are loaded manually via + /// SkillDirectories in BuildSessionConfig instead. + /// + public static async Task GetPluginClient( + string? pluginRoot, bool verbose) + { + var key = pluginRoot ?? ""; + + if (_pluginClients.TryGetValue(key, out var existing)) + return existing; await _clientLock.WaitAsync(); try { - if (_sharedClient is not null) return _sharedClient; + if (_pluginClients.TryGetValue(key, out existing)) + return existing; + + CaptureGitHubToken(); var options = new CopilotClientOptions { LogLevel = verbose ? "info" : "none", }; - var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); - if (!string.IsNullOrEmpty(githubToken)) - { - options.GitHubToken = githubToken; - // Clear the token from the environment so child processes - // (e.g. LLM-generated code, eval shell commands) cannot read it. - Environment.SetEnvironmentVariable("GITHUB_TOKEN", null); - } + if (!string.IsNullOrEmpty(_capturedGitHubToken)) + options.GitHubToken = _capturedGitHubToken; - _sharedClient = new CopilotClient(options); - await _sharedClient.StartAsync(); - return _sharedClient; + var client = new CopilotClient(options); + await client.StartAsync(); + _pluginClients[key] = client; + return client; } finally { @@ -61,15 +81,27 @@ public static async Task GetSharedClient(bool verbose) } } - public static async Task StopSharedClient() + /// + /// Backward-compatible alias — returns the no-plugin client. + /// Used by judge sessions that don't need plugin loading. + /// + public static Task GetSharedClient(bool verbose) + => GetPluginClient(null, verbose); + + /// Stop all plugin clients (including the no-plugin client). + public static async Task StopAllClients() { - if (_sharedClient is not null) + foreach (var (_, client) in _pluginClients) { - await _sharedClient.StopAsync(); - _sharedClient = null; + try { await client.StopAsync(); } + catch { /* best effort */ } } + _pluginClients.Clear(); } + /// Backward-compatible alias. + public static Task StopSharedClient() => StopAllClients(); + /// Remove all temporary working directories created during runs. public static Task CleanupWorkDirs() { @@ -82,7 +114,7 @@ public static Task CleanupWorkDirs() })); } - public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action? log, string? runLabel = null, IReadOnlyList? additionalAllowedDirs = null) + public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action? log, string? runLabel = null, string? pluginRoot = null, IReadOnlyList? additionalAllowedDirs = null) { string? reqPath = null; if (request.ExtensionData is { } data) @@ -125,6 +157,14 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st allowedDirs.Add(skillsPathAbsolute); } + if (pluginRoot is not null) + { + string pluginRootAbsolute = Path.GetFullPath(pluginRoot); + if (!Path.EndsInDirectorySeparator(pluginRootAbsolute)) + pluginRootAbsolute = pluginRootAbsolute + Path.DirectorySeparatorChar; + + allowedDirs.Add(pluginRootAbsolute); + } if (additionalAllowedDirs is not null) { @@ -161,6 +201,7 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st internal static SessionConfig BuildSessionConfig( SkillInfo? skill, + string? pluginRoot, string model, string workDir, IReadOnlyDictionary? mcpServers = null, @@ -179,12 +220,11 @@ internal static SessionConfig BuildSessionConfig( if (verbose) log?.Invoke($" 📂 Config dir: {configDir} ({(skill is not null ? "skilled" : "baseline")})"); - // Build skill directories list: primary skill + any additional skills. + // Build additional noise skill directories when noise testing is active. // For additional skills we stage a temp directory with copies of each // skill's SKILL.md so the SDK discovers exactly those skills — not // every sibling that happens to share the same parent directory. - var skillDirs = new List(); - if (skillPath is not null) skillDirs.Add(skillPath); + var noiseDirs = new List(); if (additionalSkills is { Count: > 0 }) { var stageDir = Path.Combine(Path.GetTempPath(), $"sv-noise-{Guid.NewGuid():N}"); @@ -202,10 +242,11 @@ internal static SessionConfig BuildSessionConfig( File.Copy(skillMdPath, Path.Combine(stagedSkillDir, "SKILL.md")); } - skillDirs.Add(stageDir); + noiseDirs.Add(stageDir); } // Convert MCPServerDef records to the SDK's Dictionary shape + // Security hardening: validate commands, sanitize args/env, drop custom cwd. Dictionary? sdkMcp = null; if (mcpServers is { Count: > 0 }) { @@ -247,19 +288,41 @@ internal static SessionConfig BuildSessionConfig( if (sdkMcp.Count == 0) sdkMcp = null; } + // Three run types: + // 1. Baseline (skill == null, pluginRoot == null): no skills, no MCP. + // 2. Skilled-isolated (skill != null, pluginRoot == null): single skill via SkillDirectories. + // 3. Skilled-plugin (skill != null, pluginRoot != null): entire plugin loaded manually + // via SkillDirectories (--plugin-dir is NOT honored by SDK). + // + // For skilled-plugin, we enumerate all skill directories from plugin.json + // so that all sibling skills are loaded, matching production behavior. + string[] skillDirs; + if (pluginRoot is not null) + { + skillDirs = ResolvePluginSkillDirectories(pluginRoot); + } + else if (skill is not null) + { + skillDirs = [skillPath!]; + } + else + { + skillDirs = []; + } + return new SessionConfig { Model = model, Streaming = true, WorkingDirectory = workDir, - SkillDirectories = skillDirs, + SkillDirectories = [..skillDirs, ..noiseDirs], ConfigDir = configDir, McpServers = sdkMcp, InfiniteSessions = new InfiniteSessionConfig { Enabled = false }, OnPermissionRequest = (request, _) => { var runLabel = skill is not null ? "skilled" : "baseline"; - var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel); + var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel, pluginRoot); return Task.FromResult(new PermissionRequestResult { Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules, @@ -268,11 +331,44 @@ internal static SessionConfig BuildSessionConfig( }; } + /// + /// Resolves the skill directories for a plugin by reading its plugin.json + /// and returning the resolved skills path. The SDK scans this directory + /// for subdirectories containing SKILL.md files. + /// + internal static string[] ResolvePluginSkillDirectories(string pluginRoot) + { + var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json"); + PluginInfo? pluginInfo; + try + { + pluginInfo = PluginValidator.ParsePluginJson(pluginJsonPath); + } + catch (JsonException) + { + // Malformed plugin.json — return empty so the session is created + // without extra skill directories; validation surfaces the real error. + return []; + } + if (pluginInfo?.SkillsPath is null) return []; + + if (!PluginValidator.TryGetSafeSubdirectory( + pluginRoot, pluginInfo.SkillsPath, out var skillsDir, out _)) + return []; + + if (!Directory.Exists(skillsDir!)) return []; + + return [skillsDir!]; + } + public static async Task RunAgent(RunOptions options) { + var runType = options.Skill is null ? "baseline" + : options.PluginRoot is not null ? "skilled-plugin" + : "skilled-isolated"; return await RetryHelper.ExecuteWithRetry( async ct => await RunAgentCore(options, ct), - label: $"RunAgent({options.Scenario.Name}, {(options.Skill is not null ? "skilled" : "baseline")})", + label: $"RunAgent({options.Scenario.Name}, {runType})", maxRetries: 2, baseDelayMs: 5_000, totalTimeoutMs: (options.Scenario.Timeout + 60) * 1000); @@ -294,10 +390,12 @@ private static async Task RunAgentCore(RunOptions options, Cancellat try { + // All runs use the shared client — plugin skills are loaded manually + // via SkillDirectories (--plugin-dir is not honored by SDK). var client = await GetSharedClient(options.Verbose); await using var session = await client.CreateSessionAsync( - BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose)); + BuildSessionConfig(options.Skill, options.PluginRoot, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose)); var done = new TaskCompletionSource(); var effectiveTimeout = options.Scenario.Timeout; diff --git a/eng/skill-validator/src/Services/Comparator.cs b/eng/skill-validator/src/Services/Comparator.cs index fa1aaff775..70f6cb8fd1 100644 --- a/eng/skill-validator/src/Services/Comparator.cs +++ b/eng/skill-validator/src/Services/Comparator.cs @@ -54,9 +54,12 @@ public static ScenarioComparison CompareScenario( { ScenarioName = scenarioName, Baseline = baseline, - WithSkill = withSkill, + SkilledIsolated = withSkill, + SkilledPlugin = null, ImprovementScore = improvementScore, + IsolatedImprovementScore = improvementScore, Breakdown = breakdown, + IsolatedBreakdown = breakdown, PairwiseResult = pairwiseResult, }; } @@ -95,7 +98,8 @@ public static SkillVerdict ComputeVerdict( if (requireCompletion) { bool regressed = comparisons.Any(c => - c.Baseline.Metrics.TaskCompleted && !c.WithSkill.Metrics.TaskCompleted); + c.Baseline.Metrics.TaskCompleted && + (!c.SkilledIsolated.Metrics.TaskCompleted || (c.SkilledPlugin is not null && !c.SkilledPlugin.Metrics.TaskCompleted))); if (regressed) { return new SkillVerdict @@ -133,6 +137,8 @@ public static SkillVerdict ComputeVerdict( NormalizedGain = normalizedGain, ConfidenceInterval = ci, IsSignificant = significant, + IsolatedScore = comparisons.Average(c => c.IsolatedImprovementScore), + PluginScore = comparisons.Average(c => c.PluginImprovementScore), Reason = reason, FailureKind = passed ? null : "threshold", }; @@ -170,7 +176,15 @@ private static double ComputeNormalizedGain(IReadOnlyList co foreach (var c in comparisons) { double pre = (c.Baseline.JudgeResult.OverallScore - 1) / 4.0; - double post = (c.WithSkill.JudgeResult.OverallScore - 1) / 4.0; + // Select the effective run based on the worse (lower) improvement score, + // then use that run's overall score so this aligns with the effective + // comparison used for pass/fail and reporting. + double effectiveScore; + if (c.SkilledPlugin is not null && c.PluginImprovementScore < c.IsolatedImprovementScore) + effectiveScore = c.SkilledPlugin.JudgeResult.OverallScore; + else + effectiveScore = c.SkilledIsolated.JudgeResult.OverallScore; + double post = (effectiveScore - 1) / 4.0; if (pre >= 1) totalGain += post >= pre ? 0 : post - pre; diff --git a/eng/skill-validator/src/Services/MetricsCollector.cs b/eng/skill-validator/src/Services/MetricsCollector.cs index 3ebc0eb0e8..b0c6927806 100644 --- a/eng/skill-validator/src/Services/MetricsCollector.cs +++ b/eng/skill-validator/src/Services/MetricsCollector.cs @@ -7,10 +7,14 @@ public static class MetricsCollector { /// /// Analyse events from a "with-skill" run to determine whether the skill was activated. + /// When is set, only counts as "Activated" if the + /// target skill was specifically detected — prevents false positives in plugin runs + /// where sibling skills may fire instead of the skill under test. /// public static SkillActivationInfo ExtractSkillActivation( IReadOnlyList skilledEvents, - Dictionary baselineToolBreakdown) + Dictionary baselineToolBreakdown, + string? targetSkillName = null) { var detectedSkills = new List(); int skillEventCount = 0; @@ -44,8 +48,18 @@ public static SkillActivationInfo ExtractSkillActivation( .Where(tool => !baselineToolBreakdown.ContainsKey(tool)) .ToList(); + // When targetSkillName is set, activation is determined solely by whether + // the target skill was specifically detected (case-insensitive). ExtraTools + // and sibling skill names are still populated for diagnostic purposes but + // do NOT contribute to the Activated flag — we control the SDK and it always + // emits SkillInvokedEvent when a skill is loaded. + bool activated = targetSkillName is null + ? skillEventCount > 0 || extraTools.Count > 0 + : detectedSkills.Any(s => + s.Equals(targetSkillName, StringComparison.OrdinalIgnoreCase)); + return new SkillActivationInfo( - Activated: skillEventCount > 0 || extraTools.Count > 0, + Activated: activated, DetectedSkills: detectedSkills, ExtraTools: extraTools, SkillEventCount: skillEventCount); diff --git a/eng/skill-validator/src/Services/PairwiseJudge.cs b/eng/skill-validator/src/Services/PairwiseJudge.cs index 246822032b..4913412702 100644 --- a/eng/skill-validator/src/Services/PairwiseJudge.cs +++ b/eng/skill-validator/src/Services/PairwiseJudge.cs @@ -87,7 +87,7 @@ private static async Task JudgeCall( OnPermissionRequest = (request, _) => { var extraDirs = options.SkilledWorkDir is not null ? new[] { options.SkilledWorkDir } : null; - var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", extraDirs); + var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", additionalAllowedDirs: extraDirs); return Task.FromResult(new PermissionRequestResult { Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules, diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs index 47e40a5119..f8ab8f95db 100644 --- a/eng/skill-validator/src/Services/Reporter.cs +++ b/eng/skill-validator/src/Services/Reporter.cs @@ -170,7 +170,7 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver Console.WriteLine($"{summaryColor}{summaryText}\x1b[0m"); bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s => - s.Baseline.Metrics.TimedOut || s.WithSkill.Metrics.TimedOut)); + s.Baseline.Metrics.TimedOut || s.SkilledIsolated.Metrics.TimedOut || (s.SkilledPlugin?.Metrics.TimedOut == true))); if (anyTimeout) { Console.WriteLine(); @@ -186,70 +186,105 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo Console.WriteLine($" {icon} {scenario.ScenarioName} {FormatScore(scenario.ImprovementScore)}"); var b = scenario.Baseline.Metrics; - var s = scenario.WithSkill.Metrics; - var bd = scenario.Breakdown; + var s = scenario.SkilledIsolated.Metrics; + var p = scenario.SkilledPlugin?.Metrics; double bRubric = AvgRubricScore(scenario.Baseline.JudgeResult.RubricScores); - double sRubric = AvgRubricScore(scenario.WithSkill.JudgeResult.RubricScores); + double sRubric = AvgRubricScore(scenario.SkilledIsolated.JudgeResult.RubricScores); + double? pRubric = scenario.SkilledPlugin is { } sp ? AvgRubricScore(sp.JudgeResult.RubricScores) : null; - var metrics = new (string Label, double Value, string Absolute, bool LowerIsBetter)[] + // Build 3-column metric rows: baseline: X isolated: Y (delta%) plugin: Z (delta%) + var metricRows = new (string Label, string Baseline, string Isolated, string? Plugin)[] { - ("Tokens", bd.TokenReduction, $"{b.TokenEstimate} → {s.TokenEstimate}", true), - ("Tool calls", bd.ToolCallReduction, $"{b.ToolCallCount} → {s.ToolCallCount}", true), - ("Task completion", bd.TaskCompletionImprovement, $"{FmtBool(b.TaskCompleted)} → {FmtBool(s.TaskCompleted)}", false), - ("Time", bd.TimeReduction, $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")} → {FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}", true), - ("Quality (rubric)", bd.QualityImprovement, $"{bRubric:F1}/5 → {sRubric:F1}/5", false), - ("Quality (overall)", bd.OverallJudgmentImprovement, $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5 → {scenario.WithSkill.JudgeResult.OverallScore:F1}/5", false), - ("Errors", bd.ErrorReduction, $"{b.ErrorCount} → {s.ErrorCount}", true), + ("Tokens", + $"{b.TokenEstimate}", + FormatMetricWithDelta(s.TokenEstimate, b.TokenEstimate, true), + p is not null ? FormatMetricWithDelta(p.TokenEstimate, b.TokenEstimate, true) : null), + ("Tool calls", + $"{b.ToolCallCount}", + FormatMetricWithDelta(s.ToolCallCount, b.ToolCallCount, true), + p is not null ? FormatMetricWithDelta(p.ToolCallCount, b.ToolCallCount, true) : null), + ("Task completion", + FmtBool(b.TaskCompleted), + FmtBool(s.TaskCompleted), + p is not null ? FmtBool(p.TaskCompleted) : null), + ("Time", + $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")}", + $"{FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}{FormatPctDelta(s.WallTimeMs, b.WallTimeMs, true)}", + p is not null ? $"{FmtMs(p.WallTimeMs)}{(p.TimedOut ? " ⏰" : "")}{FormatPctDelta(p.WallTimeMs, b.WallTimeMs, true)}" : null), + ("Quality (rubric)", + $"{bRubric:F1}/5", + $"{sRubric:F1}/5{FormatPctDelta(sRubric, bRubric, false)}", + pRubric is not null ? $"{pRubric:F1}/5{FormatPctDelta(pRubric.Value, bRubric, false)}" : null), + ("Quality (overall)", + $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5", + $"{scenario.SkilledIsolated.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledIsolated.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}", + scenario.SkilledPlugin is not null ? $"{scenario.SkilledPlugin.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledPlugin.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}" : null), + ("Errors", + $"{b.ErrorCount}", + FormatMetricWithDelta(s.ErrorCount, b.ErrorCount, true), + p is not null ? FormatMetricWithDelta(p.ErrorCount, b.ErrorCount, true) : null), }; + bool hasPlugin = scenario.SkilledPlugin is not null; + // Show timeout warnings prominently before the metrics table - if (b.TimedOut || s.TimedOut) + if (b.TimedOut || s.TimedOut || p?.TimedOut == true) { var parts = new List(); if (b.TimedOut) parts.Add("baseline"); - if (s.TimedOut) parts.Add("with-skill"); + if (s.TimedOut) parts.Add("isolated"); + if (p?.TimedOut == true) parts.Add("plugin"); Console.WriteLine($" \x1b[31;1m⏰ TIMEOUT\x1b[0m — {string.Join(" and ", parts)} run(s) hit the scenario timeout limit"); } - foreach (var (label, value, absolute, lowerIsBetter) in metrics) + foreach (var (label, baseline, isolated, plugin) in metricRows) { - var color = value > 0 ? "\x1b[32m" : value < 0 ? "\x1b[31m" : "\x1b[2m"; - double displayValue = lowerIsBetter ? -value : value; - Console.WriteLine($" \x1b[2m{label,-20}\x1b[0m {color}{FormatDelta(displayValue),-10}\x1b[0m \x1b[2m{absolute}\x1b[0m"); + var line = $" \x1b[2m{label,-20}\x1b[0m baseline: \x1b[2m{baseline,-12}\x1b[0m isolated: {isolated,-20}"; + if (hasPlugin) + line += $" plugin: {plugin ?? "—"}"; + Console.WriteLine(line); } - // Skill activation info - if (scenario.SkillActivation is { } sa) + // Effective score line (when plugin run exists, show min) + if (hasPlugin) + { + var isoScore = scenario.IsolatedImprovementScore; + var plugScore = scenario.PluginImprovementScore; + Console.WriteLine($" \x1b[1mEffective score:\x1b[0m min(isolated={FormatPct(isoScore)}, plugin={FormatPct(plugScore)}) = {FormatPct(scenario.ImprovementScore)}"); + } + + // Skill activation info — isolated + if (scenario.SkillActivationIsolated is { } saIso) { Console.WriteLine(); - if (sa.Activated) - { - var parts = new List(); - if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills)); - if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools)); - Console.WriteLine($" \x1b[2mSkill activated:\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m"); - } - else - { - if (!scenario.ExpectActivation) - Console.WriteLine(" \x1b[36mℹ️ Skill correctly NOT activated (negative test)\x1b[0m"); - else - Console.WriteLine(" \x1b[33m⚠️ Skill was NOT activated\x1b[0m"); - } + ReportActivation(saIso, "Isolated", scenario.ExpectActivation); + } + + // Skill activation info — plugin + if (scenario.SkillActivationPlugin is { } saPlug) + { + ReportActivation(saPlug, "Plugin", scenario.ExpectActivation); } Console.WriteLine(); var bj = scenario.Baseline.JudgeResult; - var sj = scenario.WithSkill.JudgeResult; - double scoreDelta = sj.OverallScore - bj.OverallScore; - var deltaStr = scoreDelta > 0 ? $"\x1b[32m+{scoreDelta:F1}\x1b[0m" : - scoreDelta < 0 ? $"\x1b[31m{scoreDelta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m"; + var sj = scenario.SkilledIsolated.JudgeResult; + var pj = scenario.SkilledPlugin?.JudgeResult; + double scoreDeltaIso = sj.OverallScore - bj.OverallScore; + var deltaStrIso = FormatColorDelta(scoreDeltaIso); var bTimeout = b.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : ""; var sTimeout = s.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : ""; - Console.WriteLine($" \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → {sj.OverallScore:F1}{sTimeout} ({deltaStr})"); + var overallLine = $" \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → isolated: {sj.OverallScore:F1}{sTimeout} ({deltaStrIso})"; + if (pj is not null) + { + double scoreDeltaPlug = pj.OverallScore - bj.OverallScore; + var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : ""; + overallLine += $" plugin: {pj.OverallScore:F1}{pTimeout} ({FormatColorDelta(scoreDeltaPlug)})"; + } + Console.WriteLine(overallLine); Console.WriteLine(); // Baseline judge @@ -269,8 +304,8 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo Console.WriteLine(); - // With-skill judge - Console.WriteLine($" \x1b[35m─── With-Skill Judge\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m"); + // With-skill judge (Isolated) + Console.WriteLine($" \x1b[35m─── With-Skill Judge (Isolated)\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m"); Console.WriteLine($" \x1b[2m{sj.OverallReasoning}\x1b[0m"); if (sj.RubricScores.Count > 0) { @@ -288,6 +323,29 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo } Console.WriteLine(); + // With-skill judge (Plugin) — only if plugin run exists + if (pj is not null) + { + var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : ""; + Console.WriteLine($" \x1b[32m─── With-Skill Judge (Plugin)\x1b[0m \x1b[32;1m{pj.OverallScore:F1}/5\x1b[0m{pTimeout} \x1b[32m───\x1b[0m"); + Console.WriteLine($" \x1b[2m{pj.OverallReasoning}\x1b[0m"); + if (pj.RubricScores.Count > 0) + { + Console.WriteLine(); + foreach (var rs in pj.RubricScores) + { + var scoreColor = rs.Score >= 4 ? "\x1b[32m" : rs.Score >= 3 ? "\x1b[33m" : "\x1b[31m"; + var baselineRs = bj.RubricScores.FirstOrDefault(b => + string.Equals(b.Criterion, rs.Criterion, StringComparison.OrdinalIgnoreCase)); + var comparison = baselineRs is not null ? $"\x1b[2m (was {baselineRs.Score}/5)\x1b[0m" : ""; + Console.WriteLine($" {scoreColor}\x1b[1m{rs.Score}/5\x1b[0m{comparison} \x1b[1m{rs.Criterion}\x1b[0m"); + if (!string.IsNullOrEmpty(rs.Reasoning)) + Console.WriteLine($" \x1b[2m{rs.Reasoning}\x1b[0m"); + } + } + Console.WriteLine(); + } + // Pairwise judge results if (scenario.PairwiseResult is { } pw) { @@ -317,11 +375,62 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo Console.WriteLine(); Console.WriteLine(" \x1b[2mBaseline output:\x1b[0m"); Console.WriteLine(IndentBlock(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)", 8)); - Console.WriteLine(" \x1b[2mWith-skill output:\x1b[0m"); - Console.WriteLine(IndentBlock(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)", 8)); + Console.WriteLine(" \x1b[2mWith-skill output (isolated):\x1b[0m"); + Console.WriteLine(IndentBlock(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)", 8)); + if (scenario.SkilledPlugin is { } pluginRun) + { + Console.WriteLine(" \x1b[2mWith-skill output (plugin):\x1b[0m"); + Console.WriteLine(IndentBlock(pluginRun.Metrics.AgentOutput.Length > 0 ? pluginRun.Metrics.AgentOutput : "(no output)", 8)); + } } } + private static void ReportActivation(SkillActivationInfo sa, string label, bool expectActivation) + { + if (sa.Activated) + { + var parts = new List(); + if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills)); + if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools)); + Console.WriteLine($" \x1b[2mSkill activated ({label}):\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m"); + } + else + { + if (!expectActivation) + Console.WriteLine($" \x1b[36mℹ️ Skill correctly NOT activated ({label}, negative test)\x1b[0m"); + else + Console.WriteLine($" \x1b[33m⚠️ Skill was NOT activated ({label})\x1b[0m"); + } + } + + /// Formats a metric value with a percentage delta from baseline, e.g. "800 (-33%)". + private static string FormatMetricWithDelta(double value, double baseline, bool lowerIsBetter) + { + if (baseline == 0) return $"{value}"; + double pctChange = (value - baseline) / baseline * 100; + var sign = pctChange > 0 ? "+" : ""; + bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0; + var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m"; + var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m"; + return $"{value} {color}({sign}{pctChange:F0}%){reset}"; + } + + /// Returns a parenthesized percentage delta string, e.g. " (-33%)". + private static string FormatPctDelta(double value, double baseline, bool lowerIsBetter) + { + if (baseline == 0) return ""; + double pctChange = (value - baseline) / baseline * 100; + var sign = pctChange > 0 ? "+" : ""; + bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0; + var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m"; + var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m"; + return $" {color}({sign}{pctChange:F0}%){reset}"; + } + + private static string FormatColorDelta(double delta) => + delta > 0 ? $"\x1b[32m+{delta:F1}\x1b[0m" : + delta < 0 ? $"\x1b[31m{delta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m"; + // --- Markdown reporter --- public static string GenerateMarkdownSummary( @@ -356,6 +465,7 @@ public static string GenerateMarkdownSummary( var footnotes = new List(); var tableRows = new List(); + bool anyPluginRun = verdicts.Any(v => v.Scenarios.Any(s => s.SkilledPlugin is not null)); foreach (var v in verdicts) { @@ -363,33 +473,42 @@ public static string GenerateMarkdownSummary( foreach (var s in v.Scenarios) { var baseScore = s.Baseline?.JudgeResult?.OverallScore; - var skillScore = s.WithSkill?.JudgeResult?.OverallScore; + var isoScore = s.SkilledIsolated?.JudgeResult?.OverallScore; + var plugScore = s.SkilledPlugin?.JudgeResult?.OverallScore; var bTimedOut = s.Baseline?.Metrics?.TimedOut == true; - var sTimedOut = s.WithSkill?.Metrics?.TimedOut == true; + var isoTimedOut = s.SkilledIsolated?.Metrics?.TimedOut == true; + var plugTimedOut = s.SkilledPlugin?.Metrics?.TimedOut == true; + + string isoQualityCol = FormatQualityCell(baseScore, isoScore, bTimedOut, isoTimedOut, out double? isoQualityDelta); + double? plugQualityDelta = null; + string plugQualityCol = ""; + if (anyPluginRun) + { + plugQualityCol = FormatQualityCell(baseScore, plugScore, bTimedOut, plugTimedOut, out plugQualityDelta); + } - string qualityCol = FormatQualityCell(baseScore, skillScore, bTimedOut, sTimedOut, out double? qualityDelta); + // Use the effective (worse) run's quality delta for footnotes + bool pluginIsEffective = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore; + double? qualityDelta = pluginIsEffective ? plugQualityDelta : isoQualityDelta; var icon = s.ImprovementScore > 0 ? "✅" : s.ImprovementScore < 0 ? "❌" : "🟡"; + // Skills loaded column — show both isolated and plugin activation string skillsCol = "—"; - if (s.SkillActivation is { } sa) + if (s.SkillActivationIsolated is { } saIso) { - if (sa.Activated) - { - var parts = new List(); - if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills); - if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools)); - skillsCol = parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅"; - } - else - { - skillsCol = s.ExpectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)"; - } + skillsCol = FormatActivationCell(saIso, s.ExpectActivation); } else if (skillNotActivated) { skillsCol = "⚠️ NOT ACTIVATED"; } + if (anyPluginRun && s.SkillActivationPlugin is { } saPlug) + { + string plugActivation = FormatActivationCell(saPlug, s.ExpectActivation); + skillsCol += $" / {plugActivation}"; + } + var footnote = BuildVerdictFootnote(s, qualityDelta); string verdictCol = icon; if (footnote is not null) @@ -399,14 +518,25 @@ public static string GenerateMarkdownSummary( verdictCol = $"{icon} [{n}]"; } - tableRows.Add($"| {v.SkillName} | {s.ScenarioName} | {qualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |"); + var row = anyPluginRun + ? $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {plugQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |" + : $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |"; + tableRows.Add(row); } } if (tableRows.Count > 0) { - sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |"); - sb.AppendLine("|-------|----------|---------|---------------|---------|---------|"); + if (anyPluginRun) + { + sb.AppendLine("| Skill | Scenario | Quality (Isolated) | Quality (Plugin) | Skills Loaded | Overfit | Verdict |"); + sb.AppendLine("|-------|----------|--------------------|------------------|---------------|---------|---------|"); + } + else + { + sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |"); + sb.AppendLine("|-------|----------|---------|---------------|---------|---------|"); + } foreach (var row in tableRows) sb.AppendLine(row); } @@ -419,7 +549,7 @@ public static string GenerateMarkdownSummary( } bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s => - (s.Baseline?.Metrics?.TimedOut == true) || (s.WithSkill?.Metrics?.TimedOut == true))); + (s.Baseline?.Metrics?.TimedOut == true) || (s.SkilledIsolated?.Metrics?.TimedOut == true) || (s.SkilledPlugin?.Metrics?.TimedOut == true))); if (anyTimeout) sb.AppendLine("\n> ⏰ **timeout** — run hit the scenario timeout limit; scoring may be impacted by aborting model execution before it could produce its full output"); @@ -493,23 +623,46 @@ private static async Task ReportMarkdown( foreach (var rs in scenario.Baseline.JudgeResult.RubricScores) judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}"); judgeReport.AppendLine(); - judgeReport.AppendLine("## With-Skill Judge"); - judgeReport.AppendLine($"Overall Score: {scenario.WithSkill.JudgeResult.OverallScore}/5"); - judgeReport.AppendLine($"Reasoning: {scenario.WithSkill.JudgeResult.OverallReasoning}"); + judgeReport.AppendLine("## With-Skill Judge (Isolated)"); + judgeReport.AppendLine($"Overall Score: {scenario.SkilledIsolated.JudgeResult.OverallScore}/5"); + judgeReport.AppendLine($"Reasoning: {scenario.SkilledIsolated.JudgeResult.OverallReasoning}"); judgeReport.AppendLine(); - foreach (var rs in scenario.WithSkill.JudgeResult.RubricScores) + foreach (var rs in scenario.SkilledIsolated.JudgeResult.RubricScores) judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}"); judgeReport.AppendLine(); + + // Plugin judge section (if plugin run exists) + if (scenario.SkilledPlugin is { } pluginRun) + { + judgeReport.AppendLine("## With-Skill Judge (Plugin)"); + judgeReport.AppendLine($"Overall Score: {pluginRun.JudgeResult.OverallScore}/5"); + judgeReport.AppendLine($"Reasoning: {pluginRun.JudgeResult.OverallReasoning}"); + judgeReport.AppendLine(); + foreach (var rs in pluginRun.JudgeResult.RubricScores) + judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}"); + judgeReport.AppendLine(); + } + judgeReport.AppendLine("## Baseline Agent Output"); judgeReport.AppendLine("```"); judgeReport.AppendLine(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)"); judgeReport.AppendLine("```"); judgeReport.AppendLine(); - judgeReport.AppendLine("## With-Skill Agent Output"); + judgeReport.AppendLine("## With-Skill Agent Output (Isolated)"); judgeReport.AppendLine("```"); - judgeReport.AppendLine(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)"); + judgeReport.AppendLine(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)"); judgeReport.AppendLine("```"); + // Plugin agent output (if plugin run exists) + if (scenario.SkilledPlugin is { } pluginRunOutput) + { + judgeReport.AppendLine(); + judgeReport.AppendLine("## With-Skill Agent Output (Plugin)"); + judgeReport.AppendLine("```"); + judgeReport.AppendLine(pluginRunOutput.Metrics.AgentOutput.Length > 0 ? pluginRunOutput.Metrics.AgentOutput : "(no output)"); + judgeReport.AppendLine("```"); + } + await File.WriteAllTextAsync(Path.Combine(skillDir, $"{scenarioSlug}.md"), judgeReport.ToString()); } } @@ -646,6 +799,19 @@ internal static string FormatQualityCell( return $"{baseFmt} \u2192 {skillFmt}"; } + /// Formats an activation info object into a markdown cell string. + internal static string FormatActivationCell(SkillActivationInfo sa, bool expectActivation) + { + if (sa.Activated) + { + var parts = new List(); + if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills); + if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools)); + return parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅"; + } + return expectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)"; + } + /// /// Returns a footnote string when the verdict (based on composite ImprovementScore) /// disagrees with the quality delta shown in the table, or null if no explanation is needed. @@ -693,11 +859,18 @@ internal static string FormatQualityCell( }) .ToList(); + // Determine which run is the effective (worse) one for raw metric values + bool pluginIsWorse = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore; + string runLabel = pluginIsWorse ? "Plugin" : "Isolated"; + // Format a contributor label with raw metric values when available string FormatContributor(string label) { var bm = s.Baseline?.Metrics; - var sm = s.WithSkill?.Metrics; + // Use the effective (worse) run's metrics for raw values + var sm = pluginIsWorse + ? s.SkilledPlugin?.Metrics + : s.SkilledIsolated?.Metrics; if (bm is null || sm is null) return label; string? raw = label switch @@ -727,7 +900,7 @@ string FormatContributor(string label) ? string.Join(", ", negatives) : "efficiency metrics"; string qualityDesc = qualityPositive ? "Quality improved" : "Quality unchanged"; - return $"{qualityDesc} but weighted score is {compositeStr} due to: {factors}"; + return $"({runLabel}) {qualityDesc} but weighted score is {compositeStr} due to: {factors}"; } if (verdictPositive && qualityNegative) @@ -741,7 +914,7 @@ string FormatContributor(string label) string factors = positives.Count > 0 ? string.Join(", ", positives) : "efficiency metrics"; - return $"Quality dropped but weighted score is {compositeStr} due to: {factors}"; + return $"({runLabel}) Quality dropped but weighted score is {compositeStr} due to: {factors}"; } return null; diff --git a/eng/skill-validator/src/Services/SkillDiscovery.cs b/eng/skill-validator/src/Services/SkillDiscovery.cs index 24dad5512c..8ef65bfd97 100644 --- a/eng/skill-validator/src/Services/SkillDiscovery.cs +++ b/eng/skill-validator/src/Services/SkillDiscovery.cs @@ -277,6 +277,67 @@ internal static (EvalSchema.RawAgentFrontmatter Metadata, string Body) ParseAgen // --- Plugin discovery --- + /// + /// For a given skill, find its plugin root directory (the directory containing plugin.json). + /// Returns the plugin root path and the parsed PluginInfo. + /// Returns null if no plugin.json is found or if it is malformed. + /// + public static (string PluginRoot, PluginInfo Plugin)? FindPluginContext(SkillInfo skill) + { + var pluginRoot = FindPluginRoot(skill.Path); + if (pluginRoot is null) return null; + + var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json"); + PluginInfo? plugin; + try + { + plugin = PluginValidator.ParsePluginJson(pluginJsonPath); + } + catch (JsonException) + { + // Malformed plugin.json — treated as "no plugin" here; + // the later DiscoverPlugins/ValidatePlugin path will surface + // the user-friendly error message. + return null; + } + if (plugin is null) return null; + + return (pluginRoot, plugin); + } + + /// + /// Groups discovered skills by their parent plugin root. + /// Returns a dictionary: pluginRoot -> (PluginInfo, skills[]) + /// Skills without a plugin are reported as errors and excluded. + /// + public static (Dictionary Skills)> Groups, List Errors) + GroupSkillsByPlugin(IReadOnlyList skills) + { + var groups = new Dictionary Skills)>( + StringComparer.OrdinalIgnoreCase); + var errors = new List(); + + foreach (var skill in skills) + { + var context = FindPluginContext(skill); + if (context is null) + { + errors.Add($"Skill '{skill.Name}' at '{skill.Path}' is not inside a plugin directory (no valid plugin.json found: missing or malformed). All skills must belong to a plugin."); + continue; + } + + var (root, plugin) = context.Value; + if (!groups.TryGetValue(root, out var group)) + { + group = (plugin, []); + groups[root] = group; + } + group.Skills.Add(skill); + } + + return (groups, errors); + } + /// /// Discover plugin.json files from plugin directories reachable from the given paths. /// diff --git a/eng/skill-validator/tests/ComparatorTests.cs b/eng/skill-validator/tests/ComparatorTests.cs index 8e999913b5..c11873c1f3 100644 --- a/eng/skill-validator/tests/ComparatorTests.cs +++ b/eng/skill-validator/tests/ComparatorTests.cs @@ -201,6 +201,37 @@ public void MarksAsNotSignificantWhenPerRunScoresSpanZero() Assert.False(verdict.IsSignificant!.Value); Assert.Contains("not statistically significant", verdict.Reason); } + [Fact] + public void FailsWhenPluginRunRegressesTaskCompletion() + { + var baseline = MakeRunResult(taskCompleted: true, overallScore: 3); + var withSkill = MakeRunResult(taskCompleted: true, tokenEstimate: 100, overallScore: 5); + var comparison = Comparator.CompareScenario("test", baseline, withSkill); + // Simulate plugin run that failed completion + comparison = new ScenarioComparison + { + ScenarioName = comparison.ScenarioName, + Baseline = comparison.Baseline, + SkilledIsolated = comparison.SkilledIsolated, + SkilledPlugin = MakeRunResult(taskCompleted: false, tokenEstimate: 100, overallScore: 5), + ImprovementScore = comparison.ImprovementScore, + Breakdown = comparison.Breakdown, + }; + + var verdict = Comparator.ComputeVerdict(MockSkill, [comparison], 0.0, true); + Assert.False(verdict.Passed); + Assert.Contains("regressed", verdict.Reason); + } + + [Fact] + public void CompareScenarioSetsPluginToNull() + { + var baseline = MakeRunResult(); + var withSkill = MakeRunResult(tokenEstimate: 500, overallScore: 5); + var comparison = Comparator.CompareScenario("test", baseline, withSkill); + // CompareScenario is a utility for single-run comparison; SkilledPlugin should be null + Assert.Null(comparison.SkilledPlugin); + } } public class CompareScenarioWithPairwiseTests diff --git a/eng/skill-validator/tests/DiscoveryTests.cs b/eng/skill-validator/tests/DiscoveryTests.cs index d9613b867e..e260744212 100644 --- a/eng/skill-validator/tests/DiscoveryTests.cs +++ b/eng/skill-validator/tests/DiscoveryTests.cs @@ -1,4 +1,5 @@ using SkillValidator.Services; +using SkillValidator.Models; namespace SkillValidator.Tests; @@ -223,3 +224,95 @@ public async Task ResolveEvalPathPrefersFlatLayout() } } } + +public class GroupSkillsByPluginTests +{ + [Fact] + public void GroupsSkillsUnderSamePlugin() + { + var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}"); + var pluginDir = Path.Combine(tmpDir, "my-plugin"); + var skillDir1 = Path.Combine(pluginDir, "skills", "skill-a"); + var skillDir2 = Path.Combine(pluginDir, "skills", "skill-b"); + Directory.CreateDirectory(skillDir1); + Directory.CreateDirectory(skillDir2); + try + { + File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }"""); + + var skills = new[] + { + new SkillInfo("skill-a", "A", skillDir1, Path.Combine(skillDir1, "SKILL.md"), "# A", null, null, null), + new SkillInfo("skill-b", "B", skillDir2, Path.Combine(skillDir2, "SKILL.md"), "# B", null, null, null), + }; + + var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin(skills); + Assert.Empty(errors); + Assert.Single(groups); + var (plugin, grouped) = groups.Values.First(); + Assert.Equal(2, grouped.Count); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void ReportsErrorForStandaloneSkill() + { + var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}"); + Directory.CreateDirectory(tmpDir); + try + { + // No plugin.json in parents + var skill = new SkillInfo("orphan", "O", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# O", null, null, null); + var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin([skill]); + Assert.Empty(groups); + Assert.Single(errors); + Assert.Contains("orphan", errors[0]); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void FindPluginContextReturnsNullWithoutPluginJson() + { + var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}"); + Directory.CreateDirectory(tmpDir); + try + { + var skill = new SkillInfo("test", "T", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# T", null, null, null); + var result = SkillDiscovery.FindPluginContext(skill); + Assert.Null(result); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void FindPluginContextReturnsPluginInfo() + { + var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}"); + var pluginDir = Path.Combine(tmpDir, "my-plugin"); + var skillDir = Path.Combine(pluginDir, "skills", "test-skill"); + Directory.CreateDirectory(skillDir); + try + { + File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }"""); + var skill = new SkillInfo("test-skill", "T", skillDir, Path.Combine(skillDir, "SKILL.md"), "# T", null, null, null); + var result = SkillDiscovery.FindPluginContext(skill); + Assert.NotNull(result); + Assert.Equal(pluginDir, result!.Value.PluginRoot); + } + finally + { + Directory.Delete(tmpDir, true); + } + } +} diff --git a/eng/skill-validator/tests/MetricsTests.cs b/eng/skill-validator/tests/MetricsTests.cs index b1abc059f9..736b90e73f 100644 --- a/eng/skill-validator/tests/MetricsTests.cs +++ b/eng/skill-validator/tests/MetricsTests.cs @@ -196,6 +196,143 @@ public void DetectsSkillFromSkillInvokedEvent() Assert.Equal(["binlog-failure-analysis"], result.DetectedSkills); Assert.Equal(1, result.SkillEventCount); } + + // --- Targeted skill activation (targetSkillName parameter) tests --- + + [Fact] + public void TargetSkillName_ActivatedWhenTargetSkillDetected() + { + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))), + MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf"); + + Assert.True(result.Activated); + Assert.Equal(["build-perf"], result.DetectedSkills); + } + + [Fact] + public void TargetSkillName_NotActivatedWhenSiblingSkillFires() + { + // In a plugin run, a sibling skill fires but not the target skill + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))), + MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf"); + + Assert.False(result.Activated); + Assert.Equal(["sibling-skill"], result.DetectedSkills); + Assert.Equal(1, result.SkillEventCount); + } + + [Fact] + public void TargetSkillName_CaseInsensitiveMatch() + { + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("Build-Perf")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary(), targetSkillName: "build-perf"); + + Assert.True(result.Activated); + } + + [Fact] + public void TargetSkillName_NotActivatedEvenWithExtraToolsWhenNoTargetDetected() + { + // Extra tools present but target skill not detected — NOT activated. + // We control the SDK; it always emits SkillInvokedEvent. Extra tools + // alone are not proof the target skill was loaded. + var events = new List + { + MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("msbuild_analyze")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary(), targetSkillName: "build-perf"); + + Assert.False(result.Activated); + Assert.Equal(["msbuild_analyze"], result.ExtraTools); + } + + [Fact] + public void TargetSkillName_ExtraToolsIgnoredWhenSiblingSkillEventsExist() + { + // Sibling skill fired (skill events exist) plus extra tools — NOT activated. + // The event system works, so extra tools likely came from the sibling, not + // the target skill. This is the false-positive scenario from plugin runs. + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))), + MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("view")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary(), targetSkillName: "nuget-trusted-publishing"); + + Assert.False(result.Activated); + Assert.Equal(["sibling-skill"], result.DetectedSkills); + Assert.Equal(["view"], result.ExtraTools); + } + + [Fact] + public void TargetSkillName_NullBehavesAsOriginal() + { + // When targetSkillName is null, any skill event counts as activation (original behavior) + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary(), targetSkillName: null); + + Assert.True(result.Activated); + Assert.Equal(["sibling-skill"], result.DetectedSkills); + } + + [Fact] + public void TargetSkillName_NotActivatedWhenNoEventsAndNoExtraTools() + { + var events = new List + { + MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf"); + + Assert.False(result.Activated); + } + + [Fact] + public void TargetSkillName_ActivatedWhenTargetAmongMultipleSkills() + { + // Multiple skills fire in a plugin run, including the target + var events = new List + { + MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))), + MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))), + MakeEvent("skill.invoked", D(("name", JsonValue.Create("another-skill")))), + }; + + var result = MetricsCollector.ExtractSkillActivation( + events, new Dictionary(), targetSkillName: "build-perf"); + + Assert.True(result.Activated); + Assert.Equal(3, result.SkillEventCount); + Assert.Contains("build-perf", result.DetectedSkills); + } } public class CollectMetricsTests diff --git a/eng/skill-validator/tests/OverfittingJudgeTests.cs b/eng/skill-validator/tests/OverfittingJudgeTests.cs index 29b6069a1b..10c4ab65f4 100644 --- a/eng/skill-validator/tests/OverfittingJudgeTests.cs +++ b/eng/skill-validator/tests/OverfittingJudgeTests.cs @@ -349,7 +349,7 @@ public void MarkdownTable_IncludesOverfitColumn() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline" }, new JudgeResult(new List(), 3.5, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled" }, new JudgeResult(new List(), 4.5, "Good")), ImprovementScore = 0.25, @@ -398,7 +398,7 @@ public void MarkdownTable_ShowsDashWhenNoOverfitting() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline" }, new JudgeResult(new List(), 3.5, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled" }, new JudgeResult(new List(), 4.5, "Good")), ImprovementScore = 0.25, @@ -440,7 +440,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5, WallTimeMs = 2100 }, new JudgeResult(new List(), 3.5, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled", TokenEstimate = 8000, ToolCallCount = 15, WallTimeMs = 8500 }, new JudgeResult(new List(), 4.5, "Good")), ImprovementScore = -0.18, @@ -462,7 +462,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality() var md = Reporter.GenerateMarkdownSummary(verdicts); Assert.Contains("[1]", md); - Assert.Contains("Quality improved but weighted score is", md); + Assert.Contains("(Isolated) Quality improved but weighted score is", md); Assert.Contains("due to:", md); // Raw metrics should appear in footnote Assert.Contains("tokens (1000", md); @@ -492,7 +492,7 @@ public void MarkdownTable_NoFootnoteWhenVerdictMatchesQuality() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline" }, new JudgeResult(new List(), 3.0, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled" }, new JudgeResult(new List(), 4.5, "Good")), ImprovementScore = 0.35, @@ -538,7 +538,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline", TokenEstimate = 5000, TaskCompleted = false }, new JudgeResult(new List(), 4.0, "Good")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled", TokenEstimate = 2000, TaskCompleted = true }, new JudgeResult(new List(), 3.0, "OK")), ImprovementScore = 0.14, @@ -560,7 +560,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive() var md = Reporter.GenerateMarkdownSummary(verdicts); Assert.Contains("[1]", md); - Assert.Contains("Quality dropped but weighted score is", md); + Assert.Contains("(Isolated) Quality dropped but weighted score is", md); Assert.Contains("due to:", md); // Raw metrics should appear in footnote Assert.Contains("completion", md); @@ -590,7 +590,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5 }, new JudgeResult(new List(), 3.5, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled", TokenEstimate = 3000, ToolCallCount = 5 }, new JudgeResult(new List(), 3.5, "OK")), ImprovementScore = -0.10, @@ -612,7 +612,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative() var md = Reporter.GenerateMarkdownSummary(verdicts); Assert.Contains("[1]", md); - Assert.Contains("Quality unchanged but weighted score is", md); + Assert.Contains("(Isolated) Quality unchanged but weighted score is", md); Assert.Contains("tokens (1000", md); } @@ -657,7 +657,7 @@ public void MarkdownSummary_OmitsErrorsSectionWhenAllVerdictsPassed() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline" }, new JudgeResult(new List(), 3.5, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled" }, new JudgeResult(new List(), 4.5, "Good")), ImprovementScore = 0.25, @@ -692,7 +692,7 @@ public void MarkdownSummary_OmitsErrorsSectionForFailuresWithScenarios() Baseline = new RunResult( new RunMetrics { AgentOutput = "baseline" }, new JudgeResult(new List(), 4.0, "OK")), - WithSkill = new RunResult( + SkilledIsolated = new RunResult( new RunMetrics { AgentOutput = "skilled" }, new JudgeResult(new List(), 3.0, "Worse")), ImprovementScore = -0.25, diff --git a/eng/skill-validator/tests/RunnerTests.cs b/eng/skill-validator/tests/RunnerTests.cs index 93544b247b..c4ca5b2f8a 100644 --- a/eng/skill-validator/tests/RunnerTests.cs +++ b/eng/skill-validator/tests/RunnerTests.cs @@ -21,7 +21,7 @@ public class BuildSessionConfigTests [Fact] public void SetsSkillDirectoriesToParentOfSkillPath() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.Single(config.SkillDirectories!); Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]); } @@ -50,7 +50,7 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs() new SkillInfo("no-skill", "None", noSkillDir, Path.Combine(noSkillDir, "SKILL.md"), "", null, null), }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", + var config = AgentRunner.BuildSessionConfig(MockSkill, pluginRoot: null, "gpt-4.1", "C:\\tmp\\work", additionalSkills: additionalSkills); // Primary skill parent + one staging directory for additional skills @@ -74,14 +74,14 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs() [Fact] public void SetsWorkingDirectoryToWorkDir() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.Equal("C:\\tmp\\work", config.WorkingDirectory); } [Fact] public void SetsConfigDirToUniqueTempDirForSkillIsolation() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.NotEqual("C:\\tmp\\work", config.ConfigDir); Assert.StartsWith(Path.GetTempPath(), config.ConfigDir); Assert.True(Directory.Exists(config.ConfigDir)); @@ -90,7 +90,7 @@ public void SetsConfigDirToUniqueTempDirForSkillIsolation() [Fact] public void SetsConfigDirToUniqueTempDirEvenWithoutSkill() { - var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work"); Assert.NotEqual("C:\\tmp\\work", config.ConfigDir); Assert.StartsWith(Path.GetTempPath(), config.ConfigDir); } @@ -98,36 +98,36 @@ public void SetsConfigDirToUniqueTempDirEvenWithoutSkill() [Fact] public void EachCallGetsUniqueConfigDir() { - var config1 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work"); - var config2 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work"); + var config1 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work"); + var config2 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work"); Assert.NotEqual(config1.ConfigDir, config2.ConfigDir); } [Fact] public void SetsEmptySkillDirectoriesWhenNoSkill() { - var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work"); Assert.Empty(config.SkillDirectories!); } [Fact] public void PassesModelThrough() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "claude-opus-4.6", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "claude-opus-4.6", "C:\\tmp\\work"); Assert.Equal("claude-opus-4.6", config.Model); } [Fact] public void DisablesInfiniteSessions() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.False(config.InfiniteSessions!.Enabled); } [Fact] public void UsesOnPermissionRequestNotPreToolUseHook() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.NotNull(config.OnPermissionRequest); Assert.Null(config.Hooks); } @@ -142,7 +142,7 @@ public void SetsMcpServersWhenProvided() Args: ["run", "--project", "server"], Tools: ["load_data", "get_results"]) }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.NotNull(config.McpServers); Assert.True(config.McpServers.ContainsKey("test-mcp")); } @@ -150,7 +150,7 @@ public void SetsMcpServersWhenProvided() [Fact] public void OmitsMcpServersWhenNull() { - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work"); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); Assert.Null(config.McpServers); } @@ -164,7 +164,7 @@ public void BlocksDisallowedMcpCommand() Args: ["-X", "POST", "https://evil.example.com"], Tools: ["exfil"]) }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.Null(config.McpServers); } @@ -178,8 +178,8 @@ public void RejectsMcpCommandWithFullPath() Args: ["run"], Tools: ["*"]) }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); - // Full paths are rejected — only bare command names allowed + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); + // Full paths are rejected - only bare command names allowed Assert.Null(config.McpServers); } @@ -199,7 +199,7 @@ public void StripsDangerousMcpEnvKeys() ["PATH"] = "/tmp/evil", }) }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.NotNull(config.McpServers); Assert.True(config.McpServers.ContainsKey("ok")); // Dangerous keys are stripped; safe keys remain @@ -221,7 +221,7 @@ public void DropsMcpCwd() Tools: ["*"], Cwd: "/tmp/evil") }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.NotNull(config.McpServers); var entry = (Dictionary)config.McpServers["ok"]; Assert.False(entry.ContainsKey("cwd")); @@ -235,7 +235,7 @@ public void FiltersOutDisallowedMcpServersButKeepsAllowed() ["good"] = new MCPServerDef(Command: "node", Args: ["server.js"], Tools: ["*"]), ["bad"] = new MCPServerDef(Command: "bash", Args: ["-c", "echo pwned"], Tools: ["*"]), }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.NotNull(config.McpServers); Assert.True(config.McpServers.ContainsKey("good")); Assert.False(config.McpServers.ContainsKey("bad")); @@ -248,7 +248,7 @@ public void RejectsMcpServerWithDangerousArgs() { ["evil"] = new MCPServerDef(Command: "node", Args: ["-e", "process.exit(1)"], Tools: ["*"]), }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.Null(config.McpServers); } @@ -259,10 +259,62 @@ public void AllowsMcpServerWithSafeArgs() { ["ok"] = new MCPServerDef(Command: "node", Args: ["dist/server.js", "--stdio"], Tools: ["*"]), }; - var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers); + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers); Assert.NotNull(config.McpServers); Assert.True(config.McpServers.ContainsKey("ok")); } + + [Fact] + public void PluginRootWithoutPluginJsonFallsBackToEmptySkillDirs() + { + var mcpServers = new Dictionary + { + ["test-mcp"] = new MCPServerDef( + Command: "dotnet", + Args: ["run"], + Tools: ["t1"]) + }; + var config = AgentRunner.BuildSessionConfig(MockSkill, "/plugins/dotnet", "gpt-4.1", "C:\\tmp\\work", mcpServers); + // When pluginRoot has no plugin.json, SkillDirectories falls back to empty + Assert.Empty(config.SkillDirectories!); + // MCP servers are always passed through (no longer suppressed for plugin runs) + Assert.NotNull(config.McpServers); + Assert.True(config.McpServers.ContainsKey("test-mcp")); + } + + [Fact] + public void PluginRootWithPluginJsonResolvesSkillDirectories() + { + // Create a temp plugin structure + var tempDir = Path.Combine(Path.GetTempPath(), $"sv-test-{Guid.NewGuid():N}"); + var skillsDir = Path.Combine(tempDir, "skills", "my-skill"); + Directory.CreateDirectory(skillsDir); + File.WriteAllText(Path.Combine(skillsDir, "SKILL.md"), "---\nname: my-skill\n---\n# Test"); + File.WriteAllText(Path.Combine(tempDir, "plugin.json"), + "{\"name\":\"test\",\"version\":\"1.0.0\",\"description\":\"Test plugin\",\"skills\":\"./skills/\"}"); + try + { + var config = AgentRunner.BuildSessionConfig(MockSkill, tempDir, "gpt-4.1", "C:\\tmp\\work"); + Assert.Single(config.SkillDirectories!); + // Normalize trailing separators for comparison + var expected = Path.GetFullPath(Path.Combine(tempDir, "skills")); + var actual = config.SkillDirectories![0].TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar); + Assert.Equal(expected, actual); + } + finally + { + Directory.Delete(tempDir, true); + } + } + + [Fact] + public void PluginRootNullPreservesSkillDirectories() + { + var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work"); + // Without pluginRoot, SkillDirectories should contain the skill's parent + Assert.Single(config.SkillDirectories!); + Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]); + } } public class IsAllowedMcpCommandTests From ffd84d35c6a634ce4184162304bd85aeb26b9859 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 12 Mar 2026 20:51:36 +0000 Subject: [PATCH 2/3] Initial plan From 7f0a250995426353efddef8c618cd6e421b1aa9d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 12 Mar 2026 20:56:34 +0000 Subject: [PATCH 3/3] Add logging to StopAllClients catch block in AgentRunner.cs Co-authored-by: JanKrivanek <3809076+JanKrivanek@users.noreply.github.com> --- eng/skill-validator/src/Services/AgentRunner.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 058b17b717..17c8d10f2c 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -94,7 +94,7 @@ public static async Task StopAllClients() foreach (var (_, client) in _pluginClients) { try { await client.StopAsync(); } - catch { /* best effort */ } + catch (Exception ex) { Console.Error.WriteLine($"Warning: failed to stop Copilot client: {ex.Message}"); } } _pluginClients.Clear(); }