diff --git a/.agents/skills/create-skill-evaluation/SKILL.md b/.agents/skills/create-skill-evaluation/SKILL.md new file mode 100644 index 0000000000..0a6d0a14fd --- /dev/null +++ b/.agents/skills/create-skill-evaluation/SKILL.md @@ -0,0 +1,242 @@ +--- +name: create-skill-evaluation +description: > + Write eval.yaml files that measure whether a SKILL.md improves agent output. + USE when creating or improving tests///eval.yaml. + DO NOT USE for writing SKILL.md content itself or for running the skill-validator CLI. +--- + +# Create Skill Evaluation + +Produce an `eval.yaml` that the skill-validator can run to prove a skill adds measurable value. +The eval file lives at `tests///eval.yaml`. + +## Inputs + +Before writing an eval, you need: + +1. The **SKILL.md** being evaluated — read it to catalog every teaching +2. The **plugin domain** — determines setup approach (project scaffolding, fixtures, inline files) +3. The **CONTRIBUTING.md** and **eng/skill-validator/README.md** — reference for assertion types, setup options, and CLI flags + +## Eval design principles + +### Scenarios describe goals, not techniques + +The prompt must read like a real user request: "I need a component that does X." +It must never mention concepts the skill teaches. If the skill teaches retry policies +with Polly, the prompt says "make the HTTP calls resilient" — not "add a Polly retry policy." +When the prompt contains the skill's teachings, the baseline gets the same guidance +and the eval cannot measure the skill's contribution. + +### Scenario domains must not overlap with SKILL.md examples + +If the skill teaches caching with a "product catalog" example, do not create an +eval scenario about product catalogs. The agent may memorize and replay the example +rather than generalize. Choose unrelated domains — weather data, appointment scheduling, +audit logging — so the eval measures whether the agent internalized the pattern. + +### 5 scenarios is the sweet spot + +Enough diversity to avoid overfitting to one problem shape, few enough to finish +in reasonable time. Each scenario should exercise a different subset of the skill's +teachings so no single pattern carries the entire score. + +### Baselines should be achievable but imperfect + +If the baseline already scores 5/5 on a scenario, the skill cannot show improvement. +Pick problems where an unskilled agent produces working but non-idiomatic code — the +kind of gap a skill is designed to close. + +## eval.yaml structure + +### Setup + +Choose the setup approach based on what the scenario needs: + +| Approach | When to use | Example | +|----------|-------------|---------| +| `commands:` | Agent needs a scaffolded project | `dotnet new webapi --no-https` | +| `copy_test_files: true` | Fixtures live alongside eval.yaml | Legacy migration scenarios | +| `files:` (inline) | Specific file contents required | Broken code that needs fixing | +| No setup | Agent creates everything from scratch | Pure authoring tasks | + +Use YAML anchors when all scenarios share the same setup: + +```yaml +scenarios: + - name: "First scenario" + setup: &api_project + commands: + - "dotnet new webapi --no-https" + # ... + + - name: "Second scenario" + setup: *api_project + # ... +``` + +### Prompts + +Write prompts as a user would — describe the goal with bulleted requirements: + +```yaml +prompt: | + I need a `RetryHandler` for my ASP.NET Core app. + + Requirements: + - Wraps HttpClient calls with automatic retry on transient failures + - Configurable max retry count and base delay via DI options + - Uses exponential backoff with jitter + - Logs each retry attempt with the attempt number and delay + - After exhausting retries, throws the original exception + - Registered as a DelegatingHandler in the DI container +``` + +Do not include architecture decisions, pattern names, or implementation hints +that come from the skill. The prompt describes *what*, the skill provides *how*. + +### Assertions + +Use 3–4 assertions per scenario as structural gates. Assertions verify necessary +conditions — the rubric handles quality. + +```yaml +assertions: + - type: "output_contains" + value: "DelegatingHandler" + - type: "output_contains" + value: "ILogger" + - type: "output_matches" + pattern: "(AddHttpMessageHandler|AddTransientHttpErrorPolicy)" +``` + +Available assertion types: + +| Type | Purpose | +|------|---------| +| `output_contains` | Key type/method name appears in agent output | +| `output_not_contains` | Banned pattern is absent (e.g., hardcoded secrets) | +| `output_matches` | Regex for alternative valid patterns | +| `file_exists` | Expected file was created | +| `file_contains` | File has specific content | +| `exit_success` | Agent produced non-empty output | + +### Rubric items + +Write 10–13 items per scenario. Each item is one observable behavior scored 1–5 +by the LLM judge. + +**Format**: "Does [correct thing] — not [common mistake]" + +```yaml +rubric: + - "Implements retry as a DelegatingHandler — not as inline try/catch in every call site" + - "Uses exponential backoff with jitter — not fixed-delay retry" + - "Registers via AddHttpMessageHandler in DI — not manually wrapping HttpClient" + - "Logs each retry with attempt number and computed delay — not silent retries" + - "Configures retry options through IOptions pattern — not hardcoded constants" + - "Only retries on transient HTTP errors (408, 429, 5xx, network failures) — not on 4xx client errors" + - "Throws the original exception after exhausting retries — not a wrapped AggregateException" + - "Disposes no unmanaged resources and does not implement IDisposable unnecessarily" +``` + +Rules: +- One behavior per item — the judge must be able to score it independently +- Name specific APIs and methods — "uses `AddHttpMessageHandler`" not "registers correctly" +- Include anti-pattern phrasing — tells the judge what penalties to apply +- Don't rubric things the baseline already does perfectly — wastes discriminative power +- Add items for patterns the skill explicitly warns against — these are frequently missed + +### Timeout + +| Scenario type | Timeout | +|---------------|---------| +| Analysis only (no code generation) | 120s | +| Code generation, simple scaffolding | 180–300s | +| Code generation with project setup + multiple files | 300–600s | + +If rubric count exceeds 10 items per scenario, use `--judge-timeout 600` when running +the validator. + +## Iteration workflow + +### 1. Fast feedback (1-run) + +```bash +dotnet run --no-launch-profile --project eng/skill-validator/src/SkillValidator.csproj \ + -- --tests-dir tests/ --runs 1 --verbose \ + --reporter json --reporter markdown --no-overfitting-check \ + --keep-work-dirs --work-dir artifacts/work-dirs +``` + +### 2. Check baseline scores + +For each rubric item, look at the baseline score. If baseline scores 5/5 on an +item, that item adds no signal — remove or replace it with something the baseline +misses. + +### 3. Check for equal failures + +If both baseline and skilled variants fail a rubric item equally, either: +- The skill doesn't teach that pattern (skill gap — fix the SKILL.md) +- The rubric is unfair (asks for something unreasonable — rewrite the item) + +### 4. Compare structural diversity across variants + +After a 1-run eval, read the baseline and skilled output for each scenario. +If the generated code has the same file structure, class hierarchy, and API +surface across both variants, the scenario is too constrained or the skill +is not changing how the agent approaches the problem. Good scenarios show +structural differences: the skilled variant creates a wrapper class the baseline +does not, uses a different disposal pattern, or organizes files differently. +If 3+ scenarios produce near-identical baseline vs skilled code, diversify the +scenario domains or loosen the prompts. + +### 5. Read judge reasoning + +The results JSON contains per-rubric `reasoning` from the judge. Read it to +understand why specific items scored low — the judge often reveals whether the +agent missed a pattern or the rubric was ambiguous. + +### 6. Final validation (5-run) + +```bash +dotnet run --no-launch-profile --project eng/skill-validator/src/SkillValidator.csproj \ + -- --tests-dir tests/ --runs 5 --verbose \ + --reporter json --reporter markdown --no-overfitting-check \ + --judge-timeout 600 +``` + +The eval passes when overall score is ≥ +10% and all scenarios show improvement. + +## Common mistakes + +| Mistake | Why it hurts | +|---------|-------------| +| Prompt leaks skill teachings | Mentioning "use the Options pattern" or "create a handler class" in the prompt gives the baseline the same guidance the skill provides — eval cannot measure improvement | +| All scenarios exercise the same 2–3 teachings | The eval appears to pass but only validates a fraction of the skill | +| Scenario domains overlap with SKILL.md examples | Agent replays memorized examples instead of generalizing | +| Baseline and skilled outputs are structurally identical | Scenarios are too narrow or prompts too prescriptive — the skill has no room to change the agent's approach | +| Rubric tests vocabulary instead of outcomes | "Uses the word 'resilience'" vs "Retries on transient failures" — the first is overfitting, the second measures behavior | +| Compound rubric items | "Does A and B and C" — the judge cannot score partially; split into separate items | +| Assertions check things unrelated to the skill | A `file_exists` check for `README.md` when the skill teaches error handling — noise | +| Timeout too short | Agent gets killed mid-generation; scored as failure when it was simply slow | +| Missing anti-pattern rubrics | The skill warns "don't do X" but the rubric never penalizes X — the eval misses a key signal | + +## Checklist + +Before submitting the eval for review: + +- [ ] 5 scenarios with diverse problem domains +- [ ] No domain overlap between eval scenarios and SKILL.md examples +- [ ] Prompts describe goals — no skill concepts or pattern names mentioned +- [ ] 3–4 assertions per scenario (structural gates) +- [ ] 10–13 rubric items per scenario (quality bar) +- [ ] Each rubric item is a single observable behavior +- [ ] Anti-pattern phrasing on items that catch common mistakes +- [ ] Timeout appropriate for setup complexity +- [ ] YAML anchors for shared setup +- [ ] 1-run eval: baseline vs skilled outputs are structurally different +- [ ] 1-run eval: no rubric item scores 5/5 on baseline +- [ ] 5-run eval passes with ≥ +10% overall diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 5f9969a287..24d0543c88 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -34,6 +34,11 @@ public static RootCommand Create() var noiseSkillsDirOpt = new Option("--noise-skills-dir") { Description = "Directory containing skills to load as noise. Enables the noise test: re-runs scenarios with all noise skills loaded and measures degradation." }; var noiseMaxDegradationOpt = new Option("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 }; var noiseMaxScenarioDegradationOpt = new Option("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 }; + var keepWorkDirsOpt = new Option("--keep-work-dirs") { Description = "Preserve temporary working directories after the run (paths printed when --verbose is set)" }; + var workDirBaseOpt = new Option("--work-dir") { Description = "Base directory for temporary working directories (defaults to system temp)" }; + var readableWorkDirsOpt = new Option("--readable-work-dirs") { Description = "Use human-readable directory names (//) instead of GUIDs" }; + var scenarioOpt = new Option("--scenario") { Description = "Run only scenarios whose name contains this substring (case-insensitive). Can be repeated.", AllowMultipleArgumentsPerToken = true }; + var environmentOpt = new Option("--environment") { Description = "Run only scenarios matching this environment (e.g. 'ci', 'local'). Scenarios without an environment always run." }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -60,6 +65,11 @@ public static RootCommand Create() noiseSkillsDirOpt, noiseMaxDegradationOpt, noiseMaxScenarioDegradationOpt, + keepWorkDirsOpt, + workDirBaseOpt, + readableWorkDirsOpt, + scenarioOpt, + environmentOpt, }; command.SetAction(async (parseResult, _) => @@ -108,6 +118,11 @@ public static RootCommand Create() NoiseSkillsDir = parseResult.GetValue(noiseSkillsDirOpt), NoiseDegradationLimit = parseResult.GetValue(noiseMaxDegradationOpt), NoiseMaxScenarioDegradation = parseResult.GetValue(noiseMaxScenarioDegradationOpt), + KeepWorkDirs = parseResult.GetValue(keepWorkDirsOpt), + WorkDirBase = parseResult.GetValue(workDirBaseOpt), + ReadableWorkDirs = parseResult.GetValue(readableWorkDirsOpt), + ScenarioFilters = parseResult.GetValue(scenarioOpt) ?? [], + Environment = parseResult.GetValue(environmentOpt), }; return await Run(config); @@ -127,6 +142,9 @@ public static RootCommand Create() public static async Task Run(ValidatorConfig config) { + AgentRunner.SetWorkDirBase(config.WorkDirBase); + AgentRunner.SetReadableWorkDirs(config.ReadableWorkDirs); + // Validate model early try { @@ -327,7 +345,10 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, } await AgentRunner.StopAllClients(); - await AgentRunner.CleanupWorkDirs(); + if (config.KeepWorkDirs) + Console.WriteLine($"\x1b[33m📂 Keeping {AgentRunner.WorkDirCount} work dir(s). Use --verbose to see paths during the run.\x1b[0m"); + else + await AgentRunner.CleanupWorkDirs(); // Always fail on execution errors, even in --verdict-warn-only mode if (rejectionMessages.Count > 0) return 1; @@ -469,7 +490,37 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList + var scenarios = (IReadOnlyList)skill.EvalConfig.Scenarios; + + // Filter by --environment: scenarios with no environment always run; + // scenarios with an environment run only when it matches. + if (config.Environment is not null) + { + scenarios = scenarios.Where(s => + s.Environment is null || + s.Environment.Equals(config.Environment, StringComparison.OrdinalIgnoreCase)).ToList(); + } + + // Filter by --scenario name substring + if (config.ScenarioFilters.Count > 0) + { + scenarios = scenarios.Where(s => + config.ScenarioFilters.Any(f => s.Name.Contains(f, StringComparison.OrdinalIgnoreCase))).ToList(); + } + + if (scenarios.Count == 0) + { + var filters = new List(); + if (config.Environment is not null) filters.Add($"--environment {config.Environment}"); + if (config.ScenarioFilters.Count > 0) filters.Add($"--scenario {string.Join(", ", config.ScenarioFilters)}"); + log($"⏭ No scenarios match filter(s): {string.Join("; ", filters)}"); + return null; + } + + if (scenarios.Count < skill.EvalConfig.Scenarios.Count) + log($"🔍 Running {scenarios.Count}/{skill.EvalConfig.Scenarios.Count} scenario(s) matching filter"); + + var scenarioTasks = scenarios.Select(scenario => scenarioLimit.RunAsync(() => ExecuteScenario(scenario, skill, config, usePairwise, singleScenario, spinner))); var comparisons = (await Task.WhenAll(scenarioTasks)).ToList(); @@ -697,13 +748,13 @@ private static async Task ExecuteRun( var agentTasks = await Task.WhenAll( // 1. Baseline: no plugin, no skills — vanilla agent AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog)), + PluginRoot: null, Log: runLog, RunIndex: runIndex)), // 2. Skilled-isolated: single skill only (current behavior) AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog)), + PluginRoot: null, Log: runLog, RunIndex: runIndex)), // 3. Skilled-plugin: load entire plugin from plugin root directory AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, - PluginRoot: pluginRoot, Log: runLog))); + PluginRoot: pluginRoot, Log: runLog, RunIndex: runIndex))); var baselineMetrics = agentTasks[0]; var isolatedMetrics = agentTasks[1]; var pluginMetrics = agentTasks[2]; @@ -914,11 +965,11 @@ private static async Task ExecuteNoiseTest( { // Run with target skill only var skillOnlyMetrics = await AgentRunner.RunAgent(new RunOptions( - scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog)); + scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, RunIndex: runIndex)); // Run with all skills loaded var allSkillsMetrics = await AgentRunner.RunAgent(new RunOptions( - scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, AdditionalSkills: otherSkills)); + scenario, targetSkill, targetSkill.EvalPath, config.Model, config.Verbose, Log: scenarioLog, AdditionalSkills: otherSkills, RunIndex: runIndex)); // Evaluate assertions on both if (scenario.Assertions is { Count: > 0 }) diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 83e0223793..93f78aec34 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -68,7 +68,8 @@ public sealed record EvalScenario( IReadOnlyList? RejectTools = null, int? MaxTurns = null, int? MaxTokens = null, - bool ExpectActivation = true); + bool ExpectActivation = true, + string? Environment = null); public sealed record EvalConfig(IReadOnlyList Scenarios); @@ -378,6 +379,11 @@ public sealed record ValidatorConfig public string? NoiseSkillsDir { get; init; } public double NoiseDegradationLimit { get; init; } = 0.2; public double NoiseMaxScenarioDegradation { get; init; } = 0.4; + public bool KeepWorkDirs { get; init; } + public string? WorkDirBase { get; init; } + public bool ReadableWorkDirs { get; init; } + public IReadOnlyList ScenarioFilters { get; init; } = []; + public string? Environment { get; init; } } public static class DefaultWeights diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 05686f7f25..a7c0873819 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -2,6 +2,7 @@ using System.Diagnostics; using System.Text.Json; using System.Text.Json.Nodes; +using System.Text.RegularExpressions; using SkillValidator.Models; using SkillValidator.Utilities; using GitHub.Copilot.SDK; @@ -16,13 +17,17 @@ public sealed record RunOptions( bool Verbose, string? PluginRoot = null, Action? Log = null, - IReadOnlyList? AdditionalSkills = null); + IReadOnlyList? AdditionalSkills = null, + int RunIndex = 0); public static class AgentRunner { private static readonly ConcurrentDictionary _pluginClients = new(StringComparer.OrdinalIgnoreCase); private static readonly SemaphoreSlim _clientLock = new(1, 1); private static readonly ConcurrentBag _workDirs = []; + private static string _workDirBase = Path.GetTempPath(); + private static bool _readableWorkDirs; + private static string? _readableTimestamp; private static string? _capturedGitHubToken; private static bool _tokenCaptured; @@ -101,6 +106,36 @@ public static async Task StopAllClients() /// Backward-compatible alias. public static Task StopSharedClient() => StopAllClients(); + /// Number of tracked temporary working directories. + public static int WorkDirCount => _workDirs.Count; + + /// Set the base directory for all temporary working directories. + public static void SetWorkDirBase(string? path) + { + if (!string.IsNullOrWhiteSpace(path)) + { + _workDirBase = Path.GetFullPath(path); + Directory.CreateDirectory(_workDirBase); + } + } + + /// Enable readable, human-friendly work directory names instead of GUIDs. + public static void SetReadableWorkDirs(bool enabled) + { + _readableWorkDirs = enabled; + if (enabled) + _readableTimestamp = DateTime.Now.ToString("yyyy-MM-dd_HH-mm-ss"); + } + + /// Slugify a scenario name for use as a directory name. + internal static string Slugify(string name) + { + var slug = name.ToLowerInvariant().Replace(' ', '-'); + slug = Regex.Replace(slug, @"[^a-z0-9\-]", "-"); + slug = Regex.Replace(slug, @"-{2,}", "-"); + return slug.Trim('-'); + } + /// Remove all temporary working directories created during runs. public static Task CleanupWorkDirs() { @@ -213,7 +248,7 @@ internal static SessionConfig BuildSessionConfig( var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null; // Create a unique temporary config directory for this session to not share any data - var configDir = Path.Combine(Path.GetTempPath(), $"sv-cfg-{Guid.NewGuid():N}"); + var configDir = Path.Combine(_workDirBase, $"sv-cfg-{Guid.NewGuid():N}"); Directory.CreateDirectory(configDir); _workDirs.Add(configDir); if (verbose) @@ -226,7 +261,7 @@ internal static SessionConfig BuildSessionConfig( var noiseDirs = new List(); if (additionalSkills is { Count: > 0 }) { - var stageDir = Path.Combine(Path.GetTempPath(), $"sv-noise-{Guid.NewGuid():N}"); + var stageDir = Path.Combine(_workDirBase, $"sv-noise-{Guid.NewGuid():N}"); Directory.CreateDirectory(stageDir); _workDirs.Add(stageDir); @@ -306,7 +341,7 @@ internal static SessionConfig BuildSessionConfig( { // Stage the single skill into a temp directory so the SDK discovers // only this skill — not every sibling that shares the same parent. - var isoStageDir = Path.Combine(Path.GetTempPath(), $"sv-iso-{Guid.NewGuid():N}"); + var isoStageDir = Path.Combine(_workDirBase, $"sv-iso-{Guid.NewGuid():N}"); Directory.CreateDirectory(isoStageDir); _workDirs.Add(isoStageDir); @@ -387,7 +422,11 @@ public static async Task RunAgent(RunOptions options) private static async Task RunAgentCore(RunOptions options, CancellationToken cancellationToken) { - var workDir = await SetupWorkDir(options.Scenario, options.Skill?.Path, options.EvalPath); + var variant = options.Skill is null ? "baseline" + : options.AdditionalSkills is { Count: > 0 } ? "noise-all" + : options.PluginRoot is not null ? "plugin" + : "isolated"; + var workDir = await SetupWorkDir(options.Scenario, options.Skill?.Path, options.EvalPath, options.RunIndex, variant); if (options.Verbose) { var write = options.Log ?? (msg => Console.Error.WriteLine(msg)); @@ -534,9 +573,18 @@ private static async Task RunAgentCore(RunOptions options, Cancellat return metrics; } - private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) + private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath, int runIndex = 0, string? variant = null) { - var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); + string workDir; + if (_readableWorkDirs && variant is not null && _readableTimestamp is not null) + { + var slug = Slugify(scenario.Name); + workDir = Path.Combine(_workDirBase, _readableTimestamp, slug, $"run-{runIndex + 1}", variant); + } + else + { + workDir = Path.Combine(_workDirBase, $"sv-{Guid.NewGuid():N}"); + } Directory.CreateDirectory(workDir); _workDirs.Add(workDir); diff --git a/eng/skill-validator/src/Services/EvalSchema.cs b/eng/skill-validator/src/Services/EvalSchema.cs index 30bc9816ef..7cbf098781 100644 --- a/eng/skill-validator/src/Services/EvalSchema.cs +++ b/eng/skill-validator/src/Services/EvalSchema.cs @@ -65,7 +65,8 @@ private static EvalScenario ParseScenario(RawScenario raw) RejectTools: raw.RejectTools, MaxTurns: raw.MaxTurns, MaxTokens: raw.MaxTokens, - ExpectActivation: raw.ExpectActivation ?? true); + ExpectActivation: raw.ExpectActivation ?? true, + Environment: raw.Environment); } private static Assertion ParseAssertion(RawAssertion raw) @@ -143,6 +144,7 @@ internal sealed class RawScenario public int? MaxTurns { get; set; } public int? MaxTokens { get; set; } public bool? ExpectActivation { get; set; } + public string? Environment { get; set; } } internal sealed class RawSetup