diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 3bae77a..320a371 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -18,3 +18,14 @@ paths: ignore: - 'shellcheck reported issue in this script: SC2015' - 'shellcheck reported issue in this script: SC2016' + # The waza eval workflows emit markdown PR comments via `printf` with + # backticks inside single-quoted strings (literal markdown code spans like + # `prompt`, `continue_session: true`). SC2016 ("Expressions don't expand in + # single quotes") is exactly the intent — single quotes prevent the shell + # from interpolating. Silence for these two files only. + ".github/workflows/waza-evals.yml": + ignore: + - 'shellcheck reported issue in this script: SC2016' + ".github/workflows/waza-agent-evals.yml": + ignore: + - 'shellcheck reported issue in this script: SC2016' diff --git a/.github/evals/README.md b/.github/evals/README.md new file mode 100644 index 0000000..ab62fd5 --- /dev/null +++ b/.github/evals/README.md @@ -0,0 +1,56 @@ +# Git-Ape eval harness + +Behavioral evals for the skills under `.github/skills/` and the agents +under `.github/agents/`. Investigated as part of [#61][issue-61]. + +## Decision: waza + +We evaluated three options before landing the harness: + +| Option | Verdict | Why | +|---|---|---| +| [`openai/evals`][openai-evals] | Rejected | Python-only ecosystem, Completion-Function-Protocol coupling to OpenAI models, and a registry shape that doesn't match how this repo loads skills/agents (filesystem-discovered Markdown with YAML frontmatter). | +| Custom Node harness (per [PR #40][pr-40] spike) | Rejected | Would have to reinvent grader composition, multi-model fan-out, CI fixture management, and PR-comment rendering. Net new surface area to maintain. | +| **[`waza`][waza]** | **Selected** | Already speaks the "skill / agent / task" vocabulary this repo uses, ships native cross-model `waza compare`, has a token/quality auditor, and integrates with both VS Code Copilot and GitHub Actions. Matches the maintainer workflow we want (`/skill-onboard` → `/skill-bench` → `/skill-improve` → `/skill-promote`). | + +## Layout + +``` +.github/evals/ +├── manifest.yaml # Skill tier configuration (skills only) +├── / +│ ├── eval.yaml # Skill eval definition +│ └── tasks/*.yaml # Per-task graders +└── agents// + ├── eval.yaml # Agent eval definition + ├── .agent.md # Mirror of the canonical .agent.md + └── tasks/*.yaml # Per-task graders +``` + +Skills are discovered via [`manifest.yaml`](./manifest.yaml). Agents are +auto-discovered from the filesystem (no manifest entry needed). + +## How to add a new eval suite + +Run one of the slash commands from VS Code (Copilot Chat). They scaffold +the directory, patch it to repo conventions, and run a smoke trial: + +- **Skills** — `/skill-onboard skillName=` +- **Agents** — `/agent-onboard agentName=` + +Full lifecycle (onboard → bench → improve → promote) is documented in +the [authoring docs][authoring-evals]. + +## CI wiring + +- Skills — [`.github/workflows/waza-evals.yml`](../workflows/waza-evals.yml) +- Agents — [`.github/workflows/waza-agent-evals.yml`](../workflows/waza-agent-evals.yml) + +Both run on PRs touching the relevant artifacts, post results as a PR +comment, and are currently **non-blocking**. + +[issue-61]: https://github.com/Azure/git-ape/issues/61 +[pr-40]: https://github.com/Azure/git-ape/pull/40 +[openai-evals]: https://github.com/openai/evals +[waza]: https://github.com/microsoft/waza +[authoring-evals]: https://azure.github.io/git-ape/docs/authoring/evals diff --git a/.github/evals/manifest.yaml b/.github/evals/manifest.yaml new file mode 100644 index 0000000..f71f4b3 --- /dev/null +++ b/.github/evals/manifest.yaml @@ -0,0 +1,47 @@ +# Single source of truth for the waza-evals workflow matrix. +# +# Consumed by `.github/workflows/waza-evals.yml` (prepare job) to: +# - decide which skills are configured for evaluation +# - generate the matrix.include payload (skill × model fan-out per tier) +# - drive the per-skill ordering of the PR comment +# +# Everything else (skill markdown, eval.yaml, tasks, fixtures) is +# auto-discovered from the filesystem by waza itself. This file only +# exists because waza has no native "tier" concept. +# +# Maintenance: +# - Add a skill: append a `{ name, tier }` entry to `skills:`. Make sure +# `.github/skills//SKILL.md` and `.github/evals//eval.yaml` +# exist. +# - Promote a skill (expanded → pilot): change its `tier:`. +# - Add/remove a model on a tier: edit `tiers..models:`. +# - Editing this file triggers the FULL matrix on PR (config-wide change). +# +# Bootstrap state (PR 1 of the eval harness port): +# Only `prereq-check` is enabled at landing time — it doubles as the +# harness smoke test. Each remaining skill suite ships in its own PR +# tracked under https://github.com/Azure/git-ape/issues/93. + +# Ordered list of evaluable skills. Order controls the PR-comment ordering. +skills: + # Pilot tier: full multi-model fan-out (most-trusted skills). + - name: prereq-check + tier: pilot + +# Per-tier model fan-out. The matrix runs each selected skill against every +# model in its tier. To compare additional models, add them here. +# +# Models with `baseline: true` run with `waza run --baseline` (A/B mode) to +# cap quota cost. The PR comment labels them clearly. +tiers: + pilot: + models: + - name: claude-sonnet-4.6 + - name: gpt-5.4 + baseline: true + - name: gpt-5-codex + - name: claude-opus-4.6 + expanded: + models: + - name: claude-sonnet-4.6 + - name: gpt-5-codex diff --git a/.github/evals/prereq-check/eval.yaml b/.github/evals/prereq-check/eval.yaml new file mode 100644 index 0000000..5b7510d --- /dev/null +++ b/.github/evals/prereq-check/eval.yaml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +# Pilot evaluation suite for the prereq-check skill. +# Validates trigger precision via the heuristic `trigger` grader. +# +# Run: waza run .github/evals/prereq-check/eval.yaml + +name: prereq-check-eval +description: Trigger precision pilot for prereq-check. +skill: prereq-check +version: "0.2" + +config: + # Pilot tier: 3 trials per task for flake detection (per skill-promote contract). + # Single-trial runs hide model nondeterminism on borderline triggers. + trials_per_task: 3 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Skill should activate on tooling/install prompts and stay quiet otherwise. + +graders: + # Budget grader: prereq-check is a lightweight diagnostic; flag anything + # that explodes in tool calls or takes longer than expected. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + + # answer_quality (LLM-as-judge) is scoped per-task on positive tasks only. + # Keeps judge-model errors from zeroing out the negative-task trigger check + # in the same leg. + +tasks: + - "tasks/*.yaml" diff --git a/.github/evals/prereq-check/tasks/negative-template-edit.yaml b/.github/evals/prereq-check/tasks/negative-template-edit.yaml new file mode 100644 index 0000000..0a7e424 --- /dev/null +++ b/.github/evals/prereq-check/tasks/negative-template-edit.yaml @@ -0,0 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-trigger-template-edit +name: Negative — Editing an ARM template +description: Editing template JSON should NOT trigger prereq-check. +# See positive-command-not-found.yaml for `mutable-by-*` tag semantics. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Add a tag block to the storageAccount resource in this ARM template." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/prereq-check/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/prereq-check/tasks/negative-trigger-conceptual-azure.yaml b/.github/evals/prereq-check/tasks/negative-trigger-conceptual-azure.yaml new file mode 100644 index 0000000..d1dd860 --- /dev/null +++ b/.github/evals/prereq-check/tasks/negative-trigger-conceptual-azure.yaml @@ -0,0 +1,16 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-trigger-conceptual-azure +name: Negative — Azure service concept question +description: A pure conceptual question about an Azure service's behavior has nothing to do with local CLI prerequisites, versions, or auth sessions and must not trigger prereq-check. +# See positive-command-not-found.yaml for `mutable-by-*` tag semantics. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Can you explain how Azure Container Apps revisions work? I want to understand the difference between single-revision and multiple-revision mode, how traffic splitting between revisions behaves, and what triggers a new revision to be created." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/prereq-check/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/prereq-check/tasks/positive-command-not-found.yaml b/.github/evals/prereq-check/tasks/positive-command-not-found.yaml new file mode 100644 index 0000000..3a9e8df --- /dev/null +++ b/.github/evals/prereq-check/tasks/positive-command-not-found.yaml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-command-not-found +name: Positive — "command not found" failure +description: Skill should be invoked when CLI tools are missing. +# `mutable-by-*` tag declares which artifact must change for this task's +# score to move. Values: +# mutable-by-skill — score reflects SKILL.md (trigger graders) +# mutable-by-agent — score reflects .agent.md (persona, workflow, identity) +# mutable-by-eval-grader — score is locked by grader/task design; only this YAML can fix it +# Currently consumed by `/agent-improve` (actionability surface + Locked? +# column + verdict gate). `/skill-improve` integration is tracked as a +# follow-up; until it lands, the tag is informational on skill tasks. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "az: command not found — what tools should be installed for Git-Ape skills?" +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/prereq-check/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. See eval.yaml. + # + # IMPORTANT: waza prompt graders are binary (set_waza_grade_pass = 1.0, + # set_waza_grade_fail = 0.0). They are NOT 1–5 rubrics. The judge has NO + # access to the agent's response unless continue_session: true is set — it + # resumes the agent's own session so it can read the response. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user reported `az: command not found` and asked what tools should + be installed for Git-Ape skills. + + PASS criteria — the response must contain ALL of: + 1. Names ALL of the core required tools: `az` (Azure CLI), `gh` + (GitHub CLI), `jq`, and `git`. + 2. Provides a concrete install command for the missing `az` for at + least one platform (macOS via `brew install azure-cli`, Linux + via apt/yum/curl, or Windows via winget/MSI). + 3. Recommends verifying versions after install (e.g., `az --version`, + `gh --version`). + 4. Reaches a verdict / next step — either a readiness summary + (✅ READY / ⚠️ Missing) OR explicit "after install, run X to + verify". + + If ALL four PASS criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/prereq-check/tasks/positive-what-do-i-need.yaml b/.github/evals/prereq-check/tasks/positive-what-do-i-need.yaml new file mode 100644 index 0000000..bbe397d --- /dev/null +++ b/.github/evals/prereq-check/tasks/positive-what-do-i-need.yaml @@ -0,0 +1,46 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-what-do-i-need +name: Positive — "What do I need to install?" +description: Direct phrasing from the SKILL.md "When to use" section. +# See positive-command-not-found.yaml for `mutable-by-*` tag semantics. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "What do I need to install before running git-ape onboarding?" +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/prereq-check/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. See eval.yaml. + # + # IMPORTANT: waza prompt graders are binary (set_waza_grade_pass = 1.0, + # set_waza_grade_fail = 0.0). They are NOT 1–5 rubrics. The judge has NO + # access to the agent's response unless continue_session: true is set — it + # resumes the agent's own session so it can read the response. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked what they need to install before running Git-Ape + onboarding. + + PASS criteria — the response must contain ALL of: + 1. Lists `az` (Azure CLI), `gh` (GitHub CLI), `jq`, and `git` as + required tools. + 2. Notes authentication requirements (at minimum `az login`; ideally + also `gh auth login`). + 3. Mentions either minimum versions OR "use latest stable" / a + version check command. + 4. Provides install commands or points the user to a verification + script (e.g., a prereq-check skill invocation that runs the + checks for them). + + If ALL four PASS criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/prompts/agent-bench.prompt.md b/.github/prompts/agent-bench.prompt.md new file mode 100644 index 0000000..0166152 --- /dev/null +++ b/.github/prompts/agent-bench.prompt.md @@ -0,0 +1,179 @@ +--- +agent: 'agent' +description: 'Cross-model benchmark for a single custom agent: runs waza eval once per model, captures results, compares with waza compare, and prints a one-line winner summary' +argument-hint: '[agentName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]' +--- + +# Agent Bench + +Run a cross-model benchmark against a single custom agent's eval suite. +Executes `waza run` once per model, captures per-model result JSON to +`/tmp/waza-runs/`, then compares all runs with `waza compare` and prints +a one-line summary identifying the best model and the gap to the next. + +This is the agent-side counterpart of `/skill-bench`. It targets +`.github/evals/agents//eval.yaml` and uses the waza ≥ 0.31 +custom-agent eval feature ([PR #226](https://github.com/microsoft/waza/pull/226)). + +> **Cost notice:** This prompt consumes **one premium Copilot request per +> (model × task × trial)** combination. With 4 models and a suite of 3 tasks, +> that is ≥ 12 premium requests per invocation. Set `models` to a subset if +> quota is limited. + +This is **non-interactive** — it runs to completion and reports results. + +## Inputs + +* `${input:agentName}`: (Required) Bare agent name (e.g. + `azure-policy-advisor`), matching the basename of + `.github/agents/.agent.md`. If omitted, ask once then proceed. +* `${input:models:claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6}`: + (Optional) Comma-separated list of waza model IDs to benchmark. + Defaults to all four pilot-tier models. Run `waza models` to see the + currently-supported IDs. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for every +shell command. Use `set -uo pipefail` (not `-e`) so a non-zero `waza run` +exit (eval below threshold) does not abort the benchmark. + +### Step 1 — Resolve and verify + +1. Set `agent="${input:agentName}"`. +2. Verify `.github/agents/${agent}.agent.md` exists. If not, stop and + report the missing path. +3. Verify `.github/evals/agents/${agent}/eval.yaml` exists. If not, stop + and report the missing path. Benchmarking requires an eval suite — + point the user at `.github/evals/agents/azure-policy-advisor/` as a + reference layout. +4. Parse `${input:models}` by splitting on commas, trimming whitespace. + Store as an array `models`. If empty or not provided, use the default + list: `claude-sonnet-4.6`, `gpt-5.4`, `gpt-5-codex`, `claude-opus-4.6`. +5. Print a one-line preamble: + `Benchmarking across models: , , ...` + +### Step 2 — Resync eval-dir copy + +The eval directory holds a **copy** of the production agent file (not a +symlink). Refresh it once before any runs so every model sees the same +on-disk bytes: + +```bash +cp ".github/agents/${agent}.agent.md" \ + ".github/evals/agents/${agent}/${agent}.agent.md" +``` + +This is a one-shot copy, not a per-model sync — the production file +does not change during the benchmark, so re-copying between runs would +be redundant. + +### Step 3 — Run evals (one per model) + +```bash +mkdir -p /tmp/waza-runs + +for model in ${models[@]}; do + echo "▶ Running: ${model}" + waza run ".github/evals/agents/${agent}/eval.yaml" \ + --model "${model}" \ + --no-cache \ + --output "/tmp/waza-runs/${agent}-${model}-bench.json" \ + 2>&1 | tail -5 + echo " → saved /tmp/waza-runs/${agent}-${model}-bench.json" +done +``` + +**Rules:** +- Pass `--no-cache` on every run. Without it, a cached result from a + previous run makes the comparison meaningless. +- Do not pass `--format` here; the default output is what we want for the + JSON capture. The `waza compare` step formats the results. +- If a model ID is unsupported, `waza run` will exit non-zero; log the + failure and continue to the next model (do not abort the whole bench). +- Do not parallelise the runs (no background `&`). Running serially bounds + memory and makes quota consumption predictable. + +### Step 4 — Compare results + +```bash +# Collect all result files produced in Step 3 +result_files=(/tmp/waza-runs/${agent}-*-bench.json) +if [ ${#result_files[@]} -lt 2 ]; then + echo "⚠ Only ${#result_files[@]} result file(s) found — skipping compare." +else + waza compare "${result_files[@]}" --format table +fi +``` + +If `waza compare` exits non-zero, print the error and continue to Step 5. + +### Step 5 — One-line summary + +Parse the `waza compare` table output (or the per-run score lines from +Step 3 if compare failed). Then print: + +``` +Best model: overall score +Second best: overall score gap: <+/-Δ> +``` + +If only one model produced a valid result, print: +``` +Only one valid result: overall score — no comparison possible. +``` + +Then close with a "Next steps" line: +- If best and second-best are close (gap < 0.05): + `"Gap is narrow — consider running with trials_per_task=3 on the best model to confirm."` +- If gap ≥ 0.05: + `"Clear winner: use for this agent in the matrix."` + +Also include a one-line **infra-failure check**: scan each result JSON +for `tasks[].runs[].error_msg` containing `"Session not found"` or +`"failed to run grader"`. If any are present, surface the model + count +in a `⚠ Infra-failed legs:` line so the human knows the comparison is +contaminated and rerun is warranted. + +## Rules and Constraints + +* **Always pass `--no-cache`.** Results cached from a prior run make the + delta meaningless. +* **Never parallelize `waza run` calls.** Serial execution keeps quota + consumption predictable and avoids hitting rate limits. +* **Respect unsupported model IDs.** If a model fails with an "unsupported" + error, log it and move on — do not abort the entire bench. +* **Stay scoped to eval runs.** Do not edit `.agent.md`, `eval.yaml`, + fixtures, or task files as part of this prompt. Eval changes belong in + a separate review. +* **Sync rule.** Step 2 refreshes the eval-dir copy from the production + file once at the top. Never hand-edit the eval-dir copy directly — it + is a derived artifact. +* **`skill_directories` is required.** The auto-injected `tool_constraint` + grader only fires when the eval's `config:` block includes + `skill_directories: ["."]`. If `agent_tools_implicit` is missing from + every per-model run, flag the setup bug in the summary and recommend + fixing the eval before trusting bench results. +* **`executor: copilot-sdk` everywhere.** This repo standardizes on the + real Copilot SDK executor for both agent and skill evals. +* **Cost transparency.** At the start (Step 1) always remind the user of + the estimated premium request count: `models × tasks × trials_per_task`. + +## Why each step + +* **`--no-cache` on every run (Step 3)** — a cached result makes the + comparison delta meaningless; the bench only has value if each model is + exercised fresh. +* **Serial runs (Step 3)** — parallel `waza run` calls multiply quota + consumption and can hit rate limits; serial is slightly slower but + predictable and cost-safe. +* **Single resync at Step 2** — the production agent file is the source + of truth; the eval-dir copy must reflect it before benchmarking. Doing + it once (not per-model) prevents accidental mid-bench drift. +* **`waza compare` (Step 4)** — produces a structured table normalised + across runs; parsing raw score lines from stdout is fragile. +* **One-line summary (Step 5)** — answers the only question that matters: + which model to pin for this agent, and how confident we should be. +* **Infra-failure scan (Step 5)** — `Session not found` and grader-infra + errors silently flatten scores. Surfacing them prevents reading + contaminated bench data as a quality signal. diff --git a/.github/prompts/agent-improve.prompt.md b/.github/prompts/agent-improve.prompt.md new file mode 100644 index 0000000..64c11bd --- /dev/null +++ b/.github/prompts/agent-improve.prompt.md @@ -0,0 +1,481 @@ +--- +agent: 'agent' +description: 'Local feedback loop for a single custom agent: baseline → audit → propose edits → apply (with approval) → re-rank via waza compare. Optionally loops up to 3 rounds for deeper refinement.' +argument-hint: '[agentName=...] [iterations={1|2|3}] [rescoreQuality={true|false}]' +--- + +# Agent Improve + +Run a local improvement loop against a single custom agent +(`.github/agents/.agent.md`) in this repository. Captures a +baseline eval score, audits the agent file with `waza tokens suggest` + +`waza quality` (via the SKILL.md staging trick), proposes concrete +edits, applies the ones the user approves, re-runs the eval, and shows +the delta. + +This is the agent-side counterpart of `/skill-improve`. It targets the +`.agent.md` file and uses the waza ≥ 0.31 custom-agent eval feature +([PR #226](https://github.com/microsoft/waza/pull/226)) — the +auto-injected `tool_constraint` grader is what gives the loop its +teeth. + +By default this is a **single pass** (one round of audit → propose → +apply → verify). Pass `iterations=2` or `iterations=3` for a deeper +refinement loop with a fresh approval gate per round — useful when +driving an `.agent.md` toward a specific behavior shape methodically. +Values above `3` are clamped to bound premium-request cost. + +This prompt is **interactive**. The protocol always pauses before +editing. + + +## Inputs + +* `${input:agentName}`: (Required) Bare agent name (e.g. + `azure-policy-advisor`), matching the basename of + `.github/agents/.agent.md`. If omitted, infer from the user's + message; otherwise ask once. +* `${input:iterations:1}`: (Optional, defaults to `1`) Number of + audit-propose-apply rounds to run inside this invocation. Hard-capped + at `3`. The audit signals are reused across rounds (re-running them + costs premium requests); only the LLM's edit proposal refreshes per + iteration as it re-reads the now-edited agent file. +* `${input:rescoreQuality:false}`: (Optional, defaults to `false`) + When `true`, re-run `waza quality` after edits (or after the final + iteration when `iterations > 1`) and include the per-dimension + before/after delta in the summary. Costs one extra premium Copilot + request. Off by default to keep the loop cheap; turn on when + trigger-precision or another quality dimension is the lever you're + trying to move. +## Required Protocol + +Execute the steps below in order. Do not parallelize across steps. + +Use the workspace root as cwd for every shell command. Use `set -uo +pipefail` (not `-e`) so a non-zero `waza run` exit (eval below +threshold) does not abort the loop. + +### Step 1 — Resolve and verify + +1. Set `agent="${input:agentName}"`. +2. Set `maxIter = min(3, ${input:iterations:1})`. +3. Verify `.github/agents/${agent}.agent.md` exists. If not, stop and + report the missing path. +4. Check whether `.github/evals/agents/${agent}/eval.yaml` exists. + * **If yes**, proceed to step 2. + * **If no**, this is an audit-only loop: warn the user, skip steps + 2, 6b, 7, 8, and jump from step 6 to step 9. Also offer (in the + same turn, as a one-line aside) to scaffold an eval dir on the + next invocation using `.github/evals/agents/azure-policy-advisor/` + as the template. +5. Print a one-line preamble: + `Improving (eval: present|absent, iterations: )`. + +### Step 2 — Sync agent file into eval dir (skip in audit-only mode) + +The eval dir holds a **copy** of the production agent file so waza's +discovery picks it up via `config.skill_directories: ["."]`. Refresh +the copy before every run so the eval reflects on-disk truth: + +```bash +cp ".github/agents/${agent}.agent.md" \ + ".github/evals/agents/${agent}/${agent}.agent.md" +``` + +If `${agent}.agent.md` already exists in the eval dir and differs +from the production file, the `cp` overwrites it. That is the +intended behavior — the production file is the single source of +truth. + +### Step 3 — Baseline eval (skip in audit-only mode) + +Run **once**, with cache disabled so a later re-run produces a real +delta even when nothing else changes: + +```bash +mkdir -p /tmp/waza-runs +waza run ".github/evals/agents/${agent}/eval.yaml" \ + --no-cache \ + --output "/tmp/waza-runs/${agent}-baseline.json" +``` + +Capture the printed score line for the summary. Also note which +graders fired — in particular whether `agent_tools_implicit` appears +(auto-injection working) or is missing (`config.skill_directories` +not set, or `tool_constraint` already declared in eval.yaml). + +### Step 4 — Audits (run in parallel, once) + +Run both audits to seed iteration 1. `waza quality` is normally +SKILL.md-specific, but works on `.agent.md` after staging the file as +`waza-agent-stage//SKILL.md` (a NON-DOT path — dotted paths +are silently skipped by waza's workspace walker). The stage dir is +ephemeral and cleaned up after the audit. + +`waza dev` is excluded — it expects skill-spec frontmatter +(`name`/`description`/`when-to-use`) that `.agent.md` does not have, +so its recommendations would be nonsense. + +```bash +mkdir -p "waza-agent-stage/${agent}" +cp ".github/agents/${agent}.agent.md" \ + "waza-agent-stage/${agent}/SKILL.md" + +waza tokens suggest ".github/agents/${agent}.agent.md" --format text \ + > "/tmp/waza-runs/${agent}-tokens.txt" 2>&1 & + +waza quality "waza-agent-stage/${agent}/SKILL.md" \ + --model claude-sonnet-4.6 --format table \ + | sed "s|waza-agent-stage/${agent}/SKILL.md|${agent}.agent.md|g" \ + > "/tmp/waza-runs/${agent}-quality.txt" 2>&1 & + +wait +rm -rf "waza-agent-stage/${agent}" +``` + +If either exits non-zero, keep going but flag the failure in the +summary. They are advisory inputs, not gates. + +### Step 5 — Iteration loop + +Repeat the read → propose → approve → apply cycle up to `maxIter` +times. Track iteration index `i` starting at `1`. Maintain a running +log `applied[i] = [list of indices]` for the final summary. + +The audit signals from step 4 are reused across all iterations. Each +round, re-read the (now-edited) agent file so the LLM proposes fresh +edits against current state. + +#### Step 5a — Read iteration context + +Read into context (do not echo back to the user verbatim — they will +see the synthesis in step 5b): + +* `.github/agents/${agent}.agent.md` (current on-disk version) +* `.github/evals/agents/${agent}/eval.yaml` (if present) +* All task files matching + `.github/evals/agents/${agent}/tasks/*.yaml` (if present) +* `/tmp/waza-runs/${agent}-baseline.json` (if present) +* `/tmp/waza-runs/${agent}-tokens.txt` +* `/tmp/waza-runs/${agent}-quality.txt` + +#### Step 5b — Propose edits + +First, print an **actionability surface** preamble in one block, +derived from the `mutable-by-*` tag on each task in +`.github/evals/agents/${agent}/tasks/*.yaml`: + +```text +Actionability surface (from task `mutable-by-*` tags): + mutable-by-agent : ← edits to .agent.md can move these + mutable-by-skill : ← only SKILL.md edits move these (out of scope here) + mutable-by-eval-grader : ← locked; only task YAML edits move these + (no mutable-by-* tag) : ← unknown; treat as mutable-by-agent until tagged +``` + +The ceiling for this loop is `(mutable-by-agent + unknown) / total`. +If that ceiling is <50%, warn the user explicitly: "Most tasks are +locked or skill-driven; expect a small delta." + +Then produce a numbered list of **3 to 7** concrete, actionable edits to +`.github/agents/${agent}.agent.md`. Each item has: + +* **Index** — sequential within this iteration, starts at 1. +* **Lever** — one of: + * `clarity` — the role, workflow, or output format is ambiguous + * `trigger-precision` — the description/name fires on the wrong + prompts, or misses prompts it should fire on + * `scope-boundary` — the agent overreaches or underreaches its + stated scope + * `tool-list` — the `tools:` frontmatter is missing a tool the + workflow needs, or lists a tool the workflow never uses (this + feeds straight into the auto-injected `tool_constraint` grader) + * `token-saving` — the agent file has redundant or verbose blocks + * `anti-pattern` — disclaimers, prohibitions, or fluff that the + model ignores +* **Rationale** — one or two sentences citing the input that + surfaced the suggestion (e.g. "tokens suggest flagged a 220-token + example block", "quality scored trigger-precision 2/5", "baseline + `agent_tools_implicit` reported `read` not used across both tasks", + "negative task `off-topic` matched positive-task regex"). +* **Proposed change** — the exact text to add, modify, or delete. + +Do not invent edits beyond what the audits + your reading of the +agent file, eval.yaml, and tasks justify. Prefer fewer high-confidence +edits over a long list of speculative ones. + +After the list, ask the user a single question. The options vary by +iteration: + +* **If `maxIter == 1` or this is the final iteration:** + > Reply with the indices to apply (e.g. `1, 3, 5`), `all`, `skip`, + > or `add: `. +* **If more iterations remain (`i < maxIter`):** + > Reply with the indices to apply (e.g. `1, 3, 5`), `all`, `skip`, + > `add: `, or `stop` to end the + > loop now. + +`add:` is a first-class option — use it when you've spotted a needed +edit the auditor missed (e.g. a production-UI regression, a phrase the +grader requires that's about to be removed). The freeform text is +treated as a new proposed edit; it goes through the same apply step. +You may combine `add:` with indices, e.g. +`1, 3, add: restore one-at-a-time vscode_askQuestions in First-turn`. +#### Step 5c — Apply approved edits + +On user response: + +* `stop` — record `applied[${i}] = []` with reason `user stopped`, + exit the loop, jump to step 6. +* `skip` — record `applied[${i}] = []`, continue to iteration `i + 1` + (or exit if `i == maxIter`). +* `all`, a list of indices, and/or `add: ` — apply the + corresponding edits to `.github/agents/${agent}.agent.md` exclusively + (never the eval-dir copy, eval.yaml, tasks, or fixtures). For each + `add:` entry, treat the freeform text as an additional edit + description and apply it the same way. Use `edit` tool calls; never + shell `sed`/`awk`. Record applied indices plus any `add:` entries + (label them `add-` in the iteration log). + +After applying, run **Step 5d** (grader-literal lint) before +incrementing the iteration counter. + +#### Step 5d — Grader-literal lint (post-edit, pre-rerun) + +This catches the most common regression: an edit removes a literal +string that a per-task `answer_quality` or `clean_refusal` grader +requires the agent's response to contain. Without this check, the +regression is only visible after a full eval re-run. + +1. For each task in `.github/evals/agents/${agent}/tasks/*.yaml`, + read the `graders[].config.prompt` field of every `type: prompt` + grader. +2. Extract the **literal strings** the grader requires the agent to + contain. Heuristic: + * Strings in the grader prompt that appear inside double quotes, + single quotes, or backticks AND are referenced by PASS criteria + using verbs like "names", "mentions", "identifies", "includes", + "contains", or "says". + * Filter out judge tooling literals: `set_waza_grade_pass`, + `set_waza_grade_fail`, `continue_session`, and the literal + strings `PASS`/`FAIL` themselves. +3. For each extracted literal, grep the **post-edit** + `.github/agents/${agent}.agent.md` (case-insensitive, fixed-string). +4. Compare against the **pre-edit** version (use `git show + HEAD:.github/agents/${agent}.agent.md` if uncommitted, otherwise + the prior iteration's snapshot). +5. Print one of these outcomes: + * ✅ `lint clean` — no required literals removed. + * ⚠️ `lint warning` — list each removed literal with the task that + requires it and the criterion number. Then ask: + > These edits removed strings the grader requires. Re-add them + > to the agent file, revert the offending edit, or proceed + > anyway? (`re-add` / `revert` / `proceed`). + * `re-add` — insert each missing literal back into the closest + semantically appropriate section (use `edit` calls) and re-run + the lint. + * `revert` — undo the offending edit(s) only, keep other applied + edits, re-run the lint. + * `proceed` — continue with the regression. Record in the + summary as `⚠️ known regression: removed`. + +This step uses no premium requests. It runs in pure local context. + +After the lint, increment `i`. If `i > maxIter`, exit the loop with +stop reason `max iterations`. + +### Step 6 — Re-sync and verify (skip in audit-only mode, skip if no edits applied) + +Refresh the eval-dir copy, then re-run: + +```bash +cp ".github/agents/${agent}.agent.md" \ + ".github/evals/agents/${agent}/${agent}.agent.md" + +waza run ".github/evals/agents/${agent}/eval.yaml" \ + --no-cache \ + --output "/tmp/waza-runs/${agent}-after.json" +``` + +### Step 7 — Compare (skip in audit-only mode, skip if no edits applied) + +```bash +waza compare \ + "/tmp/waza-runs/${agent}-baseline.json" \ + "/tmp/waza-runs/${agent}-after.json" \ + --format table +``` + +### Step 7b — Re-score quality (only when `rescoreQuality=true`) + +Run a fresh quality judge against the edited `.agent.md` and capture +the table for the summary. Stage again with the same trick: + +```bash +mkdir -p "waza-agent-stage/${agent}" +cp ".github/agents/${agent}.agent.md" \ + "waza-agent-stage/${agent}/SKILL.md" + +waza quality "waza-agent-stage/${agent}/SKILL.md" \ + --model claude-sonnet-4.6 --format table \ + | sed "s|waza-agent-stage/${agent}/SKILL.md|${agent}.agent.md|g" \ + > "/tmp/waza-runs/${agent}-quality-after.txt" 2>&1 + +rm -rf "waza-agent-stage/${agent}" +``` + +Parse both `/tmp/waza-runs/${agent}-quality.txt` (baseline, captured +in step 4) and `/tmp/waza-runs/${agent}-quality-after.txt`. Skip this +step silently when `rescoreQuality` is `false`. + +### Step 8 — Summary + +Print a Markdown summary table with: + +| Metric | Before | After | Δ | Locked? | +|---|---|---|---|---| +| Overall score | … | … | … | — | +| Per-task: | … | … | … | yes/no | +| `agent_tools_implicit` fired | yes/no | yes/no | — | — | +| Agent file tokens | … | … | … | — | +| Quality (clarity / completeness / trigger-precision / scope / anti-patterns) | … | … | — | — | + +The **Locked?** column populates from the task's `mutable-by-*` tag: +`yes` when the tag is `mutable-by-skill` or `mutable-by-eval-grader` +(the agent file cannot move this score); `no` when the tag is +`mutable-by-agent` or absent. The Overall, tool-fired, tokens, and +Quality rows show `—` (not applicable). + +If the lint step (5d) recorded any `known regression`, append a +⚠️ line below the table summarizing them (one bullet per literal, +with the task name). + +The Quality row populates with real before/after numbers **only when +`rescoreQuality=true`**. Otherwise show the baseline column from step +4 and write `not re-scored (pass rescoreQuality=true to enable)` in +the After column. + +If `maxIter > 1`, also print a per-iteration breakdown: + +* **Iterations run** — `` of ``. +* **Stop reason** — one of: `max iterations` | `user stopped` | + `eval missing — only ran audits` | `no edits applied`. +* **Per-iteration applied items** — bullet list grouped by iteration + index, citing each applied edit's lever. + +Then a "Verdict" paragraph using one of these labels: + +* `IMPROVED` — overall score increased AND no negative-task score + increased AND no positive-task score decreased AND no locked-task + (`mutable-by-skill` / `mutable-by-eval-grader`) score increased. + A locked-task score moving on agent-file edits alone is suspicious + — either the tag is wrong or the test is noisy. Demote to `MIXED` + and call it out. +* `MIXED` — overall score increased BUT at least one of: negative-task + score went up, positive-task score went down, a locked-task score + moved, or the lint step (5d) recorded a `known regression`. +* `REGRESSED` — overall score decreased. +* `NO CHANGE` — overall delta is zero. +* `AUDIT ONLY` — no eval present; only token + quality numbers shown. + +End with a one-line "Next" suggestion: + +* If `IMPROVED`: "Commit `.github/agents/${agent}.agent.md` and the + refreshed eval-dir copy, then open a PR." +* If `MIXED` or `REGRESSED`: "Re-run `/agent-improve ${agent}` after + reverting or refining the offending edits." +* If `NO CHANGE`: "Audits had no actionable findings. Expand the + task suite under `.github/evals/agents/${agent}/tasks/` to surface + weaker behaviors." +* If `AUDIT ONLY`: "Add `.github/evals/agents/${agent}/eval.yaml` + with `config.skill_directories: ['.']` to enable score-based + feedback. See `.github/evals/agents/azure-policy-advisor/` for a + reference layout." + +## Rules and Constraints + +* **Always** pass `--no-cache` to `waza run` for both baseline and + verify. Without it, unchanged eval inputs return cached results + and the delta is meaningless. +* **Never** auto-apply edits. The approval gate in step 5b is + mandatory — every iteration of the loop pauses for explicit user + input. +* **`iterations` is hard-capped at 3.** Higher values are silently + clamped down. This bounds premium-request cost (each round adds an + LLM proposal turn; with `rescoreQuality=true` it also adds one + premium request for the final quality re-score). +* **Refuse to label `IMPROVED`** if a negative-task score increased + OR a locked-task score (`mutable-by-skill` / + `mutable-by-eval-grader`) moved. Broadening the agent's description + to win positives at the cost of negatives is overfitting; an + agent-file edit moving a skill-graded score means the tag is wrong + or the run is noisy. Surface both instead of hiding them. +* **Stay scoped to `.github/agents/${agent}.agent.md`.** Do not edit + `eval.yaml`, fixtures, tasks, or `.github/agents/` siblings. Eval + changes belong in a separate manual review. +* **Sync rule.** The eval-dir `.agent.md` is a derived copy; always + refresh it from the production file before running the eval. Never + hand-edit the eval-dir copy. +* **`skill_directories` is required.** The auto-injected + `tool_constraint` grader only fires when the eval's `config:` block + includes `skill_directories: ["."]`. If `agent_tools_implicit` is + missing from the baseline run, flag this as a setup bug and stop + before proposing edits. +* **`executor: copilot-sdk` everywhere.** This repo standardizes on + the real Copilot SDK executor for both agent and skill evals. Each + run consumes a few premium requests and ~100k model tokens — + budget accordingly. +* **Never rewrite an agent's `tools:` field to satisfy the eval.** + The `tools:` frontmatter is the **production-surface contract** + — it lists VS Code Chat tool IDs (e.g. `execute`, `read`, `search`, + `vscode`, `todo`) and MCP namespaces (e.g. `azure-mcp/cloudarchitect`, + `microsoftdocs/mcp/*`) that the agent uses when run inside VS Code. + Waza's `executor: copilot-sdk` emits a **different taxonomy** at + runtime: SDK CLI short names like `bash`, `view`, `edit`, `create`, + `sql`, `task`. Auto-injection naively zips the two together and + always fails. The fix lives **on the eval**, not the agent: declare + an explicit `tool_constraint` grader in the agent eval that lists + the SDK CLI names. This opts out of auto-injection ([waza#226](https://github.com/microsoft/waza/pull/226)) + and gives the grader something it can match. If the implicit + grader is producing nonsense, that's a grader-config bug, not an + agent bug. + +## Why every step + +* **Step 2 sync** — eval-dir holds a copy (not a symlink) so the + eval is reproducible across platforms and git tracks the exact bytes + evaluated. The sync step keeps the copy honest. +* **Step 3/6 with `--no-cache`** — the verify step needs a real + execution; without `--no-cache` an unchanged spec returns the + cached baseline and the delta is always 0. +* **Step 4 staging trick** — `waza quality` requires a SKILL.md + filename, and waza's workspace walker silently skips dotted paths + (the same `.NET FileAttributes.Hidden` quirk that bites the MSDO + template-analyzer). Staging `.agent.md` as + `waza-agent-stage//SKILL.md` (NON-DOT path) is the smallest + workaround that lets us run the 5-dim LLM judge on an agent file. + `sed` strips the prefix from the output so the user sees the real + agent path. `waza dev` is excluded because the recommendation + engine assumes skill-spec frontmatter (`name`, `description`, + `when-to-use`) that `.agent.md` does not have. +* **Step 5 iteration loop** — audits in step 4 cost premium requests, + so we run them once and reuse the signal. The LLM re-reads the + edited agent file each round and proposes fresh edits against + current state, which is the cheap part. The per-iteration approval + gate keeps a human in the loop between rounds. +* **Step 5b numbered approval** — partial-acceptance control without + re-prompting per item. The `add:` option in step 5b is the + user-supplied edit channel — production-UI fixes, grader-required + phrases the auditor missed, or any edit the LLM didn't propose. + Without it, the only way to inject such edits is to abort the loop + and re-invoke, which replays baseline and audits at premium cost. +* **Step 5d static lint** — catches the most common regression class + (an edit removes a literal string the grader requires) BEFORE the + ~90s/premium-request eval re-run. The lint uses no premium + requests; the eval re-run does. Mutability tags (`mutable-by-*` on + task `tags:`) drive the actionability preamble in step 5b and the + Locked column in step 8 — they prevent the loop from chasing scores + that agent-file edits cannot move. +* **Step 8 verdict labels** — distinguishes the most common failure + mode (overfitting positives at the cost of negatives) from a real + improvement. diff --git a/.github/prompts/agent-onboard.prompt.md b/.github/prompts/agent-onboard.prompt.md new file mode 100644 index 0000000..63c2922 --- /dev/null +++ b/.github/prompts/agent-onboard.prompt.md @@ -0,0 +1,597 @@ +--- +agent: 'agent' +description: 'Onboard a new custom agent into the waza eval harness: scaffold eval.yaml + tasks under .github/evals/agents//, mirror the .agent.md, patch to repo conventions (SDK CLI tool taxonomy, hybrid graders, off-topic persona-lock test), then run a smoke trial. No SKILL.md or manifest edits.' +argument-hint: '[agentName=...] [positiveTasks={2|3|4}] [negativeTasks={1|2}] [smokeModel=claude-sonnet-4.6]' +--- + +# Agent Onboard + +Bootstrap a brand-new eval suite for a custom agent +(`.github/agents/.agent.md`) that currently has no evaluation. +This is **stage 0** of the agent eval lifecycle — it precedes +[agent-bench](agent-bench.prompt.md), [agent-improve](agent-improve.prompt.md), +and [agent-promote](agent-promote.prompt.md). + +What it produces: + +* `.github/evals/agents//eval.yaml` +* `.github/evals/agents//.agent.md` — mirror copy of the + canonical `.github/agents/.agent.md` (NOT a symlink — waza walks + the dir under `skill_directories: ["."]`) +* `.github/evals/agents//tasks/positive-*.yaml` (default 2) +* `.github/evals/agents//tasks/negative-*.yaml` (default 2) +* `.github/evals/agents//tasks/negative-off-topic.yaml` — one + off-topic task with a `clean_refusal` grader that asserts the agent + identifies itself and redirects to its specialty + +What it does **not** do: + +* Touch `.github/agents/.agent.md`. Content-level concerns + (clarity, token trims, trigger precision, tool list, anti-patterns) + belong in [agent-improve](agent-improve.prompt.md). +* Touch `.github/evals/manifest.yaml`. Agents are NOT in the skills + manifest — they are auto-discovered by + [`.github/workflows/waza-agent-evals.yml`](../workflows/waza-agent-evals.yml) + from any directory under `.github/evals/agents//` that contains + an `eval.yaml`. Adding the eval directory is what registers the agent. +* Run a "production readiness" check. That gate is + [agent-promote](agent-promote.prompt.md), and only after the agent has + matured through one or more [agent-improve](agent-improve.prompt.md) + cycles. + +This is **interactive**. The protocol pauses for approval before +writing the eval directory and before running the smoke trial. + +> **Cost notice:** Step 6 (smoke trial) consumes +> `trials_per_task × len(tasks)` premium requests (default `2 × 5 = 10`) +> plus per-task judge calls — one `answer_quality` per positive trial, +> one `clean_refusal` per off-topic trial. Total budget ≈ **14–18 +> premium requests** per invocation. Step 3's scaffold authoring is +> local (no `waza suggest` — see Step 3 for why). + +## Inputs + +* `${input:agentName}`: (Required) Agent basename under `.github/agents/`. + Pass the bare name (e.g. `azure-policy-advisor`), not a path or the + `.agent.md` suffix. If omitted, infer from the user's message; + otherwise ask once. +* `${input:positiveTasks:2}`: (Optional, defaults to `2`) How many + positive in-scope tasks to scaffold. Hard-capped at `4` to bound cost. + Each positive task gets a hybrid grader pair: `trigger` (heuristic) + + `answer_quality` (LLM judge, `continue_session: true`). +* `${input:negativeTasks:2}`: (Optional, defaults to `2`) How many + adjacent-domain negative tasks to scaffold. Hard-capped at `2`. + Defaulting to the cap ensures every new agent gets two in-domain + refusal cases on top of the dedicated off-topic task (Step 4) — + single-negative scaffolds tend to under-cover the `## Non-goals` + boundary. The off-topic task is authored separately (Step 4) and + does not count against this budget. Negative tasks carry the + `trigger` grader only. +* `${input:smokeModel:claude-sonnet-4.6}`: (Optional) Model to use for + the final smoke trial. Default is the cheapest stable pilot-tier + model; override only if the agent is model-sensitive. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for +every shell command. Use `set -uo pipefail` (not `-e`) so a non-zero +`waza run` exit (eval below threshold on first run) does not abort the +onboarding. + +### Step 1 — Resolve and verify + +1. Set `agent="${input:agentName}"`. +2. Set `nPos = min(4, ${input:positiveTasks:2})`, + `nNeg = min(2, ${input:negativeTasks:2})`. +3. Verify `.github/agents/${agent}.agent.md` exists. If not, stop and + report the missing path. (Common slip: passing + `azure-policy-advisor.agent.md` instead of `azure-policy-advisor` — + strip the suffix and retry.) +4. **Refuse to overwrite.** If `.github/evals/agents/${agent}/` already + exists, stop and tell the user to use + [agent-improve](agent-improve.prompt.md) or delete the directory + manually first. Onboarding is a greenfield operation. +5. Print a one-line preamble: + `Onboarding with positive + negative + 1 off-topic task; smoke model: `. + +### Step 2 — Profile the agent + +Read `.github/agents/${agent}.agent.md` and extract: + +1. The `description:` frontmatter line. +2. The `tools:` frontmatter list — used by Step 4 as a sanity check + against the eval's tool-constraint expectations (see Step 4 patches). + **Do not rewrite the agent's `tools:` field from this prompt** — the + production list is a VS Code Chat tool ID contract; the eval bridges + the gap with an SDK CLI taxonomy on the grader side. +3. `## Mission` — used as the one-line description in `eval.yaml`. +4. `## Skills I own` — the listed skill slugs hint at the agent's + primary domain (e.g. `azure-naming-research`, `azure-cost-estimator` + → "Azure deployments"). +5. `## Workflow` — pick `nPos` distinct phase entry-points that a real + user might phrase as a first-turn request. These become the + **positive** task prompts. If `## Workflow` is too abstract, fall + back to the agent's `description:` framing. +6. `## Non-goals` — the explicit refusal/redirect entries become the + **negative** task prompts (adjacent domains the agent should refuse + in favor of another agent or skill). +7. The agent's primary domain (1–3 words; e.g. "Azure deployments", + "GitHub PR review", "Bicep authoring"). This drives the off-topic + negative task — pick a topic clearly outside the domain AND outside + anything Git-Ape touches (e.g. "Linux kernel scheduling internals" + for an Azure agent) so the `clean_refusal` grader has a clean signal. + +Echo the extracted profile so the human can sanity-check before any +files are written. + +### Step 3 — Author the scaffold to staging + +`waza suggest` is **not used** in this prompt. The skill version of +this prompt uses it; the agent version does not, because: + +* `waza suggest` expects skill-spec frontmatter (`name`, `description`, + `when-to-use`) per the agentskills.io spec [\[1\]](#refs). `.agent.md` + frontmatter (`tools`, `argumentHint`, `model`, `agents`, + `user-invocable`) is rejected as malformed — same root cause as why + `waza check` is excluded from [agent-promote](agent-promote.prompt.md) + criteria. +* The agent eval scaffold has agent-specific requirements + (`skill_directories: ["."]`, mirrored `.agent.md`, explicit + `tool_constraint` grader using SDK CLI taxonomy, off-topic + `clean_refusal` task) that the LLM-driven suggester does not know to + produce. + +Author the staged files directly using the templates in Step 4. Stage +to `/tmp/waza-onboard/agents/${agent}/` so the patching is atomic. + +```bash +mkdir -p "/tmp/waza-onboard/agents/${agent}/tasks" +``` + +Then proceed to Step 4 — every file produced by this prompt is hand- +authored from the templates below. + +### Step 4 — Author eval and tasks (repo conventions) + +The agent eval is patterned after the canonical +[.github/evals/prereq-check/eval.yaml](../evals/prereq-check/eval.yaml) +skill reference suite, with three agent-specific extensions: +`skill_directories: ["."]`, an explicit `tool_constraint` grader, and +an off-topic task that grades persona-lock. + +**Write `/tmp/waza-onboard/agents/${agent}/eval.yaml`:** + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +# Pilot evaluation suite for the custom agent. +# Validates trigger precision (in-scope vs. out-of-scope), answer quality +# on positives, and persona-lock on off-topic prompts. +# +# Run: waza run .github/evals/agents//eval.yaml + +name: -agent-eval +description: +version: "0.1" + +config: + # 2 trials catches the obvious LLM nondeterminism flakes (single trial + # = no flake signal). Pilot tier bumps to 3 via /agent-promote. + trials_per_task: 2 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + # MANDATORY for agents — waza walks this directory for SKILL.md or + # *.agent.md mirrors. Without it the agent file is not discovered, the + # auto-injected tool_constraint never fires, and /agent-promote's + # criterion 5 reports a setup bug instead of model quality. + skill_directories: ["."] + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Agent should activate on in-scope prompts and stay quiet otherwise. + +graders: + # Bound runaway tool use. Agent workflows are heavier than skill + # workflows (multi-step orchestration), so the budget is the same as + # the skill default — tune up only if a specific agent legitimately + # needs more headroom. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + + # Explicit tool_constraint grader. This declaration BOTH suppresses + # waza's broken auto-injection (which naively zips VS Code Chat tool + # IDs against the SDK CLI runtime taxonomy and always fails — see + # waza#226) AND gives the grader something it can actually match. + # + # We name it `agent_tools_implicit` to preserve compatibility with + # /agent-promote criterion 5, which keys off this exact grader name. + # + # The regex matches the copilot-sdk executor's CLI taxonomy ONLY + # (bash, view, edit, create, sql, task). NEVER edit the agent's + # production `tools:` field to satisfy this grader — those are VS Code + # Chat tool IDs (execute, read, search, vscode, todo, MCP namespaces) + # and live on a separate production surface. + - type: tool_constraint + name: agent_tools_implicit + config: + expect_tools: + - tool: "^(bash|view|edit|create|sql|task)$" + + # answer_quality (LLM judge) and clean_refusal (persona-lock judge) + # are scoped per-task — keeps a flaky judge call from zeroing the + # entire leg. + +tasks: + - "tasks/*.yaml" +``` + +**Per-task patches (apply to every task file):** + +Two defects MUST be avoided up front: + +1. **Prepend the task-schema header** as line 1 of every task file. + Without it, VS Code applies the eval-schema to task files and + surfaces ~6 false-positive lint errors per file. + + ```yaml + # yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + ``` + +2. **Do NOT emit an `expected:` field** on any task. The waza task JSON + schema accepts the shape `expected:\n should_trigger: true`, but the + Go runtime parser rejects it + (`cannot unmarshal !!seq into models.TaskExpectation`) and the suite + fails to load. Graders alone drive pass/fail. + +**Write `/tmp/waza-onboard/agents/${agent}/tasks/positive-.yaml` +(once per `nPos`):** + +`positive-` is derived from a `## Workflow` phase entry-point +extracted in Step 2. `inputs.prompt:` MUST be a concrete, answerable +first-turn user message — not a generic placeholder. Reference shape: +prereq-check's `positive-command-not-found` task uses `"az: command not +found — what tools should be installed for Git-Ape skills?"`. Aim for +that level of specificity. + +Each positive task carries a hybrid grader pair. The `prompt` grader +needs `continue_session: true` — without it the judge has zero access +to the agent's response and scores oscillate. + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json +id: positive- +name: Positive — +description: should activate on this in-scope prompt. +# The `mutable-by-*` tag declares which artifact must change for this +# task's score to move. /agent-improve uses it to print an actionability +# surface and to refuse IMPROVED verdicts that increase scores on locked +# tasks. Pick exactly one: +# mutable-by-agent — fixable by editing .github/agents/.agent.md +# mutable-by-skill — fixable by editing .github/skills//SKILL.md +# (trigger graders point at SKILL.md, not the agent) +# mutable-by-eval-grader — locked by grader/task design; only this YAML can +# change the score (e.g. clean_refusal on a task with +# no tool calls scoring tool_constraint = 0) +# Positives that exercise the agent's workflow should be `mutable-by-agent`. +tags: [trigger, positive, mutable-by-agent] +inputs: + prompt: "" +graders: + - name: trigger_relevance_positive + type: trigger + config: + # Point at the MIRROR copy inside the eval dir — waza loaded it + # via skill_directories: ["."]. + skill_path: .agent.md + mode: positive + threshold: 0.5 + + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + + + PASS criteria — the response must contain ALL of: + 1. + 2. + 3. + 4. + + If ALL criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. +``` + +**Write `/tmp/waza-onboard/agents/${agent}/tasks/negative-.yaml` +(once per `nNeg`):** + +`negative-` is derived from an entry in the agent's `## Non-goals` +section — an adjacent domain the agent should refuse in favor of +another agent or skill. The trigger grader alone is the correct signal; +adding `answer_quality` to a refusal grades "how good is the answer" +when the right answer is "I will not answer". + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json +id: negative- +name: Negative — +description: should refuse and redirect on this out-of-scope prompt. +# `trigger_relevance_negative` reads SKILL.md, not the .agent.md — so this +# task's score moves with skill content, not the agent file. See the +# positive- template above for the full `mutable-by-*` table. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "" +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .agent.md + mode: negative + threshold: 0.5 +``` + +**Write `/tmp/waza-onboard/agents/${agent}/tasks/negative-off-topic.yaml`:** + +This is the agent-specific divergence from +[skill-onboard](skill-onboard.prompt.md). Skills are passive guidance — +they don't enforce identity contracts and a `clean_refusal` grader +produces noise. Agents carry a `## Identity (non-negotiable)` section +that IS the contract; off-topic prompts are exactly where it should +fire. We grade both signals: + +1. `trigger` (mode: negative) — the agent's workflow should not activate. +2. `clean_refusal` — the response must name the agent and redirect to + its specialty (per the persona-lock pattern in + [.github/templates/AGENT.template.md](../templates/AGENT.template.md)). + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json +id: negative-off-topic +name: Negative — Off-topic prompt (persona-lock test) +description: Fully off-topic prompt should not trigger ; agent must identify itself and redirect. +# Two graders, two surfaces: `trigger_relevance_negative` reads SKILL.md, +# `clean_refusal` reads the agent's response. The agent file is the +# dominant lever for the persona-lock criterion, so tag as `mutable-by-agent`. +# See the positive- template above for the full `mutable-by-*` table. +tags: [trigger, negative, persona-lock, mutable-by-agent] +inputs: + prompt: "" +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .agent.md + mode: negative + threshold: 0.5 + + - type: prompt + name: clean_refusal + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked a fully off-topic question (). The assistant runs under a custom-agent persona named + "" whose Identity (non-negotiable) section + requires it to name itself on every response and redirect off-topic + prompts to its specialty (). + + PASS criteria — the response must satisfy ALL of: + 1. Names itself as "" (or the canonical + persona string from the agent file's `## Identity` section). + 2. Refuses or declines to perform the off-topic task — either + explicitly ("I can't help with that", "outside my scope") or + by clearly redirecting before any off-topic answer is given. + 3. Names its actual specialty (the in-scope domain) as the + redirect target. + 4. Does NOT identify as "software engineering assistant", + "GitHub Copilot CLI", or any generic persona. + + If ALL four criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria failed. +``` + +> **Realism note.** Persona-lock is best-effort: the model's built-in +> CLI refusal can leak through ahead of the agent rewrite, especially +> on the cheapest models. A `🟡 ONBOARDED with weak signal` outcome on +> the off-topic task is common at this stage — it is feedback for +> [agent-improve](agent-improve.prompt.md), not a reason to soften the +> grader. + +### Step 5 — Approval gate + mirror copy + move into place + +Show the human a tree listing of the staged files plus a diff-style +summary of what will be written. Then pause and ask: + +> Apply this agent eval suite to `.github/evals/agents/${agent}/` and +> mirror the canonical `.agent.md` into it? (yes / no) + +If `yes`: + +```bash +mkdir -p ".github/evals/agents/${agent}" +cp -R "/tmp/waza-onboard/agents/${agent}/." \ + ".github/evals/agents/${agent}/" + +# Mirror copy — NOT a symlink (waza-agent-evals.yml re-syncs per-run). +cp ".github/agents/${agent}.agent.md" \ + ".github/evals/agents/${agent}/${agent}.agent.md" +``` + +No manifest edit is required. The +[`.github/workflows/waza-agent-evals.yml`](../workflows/waza-agent-evals.yml) +workflow auto-discovers any directory under `.github/evals/agents/` +that contains an `eval.yaml`. Creating the directory IS the +registration. + +If `no`, stop without writing anything. Leave the staged files in +`/tmp/waza-onboard/agents/${agent}/` for the user to inspect manually. + +### Step 6 — Smoke trial (single model, single trial) + +Confirm the suite executes end-to-end. This is a cheap signal that the +graders wire up correctly — not a quality assessment. +[agent-bench](agent-bench.prompt.md) does the cross-model bench. + +```bash +mkdir -p /tmp/waza-runs +waza run ".github/evals/agents/${agent}/eval.yaml" \ + --model "${input:smokeModel:claude-sonnet-4.6}" \ + --judge-model "claude-sonnet-4.6" \ + --no-cache \ + --output "/tmp/waza-runs/${agent}-onboard-smoke.json" \ + 2>&1 | tail -10 +``` + +Parse the result JSON and report: + +* `summary.aggregate_score` +* Per-task `tasks[].stats.avg_score` +* Whether `agent_tools_implicit` appears in `tasks[].runs[].validations` + for every task (proves `skill_directories: ["."]` is wired and the + explicit `tool_constraint` grader is matchable). If it is missing, + the eval has a setup bug — flag it in the summary, do not silently + ignore. +* Any tasks with `runs[].status == "error"` and their `error_msg` — + three classes of grader-infra failure must be surfaced before the + suite is trusted (the same classes the agent-evals workflow retries + on): + * `Session not found` (JSON-RPC -32603) — Copilot SDK dropped the + session before `continue_session: true` could resume it. + * `failed to run grader` — judge LLM backend crashed mid-grader. + * `Failed to list models: 429` — Copilot models API rate-limit. + + These are infra noise, not agent quality signal. See the retry + pattern in + [.github/workflows/waza-agent-evals.yml](../workflows/waza-agent-evals.yml). + +### Step 7 — Render next-step + +Print a single decision line, then a one-line next-step: + +* **All tasks scored, no errors, `agent_tools_implicit` fired everywhere:** + `🟢 ONBOARDED: eval suite created. Smoke trial scored .` + Next: `Run /agent-bench ${agent} to benchmark across pilot-tier models (4 models, ~40 premium requests at trials=2).` + +* **Some tasks scored 0.0 cleanly (model failed criteria, persona-lock leak common on off-topic):** + `🟡 ONBOARDED with weak signal: eval suite created but smoke scored . Inspect failing tasks.` + Next: `Run /agent-improve ${agent} to tighten the .agent.md (persona-lock section is the usual lever), OR adjust grader criteria.` + +* **`agent_tools_implicit` missing on any task:** + `🟠 ONBOARDED but tool-constraint grader did not fire on N task(s).` + Next: `Verify .github/evals/agents/${agent}/eval.yaml has skill_directories: ["."] in config and the explicit tool_constraint grader at eval root. See /agent-promote criterion 5.` + +* **Any task had `runs[].status == "error"`:** + `🔴 ONBOARDED but smoke FAILED: task(s) errored — see /tmp/waza-runs/${agent}-onboard-smoke.json.` + Next: `Investigate grader-infra errors (session-not-found, judge unreachable, models 429) before relying on this eval.` + +## Rules and Constraints + +* **Greenfield only (eval dir).** This prompt refuses to overwrite an + existing `.github/evals/agents/${agent}/` directory. Iteration of + the eval suite belongs to [agent-improve](agent-improve.prompt.md), + not here. +* **No manifest edit.** Unlike skill onboarding, agent registration is + filesystem-driven — the workflow auto-discovers any `eval.yaml` under + `.github/evals/agents//`. Do not touch `manifest.yaml`. +* **`skill_directories: ["."]` is non-negotiable.** Without it, the + mirrored `.agent.md` is not discovered, the `tool_constraint` grader + cannot fire, and the smoke trial reports a setup bug instead of + agent quality. Step 4's eval template includes it — do not omit. +* **Mirror, do NOT symlink.** Step 5 uses `cp`, not `ln -s`. The eval-dir + copy is a tracked artifact that + [.github/workflows/waza-agent-evals.yml](../workflows/waza-agent-evals.yml) + re-syncs on every run. Symlinks behave inconsistently across + platforms and CI runners. +* **Two tool taxonomies.** The agent's production `tools:` field lists + VS Code Chat IDs (`execute`, `read`, `search`, `vscode`, `todo`, MCP + namespaces). The eval's `tool_constraint` grader matches SDK CLI + short names (`bash`, `view`, `edit`, `create`, `sql`, `task`). Both + are correct on their own surface. NEVER rewrite the agent's `tools:` + field to satisfy the grader — bridge the gap on the eval side. +* **`continue_session: true` on every prompt grader.** Without it the + judge cannot see the agent's response and scores oscillate. + Encoded in Step 4's templates — do not omit. +* **`clean_refusal` belongs on agents, not skills.** This is the + intentional divergence from [skill-onboard](skill-onboard.prompt.md): + agents enforce identity contracts via `## Identity (non-negotiable)`, + so off-topic tasks SHOULD grade refusal language. The grader is + best-effort (persona-lock can still leak through on weaker models) + but the presence of the section is a structural minimum. +* **Strip scaffold cruft before patching.** Every task file is authored + with the `yaml-language-server` schema header and WITHOUT an + `expected:` field. The Go runtime parser rejects the `expected:` + shape and the suite fails to load — do not let it slip back in. +* **Approval gate before file writes.** Step 5 pauses for human review. + Writing into `.github/evals/agents/` registers the agent for the + agent-evals workflow on next PR; the gate is not optional. +* **No skill files.** This prompt onboards **agents only**. If the user + passes a skill name (e.g. one of the skills under `.github/skills/`), + stop and point them to [skill-onboard](skill-onboard.prompt.md). +* **`executor: copilot-sdk` everywhere.** This repo standardizes on + the real Copilot SDK executor for both agent and skill evals. + +## Why each step + +* **Refuse-to-overwrite (Step 1)** — atomic onboarding semantics. A + half-written second-pass is harder to recover from than a clean + refusal. Iteration belongs in [agent-improve](agent-improve.prompt.md). +* **Profile from `## Workflow` + `## Non-goals` (Step 2)** — those + sections are the canonical scope contract described in the + [authoring framework spec](https://azure.github.io/git-ape/docs/authoring/framework). + Tasks anchored to them + measure real adherence, not the authoring LLM's interpretation of + the description line. If either section is missing or thin, the + generated tasks will be weak — fix the agent file in + [agent-improve](agent-improve.prompt.md) and re-run onboarding. +* **Hand-authored scaffold, no `waza suggest` (Step 3)** — the + agent-spec frontmatter (`tools`, `argumentHint`, `model`, `agents`, + `user-invocable`) is not in the agentskills.io spec the suggester + validates against, so it would error out or produce malformed + output. Same root cause as why + [agent-promote](agent-promote.prompt.md) excludes `waza check` from + the readiness criteria. +* **`skill_directories: ["."]` in the eval config (Step 4)** — + required for waza to discover the mirrored `.agent.md` and for the + `tool_constraint` grader to fire. The /agent-bench, /agent-improve, + and /agent-promote prompts all rely on this being set; the workflow + setup-bug detection in + [.github/workflows/waza-agent-evals.yml](../workflows/waza-agent-evals.yml) + treats its absence as a hard failure. +* **Explicit `tool_constraint` named `agent_tools_implicit` (Step 4)** — + suppresses waza's broken auto-injection (taxonomy mismatch between + VS Code Chat tool IDs and SDK CLI runtime), gives the grader a + matchable expectation, and preserves the grader name that + [agent-promote](agent-promote.prompt.md) criterion 5 keys off. +* **Mirror copy, not symlink (Step 5)** — matches the per-run sync + step in + [.github/workflows/waza-agent-evals.yml](../workflows/waza-agent-evals.yml). + CI runners may not preserve symlink semantics across `actions/checkout`, + and the mirrored bytes are what `git diff` lets reviewers inspect. +* **No manifest edit (Step 5)** — agents are auto-discovered by the + agent-evals workflow from filesystem layout. Adding a manifest entry + would require parallel workflow code changes; the current design + deliberately avoids that coupling. +* **`clean_refusal` on the off-topic task (Step 4)** — agents carry the + identity contract that skills lack. Grading persona-lock here is the + agent-side equivalent of grading `USE FOR:` adherence on a skill — + it measures the contract the agent actually promises. +* **Smoke trial on one model (Step 6)** — establishes baseline + executable status. Cross-model benchmarking is + [agent-bench](agent-bench.prompt.md)'s job; a production-readiness + gate is [agent-promote](agent-promote.prompt.md)'s. +* **Four-way next-step decision (Step 7)** — distinguishes "eval is + healthy, agent needs work" (🟡) from "eval setup bug" (🟠) from + "eval infra failure" (🔴). Each needs a different follow-on action; + collapsing them into a single failure state hides the lever the + user needs to pull next. diff --git a/.github/prompts/agent-promote.prompt.md b/.github/prompts/agent-promote.prompt.md new file mode 100644 index 0000000..500c08e --- /dev/null +++ b/.github/prompts/agent-promote.prompt.md @@ -0,0 +1,255 @@ +--- +agent: 'agent' +description: 'Assess whether a custom agent is ready for production: runs the eval suite across pilot-tier models, checks against numeric readiness criteria, and prints a graduation report.' +argument-hint: '[agentName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]' +--- + +# Agent Promote + +Assess whether a custom agent (`.github/agents/.agent.md`) has +earned a "ready for production" stamp. Unlike skills, agents in this +repo do not have a tier system (expanded → pilot); they run in a single +CI matrix today. This prompt therefore functions as a **production- +readiness gate**, not a tier graduation. + +This is the agent-side counterpart of `/skill-promote`. It runs the +full agent eval suite across the pilot-tier model fan-out, applies the +SKILL.md staging trick to score quality, scans the agent file for +persona-lock and structural hygiene, and prints a graduation report. + +This is **advisory**. The prompt produces a readiness report; the +actual "ship it" decision is a human review of the report (and, if +gates fail, a follow-up `/agent-improve` cycle). + +> **Cost notice:** This prompt runs the full agent eval suite four +> times (one per pilot-tier model) plus a `waza quality` audit. Set +> `models` to a subset if quota is limited. Total premium request +> count is roughly `models × tasks × trials_per_task + 1`. + +This is **non-interactive** — it runs to completion and prints a report. + +## Inputs + +* `${input:agentName}`: (Required) Bare agent name (e.g. + `azure-policy-advisor`), matching the basename of + `.github/agents/.agent.md`. If omitted, ask once. +* `${input:models:claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6}`: + (Optional) Comma-separated list of waza model IDs to assess. + Defaults to the full pilot-tier matrix. + +## Readiness Criteria + +An agent is **ready for production** when **all** of the following hold: + +| # | Criterion | Threshold | +|---|-----------|-----------| +| 1 | Positive-task pass rate across all (model × task × trial) | ≥ 90% | +| 2 | Negative-task pass rate (off-topic refusals fire correctly) | ≥ 90% | +| 3 | Token count of the `.agent.md` file | ≤ `tokens.warningThreshold` from `.waza.yaml` | +| 4 | `waza quality` per-dimension score (clarity / completeness / trigger-precision / scope / anti-patterns) | each ≥ 3.5 / 5.0 | +| 5 | `agent_tools_implicit` grader fires on **every** per-model run (proves `skill_directories: ["."]` is wired and tool-list is matchable) | fires on 100% of legs | +| 6 | Persona-lock present in `.agent.md` (grep for `Identity (non-negotiable)` or `Always identify yourself`) | present | +| 7 | No infra-failed legs (`Session not found` or `failed to run grader` in any per-model run's `error_msg`) | 0 | + +If any criterion fails, the agent stays in its current state; the +report explains which gates blocked promotion. + +`waza check` is **excluded** from the criteria list because waza's +compliance checker validates SKILL.md frontmatter (`agentskills.io` +spec) and rejects `.agent.md` frontmatter as malformed. That's not a +real failure — it's a spec mismatch. + +The **`tools:` taxonomy** is **not** automatically validated. The +production `tools:` field lists VS Code Chat tool IDs (`execute`, +`read`, `search`, `vscode`, `todo`, MCP namespaces); the eval runtime +emits SDK CLI short names (`bash`, `view`, `edit`, `create`, `sql`, +`task`). The criterion-5 check (`agent_tools_implicit` fires) is the +closest mechanical proxy for "tool-list is wired correctly". A +deeper taxonomy audit is a manual review item, flagged in the report +when criterion 5 fails. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for +every shell command. Use `set -uo pipefail` (not `-e`) so a non-zero +`waza run` exit does not abort the assessment. + +### Step 1 — Resolve and verify + +1. Set `agent="${input:agentName}"`. +2. Verify `.github/agents/${agent}.agent.md` exists. If not, stop and + report the missing path. +3. Verify `.github/evals/agents/${agent}/eval.yaml` exists. If not, + stop and report the missing path. A readiness assessment requires + an eval suite. Point the user at + `.github/evals/agents/azure-policy-advisor/` as a reference. +4. Parse `${input:models}` by splitting on commas, trimming whitespace. +5. Print a one-line preamble: + `Readiness assessment: against production criteria`. + +### Step 2 — Resync eval-dir copy + +```bash +cp ".github/agents/${agent}.agent.md" \ + ".github/evals/agents/${agent}/${agent}.agent.md" +``` + +One-shot sync — the production file does not change during the +assessment. The eval-dir copy is the bytes graded by Step 3. + +### Step 3 — Run the eval suite per model + +```bash +mkdir -p /tmp/waza-promote +for model in ${models[@]}; do + echo "▶ Eval: ${model}" + waza run ".github/evals/agents/${agent}/eval.yaml" \ + --model "${model}" \ + --judge-model "claude-sonnet-4.6" \ + --no-cache \ + --output "/tmp/waza-promote/${agent}-${model}.json" \ + 2>&1 | tail -3 +done +``` + +Use `--judge-model claude-sonnet-4.6` for stable cross-model quality +scoring. Run sequentially — quota consumption stays predictable. + +### Step 4 — Token budget check + +```bash +waza tokens count ".github/agents/${agent}.agent.md" --format json \ + > "/tmp/waza-promote/${agent}-tokens.json" +warning_threshold="$(yq -r '.tokens.warningThreshold' .waza.yaml)" +agent_tokens="$(jq -r '.[0].tokens' "/tmp/waza-promote/${agent}-tokens.json")" +echo "Agent tokens: ${agent_tokens} / ${warning_threshold} (warningThreshold)" +``` + +### Step 5 — Quality audit (via SKILL.md staging trick) + +`waza quality` requires a `SKILL.md` filename; staging the agent file +into a NON-DOT path (`waza-agent-stage//SKILL.md`) is the +smallest workaround. `sed` strips the prefix from the output so the +human sees the real agent path. + +```bash +mkdir -p "waza-agent-stage/${agent}" +cp ".github/agents/${agent}.agent.md" \ + "waza-agent-stage/${agent}/SKILL.md" + +waza quality "waza-agent-stage/${agent}/SKILL.md" \ + --model claude-sonnet-4.6 --format json \ + | sed "s|waza-agent-stage/${agent}/SKILL.md|${agent}.agent.md|g" \ + > "/tmp/waza-promote/${agent}-quality.json" 2>&1 + +rm -rf "waza-agent-stage/${agent}" +``` + +Parse per-dimension scores (clarity, completeness, trigger-precision, +scope, anti-patterns) from the JSON for criterion 4. + +### Step 6 — Persona-lock check + +```bash +if grep -qE "Identity \(non-negotiable\)|Always identify yourself" \ + ".github/agents/${agent}.agent.md"; then + persona_lock="present" +else + persona_lock="missing" +fi +echo "Persona-lock: ${persona_lock}" +``` + +A persona-lock block is best-effort (the model's built-in CLI +refusal can still leak through on fully off-topic prompts), but its +presence is a structural minimum for production readiness. + +### Step 7 — Infra-failure scan + +For each `/tmp/waza-promote/${agent}-.json`, count entries in +`tasks[].runs[].error_msg` containing `Session not found` or +`failed to run grader`. Both indicate the eval result was corrupted +by infrastructure noise, not model quality. + +```bash +infra_errors=0 +for f in /tmp/waza-promote/${agent}-*.json; do + count=$(jq '[.tasks[]?.runs[]? | (.error_msg // "") + | select(contains("Session not found") + or contains("failed to run grader"))] + | length' "$f") + infra_errors=$((infra_errors + count)) +done +echo "Infra errors: ${infra_errors}" +``` + +### Step 8 — Render the readiness report + +Aggregate Step 3 per-model JSONs, Step 4 token count, Step 5 quality +scores, Step 6 persona-lock state, and Step 7 infra-error count into +a single markdown block. + +For each criterion, print one of: + +* `✅ PASS` — value at or above threshold +* `❌ FAIL` — value below threshold (and what the value was) + +Then a per-model breakdown table: + +| Model | Positive pass% | Negative pass% | `agent_tools_implicit` fired | Infra errors | +|---|---|---|---|---| +| … | … | … | yes/no | … | + +End with a single decision line: + +* `🟢 RECOMMEND PROMOTE: meets all 7 criteria. Safe for production matrix.` +* `🔴 HOLD: failed N criteria — run /agent-improve iterations=3 to address, then re-assess.` + +When criterion 5 (`agent_tools_implicit` fired) is the only failure, +include a tail line: `Tool-list mismatch likely. Verify the eval declares an explicit tool_constraint grader (per waza#226) using SDK CLI taxonomy, not VS Code Chat tool IDs.` + +Do not modify any workflow file, `.agent.md`, or `eval.yaml` from +this prompt — promotion is a deliberate human-reviewed change. + +## Rules and Constraints + +* **Always pass `--no-cache`.** Cached results from a prior run make + the assessment meaningless. +* **Never parallelize `waza run` calls.** Serial execution keeps + quota consumption predictable and avoids hitting rate limits. +* **Stay scoped to assessment.** Do not edit `.agent.md`, `eval.yaml`, + fixtures, task files, or workflow files. Promotion belongs in a + separate human-reviewed PR. +* **Sync rule.** Step 2 refreshes the eval-dir copy once. Never + hand-edit the eval-dir copy directly. +* **Staging cleanup.** Always `rm -rf "waza-agent-stage/${agent}"` + after Step 5, even on failure. The stage dir is workspace-local + scratch space, not a tracked artifact. +* **`executor: copilot-sdk` everywhere.** This repo standardizes on + the real Copilot SDK executor for both agent and skill evals. +* **Cost transparency.** At the start (Step 1) always remind the + user of the estimated premium request count. + +## Why each step + +* **`--no-cache` (Step 3)** — promotion requires a fresh execution + per model; cached results would let stale numbers ride into a + ship/no-ship decision. +* **Single resync at Step 2** — the production agent file is the + source of truth; the eval-dir copy must reflect it before scoring. +* **Staging trick at Step 5** — `waza quality` requires a SKILL.md + filename and silently skips dotted paths (the same .NET + `FileAttributes.Hidden` quirk that bites MSDO template-analyzer). + Staging to `waza-agent-stage//SKILL.md` is the smallest + workaround that lets us run the 5-dim LLM judge on an agent file. +* **Persona-lock grep (Step 6)** — a structural check, not a behavior + test. The presence of the block is necessary but not sufficient; + the production matrix still catches behavior regressions. +* **Infra-failure scan (Step 7)** — `Session not found` and + grader-infra errors silently flatten task scores. Without this + scan, a "passing" assessment could be reading contaminated 0.0s + as legitimate model output. Promotion based on infra-corrupted + data is worse than no promotion at all. +* **No `waza check` criterion** — waza's compliance checker + validates SKILL.md frontmatter spec, not `.agent.md` spec. + Including it would produce a deterministic FAIL with zero signal. diff --git a/.github/prompts/skill-bench.prompt.md b/.github/prompts/skill-bench.prompt.md new file mode 100644 index 0000000..6750eea --- /dev/null +++ b/.github/prompts/skill-bench.prompt.md @@ -0,0 +1,136 @@ +--- +agent: 'agent' +description: 'Cross-model benchmark for a single skill: runs waza eval once per model, captures results, compares with waza compare, and prints a one-line winner summary' +argument-hint: '[skillName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]' +--- + +# Skill Bench + +Run a cross-model benchmark against a single skill's eval suite. Executes +`waza run` once per model, captures per-model result JSON to `/tmp/waza-runs/`, +then compares all runs with `waza compare` and prints a one-line summary +identifying the best model and the gap to the next. + +> **Cost notice:** This prompt consumes **one premium Copilot request per +> (model × task × trial)** combination. With 4 models and a suite of 3 tasks, +> that is ≥ 12 premium requests per invocation. Set `models` to a subset if +> quota is limited. + +This is **non-interactive** — it runs to completion and reports results. + +## Inputs + +* `${input:skillName}`: (Required) Skill directory name under + `.github/skills/`. Pass the bare name (e.g. `azure-cost-estimator`), + not a path. If omitted, ask once then proceed. +* `${input:models:claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6}`: + (Optional) Comma-separated list of waza model IDs to benchmark. + Defaults to all four matrix models. Run `waza models` to see the + currently-supported IDs. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for every +shell command. Use `set -uo pipefail` (not `-e`) so a non-zero `waza run` +exit (eval below threshold) does not abort the benchmark. + +### Step 1 — Resolve and verify + +1. Set `skill="${input:skillName}"`. +2. Verify `.github/evals/${skill}/eval.yaml` exists. If not, stop and + report the missing path. Benchmarking requires an eval suite. +3. Parse `${input:models}` by splitting on commas, trimming whitespace. + Store as an array `models`. If empty or not provided, use the default + list: `claude-sonnet-4.6`, `gpt-5.4`, `gpt-5-codex`, `claude-opus-4.6`. +4. Print a one-line preamble: + `Benchmarking across models: , , ...` + +### Step 2 — Run evals (one per model) + +```bash +mkdir -p /tmp/waza-runs + +for model in ${models[@]}; do + echo "▶ Running: ${model}" + waza run ".github/evals/${skill}/eval.yaml" \ + --model "${model}" \ + --no-cache \ + --output "/tmp/waza-runs/${skill}-${model}-bench.json" \ + 2>&1 | tail -5 + echo " → saved /tmp/waza-runs/${skill}-${model}-bench.json" +done +``` + +**Rules:** +- Pass `--no-cache` on every run. Without it, a cached result from a + previous run makes the comparison meaningless. +- Do not pass `--format` here; the default output is what we want for the + JSON capture. The `waza compare` step formats the results. +- If a model ID is unsupported, `waza run` will exit non-zero; log the + failure and continue to the next model (do not abort the whole bench). +- Do not parallelise the runs (no background `&`). Running serially bounds + memory and makes quota consumption predictable. + +### Step 3 — Compare results + +```bash +# Collect all result files produced in Step 2 +result_files=(/tmp/waza-runs/${skill}-*-bench.json) +if [ ${#result_files[@]} -lt 2 ]; then + echo "⚠ Only ${#result_files[@]} result file(s) found — skipping compare." +else + waza compare "${result_files[@]}" --format table +fi +``` + +If `waza compare` exits non-zero, print the error and continue to Step 4. + +### Step 4 — One-line summary + +Parse the `waza compare` table output (or the per-run score lines from +Step 2 if compare failed). Then print: + +``` +Best model: overall score +Second best: overall score gap: <+/-Δ> +``` + +If only one model produced a valid result, print: +``` +Only one valid result: overall score — no comparison possible. +``` + +Then close with a "Next steps" line: +- If best and second-best are close (gap < 0.05): + `"Gap is narrow — consider running with trials_per_task=3 on the best model to confirm."` +- If gap ≥ 0.05: + `"Clear winner: use for this skill in the matrix."` + +## Rules and Constraints + +* **Always pass `--no-cache`.** Results cached from a prior run make the + delta meaningless. +* **Never parallelize `waza run` calls.** Serial execution keeps quota + consumption predictable and avoids hitting rate limits. +* **Respect unsupported model IDs.** If a model fails with an "unsupported" + error, log it and move on — do not abort the entire bench. +* **Stay scoped to eval runs.** Do not edit `SKILL.md`, `eval.yaml`, + fixtures, or task files as part of this prompt. Eval changes belong in + a separate review. +* **No agent files.** waza only evaluates skills. If the user passes an + agent name, stop and point them to `docs/WAZA.md` → "Agent evals". +* **Cost transparency.** At the start (Step 1) always remind the user of + the estimated premium request count: `models × tasks × trials_per_task`. + +## Why each step + +* **`--no-cache` on every run (Step 2)** — a cached result makes the + comparison delta meaningless; the bench only has value if each model is + exercised fresh. +* **Serial runs (Step 2)** — parallel `waza run` calls multiply quota + consumption and can hit rate limits; serial is slightly slower but + predictable and cost-safe. +* **`waza compare` (Step 3)** — produces a structured table normalised + across runs; parsing raw score lines from stdout is fragile. +* **One-line summary (Step 4)** — answers the only question that matters: + which model to use in the matrix, and how confident we should be. diff --git a/.github/prompts/skill-improve.prompt.md b/.github/prompts/skill-improve.prompt.md new file mode 100644 index 0000000..bfebb40 --- /dev/null +++ b/.github/prompts/skill-improve.prompt.md @@ -0,0 +1,298 @@ +--- +agent: 'agent' +description: 'Local feedback loop for a single skill: baseline → audit → propose edits → apply (with approval) → re-rank via waza compare. Optionally loops up to 3 rounds for deeper refinement.' +argument-hint: '[skillName=...] [iterations={1|2|3}] [rescoreQuality={true|false}]' +--- + +# Skill Improve + +Run a local improvement loop against a single skill in this repository. +Captures a baseline eval score, gathers token + quality + frontmatter +audits, proposes concrete edits to `SKILL.md`, applies the ones the user +approves, re-runs the eval, and shows the delta. + +By default this is a **single pass** (one round of audit → propose → apply +→ verify). Pass `iterations=2` or `iterations=3` for a deeper refinement +loop that re-runs `waza dev` between rounds with a fresh approval gate per +iteration — useful when driving a `SKILL.md` toward a specific adherence +level methodically. Values above `3` are clamped to bound premium-request +cost. + +This is **interactive**. The protocol always pauses before editing. + +## Inputs + +* `${input:skillName}`: (Required) Skill directory name under + `.github/skills/`. Pass the bare name (e.g. `prereq-check`), not a + path. If omitted, infer from the user's message; otherwise ask once. +* `${input:iterations:1}`: (Optional, defaults to `1`) Number of + audit-propose-apply rounds to run inside this invocation. Hard-capped + at `3` to bound cost. When `iterations > 1`, only `waza dev` re-runs + between rounds (tokens + quality audits stay fixed at their baseline + values from round 1). +* `${input:rescoreQuality:false}`: (Optional, defaults to `false`) + When `true`, re-run `waza quality` after edits and include the + per-dimension before/after delta in the summary. Costs one extra + premium Copilot request (the LLM-as-judge call). Off by default to + keep the loop cheap; turn on when trigger-precision or another + quality dimension is the lever you're trying to move. + +## Required Protocol + +Execute the steps below in order. Do not parallelize across steps. +Within step 3, the three audit commands MAY run in parallel. + +Use the workspace root as cwd for every shell command. Use `set -uo +pipefail` (not `-e`) so a non-zero `waza run` exit (eval below +threshold) does not abort the loop. + +### Step 1 — Resolve and verify + +1. Set `skill="${input:skillName}"`. +2. Set `maxIter = min(3, ${input:iterations:1})`. +3. Verify `.github/skills/${skill}/SKILL.md` exists. If not, stop and + report the missing path. +4. Note whether `.github/evals/${skill}/eval.yaml` exists. If not, the + loop runs in **audit-only** mode: skip steps 2, 7, 8 and warn the + user that score deltas cannot be measured. +5. Print a one-line preamble: + `Improving (eval: present|absent, iterations: )`. + +### Step 2 — Baseline eval (skip in audit-only mode) + +Run **once**, with cache disabled so a later re-run produces a real +delta even when `eval.yaml` is unchanged: + +```bash +mkdir -p /tmp/waza-runs +waza run ".github/evals/${skill}/eval.yaml" \ + --no-cache \ + --output "/tmp/waza-runs/${skill}-baseline.json" +``` + +Capture the printed score line (e.g. `Score: 0.62`) for the summary. +(`--format` only accepts `default` or `github-comment`; the default is +what we want here.) + +### Step 3 — Initial audits (run in parallel, once) + +Run all four audits to seed iteration 1. Each writes to a temp file so +later steps can read them. + +```bash +waza tokens suggest ".github/skills/${skill}/SKILL.md" --format text \ + > "/tmp/waza-runs/${skill}-tokens.txt" 2>&1 & + +waza quality ".github/skills/${skill}/SKILL.md" \ + --model claude-sonnet-4.6 --format table \ + > "/tmp/waza-runs/${skill}-quality.txt" 2>&1 & + +waza dev "${skill}" --copilot --model claude-sonnet-4.6 \ + > "/tmp/waza-runs/${skill}-dev-iter1.md" 2>&1 & + +wait +``` + +If any of the three exits non-zero, keep going but flag the failure in +the summary. They are advisory inputs, not gates. + +### Step 4 — Iteration loop + +Repeat the audit → propose → approve → apply cycle up to `maxIter` +times. Track iteration index `i` starting at `1`. Maintain a running +log `applied[i] = [list of indices]` for the final summary. + +**For iteration `i = 1`**: skip step 4a (audits are already in +`/tmp/waza-runs/${skill}-dev-iter1.md` from step 3). + +**For iteration `i >= 2`**: run a fresh `waza dev` only — tokens and +quality are not re-run between rounds (they're advisory inputs that +rarely shift on small edits, and re-running quality costs a premium +request each time): + +```bash +waza dev "${skill}" --copilot --model claude-sonnet-4.6 \ + > "/tmp/waza-runs/${skill}-dev-iter${i}.md" 2>&1 +``` + +If the report says "target adherence reached" or contains zero +recommendations, **stop the loop** with stop reason `target reached` +or `no recommendations` and jump to step 5. + +#### Step 4a — Read iteration context + +Read into context (do not echo back to the user verbatim — they will +see the synthesis in step 4b): + +* `.github/skills/${skill}/SKILL.md` (current on-disk version) +* `.github/evals/${skill}/eval.yaml` (if present) +* `/tmp/waza-runs/${skill}-tokens.txt` (iter 1 only — reused after) +* `/tmp/waza-runs/${skill}-quality.txt` (iter 1 only — reused after) +* `/tmp/waza-runs/${skill}-dev-iter${i}.md` (fresh each iteration) + +#### Step 4b — Propose edits + +Produce a numbered list of **3 to 7** concrete, actionable edits to +`SKILL.md`. Each item has: + +* **Index** — sequential within this iteration, starts at 1. +* **Lever** — one of: `clarity` | `trigger-precision` | `scope-boundary` + | `token-saving` | `anti-pattern`. +* **Rationale** — one or two sentences citing the audit input that + surfaced the suggestion (e.g. "tokens suggest flagged 280-token + example block" or "quality scored trigger-precision 2/5" or + "`waza dev` iter 2 recommendation: tighten `when-to-use` against + scope overlap with `azure-cost`"). +* **Proposed change** — the exact text to add, modify, or delete. + +Do not invent edits beyond what the audits + your reading of +`SKILL.md` and `eval.yaml` justify. Prefer fewer high-confidence edits +to a long list of speculative ones. + +After the list, ask the user a single question. The options vary by +iteration: + +* **If `maxIter == 1` or this is the final iteration:** + > Reply with the indices to apply (e.g. `1, 3, 5`), `all`, or `skip`. +* **If more iterations remain (`i < maxIter`):** + > Reply with the indices to apply (e.g. `1, 3, 5`), `all`, `skip`, + > or `stop` to end the loop now. + +#### Step 4c — Apply approved edits + +On user response: + +* `stop` — record `applied[${i}] = []` with reason `user stopped`, + exit the loop, jump to step 5. +* `skip` — record `applied[${i}] = []`, continue to iteration `i + 1` + (or exit if `i == maxIter`). +* `all` or a list of indices — apply the corresponding edits using + `edit` tool calls (never shell `sed`/`awk`). Record applied indices. + +After applying, increment `i`. If `i > maxIter`, exit the loop with +stop reason `max iterations`. + +### Step 5 — Verify (skip in audit-only mode, skip if no edits applied) + +Re-run with cache disabled: + +```bash +waza run ".github/evals/${skill}/eval.yaml" \ + --no-cache \ + --output "/tmp/waza-runs/${skill}-after.json" +``` + +### Step 6 — Compare (skip in audit-only mode, skip if no edits applied) + +```bash +waza compare \ + "/tmp/waza-runs/${skill}-baseline.json" \ + "/tmp/waza-runs/${skill}-after.json" \ + --format table +``` + +### Step 6b — Re-score quality (only when `rescoreQuality=true`) + +Run a fresh quality judge against the edited `SKILL.md` and capture +the table for the summary. + +```bash +waza quality ".github/skills/${skill}/SKILL.md" \ + --model claude-sonnet-4.6 --format table \ + > "/tmp/waza-runs/${skill}-quality-after.txt" 2>&1 +``` + +Parse both `/tmp/waza-runs/${skill}-quality.txt` (baseline, captured +in step 3) and `/tmp/waza-runs/${skill}-quality-after.txt`. Skip this +step silently when `rescoreQuality` is `false`. + +### Step 7 — Summary + +Print a Markdown summary table with: + +| Metric | Before | After | Δ | +|---|---|---|---| +| Overall score | … | … | … | +| Per-task: | … | … | … | +| SKILL.md tokens | … | … | … | +| Quality (clarity / completeness / trigger-precision / scope / anti-patterns) | … | … | … | + +The Quality row populates with real before/after numbers **only when +`rescoreQuality=true`**. Otherwise show the baseline column from step +3 and write `not re-scored (pass rescoreQuality=true to enable)` in +the After column. + +If `maxIter > 1`, also print a per-iteration breakdown: + +* **Iterations run** — `` of ``. +* **Stop reason** — one of: `target reached` | `max iterations` | + `user stopped` | `no recommendations` | `eval missing — only ran + audits` | `no edits applied`. +* **Per-iteration applied items** — bullet list grouped by iteration + index, citing each applied edit's lever. + +Then a "Verdict" paragraph using one of these labels: + +* `IMPROVED` — overall score increased AND no negative-task score + increased AND no positive-task score decreased. +* `MIXED` — overall score increased BUT at least one of: negative-task + score went up, positive-task score went down. +* `REGRESSED` — overall score decreased. +* `NO CHANGE` — overall delta is zero. +* `AUDIT ONLY` — no eval present; only token + quality numbers shown. + +End with a one-line "Next" suggestion: + +* If `IMPROVED`: "Commit and open a PR to verify in CI." +* If `MIXED` or `REGRESSED`: "Re-run `/skill-improve ${skill}` after + reverting or refining the offending edits." +* If `NO CHANGE`: "Audits had no actionable findings. Consider + expanding the eval suite." +* If `AUDIT ONLY`: "Add `.github/evals/${skill}/eval.yaml` to enable + score-based feedback." + +## Rules and Constraints + +* **Always** pass `--no-cache` to `waza run` for both baseline and + verify. Without it, an unchanged `eval.yaml` returns cached results + and the delta is meaningless. +* **Never** auto-apply edits. The approval gate in step 4b is + mandatory on every iteration. +* **Hard cap of 3 iterations.** `${input:iterations}` values above 3 + are clamped to 3. Each extra `waza dev --copilot` round consumes + premium requests; an unbounded loop is a cost trap. +* **One baseline per invocation.** The baseline captured in step 2 is + reused for the final compare in step 6. Do not re-baseline inside + the iteration loop — that would mask cumulative regressions. +* **`waza dev` is the only audit re-run between iterations.** Tokens + and quality are advisory inputs that rarely shift on small edits; + re-running quality would also burn a premium request per round. +* **Refuse to label `IMPROVED`** if a negative-task score increased. + Broadening a description to win positives at the cost of negatives + is overfitting; surface it instead of hiding it. +* **Stay scoped to `SKILL.md`.** Do not edit `eval.yaml`, fixtures, + tasks, or unrelated files. `waza dev` only mutates frontmatter; if + a recommendation requires touching `eval.yaml`, fixtures, or tasks, + surface it but do not apply it — those changes belong in a separate + manual review. +* **No agent files.** waza only evaluates skills. If the user passes + an `.agent.md` name, stop and point them at `/agent-improve`. + +## Why every step + +* **Step 2/5 with `--no-cache`** — the verify step needs a real + execution; without `--no-cache` an unchanged spec returns the cached + baseline and delta is always 0. +* **Step 3 in parallel** — the three audits are independent; + `waza dev --copilot` can take 30–60 s while `waza tokens suggest` and + `waza quality` finish faster, so backgrounding them costs nothing and + gives the synthesis step a complete picture in one shot. +* **Step 4 loop with fresh `waza dev`** — `waza dev`'s recommendations + shift between rounds: issues hidden behind round-1 problems only + surface after the round-1 fix lands. Re-running it is the unique + value of `iterations > 1`. +* **Step 4b numbered approval** — gives the user partial-acceptance + control without re-prompting per item. +* **Step 7 verdict labels** — distinguishes the most common failure + mode (overfitting positives at the cost of negatives) from a real + improvement. diff --git a/.github/prompts/skill-onboard.prompt.md b/.github/prompts/skill-onboard.prompt.md new file mode 100644 index 0000000..488f5cb --- /dev/null +++ b/.github/prompts/skill-onboard.prompt.md @@ -0,0 +1,449 @@ +--- +agent: 'agent' +description: 'Onboard a new skill into the waza eval harness: scaffold eval.yaml + tasks, patch to repo conventions (hybrid graders, concrete prompts, schema headers), register at the expanded tier in manifest.yaml, then run a smoke trial.' +argument-hint: '[skillName=...] [positiveTasks={2|3|4}] [negativeTasks={1|2}] [smokeModel=claude-sonnet-4.6]' +--- + +# Skill Onboard + +Bootstrap a brand-new eval suite for a skill that currently has no +evaluation. This is **stage 0** of the eval lifecycle — it precedes +[skill-bench](skill-bench.prompt.md), [skill-improve](skill-improve.prompt.md), +and [skill-promote](skill-promote.prompt.md). + +What it produces: + +* `.github/evals//eval.yaml` +* `.github/evals//tasks/positive-*.yaml` (default 2) +* `.github/evals//tasks/negative-*.yaml` (default 2) +* `.github/evals//tasks/negative-off-topic.yaml` (one off-topic negative, trigger-grader only) +* A new `{ name: , tier: expanded }` entry appended to + `.github/evals/manifest.yaml` + +What it does **not** do: + +* Edit `SKILL.md`. Onboarding consumes the skill as-is. Use + [skill-improve](skill-improve.prompt.md) for SKILL.md edits. +* Promote the skill to the pilot tier. That gate is + [skill-promote](skill-promote.prompt.md), and only after the skill has + matured in the expanded tier. + +This is **interactive**. The protocol pauses for approval before writing +the manifest entry and before running the smoke trial. + +> **Cost notice:** Step 3 (`waza suggest --apply`) consumes ~1 premium +> request to author the initial scaffold. Step 6 (smoke trial) consumes +> `trials_per_task × len(tasks)` premium requests (default `2 × 5 = 10`) +> plus per-task judge calls (one `answer_quality` judge per positive +> trial). Total budget ≈ **12–16 premium requests** per invocation — +> larger than a single-trial scaffold but enough to surface obvious +> flakes during onboarding. + +## Inputs + +* `${input:skillName}`: (Required) Skill directory name under + `.github/skills/`. Pass the bare name (e.g. `azure-naming-research`), + not a path. If omitted, infer from the user's message; otherwise ask + once. +* `${input:positiveTasks:2}`: (Optional, defaults to `2`) How many + positive trigger tasks to scaffold. Hard-capped at `4` to bound cost. + Each positive task gets a hybrid grader pair: `trigger` (heuristic) + + `answer_quality` (LLM judge, `continue_session: true`). +* `${input:negativeTasks:2}`: (Optional, defaults to `2`) How many + negative trigger tasks to scaffold. Hard-capped at `2`. Defaulting to + the cap gives every new skill at least two in-domain refusal cases on + top of the dedicated off-topic task — single-negative scaffolds tend + to under-cover the `DO NOT USE FOR:` boundary. Negative tasks carry + the `trigger` grader only — refusals shouldn't call tools or be + graded on answer quality. +* `${input:smokeModel:claude-sonnet-4.6}`: (Optional) Model to use for + the final smoke trial. Default is the cheapest stable model in the + matrix; override only if the skill is model-sensitive. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for +every shell command. Use `set -uo pipefail` (not `-e`) so a non-zero +`waza run` exit (eval below threshold on first run) does not abort the +onboarding. + +### Step 1 — Resolve and verify + +1. Set `skill="${input:skillName}"`. +2. Set `nPos = min(4, ${input:positiveTasks:2})`, + `nNeg = min(2, ${input:negativeTasks:2})`. +3. Verify `.github/skills/${skill}/SKILL.md` exists. If not, stop and + report the missing path. +4. **Refuse to overwrite.** If `.github/evals/${skill}/` already exists, + stop and tell the user to use [skill-improve](skill-improve.prompt.md) + or delete the directory manually first. Onboarding is a greenfield + operation. +5. Check the manifest. If `.github/evals/manifest.yaml` already lists + `name: ${skill}`, stop and report — the skill is already registered. +6. Print a one-line preamble: + `Onboarding with positive + negative + 1 off-topic task; smoke model: `. + +### Step 2 — Profile the skill + +Read `.github/skills/${skill}/SKILL.md` and extract: + +1. The `description:` frontmatter line. +2. The `USE FOR:` list — these become **positive** task prompts. Pick + the `nPos` most distinct phrasings. If `USE FOR:` is missing, scan + the body for "When to use" / "Triggers on" / "Triggers:" and use + those phrasings. +3. The `DO NOT USE FOR:` list — these become **negative** task prompts. + Pick the `nNeg` most distinct phrasings. +4. The skill's primary domain (1–3 words; e.g. "Azure naming", + "Bicep templates", "prereq tooling"). This drives the off-topic + negative task — pick a topic clearly outside the domain (e.g. "Linux + kernel scheduling internals" for an Azure skill) so the trigger + grader can confirm the skill stays inactive. + +Echo the extracted profile so the human can sanity-check before any +files are written. + +### Step 3 — Scaffold with `waza suggest` + +Generate the initial eval scaffold using waza's LLM-driven suggester. +Stage to a temp directory first so we can patch and move atomically. + +```bash +mkdir -p /tmp/waza-onboard/${skill} +waza suggest ".github/skills/${skill}" \ + --apply \ + --output-dir "/tmp/waza-onboard/${skill}" \ + --model "claude-sonnet-4.6" \ + 2>&1 | tail -10 +``` + +If `waza suggest` fails (e.g. LLM unavailable), **fall back to the +deterministic scaffold**: + +```bash +waza new eval "${skill}" \ + --output "/tmp/waza-onboard/${skill}/eval.yaml" \ + 2>&1 | tail -5 +``` + +Either way, verify the staged tree contains `eval.yaml` and at least +one task file. If not, stop and report. + +### Step 4 — Patch to repo conventions + +The waza-generated scaffold is generic. Patch it to match the +prereq-check reference suite ([.github/evals/prereq-check/eval.yaml](.github/evals/prereq-check/eval.yaml)). +Apply ALL of the following edits to the staged files before they leave +`/tmp/waza-onboard/${skill}/`. + +**`eval.yaml` patches:** + +1. Top-of-file schema comment: prepend + `# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json`. +2. `config:` block must include: + ```yaml + config: + # 2 trials catches the obvious LLM nondeterminism flakes (single trial + # = no flake signal). Pilot tier bumps to 3 via /skill-promote. + trials_per_task: 2 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + ``` +3. `metrics:` block must contain exactly one entry: + ```yaml + metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: + ``` +4. `graders:` block at the eval level must contain ONLY the `budget` + behavior grader. **Do NOT add `skill_invocation` with `required_skills:` + here** — eval-level prompt graders fire on every task including + negatives, drag every leg by ~25%, and produce zero model signal + (the same score across all models proves it's noise, not value). + ```yaml + graders: + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + ``` +5. `tasks:` block: `["tasks/*.yaml"]`. +6. Remove any `tool_constraint` grader waza auto-injected at the eval + root. To suppress the auto-injection, add the no-op pattern from the + eval-harness convention: + ```yaml + _suppress_auto_inject: + type: tool_constraint + reject_tools: [{tool: "^___never_matches___$"}] + ``` + +**Per-task patches (all task files — apply first):** + +`waza new eval` (and `waza suggest`) emit each task file with two +defects that MUST be cleaned up before patching graders: + +1. **Prepend the task-schema header** as line 1 of every task file: + ```yaml + # yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + ``` + Without this, VS Code applies the eval-schema to task files and + surfaces ~6 false-positive lint errors per file ("Property graders + is not allowed", etc.). +2. **Remove any `expected:` field** the scaffold emitted. The waza + task JSON schema accepts the shape `expected:\n should_trigger: true`, + but the **Go runtime parser rejects it** + (`cannot unmarshal !!seq into models.TaskExpectation`) and the suite + fails to load. Graders alone drive pass/fail — reference + [.github/evals/prereq-check/tasks/](.github/evals/prereq-check/tasks/) + which has no `expected:` field on any task. + +**Per-task patches (`tasks/positive-*.yaml`):** + +Before patching graders, **rewrite `inputs.prompt:` to a concrete, +answerable scenario** derived from the `USE FOR:` phrasings extracted +in Step 2. The scaffold emits generic placeholders like `"Use +to help me complete this task"` which cause the agent to ask for +clarification → `answer_quality` fails deterministically. Reference +shape: prereq-check's `positive-command-not-found` task uses +`"az: command not found — what tools should be installed for Git-Ape +skills?"` — concrete, scenario-grounded, immediately answerable. + +Each positive task MUST have a hybrid grader pair. The `prompt` grader +needs `continue_session: true` — without it the judge has zero access +to the agent's response and scores fail/pass at random. + +```yaml +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills//SKILL.md + mode: positive + threshold: 0.5 + + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + + + PASS criteria — the response must contain ALL of: + 1. + 2. + 3. + 4. + + If ALL criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. +``` + +**Per-task patches (`tasks/negative-*.yaml`):** + +Negative tasks carry the `trigger` grader only. No `answer_quality` +grader — a refusal that's syntactically correct shouldn't be graded on +"how good is the answer". + +```yaml +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills//SKILL.md + mode: negative + threshold: 0.5 +``` + +**New file — `tasks/negative-off-topic.yaml`:** + +Author from scratch as a trigger-only negative task targeting the +off-topic domain identified in Step 2. **Do NOT add a `clean_refusal` +prompt grader.** Skills are passive guidance — persona-lock and identity +contracts belong to `.agent.md` mirrors, not SKILL.md. Grading a skill +on refusal language confuses two surfaces and produces deterministic +0.0 noise when the model simply answers the off-topic question (which +it will, absent agent-level identity enforcement). The trigger grader +alone is the right signal: it confirms the skill stays inactive on +out-of-scope prompts. + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json +id: negative-off-topic +name: Negative — Off-topic prompt +description: Off-topic prompt should not trigger this skill. +tags: [trigger, negative] +inputs: + prompt: "" +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills//SKILL.md + mode: negative + threshold: 0.5 +``` + +### Step 5 — Approval gate + move into place + +Show the human a tree listing of the staged files plus a diff-style +summary of what will be written. Then pause and ask: + +> Apply this eval suite to `.github/evals/${skill}/` and register +> `${skill}` at the **expanded** tier in `manifest.yaml`? (yes / no) + +If `yes`: + +```bash +mkdir -p ".github/evals/${skill}" +cp -R "/tmp/waza-onboard/${skill}/." ".github/evals/${skill}/" +``` + +Then append to `.github/evals/manifest.yaml`. The expanded tier is the +correct landing zone — pilot is reserved for skills with proven cross- +model stability (gated by [skill-promote](skill-promote.prompt.md)). + +```yaml +# Append under skills:, after existing entries + - name: + tier: expanded +``` + +Use `yq` to make the manifest edit idempotent: + +```bash +yq -i ".skills += [{\"name\": \"${skill}\", \"tier\": \"expanded\"}]" \ + .github/evals/manifest.yaml +``` + +If `no`, stop without writing anything. Leave the staged files in +`/tmp/waza-onboard/${skill}/` for the user to inspect manually. + +### Step 6 — Validate + +`waza check` is a **skill-side** validator (compliance scoring, token +budget, frontmatter). There is currently no first-class linter for +`.github/evals//eval.yaml` — Step 7's smoke trial is the runtime +validator. Run `waza check` against the **skill** directory, not the +eval directory (the latter returns `no SKILL.md found`): + +```bash +waza check ".github/skills/${skill}" 2>&1 | tail -30 +``` + +Triage the output: + +* **Blocking failures** — stop and surface; do NOT proceed to the smoke + trial. These include: compliance score `Low`, broken or missing + frontmatter, missing `name:` or `description:`, invalid YAML. +* **Advisory findings** — surface but do NOT block. Common advisories + that should NOT halt onboarding: token-budget overage, hardcoded + URLs, missing examples section, complexity score, body-structure + recommendations on an otherwise-passing skill. + +### Step 7 — Smoke trial (single model, single trial) + +Confirm the suite executes end-to-end. This is a cheap signal that the +graders wire up correctly — not a quality assessment. + +```bash +mkdir -p /tmp/waza-runs +waza run ".github/evals/${skill}/eval.yaml" \ + --model "${input:smokeModel:claude-sonnet-4.6}" \ + --judge-model "claude-sonnet-4.6" \ + --no-cache \ + --output "/tmp/waza-runs/${skill}-smoke.json" \ + 2>&1 | tail -10 +``` + +Parse the result JSON and report: + +* `summary.aggregate_score` +* Per-task `tasks[].stats.avg_score` +* Any tasks with `runs[].status == "error"` and their `error_msg` — + these indicate grader-infra failures and must be investigated before + the suite is trusted (see workflow-level retry pattern in + [.github/workflows/waza-evals.yml](.github/workflows/waza-evals.yml)). + +### Step 8 — Render next-step + +Print a single decision line, then a one-line next-step: + +* **All tasks scored, no errors:** + `🟢 ONBOARDED: registered at expanded tier. Smoke trial scored .` + Next: `Run /skill-bench ${skill} to benchmark across the expanded tier (2 models, ~20 premium requests at trials=2).` + +* **Some tasks scored 0.0 cleanly (model failed criteria):** + `🟡 ONBOARDED with weak signal: registered at expanded tier but smoke scored . Inspect failing tasks.` + Next: `Run /skill-improve ${skill} to tighten SKILL.md, OR edit the eval grader criteria.` + +* **Any task had `runs[].status == "error"`:** + `🔴 ONBOARDED but smoke FAILED: task(s) errored — see /tmp/waza-runs/${skill}-smoke.json.` + Next: `Investigate grader-infra errors (session-not-found, judge unreachable) before relying on this eval.` + +## Rules and Constraints + +* **Greenfield only.** This prompt refuses to overwrite an existing + `.github/evals/${skill}/` directory. Iteration belongs to + [skill-improve](skill-improve.prompt.md), not here. +* **Expanded tier is the only landing zone.** Never write `tier: pilot` + from this prompt — promotion is gated separately and requires + cross-model evidence. +* **Approval gate before file writes.** Step 5 pauses for human review. + Do not bypass. +* **No SKILL.md edits.** Onboarding is non-destructive to the skill + itself. If SKILL.md is malformed (no `USE FOR:` / `DO NOT USE FOR:`), + Step 2 stops and redirects to [skill-improve](skill-improve.prompt.md). +* **`continue_session: true` on every prompt grader.** Without it the + judge cannot see the agent's response and scores oscillate. This is + encoded in Step 4's templates — do not omit. +* **No `skill_invocation` grader with `required_skills:` at eval level.** + Eval-level prompt graders fire on EVERY task (including negatives) and + produce deterministic 0.0 noise. Removed in commit `2f699c79` from + git-ape-onboarding for this reason. +* **No `clean_refusal` grader on negative tasks.** Skills don't enforce + identity contracts — persona-lock belongs to `.agent.md` mirrors. Use + the trigger grader alone on negative and off-topic tasks; let the + smoke trial confirm the skill stays inactive on out-of-scope prompts. +* **Strip scaffold cruft before patching.** `waza new eval` emits each + task file without the `yaml-language-server` schema header and with + an `expected:` field the runtime parser rejects. Step 4's first + per-task patch removes both — do not skip it. +* **No agent files.** This prompt onboards **skills only**. If the user + passes an agent name (e.g. one of the agents under `.github/agents/`), + stop and point them to [agent-onboard](agent-onboard.prompt.md) (or to + the `.agent.md` mirror convention if onboarding a hybrid). + +## Why each step + +* **Refuse-to-overwrite (Step 1)** — atomic onboarding semantics. A + half-written second-pass is harder to recover from than a clean refusal. +* **Profile from `USE FOR:` / `DO NOT USE FOR:` (Step 2)** — those + sections are the canonical scope contract per the agent-customization + conventions. Tasks anchored to them measure real adherence, not the + authoring LLM's interpretation. +* **`waza suggest --apply` to staging (Step 3)** — the LLM-driven scaffold + is faster than hand-authoring, but its defaults need patching. Staging + to `/tmp/` makes the patching atomic. +* **Patch to prereq-check conventions (Step 4)** — hybrid graders, + `continue_session: true`, no eval-level `skill_invocation`. Every one + of these is a hard-won lesson encoded in the harness; the scaffold + alone doesn't produce them. +* **Approval gate (Step 5)** — manifest edits and `.github/evals/` + writes touch CI matrix dispatch. Human review is non-negotiable. +* **`waza check` before smoke (Step 6)** — fail fast on SKILL.md + compliance failures (broken frontmatter, low compliance score) before + spending premium requests. Advisory findings (token budget, missing + examples) surface but do not block, since `waza check` covers the + skill side only — Step 7's smoke trial is the eval-side runtime + validator. +* **Smoke trial on one model (Step 7)** — establishes baseline executable + status. Cross-model benchmarking is `/skill-bench`'s job. +* **Three-way next-step decision (Step 8)** — distinguishes "eval is + healthy, skill needs work" from "eval is broken". The two need different + follow-on prompts. diff --git a/.github/prompts/skill-promote.prompt.md b/.github/prompts/skill-promote.prompt.md new file mode 100644 index 0000000..786e585 --- /dev/null +++ b/.github/prompts/skill-promote.prompt.md @@ -0,0 +1,116 @@ +--- +agent: 'agent' +description: 'Assess whether a skill in the expanded eval tier is ready to graduate to the pilot tier (full 4-model fan-out). Runs the eval suite, checks against numeric promotion criteria, and prints a graduation report.' +argument-hint: '[skillName=...]' +--- + +# Skill Promote + +Assess whether a skill that lives in the **expanded eval tier** (2 models in +CI: `claude-sonnet-4.6` + `gpt-5-codex`) has earned promotion to the **pilot +tier** (full 4-model fan-out + `trials_per_task: 3` flake detection). + +This is **advisory**. The prompt produces a graduation report; an actual +promotion is a separate code change to remove the skill's `exclude:` entries +from `.github/workflows/waza-evals.yml` and `waza-trends.yml` (and to bump +`config.trials_per_task` in the skill's `eval.yaml`). + +> **Cost notice:** This prompt runs the full skill eval suite four times +> (one per pilot-tier model) plus a tokens profile. Set `models` to a subset +> if quota is limited. + +This is **non-interactive** — it runs to completion and prints a report. + +## Inputs + +* `${input:skillName}`: (Required) Skill directory name under + `.github/skills/` (bare name, not a path). If omitted, ask once. +* `${input:models:claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6}`: + (Optional) Comma-separated list of waza model IDs to benchmark for + promotion. Defaults to the full pilot-tier matrix. + +## Promotion Criteria + +A skill is **ready to promote** when **all** of the following hold: + +| # | Criterion | Threshold | +|---|-----------|-----------| +| 1 | Trigger precision — every model in the bench scores at or above the eval's `metrics.trigger_precision.threshold` | ≥ each model's threshold (default 0.6) | +| 2 | Behavior grader pass rate across all (model × task × trial) | ≥ 90% | +| 3 | Prompt grader (`answer_quality`) median score across all legs | ≥ 4.0 / 5.0 | +| 4 | Token count of the skill's `SKILL.md` | ≤ `tokens.warningThreshold` from `.waza.yaml` | +| 5 | No `waza check` errors (compliance) | 0 errors | +| 6 | `eval.yaml` already validates against upstream schema | OK | + +If any criterion fails, the skill stays in the expanded tier; the report +explains which gates blocked promotion. + +## Required Protocol + +Execute the steps below in order. Use the workspace root as cwd for every +shell command. Use `set -uo pipefail` (not `-e`) so a non-zero `waza run` +exit does not abort the assessment. + +### Step 1 — Resolve and verify + +1. Set `skill="${input:skillName}"`. +2. Verify `.github/evals/${skill}/eval.yaml` exists. If not, stop and + report the missing path. +3. Verify `.github/skills/${skill}/SKILL.md` exists. If not, stop. +4. Parse `${input:models}` by splitting on commas, trimming whitespace. +5. Print a one-line preamble: + `Promotion assessment: against pilot-tier criteria`. + +### Step 2 — Run the eval suite per model + +```bash +mkdir -p /tmp/waza-promote +for model in ${models[@]}; do + echo "▶ Eval: ${model}" + waza run ".github/evals/${skill}/eval.yaml" \ + --model "${model}" \ + --judge-model "claude-sonnet-4.6" \ + --no-cache \ + --output "/tmp/waza-promote/${skill}-${model}.json" \ + 2>&1 | tail -3 +done +``` + +Use `--judge-model claude-sonnet-4.6` for stable cross-model quality +scoring. Run sequentially — quota consumption stays predictable. + +### Step 3 — Token budget check + +```bash +waza tokens count ".github/skills/${skill}/SKILL.md" --format json \ + > "/tmp/waza-promote/${skill}-tokens.json" +warning_threshold="$(yq -r '.tokens.warningThreshold' .waza.yaml)" +skill_tokens="$(jq -r '.[0].tokens' "/tmp/waza-promote/${skill}-tokens.json")" +echo "Skill tokens: ${skill_tokens} / ${warning_threshold} (warningThreshold)" +``` + +### Step 4 — Compliance check + +```bash +waza check ".github/skills/${skill}" \ + > "/tmp/waza-promote/${skill}-check.txt" 2>&1 || true +errors="$(grep -ciE '^(error|✗|fail)' "/tmp/waza-promote/${skill}-check.txt" || echo 0)" +echo "Compliance errors: ${errors}" +``` + +### Step 5 — Render the graduation report + +Aggregate the per-model JSONs from Step 2, the token count from Step 3, +and the compliance result from Step 4 into a single markdown block. + +For each criterion, print one of: +- `✅ PASS` — value at or above threshold +- `❌ FAIL` — value below threshold (and what the value was) + +End with a single decision line: + +- `🟢 RECOMMEND PROMOTE: meets all 7 criteria. Open a PR to remove its exclude entries from waza-evals.yml + waza-trends.yml and bump trials_per_task to 3.` +- `🔴 HOLD IN EXPANDED TIER: failed N criteria — fix and re-assess.` + +Do not modify any workflow file or `eval.yaml` from this prompt — promotion +is a deliberate human-reviewed change. diff --git a/.github/references/README.md b/.github/references/README.md new file mode 100644 index 0000000..092ea6f --- /dev/null +++ b/.github/references/README.md @@ -0,0 +1,85 @@ +--- +title: "Shared references corpus" +description: "L2 grounding corpus shared across skills. Conventions, snapshot format, and refresh procedure." +--- + +## What this directory is + +The **L2 grounding layer** of the [authoring framework](https://azure.github.io/git-ape/docs/authoring/framework#the-grounding-contract). + +Each subdirectory under `.github/references/` is a curated, dated snapshot of an authoritative external source — Microsoft Learn pages, OpenAPI specs, regex tables, JSON schemas, vendor docs. + +Skills consume this corpus by relative path. The corpus is human-readable markdown by design; this is not an embedding index, not a vector store, and not RAG infrastructure. + +## When a reference belongs here vs. inside a skill + +| Situation | Location | +|---|---| +| Used by one skill, unlikely to be shared | `.github/skills//references/` (skill-local) | +| Used by two or more skills, or likely to be | `.github/references//` (shared, here) | +| Skill-internal regex tables, prompt fragments, output schemas | `.github/skills//references/` (skill-local) | +| Canonical vendor doc snapshot (CAF abbreviations, RBAC roles, API versions, etc.) | `.github/references//` (shared, here) | + +When in doubt, start skill-local. Hoist to shared the first time a second skill needs the same canon — do not copy-paste. + +## Snapshot file format + +Every reference file MUST start with this frontmatter: + +```yaml +--- +source: +snapshot: YYYY-MM-DD +refresh_command: +--- +``` + +* **`source`** — the canonical upstream URL. Used by graders to verify citations. +* **`snapshot`** — the date the content was last verified against upstream. Used by the staleness check. +* **`refresh_command`** — exact command to re-fetch and regenerate. Should be idempotent; should fail loudly if the upstream shape changed. + +After the frontmatter, the body is normal markdown. Prefer tables, lists, and structured sections over prose — skills consume this content programmatically. + +## Directory layout + +```text +.github/references/ +├── README.md ← this file +└── / + ├── .md ← snapshot file (with source/snapshot/refresh_command frontmatter) + └── .md +``` + +A `` directory is a coherent corpus. Examples (illustrative; create them on demand, do not pre-stub): + +* `azure-caf/` — Cloud Adoption Framework abbreviations and naming rules +* `azure-rbac/` — built-in role definitions snapshot +* `openssf-scorecard/` — Scorecard check definitions + +## Refresh procedure + +1. Pick a reference file you suspect is stale (or that the staleness check flagged). +2. Run the file's `refresh_command`. +3. Inspect the diff. If the upstream shape changed (new columns, removed entries, schema break), the refresh command should fail and surface the diff — do not silently overwrite. +4. Update the `snapshot:` date. +5. Run the skill's eval suite to catch any regression caused by content drift. +6. Commit the snapshot update separately from any skill changes that depend on it. + +## Staleness thresholds + +Per-canon thresholds live next to the topic's files (in a `_meta.yaml` or similar) once the corpus has more than one snapshot. Default if unset: **90 days**. The quality gate (`waza check` or equivalent) flags older snapshots. + +## What does NOT belong here + +* **Generated artifacts.** If a file can be regenerated deterministically from another file, regenerate at runtime; do not snapshot the derivative. +* **Skill-internal scaffolding.** Output schemas, prompt fragments, and grader rubrics are skill-local. +* **Secrets, tokens, or credentials.** Snapshots are public source content only. +* **Large binary data.** Tables, lists, structured markdown only. If a corpus needs binary data, that is a sign it should be fetched live (L3), not snapshotted (L2). + +## Adding a new topic + +1. Create `.github/references//`. +2. Add the first snapshot file with the required frontmatter. +3. Update the skill that consumes it to point at the shared path instead of any skill-local copy. +4. Delete the skill-local copy if one existed. +5. Add a one-line entry to the directory layout above so future authors can find it. diff --git a/.github/skills/prereq-check/SKILL.md b/.github/skills/prereq-check/SKILL.md index 91c2519..962e4c9 100644 --- a/.github/skills/prereq-check/SKILL.md +++ b/.github/skills/prereq-check/SKILL.md @@ -1,177 +1,147 @@ --- name: prereq-check -description: "Check that all required CLI tools are installed, meet minimum versions, and have active auth sessions. Shows platform-specific install commands for anything missing." +description: "Validate Git-Ape CLI tool installation (az, gh, jq, git), versions, and auth sessions. Shows platform-specific install commands for anything missing. USE FOR: check Git-Ape prerequisites, what do I need to install for Git-Ape, verify Git-Ape CLI tools, az: command not found, gh: command not found, jq: command not found, git: command not found, az missing, gh missing, jq missing, git missing, fresh machine setup for Git-Ape, dev container setup for Git-Ape, before running git-ape-onboarding, az login required, gh auth login, auth expired, not logged in, outdated az version, minimum az version, upgrade az. DO NOT USE FOR: Anything else. This skill is narrowly scoped to prerequisites checks for Git-Ape's CLI tools and auth sessions. Do not use it for any other purpose." argument-hint: "Run without arguments to check all prerequisites" user-invocable: true +license: MIT +metadata: + author: Git-Ape + version: "0.1.0" --- # Prerequisites Check -Validates the local environment has the CLI tools and auth sessions needed to run Git-Ape skills. +Validate that the local environment has the CLI tools and auth sessions needed to run Git-Ape skills. Print platform-specific install commands and PATH-repair guidance for anything missing or version-stale. + +## Quick Reference + +| Property | Value | +|----------|-------| +| Best for | First-time setup, `command not found` triage, dev container validation | +| Required binaries | `az` ≥ 2.50, `gh` ≥ 2.0, `jq` ≥ 1.6, `git` (any) | +| Required auth | `az login`, `gh auth login` | +| Shell | bash on macOS/Linux, PowerShell 7+ on Windows | +| MCP tools | None — runs locally via shell | +| Related skills | `git-ape-onboarding` (next step), `azure-validate` (deployment-time checks) | +| Side effects | Read-only — never installs or modifies anything | ## When to Use - Before first-time onboarding (`/git-ape-onboarding`) -- When any Git-Ape skill fails with a "command not found" error -- When switching machines or dev containers -- When a user asks "what do I need to install?" +- When any Git-Ape skill fails with `command not found` +- When the user reports a missing binary in their prompt (e.g., `az: command not found`) +- After switching machines, shells, or dev containers +- When the user asks "what do I need to install?" -## Required Tools +## Rules -| Tool | Binary | Minimum Version | Purpose | -|------|--------|-----------------|---------| -| Azure CLI | `az` | 2.50 | Azure resource management, RBAC, deployments | -| GitHub CLI | `gh` | 2.0 | Repo secrets, environments, PR operations | -| jq | `jq` | 1.6 | JSON parsing in scripts and workflows | -| git | `git` | any | Version control (usually pre-installed) | +1. **Run read-only** — never `brew install`, `apt-get install`, or any state-changing command. Print the commands; the user runs them. +2. **Trust user reports** — if the user reports a tool missing, treat it as ⚠️ even when this terminal can find it (different shell, PATH, container, or machine). +3. **Stop at first blocking failure** — do not continue to auth checks while any tool is ❌. +4. **Do not chain into other skills** — never auto-invoke `git-ape-onboarding`; tell the user to run it after `READY`. -## Execution Playbook +## Steps -Run the steps below in order. Present results as a table. Stop at the first blocking failure. +| # | Action | Reference | +|---|--------|-----------| +| 1 | **Detect Platform** — `uname -s` / `uname -m` on bash, `$PSVersionTable.OS` on PowerShell → macOS / Linux (apt vs dnf) / Windows (PowerShell 7+) | inline | +| 2 | **Scan Prompt for Reported Missing Tools** — match `: command not found`, `command not found: `, ` is not installed` | inline | +| 3 | **Run Tool Check** — macOS/Linux: `bash scripts/check-tools.sh` · Windows: `pwsh -File scripts/check-tools.ps1` | [scripts/check-tools.sh](scripts/check-tools.sh), [scripts/check-tools.ps1](scripts/check-tools.ps1) | +| 4 | **Present Status Table** — pass/fail with found vs. minimum version | See [Status Table](#status-table) | +| 5 | **Show Install / PATH Repair** — only for ❌ and ⚠️ entries, scoped to platform | [references/install-commands.md](references/install-commands.md) | +| 6 | **Check Auth Sessions** — only if Step 4 reports all tools ✅ | See [Auth Checks](#auth-checks) | +| 7 | **Emit Verdict** — exactly one of READY / TOOLS MISSING / REPORTED MISSING / AUTH MISSING | See [Outputs](#outputs) | -### Step 1: Detect Platform +### Status Table -```bash -OS="$(uname -s)" -ARCH="$(uname -m)" -echo "Platform: $OS / $ARCH" -``` +`scripts/check-tools.sh` emits TSV rows of `toolstatusfoundminimum`. Render them as: -Map the result for install instructions: -- `Darwin` → macOS -- `Linux` → Linux (check for `apt-get` vs `yum`/`dnf` to narrow distro) -- `MINGW*` / `MSYS*` → Windows (git-bash) +| Tool | Status | Found | Required | +|------|--------|-------|----------| +| az | ✅ / ⚠️ / ❌ | x.y.z | 2.50 | +| gh | ✅ / ⚠️ / ❌ | x.y.z | 2.0 | +| jq | ✅ / ⚠️ / ❌ | x.y | 1.6 | +| git | ✅ / ❌ | x.y.z | any | -### Step 2: Check Each Tool +Status mapping: -```bash -# --- az (Azure CLI) — required, minimum 2.50 --- -if command -v az &>/dev/null; then - AZ_VER=$(az version --query '"azure-cli"' -o tsv 2>/dev/null) - echo "az: $AZ_VER" -else - echo "az: NOT FOUND" -fi - -# --- gh (GitHub CLI) — required, minimum 2.0 --- -if command -v gh &>/dev/null; then - GH_VER=$(gh --version 2>/dev/null | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') - echo "gh: $GH_VER" -else - echo "gh: NOT FOUND" -fi - -# --- jq — required, minimum 1.6 --- -if command -v jq &>/dev/null; then - JQ_VER=$(jq --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+[a-z]*') - echo "jq: $JQ_VER" -else - echo "jq: NOT FOUND" -fi - -# --- git — required (usually pre-installed) --- -if command -v git &>/dev/null; then - GIT_VER=$(git --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+') - echo "git: $GIT_VER" -else - echo "git: NOT FOUND" -fi -``` +- `OK` → ✅ +- `OUTDATED` or `MISSING` → ❌ +- Reported missing in Step 2 but `OK` in this terminal → ⚠️ with note `reported missing by user` -### Step 3: Present Results +### Auth Checks -Show a table with pass/fail status: +macOS / Linux (bash): -| Tool | Status | Found Version | Minimum Required | -|------|--------|---------------|------------------| -| az | ✅ / ❌ | x.y.z | 2.50 | -| gh | ✅ / ❌ | x.y.z | 2.0 | -| jq | ✅ / ❌ | x.y | 1.6 | -| git | ✅ / ❌ | x.y.z | any | +```bash +az account show --query "{name:name,id:id,tenantId:tenantId}" -o table 2>/dev/null \ + || echo "❌ Not logged in to Azure. Run: az login" -Mark a tool ❌ if it is missing OR below the minimum version. +gh auth status 2>/dev/null \ + || echo "❌ Not logged in to GitHub. Run: gh auth login" +``` -### Step 4: Show Install Commands (only if something is missing) +Windows (PowerShell 7+): -Show install commands only for missing or outdated tools, matching the detected platform. +```powershell +az account show --query "{name:name,id:id,tenantId:tenantId}" -o table 2>$null +if (-not $?) { Write-Output "❌ Not logged in to Azure. Run: az login" } -**macOS (Homebrew):** -```bash -brew install azure-cli # az -brew install gh # GitHub CLI -brew install jq # jq -brew install git # git (if missing) +gh auth status 2>$null +if (-not $?) { Write-Output "❌ Not logged in to GitHub. Run: gh auth login" } ``` -**Ubuntu / Debian:** -```bash -# az — Microsoft repository -curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -# gh — GitHub repository -(type -p wget >/dev/null || sudo apt-get install wget -y) \ - && sudo mkdir -p -m 755 /etc/apt/keyrings \ - && out=$(mktemp) && wget -nv -O"$out" https://cli.github.com/packages/githubcli-archive-keyring.gpg \ - && cat "$out" | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ - && sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ - && sudo apt-get update && sudo apt-get install gh -y - -# jq -sudo apt-get install -y jq -``` +## Outputs -**RHEL / Fedora:** -```bash -# az -sudo rpm --import https://packages.microsoft.com/keys/microsoft.asc -sudo dnf install -y azure-cli +A single chat message containing: -# gh -sudo dnf install -y gh +1. **Status table** from Step 4. +2. **Install / PATH repair commands** for ❌ and ⚠️ entries — pulled from [references/install-commands.md](references/install-commands.md), scoped to the detected platform. +3. **Auth status** (Azure subscription + GitHub user) from Step 6, only when all tools ✅. +4. **Final verdict** — exactly one of: + - `✅ READY` — all tools installed, versions OK, auth sessions active. Render the handoff chip from `## Next` so the user can click into onboarding. + - `⚠️ TOOLS MISSING` — list what to install. Do not continue. + - `⚠️ REPORTED MISSING` — this terminal finds the tool but the user reported it missing. Print install / PATH repair + verification block. + - `⚠️ AUTH MISSING` — tools OK but `az login` and/or `gh auth login` required. -# jq -sudo dnf install -y jq -``` +## Error Handling -**Windows (PowerShell with winget):** -```powershell -winget install Microsoft.AzureCLI -winget install GitHub.cli -winget install jqlang.jq -``` +| Error | Cause | Fix | +|-------|-------|-----| +| `az --version` hangs | Stale telemetry / extension cache | `az config set core.collect_telemetry=false`; reinstall if persistent | +| `gh auth status` says "not logged into any hosts" | No GitHub session | `gh auth login --web` | +| `az account show` returns `Please run 'az login'` | Expired or missing session | `az login` (use `--use-device-code` in headless shells) | +| User reports missing tool but this terminal finds it | Different shell / PATH / container / machine | Treat as ⚠️ REPORTED MISSING — print install + PATH repair, do not contradict | +| `jq --version` starts with `1.5` | Below minimum (1.6) | Upgrade via platform package manager | +| `check-tools.sh: Permission denied` | Script not executable | `chmod +x .github/skills/prereq-check/scripts/check-tools.sh` | +| `check-tools.ps1 cannot be loaded because running scripts is disabled` | PowerShell execution policy | Run via `pwsh -File scripts/check-tools.ps1` (bypasses script-block policy), or `Set-ExecutionPolicy -Scope Process RemoteSigned` | +| `pwsh: command not found` on Windows | PowerShell 7+ not installed | `winget install Microsoft.PowerShell` — Windows PowerShell 5.1 also works but ship `pwsh` for parity | -> **Windows note:** Git-Ape skills require a BASH shell. Install [Git for Windows](https://gitforwindows.org/) and use git-bash. +## Constraints -### Step 5: Check Auth Sessions +**Always:** -Only run this step if all tools passed Step 3. +- Print install commands; let the user run them +- Detect platform before printing recipes +- Honor user-reported missing tools even when this terminal finds them +- Stop at the first blocking failure +- Verify with `command -v ` + ` --version` after suggested fixes -```bash -# Azure CLI session -az account show --query "{name:name,id:id,tenantId:tenantId}" -o table 2>/dev/null -if [[ $? -ne 0 ]]; then - echo "❌ Not logged in to Azure. Run: az login" -fi - -# GitHub CLI session -gh auth status 2>/dev/null -if [[ $? -ne 0 ]]; then - echo "❌ Not logged in to GitHub. Run: gh auth login" -fi -``` +**Never:** + +- Run `brew install`, `apt-get install`, `winget install`, or any state-changing command +- Require git-bash on Windows — use the PowerShell script (`scripts/check-tools.ps1`) instead +- Auto-invoke `git-ape-onboarding` after a `READY` verdict +- Silently drop a reported-missing tool because this terminal finds it +- Continue to auth checks while any tool is ❌ +- Recommend `sudo` on macOS (Homebrew handles non-root install) -### Step 6: Summary +## Next -Present a final verdict: +After a `✅ READY` verdict, render this line verbatim so the chat surface turns it into a clickable handoff: -- **✅ READY** — All tools installed, versions OK, auth sessions active. Proceed with any Git-Ape skill. -- **⚠️ TOOLS MISSING** — List what to install. Do not proceed until resolved. -- **⚠️ AUTH MISSING** — Tools OK but user needs to run `az login` and/or `gh auth login`. +> Next: **@Git-Ape Onboarding** — or run `/git-ape-onboarding` to start setup. -## Agent Behavior +VS Code Copilot Chat renders `@AgentName` mentions and `/skill-name` slash commands as clickable chips — the user clicks once to dispatch. Do not auto-invoke (Rule 4). -1. Run Steps 1–5 by executing the commands in the terminal. -2. Present the results table and install commands (if needed). -3. Do NOT install anything automatically — show the commands and let the user run them. -4. If everything passes, tell the user they're ready and suggest next steps (e.g., `/git-ape-onboarding`). +For deployment-time validation of an Azure project, use `azure-validate` instead. diff --git a/.github/skills/prereq-check/references/install-commands.md b/.github/skills/prereq-check/references/install-commands.md new file mode 100644 index 0000000..497d811 --- /dev/null +++ b/.github/skills/prereq-check/references/install-commands.md @@ -0,0 +1,89 @@ +# Install & PATH-Repair Commands + +Platform-specific install recipes for the tools `prereq-check` validates: `az` (≥ 2.50), `gh` (≥ 2.0), `jq` (≥ 1.6), `git`. + +Show only the commands matching the platform detected by Step 1 of `prereq-check`. For tools the user reported missing but this terminal can find, frame these as **reinstall / PATH repair** rather than contradicting the user. + +## macOS (Homebrew) + +```bash +brew install azure-cli gh jq git +``` + +## Ubuntu / Debian + +```bash +# az +curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +# gh (full keyring setup) +(type -p wget >/dev/null || sudo apt-get install wget -y) \ + && sudo mkdir -p -m 755 /etc/apt/keyrings \ + && out=$(mktemp) && wget -nv -O"$out" https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + && cat "$out" | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ + && sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && sudo apt-get update && sudo apt-get install gh -y + +# jq +sudo apt-get install -y jq +``` + +## RHEL / Fedora + +```bash +sudo rpm --import https://packages.microsoft.com/keys/microsoft.asc +sudo dnf install -y azure-cli gh jq +``` + +## Windows (PowerShell with winget) + +```powershell +winget install Microsoft.AzureCLI +winget install GitHub.cli +winget install jqlang.jq +winget install Git.Git +winget install Microsoft.PowerShell # PowerShell 7+ (pwsh) — required by check-tools.ps1 +``` + +> Run `prereq-check` on Windows with `pwsh -File scripts/check-tools.ps1`. Git-Ape no longer requires git-bash; PowerShell 7+ is the supported Windows shell. + +## Verification (macOS / Linux — bash) + +```bash +command -v az && az --version +command -v gh && gh --version +command -v jq && jq --version +command -v git && git --version +``` + +## Verification (Windows — PowerShell) + +```powershell +foreach ($t in 'az','gh','jq','git') { + $c = Get-Command $t -ErrorAction SilentlyContinue + if ($c) { Write-Output "$t -> $($c.Source)"; & $t --version } else { Write-Output "$t MISSING" } +} +``` + +## PATH repair + +If a binary is installed but still not found in the user's shell: + +1. Close and reopen the terminal. +2. Reload the shell profile: `source ~/.bashrc` or `source ~/.zshrc` (or shell equivalent). +3. Re-run the verification block above. +4. If still missing, check that the install location is on `$PATH`: + + ```bash + echo "$PATH" | tr ':' '\n' + which -a az gh jq git 2>/dev/null + ``` + + Common install paths to add if missing: + - macOS Homebrew (Apple Silicon): `/opt/homebrew/bin` + - macOS Homebrew (Intel): `/usr/local/bin` + - Linux user install: `~/.local/bin` + - Windows winget shims: `%LOCALAPPDATA%\Microsoft\WinGet\Packages\` and `%ProgramFiles%\` (e.g., `C:\Program Files\Microsoft SDKs\Azure\CLI2\wbin`) + + On Windows, refresh `$env:Path` in the current session after install: `$env:Path = [System.Environment]::GetEnvironmentVariable('Path','Machine') + ';' + [System.Environment]::GetEnvironmentVariable('Path','User')`. diff --git a/.github/skills/prereq-check/scripts/check-tools.ps1 b/.github/skills/prereq-check/scripts/check-tools.ps1 new file mode 100644 index 0000000..4afc3a2 --- /dev/null +++ b/.github/skills/prereq-check/scripts/check-tools.ps1 @@ -0,0 +1,73 @@ +#!/usr/bin/env pwsh +# prereq-check: detect installed CLI tool versions for Git-Ape skills. +# Read-only. Emits one TSV row per tool: \t\t\t +# where is one of: OK | OUTDATED | MISSING. +# Mirrors scripts/check-tools.sh — keep the TSV contract in sync. + +$ErrorActionPreference = 'Continue' + +$min = @{ + az = '2.50' + gh = '2.0' + jq = '1.6' + git = '0' +} + +function Test-VersionAtLeast { + param([string]$Found, [string]$Minimum) + if ($Minimum -eq '0') { return $true } + try { + $f = ($Found -replace '[^0-9.].*$', '') + $m = ($Minimum -replace '[^0-9.].*$', '') + if ($f -notmatch '\.') { $f = "$f.0" } + if ($m -notmatch '\.') { $m = "$m.0" } + return [version]$f -ge [version]$m + } catch { + return $false + } +} + +function Emit-Row { + param([string]$Tool, [string]$Status, [string]$Found) + "{0}`t{1}`t{2}`t{3}" -f $Tool, $Status, $Found, $min[$Tool] +} + +function Check-Tool { + param([string]$Tool, [scriptblock]$Extract) + if (-not (Get-Command $Tool -ErrorAction SilentlyContinue)) { + Emit-Row $Tool 'MISSING' '-' + return + } + $found = '' + try { $found = (& $Extract) } catch { $found = '' } + if (-not $found) { $found = 'unknown' } + if ($min[$Tool] -ne '0' -and -not (Test-VersionAtLeast $found $min[$Tool])) { + Emit-Row $Tool 'OUTDATED' $found + } else { + Emit-Row $Tool 'OK' $found + } +} + +$os = if ($IsWindows) { 'Windows' } elseif ($IsMacOS) { 'Darwin' } elseif ($IsLinux) { 'Linux' } else { 'Windows' } +$arch = if ($env:PROCESSOR_ARCHITECTURE) { + $env:PROCESSOR_ARCHITECTURE +} else { + try { (uname -m 2>$null) } catch { '?' } +} +"Platform: $os / $arch" + +Check-Tool 'az' { + az version --query '"azure-cli"' -o tsv +} +Check-Tool 'gh' { + $line = (gh --version 2>$null | Select-Object -First 1) + if ($line -match '\d+\.\d+\.\d+') { $matches[0] } else { '' } +} +Check-Tool 'jq' { + $v = (jq --version 2>$null) + if ($v -match '\d+\.\d+[a-z]*') { $matches[0] } else { '' } +} +Check-Tool 'git' { + $v = (git --version 2>$null) + if ($v -match '\d+\.\d+\.\d+') { $matches[0] } else { '' } +} diff --git a/.github/skills/prereq-check/scripts/check-tools.sh b/.github/skills/prereq-check/scripts/check-tools.sh new file mode 100755 index 0000000..8fbb4c2 --- /dev/null +++ b/.github/skills/prereq-check/scripts/check-tools.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# prereq-check: detect installed CLI tool versions for Git-Ape skills. +# Read-only. Emits one TSV row per tool: \t\t\t +# where is one of: OK | OUTDATED | MISSING. + +set -u + +# Parse minimum versions to compare against. +declare -A MIN=( + [az]="2.50" + [gh]="2.0" + [jq]="1.6" + [git]="0" +) + +vercmp() { + # Returns 0 (true) if $1 >= $2 using version sort. + [[ "$(printf '%s\n%s\n' "$2" "$1" | sort -V | head -1)" == "$2" ]] +} + +emit() { + printf '%s\t%s\t%s\t%s\n' "$1" "$2" "$3" "${MIN[$1]}" +} + +check() { + local tool="$1" + local extract="$2" + if ! command -v "$tool" &>/dev/null; then + emit "$tool" MISSING "-" + return + fi + local found + found="$(eval "$extract" 2>/dev/null)" + found="${found:-unknown}" + if [[ "${MIN[$tool]}" != "0" ]] && ! vercmp "$found" "${MIN[$tool]}"; then + emit "$tool" OUTDATED "$found" + else + emit "$tool" OK "$found" + fi +} + +echo "Platform: $(uname -s) / $(uname -m)" +check az "az version --query '\"azure-cli\"' -o tsv" +check gh "gh --version | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+'" +check jq "jq --version | grep -oE '[0-9]+\.[0-9]+[a-z]*'" +check git "git --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+'" diff --git a/.github/templates/AGENT.template.md b/.github/templates/AGENT.template.md new file mode 100644 index 0000000..40f1639 --- /dev/null +++ b/.github/templates/AGENT.template.md @@ -0,0 +1,97 @@ +--- +title: "Agent scaffold template" +description: "Copy this file to .github/agents/.agent.md and replace every marker. Read the authoring framework spec at https://azure.github.io/git-ape/docs/authoring/framework first." +--- + + marker. + 4. Agents are THIN. Domain knowledge belongs in skills. If you find yourself + writing how-to detail, stop and extract it into a skill instead. + 5. Add an eval at `.github/evals//` with at least one persona-lock + task (off-topic prompt → agent must refuse and identify as itself). + 6. Run `/agent-onboard ` to smoke-test. + + Required frontmatter for the real .agent.md: + + --- + name: + description: "One sentence. What does this agent do? Used for routing." + argumentHint: "What argument the user supplies when invoking the agent" + tools: ["execute", "read", "search", ""] + --- +--> + +# + +## Identity (non-negotiable) + +You are ****. + +You MUST begin every response with a sentence that names you as ****. If the request is off-topic, your refusal MUST still open with your own name and redirect to your specialty (). + +Never describe yourself as a "software engineering assistant", "GitHub Copilot CLI", "general-purpose assistant", or any other persona. This agent has a single, narrow purpose and your identity is part of its contract. + +## Mission + + + +## Skills I own + + + +1. `` — +2. `` — + +## Workflow + + + +### Phase 1 — + +Calls: ``. + + + +### Phase 2 — + + + +### Phase 3 — + + + +## State management + + + +* **State location**: +* **Recovery**: + +## Interaction contract + + + +* **Question cadence**: +* **Headless mode hook**: +* **Confirmation gates**: + +## Non-goals + + + +If the request is one of the following, refuse and redirect: + +* +* + +## Hand-off contracts + + + +| To | Trigger | Input | Output | +|---|---|---|---| +| | | | | diff --git a/.github/templates/SKILL.template.md b/.github/templates/SKILL.template.md new file mode 100644 index 0000000..53cd7c3 --- /dev/null +++ b/.github/templates/SKILL.template.md @@ -0,0 +1,111 @@ +--- +title: "Skill scaffold template" +description: "Copy this file to .github/skills//SKILL.md and replace every marker. Read the authoring framework spec at https://azure.github.io/git-ape/docs/authoring/framework first." +--- + + marker. Delete sections you genuinely do not need + and document the omission under `## Stop conditions`. + 4. Add an eval at `.github/evals//` and register it in `manifest.yaml`. + 5. Run `/skill-onboard ` to smoke-test. + + Required frontmatter for the real SKILL.md (replace the title/description block above): + + --- + name: + description: "One-paragraph routing summary. End with USE FOR: . DO NOT USE FOR: . INVOKES: ." + --- +--> + +# + +## Purpose + + + +## When to use + + + +* +* + +## When NOT to use + + + +* + +## Procedure + + + +### 1. + + + +### 2. + + + +### 3. + + + +## Authoritative sources + + + +| Source | URL | Snapshot date | +|---|---|---| +| | | | + +## Inline canonical data + + + + + +## References + + + +* [references/.md](references/.md) — + +## Tool mandates + + + +| Tool | When to call | How to cite the result | +|---|---|---| +| | | | + +## Output schema + + + +```json +{ + "": "" +} +``` + +## Anti-patterns + + + +* + +## Stop conditions + + + +* diff --git a/.github/workflows/waza-agent-evals.yml b/.github/workflows/waza-agent-evals.yml new file mode 100644 index 0000000..c42156b --- /dev/null +++ b/.github/workflows/waza-agent-evals.yml @@ -0,0 +1,1099 @@ +name: Waza agent evals + +# Advisory-mode evaluation of custom Git-Ape agents. +# Runs on PRs that touch a `.agent.md` or its eval directory. Posts a comment +# with results. Always non-blocking — eval failures never gate merges. +# +# Why a parallel workflow (vs. extending waza-evals.yml): +# - Different cost profile: agent evals are compound (agent + auto-loaded +# skills via plugin.json) and cost ~5 premium reqs each. No tier-based +# multi-model fan-out — single model (claude-sonnet-4.6) to cap quota. +# - Different artifacts: agents share `waza tokens profile` and `waza +# quality` parity with the skills workflow (each agent's `.agent.md` +# is staged as a temporary `SKILL.md` to satisfy waza's skill-walker); +# `waza check` is skipped because the agentskills.io spec it enforces +# rejects agent-specific frontmatter fields ('agents', 'argument-hint', +# 'model', 'tools', 'user-invocable') as invalid. +# - Different layout: agent evals live at `.github/evals/agents//`, +# not `.github/evals//`. The eval consumes a mirrored +# `.agent.md` next to `eval.yaml` via `skill_directories: ["."]`, +# which this workflow re-syncs from the canonical `.github/agents/` copy +# before running. +# +# Per-PR scoping: +# - Touch the workflow file → full matrix. +# - Touch `.github/agents/.agent.md` → that agent only (if an eval +# directory exists). +# - Touch `.github/evals/agents//...` → that agent only. +# - workflow_dispatch with no input → full matrix. +# - workflow_dispatch with `agent:` input → that agent only. +# +# Notes: +# - The canonical agent list is discovered from the filesystem +# (`.github/evals/agents//eval.yaml`) — no separate manifest. +# Drop in a new agent eval directory and this workflow picks it up +# on the next PR. +# - copilot-sdk needs a Copilot-scoped token. Default GITHUB_TOKEN does +# NOT carry that scope. We use the `COPILOT_GITHUB_TOKEN` repo secret +# (already configured for waza-evals.yml). +# - Comment posting uses the default token (only needs pull-requests: write). + +on: + pull_request: + paths: + - '.github/agents/**/*.agent.md' + - '.github/evals/agents/**' + - '.github/workflows/waza-agent-evals.yml' + workflow_dispatch: + inputs: + agent: + description: 'Single agent name to run (default: all agents with an eval directory)' + required: false + type: string + +permissions: + contents: read + pull-requests: write + +concurrency: + group: waza-agent-evals-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +# Pin waza to a known-good release. Bump deliberately after validating that +# the new version's eval behavior still matches our baselines. Never resolve +# via `latest` — the microsoft/waza repo publishes the core release and the +# sibling azd-extension release at the same commit, and GitHub's +# `releases/latest` endpoint returns whichever was published last, which has +# bitten PR #109 with a 404 on the wrong asset. +env: + WAZA_VERSION: 'v0.33.0' + +jobs: + # --------------------------------------------------------------------------- + # preflight: verify that the COPILOT_GITHUB_TOKEN secret is configured. + # When absent, every downstream job is skipped cleanly (no red checks). The + # maintainer setup steps are in PR #109 / README. + # --------------------------------------------------------------------------- + preflight: + name: Preflight (check secrets) + runs-on: ubuntu-latest + timeout-minutes: 2 + outputs: + enabled: ${{ steps.check.outputs.enabled }} + steps: + - name: Check COPILOT_GITHUB_TOKEN availability + id: check + env: + TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: | + if [ -z "${TOKEN:-}" ]; then + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza agent eval jobs. See repo README / PR #109 for setup." + exit 0 + fi + # Token is set — verify it can actually read the private microsoft/waza + # repo (release downloads need access). Reject silently if 401/403/404. + # Capture headers + body for diagnostics (no token is ever printed). + hdr_file=$(mktemp) + body_file=$(mktemp) + http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/microsoft/waza/releases/latest || true) + if [ "${http_code}" = "200" ]; then + echo "enabled=true" >> "$GITHUB_OUTPUT" + echo "COPILOT_GITHUB_TOKEN can read microsoft/waza — eval jobs will run." + else + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza agent eval jobs." + echo "--- diagnostic: response headers (token not included) ---" + grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true + echo "--- diagnostic: response body (first 500 bytes) ---" + head -c 500 "${body_file}" || true + echo + echo "--- diagnostic: token-user identity probe ---" + user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/user || true) + echo "GET /user -> HTTP ${user_code}" + if [ "${user_code}" = "200" ]; then + jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user" + else + head -c 300 "${body_file}.user" || true + fi + echo + fi + rm -f "${hdr_file}" "${body_file}" "${body_file}.user" + + # --------------------------------------------------------------------------- + # prepare: discover all configured agents from the filesystem, then narrow + # to the subset affected by this PR (or run all on workflow_dispatch / a + # workflow-file change). Outputs: + # - agents: JSON array of selected agent names (drives comment ordering) + # - legs: JSON array of { agent } entries for matrix.include + # - mode/reason: human-readable scope info for the PR comment banner + # --------------------------------------------------------------------------- + prepare: + name: Determine matrix + runs-on: ubuntu-latest + timeout-minutes: 5 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + outputs: + agents: ${{ steps.select.outputs.agents }} + legs: ${{ steps.select.outputs.legs }} + reason: ${{ steps.select.outputs.reason }} + mode: ${{ steps.select.outputs.mode }} + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Select agents + id: select + env: + REQUESTED: ${{ inputs.agent }} + EVENT: ${{ github.event_name }} + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + set -euo pipefail + + # Canonical agent list: every directory under .github/evals/agents/ + # that contains an eval.yaml. Filesystem is the source of truth. + # The directory may not exist yet (no agent suites ported) — treat as empty. + if [ -d .github/evals/agents ]; then + ALL_AGENTS="$( + find .github/evals/agents -mindepth 2 -maxdepth 2 -name eval.yaml \ + | awk -F/ '{print $4}' \ + | sort -u \ + | jq -R -s -c 'split("\n") | map(select(length > 0))' + )" + else + ALL_AGENTS="[]" + fi + echo "ALL_AGENTS=$ALL_AGENTS" + + # emit + emit() { + local selected="$1" mode="$2" reason="$3" + local legs + legs="$(echo "$selected" | jq -c '[ .[] | { agent: . } ]')" + { + echo "agents=${selected}" + echo "legs=${legs}" + echo "mode=${mode}" + echo "reason=${reason}" + } >> "$GITHUB_OUTPUT" + echo "Selected agents: ${selected}" + echo "Legs: ${legs}" + echo "Mode: ${mode}" + echo "Reason: ${reason}" + } + + # --- Case 1: workflow_dispatch with single-agent input --- + if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then + if echo "$ALL_AGENTS" | jq -e --arg a "$REQUESTED" '. | index($a)' > /dev/null; then + emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)" + exit 0 + else + echo "::error::Requested agent '$REQUESTED' has no eval directory under .github/evals/agents/ (available: $ALL_AGENTS)" + exit 1 + fi + fi + + # --- Case 2: workflow_dispatch without input → full matrix --- + if [ "$EVENT" = "workflow_dispatch" ]; then + emit "$ALL_AGENTS" "full" "workflow_dispatch (no input → full matrix)" + exit 0 + fi + + # --- Case 3: pull_request — diff against base --- + if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then + emit "$ALL_AGENTS" "full" "pull_request: missing base/head SHA → full matrix" + exit 0 + fi + + git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true + + changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true) + if [ -z "$changed" ]; then + emit "[]" "none" "no files changed in PR" + exit 0 + fi + + echo "--- changed files ---" + echo "$changed" + echo "---------------------" + + # Workflow-file changes → full matrix (semantics of this workflow itself changed). + if echo "$changed" | grep -qE '^\.github/workflows/waza-agent-evals\.yml$'; then + emit "$ALL_AGENTS" "full" "workflow file changed → full matrix" + exit 0 + fi + + # Per-agent changes from both possible paths: + # .github/agents/.agent.md + # .github/evals/agents//... + # shellcheck disable=SC2016 + changed_agents=$( + echo "$changed" | awk -F/ ' + /^\.github\/agents\/.+\.agent\.md$/ { + fname=$3 + sub(/\.agent\.md$/, "", fname) + print fname + } + /^\.github\/evals\/agents\// && NF >= 5 {print $4} + ' | sort -u + ) + + if [ -z "$changed_agents" ]; then + emit "[]" "none" "no per-agent files changed" + exit 0 + fi + + # Intersect with the configured (filesystem) list. + selected=$( + printf '%s\n' "$changed_agents" \ + | jq -R -s -c --argjson all "$ALL_AGENTS" \ + '[ split("\n")[] | select(length > 0) | select(IN($all[])) ]' + ) + + if [ "$selected" = "[]" ]; then + emit "[]" "none" "changed agent(s) have no eval directory: $(echo "$changed_agents" | tr '\n' ' ')" + exit 0 + fi + + count=$(echo "$selected" | jq 'length') + names=$(echo "$selected" | jq -r 'join(", ")') + emit "$selected" "subset" "diff-scoped: ${count} changed agent(s) — ${names}" + + # --------------------------------------------------------------------------- + # tokens: token comparison vs main for `.agent.md` files. Runs once (not + # per-matrix) and uploads a single JSON artifact consumed by the comment + # job. `waza tokens compare` is local computation only — no LLM, no quota + # cost. Advisory — never fails the workflow. + # --------------------------------------------------------------------------- + tokens: + name: Agent file token comparison vs main (advisory) + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + continue-on-error: true + env: + # Only used for the release-API lookup (public-repo read). Keeps the + # secret list consistent across all jobs in this workflow. + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (pinned release) + run: | + set -euo pipefail + waza_version="${WAZA_VERSION}" + if [ -z "${waza_version}" ]; then + echo "::error::WAZA_VERSION env var is not set" + exit 1 + fi + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Compare .agent.md token counts vs origin/main + # Advisory step — never gate the workflow on filter quirks. Disable + # `-e` (GitHub injects `bash -e {0}`) so a single jq failure can't + # kill the step before the recovery branches run. + shell: bash {0} + run: | + set -uo pipefail + mkdir -p .waza-results + # `waza tokens compare` without --skills walks every .md file + # in the repo. We post-filter to .github/agents/*.agent.md + # entries only. --threshold 0 keeps the exit code clean + # (advisory, never gates). + waza tokens compare origin/main --threshold 0 --format json \ + > .waza-results/tokens-compare-raw.json 2>&1 || true + + # Filter to .agent.md files in the agents directory. Tolerate + # multiple top-level schemas across waza versions — if the JSON + # has a top-level `files` array, filter that; otherwise pass + # the raw payload through and let the comment script decide. + # `.path // ""` makes the regex test null-safe (some waza + # versions emit summary/totals entries with a null path). + if jq -e 'type == "object" and has("files")' \ + .waza-results/tokens-compare-raw.json > /dev/null 2>&1; then + jq '{ + base: .base, + head: .head, + files: [ .files[] + | select((.path // "") | test("^\\.github/agents/.+\\.agent\\.md$")) ] + }' .waza-results/tokens-compare-raw.json \ + > .waza-results/tokens-compare.json \ + || cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json + else + cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json + fi + + echo "--- filtered agent token comparison ---" + cat .waza-results/tokens-compare.json || true + exit 0 + + - name: Upload token comparison artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-agent-tokens-compare + path: .waza-results/tokens-compare.json + retention-days: 14 + if-no-files-found: warn + include-hidden-files: true + + # --------------------------------------------------------------------------- + # eval: matrix (agent). Each leg runs `waza run` on the agent's compound + # eval and produces a markdown snippet for the PR comment. Single-model + # (claude-sonnet-4.6) to cap quota cost — each leg averages ~5 premium reqs. + # --------------------------------------------------------------------------- + eval: + name: "${{ matrix.agent || 'eval (skipped — no agent changes)' }}" + needs: [preflight, prepare] + if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != '' + runs-on: ubuntu-latest + timeout-minutes: 20 + continue-on-error: true + strategy: + fail-fast: false + # Throttle concurrent SDK sessions to keep us under the Copilot models + # API rate-limit ceiling. Without this cap, bursting 8 agent legs in + # parallel reliably trips `Failed to list models: 429` on a subset of + # legs — they fail in <2s without consuming any premium requests and + # surface as fake low scores. 3 concurrent SDK sessions has empirically + # stayed under the limit; raise cautiously. + max-parallel: 3 + matrix: + include: ${{ fromJSON(needs.prepare.outputs.legs) }} + env: + # copilot-sdk authenticates with this token. Default GITHUB_TOKEN does + # not carry Copilot scope, so we use a dedicated PAT in repo secrets. + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (pinned release) + run: | + set -euo pipefail + waza_version="${WAZA_VERSION}" + if [ -z "${waza_version}" ]; then + echo "::error::WAZA_VERSION env var is not set" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Sync mirrored .agent.md from canonical .github/agents/ + # The eval's `skill_directories: ["."]` loads the sibling .agent.md + # mirror; the canonical source lives in .github/agents/. Copy on + # every run so the eval always reflects the canonical agent file + # under test, without requiring contributors to keep them in sync + # by hand. + run: | + set -euo pipefail + agent="${{ matrix.agent }}" + src=".github/agents/${agent}.agent.md" + dst=".github/evals/agents/${agent}/${agent}.agent.md" + if [ -f "$src" ] && [ -d ".github/evals/agents/${agent}" ]; then + cp "$src" "$dst" + echo "Synced ${src} -> ${dst}" + else + echo "::warning::Missing canonical agent file or eval dir for ${agent}: src=${src}, dst-dir=.github/evals/agents/${agent}" + fi + + - name: Run waza eval (advisory) + id: run + run: | + # GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT + # disable -e, so a non-zero exit from `waza run` (e.g. metric below + # threshold) kills the script before `rc=$?` runs. Explicitly + # disable errexit so we can capture the code and surface it in the + # PR comment instead of failing the leg silently. + set +e + set -uo pipefail + mkdir -p .waza-results + agent="${{ matrix.agent }}" + spec=".github/evals/agents/${agent}/eval.yaml" + + # ---- Retry-on-infra-failure wrapper ------------------------------- + # Three infra-failure classes can corrupt a leg WITHOUT being model + # quality signal — see the same pattern in waza-evals.yml and + # waza-trends.yml. Detect ALL three classes per attempt, retry on + # any of them (with longer backoff on quota), and INFRA_FAILED + # the leg if retries exhaust so we don't blend fake low scores + # into the PR comment: + # 1. `Session not found` (JSON-RPC -32603): the Copilot SDK + # dropped the session before waza's `prompt` grader could + # resume it (continue_session: true). Validations get wiped + # to null on affected tasks, dragging the leg aggregate down. + # 2. `failed to run grader`: the judge LLM backend itself + # crashed during a grader call. Status=error, empty + # validations, fake low score. + # 3. `Failed to list models: 429`: Copilot models API rate-limit + # hit BEFORE the agent could start. Worst case: all tasks + # return status=error in <2s with deterministic 0-ish scores. + # + # All three are transient. We retry up to 2 times (3 total + # attempts). On exhaustion, we delete the corrupt JSON and write + # an INFRA_FAILED sidecar + markdown notice; the aggregator's + # fallback path (no JSON → use rawMd) will surface that notice + # instead of polluting the score table. + # + # --judge-model is decoupled from the executor model so quality + # scores are always judged by claude-sonnet-4.6 even if we ever + # add per-agent model overrides. + max_attempts=3 + attempt=0 + rc=0 + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + echo "::group::waza run attempt ${attempt}/${max_attempts} for ${agent}" + rc=0 + waza run "${spec}" \ + --model "claude-sonnet-4.6" \ + --judge-model "claude-sonnet-4.6" \ + --suggest \ + --recommend \ + --format "github-comment" \ + --output ".waza-results/${agent}.json" \ + --reporter "junit:.waza-results/${agent}.junit.xml" \ + > ".waza-results/${agent}.md" + rc=$? + echo "::endgroup::" + + if [ ! -f ".waza-results/${agent}.json" ]; then + echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})" + if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi + break + fi + + # Count each infra-failure class in this attempt's artifact. + infra_counts=$(jq -r ' + [.tasks[]?.runs[]? | (.error_msg // "")] as $errs + | { session: ([$errs[] | select(contains("Session not found"))] | length), + grader: ([$errs[] | select(contains("failed to run grader"))] | length), + quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) } + | "\(.session) \(.grader) \(.quota)" + ' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0") + session_errs=$(echo "${infra_counts}" | awk '{print $1}') + grader_errs=$(echo "${infra_counts}" | awk '{print $2}') + quota_errs=$(echo "${infra_counts}" | awk '{print $3}') + total_infra=$((session_errs + grader_errs + quota_errs)) + + if [ "${total_infra}" = "0" ]; then + echo "::notice::${agent} attempt ${attempt} clean (no infra-failure errors)" + break + fi + + echo "::warning::${agent} attempt ${attempt} hit ${session_errs} session-not-found + ${grader_errs} grader-infra + ${quota_errs} quota-429 error(s)" + if [ $attempt -lt $max_attempts ]; then + # Discard partial artifacts so the next attempt is independent. + rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.md" ".waza-results/${agent}.junit.xml" + # Quota errors need longer backoff than session/grader to let + # the Copilot models API window reset. + if [ "${quota_errs}" != "0" ]; then sleep 30; else sleep 5; fi + fi + done + + # Final classification: if any infra errors remain after all + # attempts, treat the leg as INFRA_FAILED and discard the corrupt + # JSON so it doesn't pollute the score table. + final_session=0 + final_grader=0 + final_quota=0 + if [ -f ".waza-results/${agent}.json" ]; then + infra_counts=$(jq -r ' + [.tasks[]?.runs[]? | (.error_msg // "")] as $errs + | { session: ([$errs[] | select(contains("Session not found"))] | length), + grader: ([$errs[] | select(contains("failed to run grader"))] | length), + quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) } + | "\(.session) \(.grader) \(.quota)" + ' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0") + final_session=$(echo "${infra_counts}" | awk '{print $1}') + final_grader=$(echo "${infra_counts}" | awk '{print $2}') + final_quota=$(echo "${infra_counts}" | awk '{print $3}') + fi + final_infra=$((final_session + final_grader + final_quota)) + + if [ "${final_infra}" != "0" ]; then + echo "::error::${agent} still has ${final_session} session-not-found + ${final_grader} grader-infra + ${final_quota} quota-429 error(s) after ${max_attempts} attempts — discarding corrupt artifact" + printf 'session_not_found_errors=%s\ngrader_failed_errors=%s\nquota_429_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \ + "${final_session}" "${final_grader}" "${final_quota}" "${max_attempts}" "${rc}" \ + > ".waza-results/${agent}.infra-failed" + rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.junit.xml" + # Replace the markdown with a clear INFRA_FAILED notice. Use + # printf (no heredoc) because heredoc EOF terminators clash + # with YAML block-scalar indentation rules in `run: |` steps. + { + printf '### `%s` — INFRA_FAILED\n\n' "${agent}" + printf 'waza run hit infra-level error(s) from the Copilot SDK ' + printf 'after **%s attempt(s)**:\n\n' "${max_attempts}" + printf -- '- `Session not found` (JSON-RPC -32603): **%s**\n' "${final_session}" + printf -- '- `failed to run grader` (judge backend crash): **%s**\n' "${final_grader}" + printf -- '- `Failed to list models: 429` (Copilot quota): **%s**\n\n' "${final_quota}" + printf 'These error classes are transient infrastructure issues, ' + printf 'not model-quality signal. **No score is reported for this leg** ' + printf '— treating a corrupted run as a low score would be misleading. ' + printf 'See the workflow logs and the `waza-agent-results-%s` artifact for details.\n' "${agent}" + } > ".waza-results/${agent}.md" + fi + # ---- end retry wrapper -------------------------------------------- + + echo "exit_code=${rc}" >> "$GITHUB_OUTPUT" + echo + echo "--- captured PR-comment markdown ---" + cat ".waza-results/${agent}.md" || true + # Never fail the step itself — surface the code in the comment. + exit 0 + + - name: Agent signal — tokens profile + quality (advisory) + # Parity with `waza-evals.yml`: surface `waza tokens profile` and + # `waza quality` output for `.agent.md` files. Both commands target + # `SKILL.md` only, so we stage a temporary copy of the agent file + # named `SKILL.md` in a NON-DOT directory ('.waza-results/...' or + # any other dotted path is silently skipped by waza's workspace + # walker). The stage dir is named with the agent slug so judge + # output ('📊 : ...') and table headers display the agent + # name instead of a random tmp suffix. + # + # `waza check` is intentionally skipped: it validates the + # agentskills.io SKILL spec, which rejects agent-specific + # frontmatter fields ('agents', 'argument-hint', 'model', + # 'tools', 'user-invocable') as invalid. Running it would + # surface confusing "spec failures" that aren't real agent + # quality signal. + # + # `waza quality` consumes ~1 premium Copilot request per leg via + # its LLM judge (claude-sonnet-4.6 by default). Failures are + # tolerated with `|| true` so a flaky judge call doesn't tank + # the whole leg. + if: always() + run: | + set -uo pipefail + mkdir -p .waza-results + agent="${{ matrix.agent }}" + src=".github/agents/${agent}.agent.md" + if [ ! -f "$src" ]; then + echo "::warning::canonical agent file missing for ${agent}: ${src} — skipping signal steps" + exit 0 + fi + + # Stage as SKILL.md in a non-dotted path so waza's workspace + # walker (which skips hidden/dotted dirs) finds it. + stage_root="waza-agent-stage" + stage_dir="${stage_root}/${agent}" + rm -rf "$stage_dir" + mkdir -p "$stage_dir" + cp "$src" "${stage_dir}/SKILL.md" + + echo "::group::waza tokens profile (${agent})" + waza tokens profile "$stage_dir" \ + > ".waza-results/${agent}-tokens-profile.txt" 2>&1 || true + # Strip the temp stage_root prefix from the human-readable output + # so the display reads "agent-name:" instead of + # "waza-agent-stage/agent-name:". + sed -i "s|${stage_root}/||g" ".waza-results/${agent}-tokens-profile.txt" || true + cat ".waza-results/${agent}-tokens-profile.txt" || true + echo "::endgroup::" + + echo "::group::waza quality (${agent}) — LLM judge, ~1 premium req" + waza quality "$stage_dir" --format table \ + > ".waza-results/${agent}-quality.txt" 2>&1 || true + sed -i "s|${stage_root}/||g" ".waza-results/${agent}-quality.txt" || true + cat ".waza-results/${agent}-quality.txt" || true + echo "::endgroup::" + + # Clean up stage so it doesn't end up in the artifact. + rm -rf "$stage_root" + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-agent-results-${{ matrix.agent }} + path: .waza-results/ + retention-days: 14 + if-no-files-found: warn + # `.waza-results/` starts with a dot, and upload-artifact treats + # any path segment starting with `.` as hidden by default. Without + # this, the artifact is silently empty. + include-hidden-files: true + + # --------------------------------------------------------------------------- + # comment: fan-in. Downloads all artifacts and posts one aggregated comment. + # Idempotent — uses an HTML marker to update the same comment on subsequent + # pushes instead of stacking new ones. + # --------------------------------------------------------------------------- + comment: + name: Post advisory comment on PR + needs: [preflight, prepare, eval, tokens] + if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always() + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Download all eval artifacts + uses: actions/download-artifact@v8 + with: + path: artifacts + pattern: waza-agent-results-* + merge-multiple: false + + - name: Download token comparison artifact + uses: actions/download-artifact@v8 + with: + name: waza-agent-tokens-compare + path: artifacts/waza-agent-tokens-compare + continue-on-error: true + + # `actions/download-artifact@v8` is documented as creating a per-artifact + # subdirectory under `path` when `pattern` is used with `merge-multiple: + # false`. In practice, when only ONE artifact matches the pattern + # (typical for diff-scoped PR runs with a single changed agent), v8 + # extracts the artifact contents directly into `path` with no + # subdirectory — the same behavior as a single-name download. The + # downstream aggregator script expects the nested layout, so this step + # normalizes the flat case back into nested. Idempotent: a no-op when + # the layout is already nested (multi-agent runs). + - name: Normalize artifact layout (handle v8 single-match flattening) + shell: bash + run: | + set -euo pipefail + if [ ! -d artifacts ]; then + echo "artifacts/ does not exist — nothing to normalize" + exit 0 + fi + + echo "--- artifact layout BEFORE normalization ---" + find artifacts -maxdepth 3 -mindepth 1 | sort + + shopt -s nullglob + cd artifacts + for f in *.json *.md *.junit.xml *-quality.txt *-tokens-profile.txt *.infra-failed; do + [ -f "$f" ] || continue + agent="$f" + for suf in -quality.txt -tokens-profile.txt .junit.xml .json .md .infra-failed; do + agent="${agent%"$suf"}" + done + if [ -z "$agent" ] || [ "$agent" = "$f" ]; then + echo "::warning::Could not derive agent slug from filename '$f' — leaving in place" + continue + fi + mkdir -p "waza-agent-results-${agent}" + mv -- "$f" "waza-agent-results-${agent}/" + echo " moved: $f -> waza-agent-results-${agent}/" + done + cd - + + echo "--- artifact layout AFTER normalization ---" + find artifacts -maxdepth 3 -mindepth 1 | sort + + - name: Aggregate and post comment + uses: actions/github-script@v9 + env: + PREPARE_MODE: ${{ needs.prepare.outputs.mode }} + PREPARE_REASON: ${{ needs.prepare.outputs.reason }} + PREPARE_AGENTS: ${{ needs.prepare.outputs.agents }} + with: + # Default GITHUB_TOKEN — has `pull-requests: write` and is the + # right identity for bot-style comments. + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const path = require('path'); + + const agents = JSON.parse(process.env.PREPARE_AGENTS || '[]'); + const root = 'artifacts'; + const allDirs = fs.existsSync(root) + ? fs.readdirSync(root) + .filter((d) => d.startsWith('waza-agent-results-')) + .sort() + : []; + + // ---------------- helpers ---------------- + function readFileOrNull(filePath) { + try { + if (!fs.existsSync(filePath)) return null; + const c = fs.readFileSync(filePath, 'utf8'); + return c.length > 0 ? c : null; + } catch (e) { + core.debug(`readFileOrNull: ${filePath} -> ${e.message}`); + return null; + } + } + + function readJsonOrNull(filePath) { + const raw = readFileOrNull(filePath); + if (!raw) return null; + try { + return JSON.parse(raw); + } catch (e) { + core.debug(`readJsonOrNull: parse failed for ${filePath}: ${e.message}`); + return null; + } + } + + function fmtMs(ms) { + if (typeof ms !== 'number' || !isFinite(ms)) return '—'; + if (ms < 1000) return `${ms}ms`; + const s = ms / 1000; + if (s < 60) return `${s.toFixed(1)}s`; + const m = Math.floor(s / 60); + return `${m}m${(s - m * 60).toFixed(0)}s`; + } + + function fmtTokens(n) { + if (typeof n !== 'number' || !isFinite(n) || n === 0) return '—'; + if (n < 1000) return String(n); + if (n < 1_000_000) return `${(n / 1000).toFixed(1)}K`; + return `${(n / 1_000_000).toFixed(2)}M`; + } + + function scoreEmoji(score, succeeded, total) { + if (typeof score !== 'number') return '⚠️'; + if (succeeded === total && total > 0) return '✅'; + if (succeeded > 0) return '⚠️'; + return '❌'; + } + + function truncateText(text, maxLines) { + if (!text) return ''; + const lines = text.split('\n'); + if (lines.length <= maxLines) return text; + return lines.slice(0, maxLines).join('\n') + + `\n… (${lines.length - maxLines} more lines truncated)`; + } + + // ---------------- top-level: agent token compare ---------------- + const tcPath = path.join(root, 'waza-agent-tokens-compare', 'tokens-compare.json'); + const tc = readJsonOrNull(tcPath); + let tokenCompareSection = ''; + if (tc) { + const files = Array.isArray(tc.files) ? tc.files : []; + if (files.length > 0) { + const rows = files + .map((f) => { + const before = (f.base_tokens != null) ? f.base_tokens + : (f.before != null) ? f.before : '—'; + const after = (f.head_tokens != null) ? f.head_tokens + : (f.after != null) ? f.after + : (f.tokens != null) ? f.tokens : '—'; + const delta = (f.delta != null) ? f.delta + : (typeof before === 'number' && typeof after === 'number') + ? (after - before) : '—'; + const pct = (f.percent_change != null) ? `${f.percent_change.toFixed(1)}%` + : (typeof before === 'number' && before > 0 && typeof delta === 'number') + ? `${(delta * 100 / before).toFixed(1)}%` + : '—'; + const sign = (typeof delta === 'number' && delta > 0) ? '+' : ''; + return `| \`${f.path}\` | ${before} | ${after} | ${sign}${delta} | ${pct} |`; + }) + .join('\n'); + tokenCompareSection = [ + '
📊 Agent file token comparison vs main (advisory)', + '', + `| File | Base | Head | Δ | % |`, + `|---|---|---|---|---|`, + rows, + '', + '
', + ].join('\n'); + } else { + tokenCompareSection = [ + '
📊 Agent file token comparison vs main (advisory)', + '', + '_No `.agent.md` files changed vs `main` (or token-compare returned no entries)._', + '', + '
', + ].join('\n'); + } + } + + // ---------------- per-agent sections ---------------- + const byAgent = new Map(); + for (const d of allDirs) { + const agent = d.replace(/^waza-agent-results-/, ''); + byAgent.set(agent, d); + } + + const sections = []; + for (const agent of agents) { + const dir = byAgent.get(agent); + if (!dir) { + sections.push([ + `### Agent: \`${agent}\``, + '', + '_No artifact produced. See workflow logs._', + ].join('\n')); + continue; + } + + const jsonPath = path.join(root, dir, `${agent}.json`); + const rawMd = readFileOrNull(path.join(root, dir, `${agent}.md`)); + const json = readJsonOrNull(jsonPath); + + if (!json) { + // Fall back to raw github-comment markdown if JSON is unavailable. + sections.push([ + `### Agent: \`${agent}\``, + '', + rawMd || '_No output captured. See workflow logs._', + ].join('\n')); + continue; + } + + const summary = json.summary || {}; + const usage = summary.usage || {}; + const total = summary.total_tests || 0; + const ok = summary.succeeded || 0; + const failed = summary.failed || 0; + const score = (typeof summary.aggregate_score === 'number') + ? summary.aggregate_score.toFixed(2) : '—'; + const emoji = scoreEmoji(summary.aggregate_score, ok, total); + + const headline = + `**Score:** ${emoji} ${score} (${ok}/${total} tasks) | ` + + `**Duration:** ${fmtMs(summary.duration_ms)} | ` + + `**Cost:** ${usage.premium_requests ?? '—'} premium req${usage.premium_requests === 1 ? '' : 's'}, ` + + `${usage.turns ?? '—'} turns | ` + + `**Tokens:** ${fmtTokens(usage.input_tokens)} in / ${fmtTokens(usage.output_tokens)} out` + + (usage.cache_read_tokens ? ` / ${fmtTokens(usage.cache_read_tokens)} cache-read` : ''); + + // Per-task table. + const tasks = Array.isArray(json.tasks) ? json.tasks : []; + const taskRows = tasks.map((t) => { + const run0 = (t.runs && t.runs[0]) || {}; + const sd = run0.session_digest || {}; + const taskScore = (typeof t.score === 'number') ? t.score.toFixed(2) + : (typeof run0.score === 'number') ? run0.score.toFixed(2) + : '—'; + const passed = run0.status === 'passed' || run0.status === 'pass'; + const statusEmoji = passed ? '✅' : (run0.status === 'error' ? '⚠️' : '❌'); + const toolCalls = sd.tool_call_count ?? '—'; + const graders = (run0.validations) + ? Object.keys(run0.validations).join(', ') + : '—'; + const name = (t.display_name || t.name || t.id || '(unnamed)') + .replace(/\|/g, '\\|'); + return `| ${name} | ${taskScore} | ${statusEmoji} | ${toolCalls} | ${graders} |`; + }).join('\n'); + const taskTable = tasks.length > 0 + ? [ + '| Task | Score | Status | Tool calls | Graders |', + '|---|---|---|---|---|', + taskRows, + ].join('\n') + : '_No task data in JSON output._'; + + // Per-agent signal sections (parity with skills workflow): + // model-independent, fed by per-leg `waza tokens profile` + // and `waza quality` output. Both files live alongside the + // eval JSON/markdown in the same artifact dir. + const tokensProfilePath = path.join(root, dir, `${agent}-tokens-profile.txt`); + const tokensProfileRaw = readFileOrNull(tokensProfilePath); + const tokensSection = tokensProfileRaw + ? [ + '
🔢 Tokens (count + profile)', + '', + '```', + tokensProfileRaw.trim(), + '```', + '', + '
', + ].join('\n') + : ''; + + const qualityPath = path.join(root, dir, `${agent}-quality.txt`); + const qualityRaw = readFileOrNull(qualityPath); + const qualitySection = qualityRaw + ? [ + '
🎯 Quality (5-dim table)', + '', + '```', + qualityRaw.trim(), + '```', + '', + '_Scored by `waza quality` with `claude-sonnet-4.6` as LLM judge. The agent\'s `.agent.md` is staged as `SKILL.md` for analysis; treat dimensions as advisory signal (the rubric was authored for skills)._', + '', + '
', + ].join('\n') + : ''; + + // Failure details (only when something failed). + const failureDetails = []; + for (const t of tasks) { + const run0 = (t.runs && t.runs[0]) || {}; + if (run0.status === 'passed' || run0.status === 'pass') continue; + const name = t.display_name || t.name || t.id || '(unnamed)'; + const lines = [`#### Task: ${name}`, '']; + const validations = run0.validations || {}; + for (const [gname, v] of Object.entries(validations)) { + if (v.passed) continue; + const fb = (v.feedback || '_no feedback_').replace(/\n/g, ' ').slice(0, 400); + lines.push(`- ❌ **${gname}** (${(v.score ?? 0).toFixed(2)}): ${fb}`); + } + const out = run0.final_output; + if (out && typeof out === 'string') { + lines.push('', '
Agent output (truncated)', '', '```', truncateText(out, 30), '```', '', '
'); + } + failureDetails.push(lines.join('\n')); + } + const failurePanel = (failed > 0 && failureDetails.length > 0) + ? [ + '
🐛 Failure details', + '', + failureDetails.join('\n\n---\n\n'), + '', + '
', + ].join('\n') + : ''; + + // Suggestion / recommendation report (--suggest --recommend). + const sug = (json.metadata && json.metadata.suggestion_report) || null; + const rec = (json.metadata && json.metadata.recommendation_report) || null; + const suggestionParts = []; + if (sug && typeof sug === 'string' && sug.trim().length > 0) { + suggestionParts.push(sug.trim()); + } + if (rec && typeof rec === 'string' && rec.trim().length > 0) { + if (suggestionParts.length > 0) suggestionParts.push('\n\n---\n\n'); + suggestionParts.push(rec.trim()); + } + const suggestionPanel = suggestionParts.length > 0 + ? [ + failed > 0 + ? '
💡 Suggestions / root-cause analysis' + : '
💡 Suggestions / recommendations', + '', + suggestionParts.join(''), + '', + '
', + ].join('\n') + : ''; + + // Raw eval output (closed by default — fallback / drill-down). + const rawPanel = rawMd + ? [ + '
📄 Full eval output (raw --format github-comment markdown)', + '', + rawMd.trim(), + '', + '
', + ].join('\n') + : ''; + + const parts = [ + `### Agent: \`${agent}\``, + '', + headline, + '', + taskTable, + ]; + if (tokensSection) { parts.push('', tokensSection); } + if (qualitySection) { parts.push('', qualitySection); } + if (failurePanel) { parts.push('', failurePanel); } + if (suggestionPanel) { parts.push('', suggestionPanel); } + if (rawPanel) { parts.push('', rawPanel); } + sections.push(parts.join('\n')); + } + + const totalLegs = allDirs.length; + + const prepareMode = (process.env.PREPARE_MODE || '').trim(); + const prepareReason = (process.env.PREPARE_REASON || '').trim(); + let scopeBanner = ''; + if (prepareMode === 'none') { + scopeBanner = + '> ℹ️ **No agents evaluated.** ' + (prepareReason || 'No relevant changes detected.'); + } else if (prepareMode === 'subset') { + scopeBanner = + '> 🎯 **Diff-scoped run.** ' + (prepareReason || 'Only changed agents evaluated.') + + ' Touch `.github/workflows/waza-agent-evals.yml` or trigger `workflow_dispatch` to run all agents.'; + } else if (prepareMode === 'single') { + scopeBanner = + '> 🎯 **Single-agent run.** ' + (prepareReason || 'workflow_dispatch input.'); + } else if (prepareMode === 'full') { + scopeBanner = + '> 🔁 **Full matrix run.** ' + (prepareReason || 'All configured agents evaluated.'); + } + + const header = [ + '', + '## 🤖 Waza agent evals (advisory)', + '', + scopeBanner, + scopeBanner ? '' : null, + 'Ran ' + totalLegs + ' agent eval' + (totalLegs === 1 ? '' : 's') + + ' against `claude-sonnet-4.6`. Each eval consumes ~5 premium Copilot requests; results are non-blocking — investigate failures via the workflow logs and the per-agent `waza-agent-results-*` artifacts.', + '', + '> **How this works:** This workflow auto-syncs the canonical `.github/agents/.agent.md` into the sibling mirror inside `.github/evals/agents//` before each run, so the score below reflects the version of the agent in this PR — not whatever was committed when the eval was first wired up.', + '', + ].filter((line) => line !== null).join('\n'); + + const sectionsBlock = sections.length > 0 + ? sections.join('\n\n---\n\n') + : '_No agents in scope for this PR._'; + const body = [ + header.replace(/\s+$/, ''), + tokenCompareSection.replace(/\s+$/, ''), + sectionsBlock, + ].filter((s) => s.length > 0).join('\n\n') + '\n'; + + const { owner, repo } = context.repo; + const issue_number = context.payload.pull_request.number; + const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number }); + const existing = comments.find((c) => c.body && c.body.includes('')); + if (existing) { + await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body }); + } else { + await github.rest.issues.createComment({ owner, repo, issue_number, body }); + } diff --git a/.github/workflows/waza-evals.yml b/.github/workflows/waza-evals.yml new file mode 100644 index 0000000..c0b3f18 --- /dev/null +++ b/.github/workflows/waza-evals.yml @@ -0,0 +1,875 @@ +name: Waza skill evals + +# Advisory-mode evaluation of agent skills. +# Runs on PRs that touch SKILL.md or any eval file. Posts a comment with results. +# Always non-blocking — eval failures never gate merges. +# +# Single source of truth: .github/evals/manifest.yaml +# Lists configured skills, tier classification, and per-tier model fan-out. +# Edit that file to add/remove a skill or promote a tier — this workflow +# reads it dynamically and needs no changes. +# +# Architecture: +# - `prepare` job: reads the manifest, then diffs base..head (or honors a +# workflow_dispatch input) to determine which subset of skills to +# evaluate. Builds the matrix.include payload (one entry per +# skill × tier-model). Project-wide config changes (.waza.yaml, this +# workflow file, the manifest) trigger the full matrix. +# - `tokens` job: runs once, compares token counts across all skills vs main +# and uploads the result for the comment job. Cheap, always runs. +# - `eval` job: matrix expanded purely from `prepare.outputs.legs`. Each +# leg runs the eval suite plus per-skill signal steps (tokens profile, +# quality, check). Uploads per-leg artifacts. Skipped entirely if no +# skills changed. +# - `comment` job: fan-in. Downloads all artifacts and posts a single +# `` PR comment with one section per skill +# (in manifest order) plus a header noting which skills ran and why. +# +# Per-PR scoping: +# - Skill-only changes → only changed skills run (saves Copilot quota). +# - .waza.yaml, manifest, or workflow file changes → full matrix. +# - .github/evals// changes → that skill only. +# - workflow_dispatch with no input → full matrix. +# - workflow_dispatch with `skill:` input → that skill only. +# +# Notes: +# - waza's eval schema only supports `skill:`. Custom agents under +# .github/agents/*.agent.md are *not* evaluable by this workflow. See +# docs/WAZA.md "Agent evals" for the upstream limitation. +# - copilot-sdk needs a Copilot-scoped token. Default GITHUB_TOKEN does +# NOT carry that scope. We use the `COPILOT_GITHUB_TOKEN` repo secret. +# Comment posting uses the default token (only needs pull-requests: write). + +on: + pull_request: + paths: + - '.github/skills/**/SKILL.md' + - '.github/skills/**/eval.yaml' + - '.github/skills/**/tasks/**' + - '.github/skills/**/fixtures/**' + - '.github/evals/**' + - '.waza.yaml' + - '.github/workflows/waza-evals.yml' + workflow_dispatch: + inputs: + skill: + description: 'Single skill name to run (default: all configured pilot evals)' + required: false + type: string + +permissions: + contents: read + pull-requests: write + +concurrency: + group: waza-evals-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +# Pin waza to a known-good release. Bump deliberately after validating that +# the new version's eval behavior still matches our baselines. Never resolve +# via `latest` — the microsoft/waza repo publishes the core release and the +# sibling azd-extension release at the same commit, and GitHub's +# `releases/latest` endpoint returns whichever was published last, which has +# bitten PR #109 with a 404 on the wrong asset. +env: + WAZA_VERSION: 'v0.33.0' + +# Note: there is no top-level skill list. The canonical list lives in +# .github/evals/manifest.yaml and is read by the `prepare` job below. + +jobs: + # --------------------------------------------------------------------------- + # preflight: verify that the COPILOT_GITHUB_TOKEN secret is configured. + # When absent, every downstream job is skipped cleanly (no red checks). The + # maintainer setup steps are in PR #109 / README. + # --------------------------------------------------------------------------- + preflight: + name: Preflight (check secrets) + runs-on: ubuntu-latest + timeout-minutes: 2 + outputs: + enabled: ${{ steps.check.outputs.enabled }} + steps: + - name: Check COPILOT_GITHUB_TOKEN availability + id: check + env: + TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: | + if [ -z "${TOKEN:-}" ]; then + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza skill eval jobs. See repo README / PR #109 for setup." + exit 0 + fi + # Token is set — verify it can actually read the private microsoft/waza + # repo (release downloads need access). Reject silently if 401/403/404. + # Capture headers + body for diagnostics (no token is ever printed). + hdr_file=$(mktemp) + body_file=$(mktemp) + http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/microsoft/waza/releases/latest || true) + if [ "${http_code}" = "200" ]; then + echo "enabled=true" >> "$GITHUB_OUTPUT" + echo "COPILOT_GITHUB_TOKEN can read microsoft/waza — eval jobs will run." + else + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza skill eval jobs." + echo "--- diagnostic: response headers (token not included) ---" + grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true + echo "--- diagnostic: response body (first 500 bytes) ---" + head -c 500 "${body_file}" || true + echo + echo "--- diagnostic: token-user identity probe ---" + user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/user || true) + echo "GET /user -> HTTP ${user_code}" + if [ "${user_code}" = "200" ]; then + # Print only the login + token type, never the token itself. + jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user" + else + head -c 300 "${body_file}.user" || true + fi + echo + fi + rm -f "${hdr_file}" "${body_file}" "${body_file}.user" + + # --------------------------------------------------------------------------- + # prepare: read .github/evals/manifest.yaml and decide which skills to + # evaluate based on the diff / dispatch input. Outputs: + # - skills: JSON array of selected skill names (drives comment ordering) + # - legs: JSON array of {skill, model, baseline} for matrix.include + # - baseline_models: JSON array of model names that run with --baseline + # - mode/reason: human-readable scope info for the PR comment banner + # --------------------------------------------------------------------------- + prepare: + name: Determine matrix + runs-on: ubuntu-latest + timeout-minutes: 5 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + outputs: + skills: ${{ steps.select.outputs.skills }} + legs: ${{ steps.select.outputs.legs }} + baseline_models: ${{ steps.select.outputs.baseline_models }} + reason: ${{ steps.select.outputs.reason }} + mode: ${{ steps.select.outputs.mode }} + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Select skills + id: select + env: + REQUESTED: ${{ inputs.skill }} + EVENT: ${{ github.event_name }} + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + set -euo pipefail + + # Source of truth for skills + tier + per-tier model fan-out. + manifest=".github/evals/manifest.yaml" + if [ ! -f "$manifest" ]; then + echo "::error::manifest not found: $manifest" + exit 1 + fi + + # Convert YAML -> JSON once; everything else is jq. + manifest_json="$(yq -o=json '.' "$manifest")" + + ALL_SKILLS="$(echo "$manifest_json" | jq -c '[.skills[].name]')" + BASELINE_MODELS="$(echo "$manifest_json" | jq -c ' + [ .tiers[].models[] | select(.baseline == true) | .name ] | unique + ')" + echo "ALL_SKILLS=$ALL_SKILLS" + echo "BASELINE_MODELS=$BASELINE_MODELS" + + # emit + # Computes `legs` from selected skills + manifest tiers and writes + # all four outputs (skills, legs, baseline_models, mode, reason). + emit() { + local selected="$1" mode="$2" reason="$3" + local legs + legs="$(echo "$manifest_json" | jq -c --argjson sel "$selected" ' + . as $root + | [ $root.skills[] + | .name as $sname + | select($sel | index($sname)) + | .tier as $tier + | $root.tiers[$tier].models[] + | { skill: $sname, model: .name, baseline: (.baseline == true) } + ] + ')" + { + echo "skills=${selected}" + echo "legs=${legs}" + echo "baseline_models=${BASELINE_MODELS}" + echo "mode=${mode}" + echo "reason=${reason}" + } >> "$GITHUB_OUTPUT" + echo "Selected skills: ${selected}" + echo "Legs: ${legs}" + echo "Mode: ${mode}" + echo "Reason: ${reason}" + } + + # --- Case 1: workflow_dispatch with single-skill input --- + if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then + if echo "$ALL_SKILLS" | jq -e --arg s "$REQUESTED" '. | index($s)' > /dev/null; then + emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)" + exit 0 + else + echo "::error::Requested skill '$REQUESTED' is not in the manifest ($ALL_SKILLS)" + exit 1 + fi + fi + + # --- Case 2: workflow_dispatch without input → full matrix --- + if [ "$EVENT" = "workflow_dispatch" ]; then + emit "$ALL_SKILLS" "full" "workflow_dispatch (no input → full matrix)" + exit 0 + fi + + # --- Case 3: pull_request — diff against base --- + if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then + emit "$ALL_SKILLS" "full" "pull_request: missing base/head SHA → full matrix" + exit 0 + fi + + # Make sure the base commit is fetched (checkout fetched everything + # via fetch-depth: 0, but be defensive in case of shallow merges). + git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true + + changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true) + if [ -z "$changed" ]; then + emit "[]" "none" "no files changed in PR" + exit 0 + fi + + echo "--- changed files ---" + echo "$changed" + echo "---------------------" + + # Project-wide config changes → full matrix. + # Includes the manifest itself: changing tiers / model fan-out + # affects every skill, so re-run everything. + if echo "$changed" | grep -qE '^(\.waza\.yaml|\.github/workflows/waza-evals\.yml|\.github/evals/manifest\.yaml)$'; then + emit "$ALL_SKILLS" "full" "project-wide config change (.waza.yaml, manifest, or workflow file) → full matrix" + exit 0 + fi + + # Per-skill changes: collect skill names from both layouts. + # .github/skills//... → SKILL.md, references, etc. + # .github/evals//... → eval.yaml, tasks, fixtures. + # NF >= 4 filter excludes files at the root of evals/ (like + # manifest.yaml) which are handled by the config-wide check above. + changed_skills=$( + echo "$changed" | awk -F/ ' + /^\.github\/skills\// && NF >= 4 {print $3} + /^\.github\/evals\// && NF >= 4 {print $3} + ' | sort -u + ) + + if [ -z "$changed_skills" ]; then + emit "[]" "none" "no per-skill files changed" + exit 0 + fi + + # Intersect with the canonical configured list. + selected=$( + printf '%s\n' "$changed_skills" \ + | jq -R -s -c --argjson all "$ALL_SKILLS" \ + '[ split("\n")[] | select(length > 0) | select(IN($all[])) ]' + ) + + if [ "$selected" = "[]" ]; then + emit "[]" "none" "changed skill(s) not in the manifest: $(echo "$changed_skills" | tr '\n' ' ')" + exit 0 + fi + + count=$(echo "$selected" | jq 'length') + names=$(echo "$selected" | jq -r 'join(", ")') + emit "$selected" "subset" "diff-scoped: ${count} changed skill(s) — ${names}" + + # --------------------------------------------------------------------------- + # tokens: compare token counts across all SKILL.md files vs main. + # Runs once (not per-matrix) and uploads a single JSON artifact consumed + # by the comment job. Advisory — never fails the workflow. + # --------------------------------------------------------------------------- + tokens: + name: Token comparison vs main (advisory) + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (pinned release) + run: | + set -euo pipefail + waza_version="${WAZA_VERSION}" + if [ -z "${waza_version}" ]; then + echo "::error::WAZA_VERSION env var is not set" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Token comparison vs main (advisory) + id: tokens-compare + run: | + set -uo pipefail + mkdir -p .waza-results + # Advisory: no --strict so the step never fails the workflow. + # --format json produces machine-readable output for the comment job. + waza tokens compare main --skills --threshold 10 --format json \ + > .waza-results/tokens-compare.json 2>&1 || true + echo "--- token comparison output ---" + cat .waza-results/tokens-compare.json || true + # Always exit cleanly — advisory only. + exit 0 + + - name: Upload token comparison artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-tokens-compare + path: .waza-results/tokens-compare.json + retention-days: 14 + if-no-files-found: warn + include-hidden-files: true + + # --------------------------------------------------------------------------- + # eval: matrix (skill x model). Each leg runs the eval suite plus per-skill + # signal steps (tokens profile, quality, check). All steps are advisory. + # + # The skill axis is supplied by the `prepare` job — only changed skills run + # on per-PR events; the full list runs on workflow_dispatch (no input) and + # on PRs that touch project-wide config. + # --------------------------------------------------------------------------- + eval: + name: "${{ matrix.skill || 'eval' }} / ${{ matrix.model || 'skipped (no skill changes)' }}" + needs: [preflight, prepare] + if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != '' + runs-on: ubuntu-latest + timeout-minutes: 25 + # Per-job continue-on-error so a single matrix leg failure doesn't fail + # the whole workflow. Combined with `if: always()` on the comment job, + # this guarantees the PR comment is posted even when some legs fail. + continue-on-error: true + strategy: + fail-fast: false + matrix: + # Matrix is sourced entirely from the manifest via the prepare job. + # Each include entry is `{ skill, model, baseline }`. Adding a skill + # or promoting a tier means editing .github/evals/manifest.yaml — + # never this workflow. + include: ${{ fromJSON(needs.prepare.outputs.legs) }} + env: + # copilot-sdk authenticates with this token. Default GITHUB_TOKEN does + # not carry Copilot scope, so we use a dedicated PAT in repo secrets. + # Also reused for the release-API lookup (only needs public-repo read). + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (pinned release) + run: | + set -euo pipefail + waza_version="${WAZA_VERSION}" + if [ -z "${waza_version}" ]; then + echo "::error::WAZA_VERSION env var is not set" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Run waza eval (advisory) + id: run + run: | + # GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT + # disable -e, so a non-zero exit from `waza run` (e.g. metric below + # threshold) kills the script before `rc=$?` runs. Explicitly + # disable errexit so we can capture the code and surface it in the + # PR comment instead of failing the leg silently. + set +e + set -uo pipefail + mkdir -p .waza-results + spec=".github/evals/${{ matrix.skill }}/eval.yaml" + # Slug used for filenames + artifact suffix; harmless when the + # model has dots (gpt-5.4) since GH Actions allows them. + slug="${{ matrix.skill }}-${{ matrix.model }}" + + # gpt-5.4 (and any other model flagged `baseline: true` in the + # manifest) runs in --baseline (A/B) mode to cap quota cost while + # still providing a reference point for cross-model comparison. All + # other models run standard. The PR comment labels each leg. + extra_flags="" + if [ "${{ matrix.baseline }}" = "true" ]; then + extra_flags="--baseline" + fi + + # ---- Retry-on-session-not-found wrapper ----------------------------- + # The Copilot SDK occasionally drops the agent's session before + # waza's `prompt` grader can resume it (`continue_session: true`), + # producing JSON-RPC -32603 "Session not found" errors. When this + # fires, the run is marked status=error with `validations: null` — + # ALL graders' verdicts for that task are wiped, dragging the leg + # aggregate down ~50–80% even when the agent's actual response was + # correct. + # + # The error is purely transient (server-side session GC). Retrying + # the leg with a fresh waza process consistently recovers. We retry + # up to 2 times (3 total attempts) on session-not-found ONLY — other + # errors (rate-limit 429, below-threshold scores, network) are NOT + # retried since they have different recovery characteristics. + # + # Stdout (--format github-comment) is the markdown for the PR + # comment; capture it cleanly to its own file. Stderr (progress, + # task results, "Running benchmark:") streams to the runner log. + # --model overrides the spec's config.model so we can fan out the + # same eval suite across multiple models. + # --judge-model decouples the LLM-as-judge from the executor model + # so quality scores are always judged by claude-sonnet-4.6. + # --suggest --recommend appends outcome-tied recommendations. + max_attempts=3 + attempt=0 + rc=0 + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + echo "::group::waza run attempt ${attempt}/${max_attempts} for ${slug}" + rc=0 + # shellcheck disable=SC2086 + waza run "${spec}" \ + --model "${{ matrix.model }}" \ + --judge-model "claude-sonnet-4.6" \ + --suggest \ + --recommend \ + ${extra_flags} \ + --format "github-comment" \ + --output ".waza-results/${slug}.json" \ + --reporter "junit:.waza-results/${slug}.junit.xml" \ + --parallel \ + > ".waza-results/${slug}.md" + rc=$? + echo "::endgroup::" + + if [ ! -f ".waza-results/${slug}.json" ]; then + echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})" + if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi + break + fi + + session_errs=$(jq -r ' + [.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length + ' ".waza-results/${slug}.json" 2>/dev/null || echo 0) + + if [ "${session_errs}" = "0" ]; then + echo "::notice::${slug} attempt ${attempt} clean (no session-not-found errors)" + break + fi + + echo "::warning::${slug} attempt ${attempt} hit ${session_errs} session-not-found error(s)" + if [ $attempt -lt $max_attempts ]; then + # Discard partial artifacts so the next attempt is independent. + rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.md" ".waza-results/${slug}.junit.xml" + sleep 5 + fi + done + # If retries exhausted and the artifact STILL has session-not-found + # errors, the data is corrupt (validations: null on affected runs + # would drag the leg aggregate down 50–80% as a fake "low score"). + # Discard it so the PR comment surfaces this as INFRA_FAILED rather + # than a misleading low score. + final_session_errs=0 + if [ -f ".waza-results/${slug}.json" ]; then + final_session_errs=$(jq -r ' + [.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length + ' ".waza-results/${slug}.json" 2>/dev/null || echo 0) + fi + if [ "${final_session_errs}" != "0" ]; then + echo "::error::${slug} still has ${final_session_errs} session-not-found error(s) after ${max_attempts} attempts — discarding corrupt artifact" + printf 'session_not_found_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \ + "${final_session_errs}" "${max_attempts}" "${rc}" \ + > ".waza-results/${slug}.infra-failed" + rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.junit.xml" + # Replace the markdown summary with a clear INFRA_FAILED notice + # so the PR comment shows the actual problem instead of stale + # markdown from one of the failed attempts. Use printf (no heredoc) + # because heredoc EOF terminators clash with YAML block-scalar + # indentation rules in `run: |` steps. + { + printf '### `%s` — INFRA_FAILED\n\n' "${slug}" + printf 'waza run hit `%s` `Session not found` JSON-RPC error(s) ' "${final_session_errs}" + printf 'from the Copilot SDK after **%s attempt(s)**. ' "${max_attempts}" + printf 'The session-resume path used by `prompt` graders with ' + printf '`continue_session: true` is intermittently flaky in CI; ' + printf 'retries did not recover. **No score is reported for this leg** ' + printf '— treating a corrupted run as a low score would be misleading.\n' + } > ".waza-results/${slug}.md" + fi + # ---- end retry wrapper ---------------------------------------------- + + echo "exit_code=${rc}" >> "$GITHUB_OUTPUT" + echo + echo "--- captured PR-comment markdown ---" + cat ".waza-results/${slug}.md" || true + # Never fail the step itself — surface the code in the comment. + exit 0 + + - name: Tokens profile (advisory) + id: tokens-profile + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + waza tokens profile ".github/skills/${{ matrix.skill }}" \ + > ".waza-results/${slug}-tokens-profile.txt" 2>&1 || true + cat ".waza-results/${slug}-tokens-profile.txt" || true + exit 0 + + - name: Quality signal (advisory) + id: quality + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + # --judge-model omitted: this step uses the project default judge model + # (claude-sonnet-4.6 from .waza.yaml) for consistent quality scoring + # regardless of which executor model is running in this matrix leg. + waza quality ".github/skills/${{ matrix.skill }}" --format table \ + > ".waza-results/${slug}-quality.txt" 2>&1 || true + cat ".waza-results/${slug}-quality.txt" || true + exit 0 + + - name: Compliance check (advisory) + id: check + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + waza check ".github/skills/${{ matrix.skill }}" \ + > ".waza-results/${slug}-check.txt" 2>&1 || true + cat ".waza-results/${slug}-check.txt" || true + exit 0 + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-results-${{ matrix.skill }}-${{ matrix.model }} + path: .waza-results/ + retention-days: 14 + if-no-files-found: warn + # `.waza-results/` starts with a dot, and upload-artifact treats + # any path segment starting with `.` as hidden by default. Without + # this, the artifact is silently empty. + include-hidden-files: true + + # --------------------------------------------------------------------------- + # comment: fan-in. Downloads all artifacts and posts one aggregated comment. + # --------------------------------------------------------------------------- + comment: + name: Post advisory comment on PR + needs: [preflight, prepare, eval, tokens] + if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always() + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Download all eval artifacts + uses: actions/download-artifact@v8 + with: + path: artifacts + pattern: waza-results-* + merge-multiple: false + + - name: Download token comparison artifact + uses: actions/download-artifact@v8 + with: + name: waza-tokens-compare + path: artifacts/waza-tokens-compare + continue-on-error: true + + - name: Aggregate and post comment + uses: actions/github-script@v9 + env: + PREPARE_MODE: ${{ needs.prepare.outputs.mode }} + PREPARE_REASON: ${{ needs.prepare.outputs.reason }} + PREPARE_SKILLS: ${{ needs.prepare.outputs.skills }} + PREPARE_BASELINES: ${{ needs.prepare.outputs.baseline_models }} + with: + # Default GITHUB_TOKEN — has `pull-requests: write` and is the + # right identity for bot-style comments. + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const path = require('path'); + + // Each matrix job uploads `waza-results--` + // containing per-leg files (slug.md, slug-tokens-profile.txt, + // slug-quality.txt, slug-check.txt). + // + // Skill ordering and baseline-model classification are sourced + // from .github/evals/manifest.yaml via the prepare job — no + // hardcoded lists in this workflow. + const skills = JSON.parse(process.env.PREPARE_SKILLS || '[]'); + const baselineModels = new Set( + JSON.parse(process.env.PREPARE_BASELINES || '[]') + ); + const root = 'artifacts'; + const allDirs = fs.existsSync(root) + ? fs.readdirSync(root) + .filter((d) => d.startsWith('waza-results-')) + .sort() + : []; + + // Helper: read a file, return trimmed content or fallback string. + // Logs a debug note when returning the fallback so missing artifacts + // are visible in the Actions log without failing the step. + function readArtifact(filePath, fallback) { + if (fs.existsSync(filePath)) { + const c = fs.readFileSync(filePath, 'utf8').trim(); + if (c) return c; + core.debug(`readArtifact: file exists but is empty — ${filePath}`); + } else { + core.debug(`readArtifact: file not found — ${filePath}`); + } + return fallback; + } + + // Helper: wrap content in a
block if it exceeds threshold. + function maybeCollapse(summary, content, threshold) { + const limit = threshold || 50; + const lines = content.split('\n').length; + if (lines > limit) { + return `
${summary} (${lines} lines — click to expand)\n\n${content}\n\n
`; + } + return `**${summary}**\n\n${content}`; + } + + // Group artifacts by skill. + const bySkill = new Map(); + for (const d of allDirs) { + const rest = d.replace(/^waza-results-/, ''); + const skill = skills.find((s) => rest === s || rest.startsWith(s + '-')); + if (!skill) continue; + const model = rest === skill ? '(default)' : rest.slice(skill.length + 1); + if (!bySkill.has(skill)) bySkill.set(skill, []); + bySkill.get(skill).push({ model, dir: d, slug: rest }); + } + + // Token comparison section (top-level, from tokens job). + let tokenCompareSection = ''; + const tcPath = path.join(root, 'waza-tokens-compare', 'tokens-compare.json'); + const tcRaw = readArtifact(tcPath, ''); + if (tcRaw) { + const tcBlock = '```json\n' + tcRaw + '\n```'; + tokenCompareSection = [ + '
📊 Token comparison vs main (advisory)', + '', + tcBlock, + '', + '
', + '', + ].join('\n'); + } + + // Build per-skill sections. + const sections = []; + for (const skill of skills) { + if (!bySkill.has(skill)) continue; + const legs = bySkill.get(skill).sort((a, b) => a.model.localeCompare(b.model)); + + // Score (per model) + Suggestions/Recommendations + const scoreParts = []; + for (const leg of legs) { + const isBaseline = baselineModels.has(leg.model); + const modelLabel = isBaseline + ? leg.model + ' *(baseline — A/B mode)*' + : leg.model; + const mdPath = path.join(root, leg.dir, leg.slug + '.md'); + const body = readArtifact(mdPath, + '_No output captured. See workflow logs and the `' + leg.dir + '` artifact._'); + scoreParts.push('
Model: ' + modelLabel + + '\n\n' + body + '\n\n
'); + } + const scoreSection = '
📈 Score (per model) + Suggestions/Recommendations\n\n' + + scoreParts.join('\n\n') + '\n\n
'; + + // Tokens (count + profile) — model-independent, use first available leg. + let tokenBody = '_Not available._'; + for (const leg of legs) { + const tp = path.join(root, leg.dir, leg.slug + '-tokens-profile.txt'); + const c = readArtifact(tp, ''); + if (c) { tokenBody = '```\n' + c + '\n```'; break; } + } + const tokenSection = maybeCollapse('🔢 Tokens (count + profile)', tokenBody); + + // Quality (5-dim table) — model-independent, use first available leg. + let qualityBody = '_Not available._'; + for (const leg of legs) { + const qp = path.join(root, leg.dir, leg.slug + '-quality.txt'); + const c = readArtifact(qp, ''); + if (c) { qualityBody = '```\n' + c + '\n```'; break; } + } + const qualitySection = maybeCollapse('🎯 Quality (5-dim table)', qualityBody); + + // Check (compliance summary) — model-independent, use first available leg. + let checkBody = '_Not available._'; + for (const leg of legs) { + const cp = path.join(root, leg.dir, leg.slug + '-check.txt'); + const c = readArtifact(cp, ''); + if (c) { checkBody = '```\n' + c + '\n```'; break; } + } + // `waza check` expects `eval.yaml` colocated with `SKILL.md`. This + // repo separates them (`.github/skills//SKILL.md` vs + // `.github/evals//eval.yaml`), so the "Evaluation Suite: + // Not Found" line is a false negative — the eval actually ran + // (see the "Score" section above). Prepend a note so reviewers + // are not misled. + const checkNote = + '> ℹ️ **`waza check` expects `eval.yaml` colocated with `SKILL.md`.** ' + + 'This repo separates them into `.github/evals/' + skill + '/eval.yaml`, ' + + 'so the "Evaluation Suite: Not Found" line below is a false negative — ' + + 'the eval actually ran (see the **Score** section above).\n\n'; + const checkSection = maybeCollapse('✅ Check (compliance summary)', checkNote + checkBody); + + sections.push([ + '### Skill: `' + skill + '`', + '', + scoreSection, + '', + tokenSection, + '', + qualitySection, + '', + checkSection, + ].join('\n')); + } + + const totalLegs = allDirs.length; + + // Selection-mode banner from the prepare job. + const prepareMode = (process.env.PREPARE_MODE || '').trim(); + const prepareReason = (process.env.PREPARE_REASON || '').trim(); + let scopeBanner = ''; + if (prepareMode === 'none') { + scopeBanner = + '> ℹ️ **No skills evaluated.** ' + (prepareReason || 'No relevant changes detected.') + + ' The token comparison above (if any) is the only signal for this PR.'; + } else if (prepareMode === 'subset') { + scopeBanner = + '> 🎯 **Diff-scoped run.** ' + (prepareReason || 'Only changed skills evaluated.') + + ' Touch `.waza.yaml` or trigger `workflow_dispatch` to run the full matrix.'; + } else if (prepareMode === 'single') { + scopeBanner = + '> 🎯 **Single-skill run.** ' + (prepareReason || 'workflow_dispatch input.'); + } else if (prepareMode === 'full') { + scopeBanner = + '> 🔁 **Full matrix run.** ' + (prepareReason || 'All configured skills evaluated.'); + } + + const header = [ + '', + '## 🧪 Waza skill evals (advisory)', + '', + scopeBanner, + scopeBanner ? '' : null, + 'Ran ' + totalLegs + ' matrix leg' + (totalLegs === 1 ? '' : 's') + + ' in parallel (skills × models). Results are non-blocking — investigate failures via the workflow logs and the per-leg `waza-results-*` artifacts.', + '', + '> **Legend:** Models flagged `baseline: true` in `.github/evals/manifest.yaml` (currently: `' + + (Array.from(baselineModels).join('`, `') || 'none') + + '`) run with `--baseline` (A/B mode) to cap quota. All other models run standard. Judge model is fixed at `claude-sonnet-4.6` across all legs.', + '', + ].filter((line) => line !== null).join('\n'); + + // Assemble body. Each major block is separated by a blank line so + // that GitHub Flavored Markdown correctly recognizes the per-skill + // `### Skill: ...` headings (without a blank line after the + // preceding `
` they get rendered as plain text). + const sectionsBlock = sections.length > 0 + ? sections.join('\n\n---\n\n') + : '_No artifacts produced. See workflow logs._'; + const body = [ + header.replace(/\s+$/, ''), + tokenCompareSection.replace(/\s+$/, ''), + sectionsBlock, + ].filter((s) => s.length > 0).join('\n\n') + '\n'; + + const { owner, repo } = context.repo; + const issue_number = context.payload.pull_request.number; + + // Paginate to find our marker comment — listComments defaults to + // 30 per page and our comment may be beyond that on busy PRs. + let existing = null; + for await (const response of github.paginate.iterator( + github.rest.issues.listComments, + { owner, repo, issue_number, per_page: 100 } + )) { + const found = response.data.find((c) => c.body && c.body.includes('')); + if (found) { existing = found; break; } + } + + if (existing) { + await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body }); + } else { + await github.rest.issues.createComment({ owner, repo, issue_number, body }); + } diff --git a/.gitignore b/.gitignore index fe355f1..75e4660 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,8 @@ docs/docusaurus/ # Playwright MCP test artifacts .playwright-mcp/ + +# Waza skill/agent evals (microsoft/waza) +.waza-cache/ +.waza-results/ +*.waza-results.json diff --git a/.waza.yaml b/.waza.yaml new file mode 100644 index 0000000..79fab8c --- /dev/null +++ b/.waza.yaml @@ -0,0 +1,47 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/config.schema.json + +# Waza configuration for Azure/git-ape. +# +# Skills live under .github/skills/ (existing layout — not touched). +# Eval suites live under .github/evals/ alongside them. +# Run results land under .waza-results/ (gitignored). + +paths: + skills: .github/skills/ + evals: .github/evals/ + results: .waza-results/ + +defaults: + engine: copilot-sdk + model: claude-sonnet-4.6 + timeout: 300 + parallel: true + workers: 4 + verbose: false + sessionLog: false + +cache: + enabled: true + dir: .waza-cache + +dev: + model: claude-sonnet-4.6 + target: medium-high + maxIterations: 5 + +tokens: + # Aspirational token caps for new SKILL.md files. These are intentionally + # tighter than today's corpus (13 skills as of 2026-05, p75 ≈ 3.2k tokens) — + # the goal is to push NEW skills to fit comfortably in agent context, while + # existing oversize skills are gradually trimmed via /skill-improve. + # Re-evaluate once the skill corpus stabilises: `waza tokens count + # .github/skills/ --format json | jq '[.files|to_entries[]| + # select(.key|endswith("/SKILL.md"))|.value.tokens]|sort'`. + warningThreshold: 1000 + # Hard fallback ceiling enforced by `waza tokens compare --strict`. + # Tightened in Wave 3b from 1500 → 1300 (1.3× warningThreshold) to narrow + # the gap between "warn" and "block" for new skills. + fallbackLimit: 1300 + +graders: + programTimeout: 30 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2f21d88..ea92bd1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -91,6 +91,28 @@ Agent files live in `.github/agents/` and require: - YAML frontmatter with `description` field. - A `## Warning` section (experimental disclaimer). +## Adding an Eval Suite + +Every skill and agent in this repo can have a companion behavioral eval +under `.github/evals/`. Evals are scored on PRs via the +[`waza-evals`](.github/workflows/waza-evals.yml) and +[`waza-agent-evals`](.github/workflows/waza-agent-evals.yml) workflows. + +To scaffold an eval for an existing skill or agent, use the slash +commands in VS Code (Copilot Chat): + +- `/skill-onboard skillName=` — bootstraps `.github/evals//` + and appends a `{ name, tier: expanded }` entry to `manifest.yaml`. +- `/agent-onboard agentName=` — bootstraps + `.github/evals/agents//`. No `manifest.yaml` edit (agent evals + are auto-discovered). + +The full lifecycle (`onboard` → `bench` → `improve` → `promote`) and the +authoring framework are documented under +[Authoring](https://azure.github.io/git-ape/docs/authoring/) on the docs +site. Decision rationale for the harness choice lives in +[`.github/evals/README.md`](.github/evals/README.md). + ## Pull Request Process 1. **Fork and branch** — Create a feature branch from `main`. diff --git a/website/docs/authoring/_category_.json b/website/docs/authoring/_category_.json new file mode 100644 index 0000000..bf1af05 --- /dev/null +++ b/website/docs/authoring/_category_.json @@ -0,0 +1,9 @@ +{ + "label": "Authoring", + "position": 7, + "link": { + "type": "generated-index", + "description": "Add, evolve, and ship skills, agents, eval suites, and the prompts that optimize them in Git-Ape." + }, + "collapsed": false +} diff --git a/website/docs/authoring/agents.md b/website/docs/authoring/agents.md new file mode 100644 index 0000000..7d75013 --- /dev/null +++ b/website/docs/authoring/agents.md @@ -0,0 +1,183 @@ +--- +title: "Authoring Agents" +sidebar_label: "Agents" +sidebar_position: 4 +description: "How to add a new agent: persona, tools allowlist, sub-agent wiring, and the dual tool taxonomy." +--- + +# Authoring Agents + +An **agent** is a persona with a `tools:` allowlist that orchestrates one or more skills to deliver a complete workflow. Agents are single `.agent.md` files under [`.github/agents/`](https://github.com/Azure/git-ape/tree/main/.github/agents). + +Where a [skill](./skills) is a runbook, an agent is the character that executes it — it owns scope, refusals, tool selection, and inter-skill orchestration. + +## Quick start + +```bash +AGENT=my-new-agent +$EDITOR .github/agents/"$AGENT".agent.md +``` + +No further registration is needed — `plugin.json` declares `"agents": ".github/agents/"` and Copilot auto-discovers every `*.agent.md` file in that directory. + +> **Optimize your agent from the start.** Don't ship an `.agent.md` blind — use the prompts listed in [Prompts](./prompts) to evaluate and harden it as you write: +> +> - [`/agent-onboard`](./prompts#agent-onboard) — scaffolds `.github/evals/agents//` with positive tasks, negative tasks, and an off-topic persona-lock check, then runs a smoke trial so you can watch your tool allowlist, refusals, and orchestration in action. +> - [`/agent-bench`](./prompts#agent-bench) — benchmarks the agent across models so you know which ones honor the persona. +> - [`/agent-improve`](./prompts#agent-improve) — diagnoses failing tasks (leaked persona, wrong tool, missed skill) and proposes targeted edits to your `.agent.md`. +> - [`/agent-promote`](./prompts#agent-promote) — locks the agent in once it's stable. +> +> Run `/agent-onboard` as soon as your first draft is readable. Agent evals are auto-discovered from the filesystem; no manifest edit is required. + +## File template + +```markdown +--- +name: "My New Agent" +description: "One sentence summarising the agent's job and when to invoke it." +tools: ["read", "search", "execute/runInTerminal", "execute/awaitTerminal"] +user-invocable: true +argument-hint: "Optional free-text hint shown in the invocation picker" +--- + +## Warning + +This agent is experimental and not production-ready. + +You are **My New Agent**, responsible for . + +**Always identify yourself as "My New Agent" in your responses.** Never describe +yourself as a generic "software engineering assistant", "GitHub Copilot CLI", or +any other persona — this agent has a single, narrow purpose and your identity is +part of its contract. + +## Non-goals + +This agent does **not**: + +- Deploy Azure resources — that is `/git-ape`'s job. +- Onboard repositories — that is `/git-ape-onboarding`'s job. +- Answer questions unrelated to . + +If a request is unrelated to , identify yourself as **My New Agent**, +decline in one sentence, and redirect the user to the appropriate agent. + +## Your Role + +Describe what this agent does in two or three sentences. + +## Use Skill + +Always use the `/` skill for procedure and output format. + +## Workflow + +1. Ask the user what they want to do. +2. Read any configuration or context files needed (e.g. `copilot-instructions.md`). +3. Execute the `/` skill procedure end-to-end. +4. Present the result. + +## Output Requirements + +- Concrete bullet about format (tables, JSON, fenced code blocks) +- Concrete bullet about anything that MUST appear in the response + +## Key Principle + +One paragraph stating the non-negotiable rule the agent enforces. +``` + +## Frontmatter reference + +| Field | Required | Purpose | +|-------|:--------:|---------| +| `name` | ✅ | Display name shown in invocation menus. Use Title Case (`"Azure Policy Advisor"`). | +| `description` | ✅ | One sentence used by routers and surfaced in tooling. | +| `tools` | ✅ | Allowlist of tool IDs the agent can call. See [Dual tool taxonomy](#dual-tool-taxonomy) below. | +| `user-invocable` | ⚪ | Defaults to `true`. Set `false` for sub-agents (e.g. `azure-template-generator`) that only run as a step inside another agent's workflow. | +| `argument-hint` | ⚪ | Free-text hint. | +| `agents` | ⚪ | List of sub-agent file basenames this agent delegates to. Sub-agents must also exist under `.github/agents/`. | + +## Persona-lock (non-negotiable) + +Without a persona-lock paragraph, models default to a generic "GitHub Copilot CLI assistant" persona on off-topic prompts. Every agent in this repo includes: + +```markdown +**Always identify yourself as "" in your responses.** Never describe +yourself as a generic "software engineering assistant", "GitHub Copilot CLI", or +any other persona — this agent has a single, narrow purpose and your identity is +part of its contract. +``` + +Plus a `## Non-goals` section listing out-of-scope domains and the agent each should be redirected to. + +Persona-lock is enforced by the agent's eval `tasks/off-topic.yaml` task — a negative test that confirms the agent declines unrelated requests in its own voice. Treat persona-lock as best-effort: the model's built-in CLI refusal sometimes fires before the agent rewrite, so do not rely on it being respected 100% of the time. The eval grader accepts both clean-refusal markers and agent-name mentions to avoid false negatives. + +## Wiring sub-agents + +If your agent calls other agents, declare them in frontmatter: + +```yaml +agents: + - azure-requirements-gatherer + - azure-template-generator + - azure-resource-deployer +``` + +Each sub-agent should set `user-invocable: false` so it only runs through the parent. See [`git-ape.agent.md`](https://github.com/Azure/git-ape/blob/main/.github/agents/git-ape.agent.md) for a multi-stage deployment pipeline that chains six sub-agents. + +## Dual tool taxonomy + +The `tools:` field lists **VS Code Copilot Chat** tool IDs — the surface where the agent runs in production. Common values: + +| Tool ID | What it grants | +|---------|----------------| +| `read` | File reads in the workspace | +| `search` | Workspace search (grep/semantic) | +| `execute/runInTerminal` | Run a terminal command | +| `execute/awaitTerminal` | Wait for an async terminal command | +| `execute/getTerminalOutput` | Read terminal output | +| `execute/createAndRunTask` | Create and run a VS Code task | +| `microsoftdocs/mcp/*` | Microsoft Learn MCP server | +| `azure-mcp/` | Azure MCP server scoped to a service (`cosmos`, `keyvault`, etc.) | +| `todo` | The todo list tool | +| `vscode` | VS Code commands | + +**Important:** the [eval harness](./evals) runs agents under the `copilot-sdk` executor, which emits a **different taxonomy** (SDK CLI short names: `bash`, `view`, `edit`, `create`, `sql`, `task`). Per-task `tool_constraint` graders in eval suites must target SDK names. Do **not** rewrite the production `tools:` field to satisfy a grader — fix the grader instead. See [Eval suites → Dual tool taxonomy](./evals#dual-tool-taxonomy) for the bridging pattern. + +## Always delegate to a skill + +If procedural detail (steps, output format, classification tables) belongs anywhere in the agent file, move it into a skill and have the agent reference it: + +```markdown +## Use Skill + +Always use the `/azure-policy-advisor` skill for procedure, classification tiers, and output format. +``` + +This keeps agents thin (persona + orchestration) and skills reusable (procedure). When the procedure changes, only the skill is edited; the agent picks up the new behaviour automatically. + +## Local validation + +```bash +# Lint the agent file (waza accepts .agent.md, but flags spec gaps) +waza check .github/agents/my-new-agent.agent.md + +# If you wrote an eval suite: +waza run .github/evals/agents/my-new-agent/eval.yaml -v +``` + +> `waza check` is built around the SKILL.md spec, so `.agent.md` files will surface frontmatter warnings even when well-formed. The signal that matters is the eval pass rate, not `waza check` clean exit. + +## Common pitfalls + +- **Missing persona-lock** — agent leaks "I'm a GitHub Copilot CLI assistant" on off-topic prompts. Add the standard paragraph above. +- **`tools:` field rewritten for the eval executor** — breaks the production agent. Keep `tools:` in VS Code Chat taxonomy and write per-task `tool_constraint` graders in SDK taxonomy. +- **Agent embeds procedure** — move it into a skill, keep the agent thin. +- **No `## Non-goals`** — without explicit redirects, off-topic refusals are generic and fail the off-topic eval task. + +## Read next + +- [Eval suites](./evals) — score the agent across models +- [Prompts](./prompts#agent-improve) — local audit + edit loop +- [Authoring skills](./skills) — the runbooks agents delegate to diff --git a/website/docs/authoring/evals.md b/website/docs/authoring/evals.md new file mode 100644 index 0000000..abf6370 --- /dev/null +++ b/website/docs/authoring/evals.md @@ -0,0 +1,231 @@ +--- +title: "Eval Suites" +sidebar_label: "Eval suites" +sidebar_position: 5 +description: "How to scaffold an eval suite for a skill or agent, what each grader scores, and how CI picks it up." +--- + +# Eval Suites + +Eval suites score skills and agents across multiple models so quality changes are caught at PR time, not after release. Git-Ape uses [microsoft/waza](https://github.com/microsoft/waza) as the eval runner; suites live under [`.github/evals/`](https://github.com/Azure/git-ape/tree/main/.github/evals). + +``` +.github/evals/ +├── manifest.yaml # CI matrix (tiers + models) +├── /eval.yaml # Skill evals +└── agents//eval.yaml # Agent evals +``` + +## Skill eval scaffold + +Every skill eval needs an `eval.yaml` and at least two tasks (one positive, one negative): + +``` +.github/evals/my-skill/ +├── eval.yaml +└── tasks/ + ├── positive-001-typical-use.yaml + └── negative-001-off-topic.yaml +``` + +### `eval.yaml` + +```yaml +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +name: my-skill-eval +description: "What this suite scores." +skill: my-skill +version: "0.1" + +config: + trials_per_task: 2 # 2 = expanded-tier default (flake detection); bump to 3 for pilot/promotion + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 # Default — CI overrides via --model per matrix leg + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Skill should activate on relevant prompts and stay quiet otherwise. + +graders: + # Cap runaway loops / unexpected plan expansion. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + +tasks: + - "tasks/*.yaml" +``` + +### Task file + +```yaml +# tasks/positive-001-typical-use.yaml +name: positive-001-typical-use +description: "User asks for the canonical thing this skill does" +prompt: | + + +expect: + trigger: skill:my-skill + behavior: + - "describes what the skill should do procedurally" + - "second behavioral assertion" + prompt: + must_include: + - "literal substring that MUST appear in the reply" +``` + +Negative tasks omit the skill-specific `expect.trigger` and assert the agent should **not** invoke the skill: + +```yaml +# tasks/negative-001-off-topic.yaml +name: negative-001-off-topic +description: "Unrelated request — skill should NOT fire" +prompt: | + Help me write a sonnet about ducks. + +expect: + trigger: !skill:my-skill # ! = negation +``` + +## Agent eval scaffold + +Agent evals live under `.github/evals/agents//`. They use the same waza schema with two extra pieces: + +1. A **mirrored copy** of the agent file as a sibling, so waza's discovery picks it up. +2. `config.skill_directories` listing both the eval directory (for the agent mirror) and the real skill directory. + +``` +.github/evals/agents/my-agent/ +├── eval.yaml +├── my-agent.agent.md # Mirror of .github/agents/my-agent.agent.md +└── tasks/ + ├── positive-001-happy-path.yaml + └── negative-001-off-topic.yaml +``` + +```yaml +name: my-agent-agent-eval +description: "End-to-end eval for the my-agent agent." +skill: my-agent +version: "0.1" + +config: + trials_per_task: 2 + timeout_seconds: 480 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + skill_directories: + - "." # picks up the .agent.md mirror + - "../../../skills/my-skill-the-agent-uses" # picks up the real skill +``` + +The eval-directory copy of `*.agent.md` must be kept in sync with the production agent. The [`/agent-improve`](./prompts#agent-improve) and [`/agent-bench`](./prompts#agent-bench) prompts do this resync automatically (`cp .github/agents/.agent.md .github/evals/agents//`). Add a CI lint or pre-commit hook if you want hard enforcement. + +## Graders + +| Grader | Scores | When to include | +|--------|--------|-----------------| +| `trigger` (implicit via `metrics.trigger_precision`) | Did the agent route to the right skill on positives, and stay quiet on negatives? | Always | +| `behavior` | Tool-call budget, duration, or step-order assertions | Always (at minimum a `budget` config) | +| `prompt` (LLM-as-judge) | Did the reply text meet a documented quality bar? Scored by `claude-sonnet-4.6`. | Always for positive tasks | +| `skill_invocation` | Did the agent invoke the specific tool calls the skill expects? | Skills that wrap MCP tools or specific CLIs | +| `tool_constraint` | Did the agent stay within the declared tool allowlist? | Agent evals (waza auto-injects this from `.agent.md` `tools:` unless suppressed) | + +### Per-task vs eval-root graders + +Some graders should not fire on every task. For example, `prompt` quality scoring on a refusal task produces low scores because the agent (correctly) gave a short refusal — there's nothing to evaluate against a quality rubric. + +Pattern: declare per-task graders inside each `tasks/*.yaml`, and only include eval-root graders that genuinely apply to every task (typically `budget`). + +## Dual tool taxonomy + +The production `tools:` field on an `.agent.md` uses **VS Code Copilot Chat IDs** (`read`, `search`, `execute/*`, `microsoftdocs/mcp/*`, `azure-mcp/*`). The waza `copilot-sdk` executor emits **SDK CLI short names** (`bash`, `view`, `edit`, `create`, `sql`, `task`). + +If you write per-task `tool_constraint` graders, target the SDK taxonomy. To stop waza from auto-injecting an eval-root `tool_constraint` that would fail against VS Code IDs the SDK never emits, declare a no-op suppressor: + +```yaml +graders: + - type: tool_constraint + name: _suppress_auto_inject + config: + reject_tools: + - tool: "^___never_matches___$" +``` + +Real assertions then live per-task: + +```yaml +# tasks/positive-001.yaml +expect_tools: + - tool: "^(bash|view|edit|create|sql|task)$" +``` + +## How CI picks up your eval + +The matrix is driven by [`.github/evals/manifest.yaml`](https://github.com/Azure/git-ape/blob/main/.github/evals/manifest.yaml): + +```yaml +skills: + - name: prereq-check + tier: pilot + # Add your skill here: + - name: my-skill + tier: expanded # Start in expanded; promote to pilot via /skill-promote + +tiers: + pilot: # Full 4-model fan-out + models: + - name: claude-sonnet-4.6 + - name: gpt-5.4 + baseline: true + - name: gpt-5-codex + - name: claude-opus-4.6 + expanded: # 2-model fan-out (lower cost) + models: + - name: claude-sonnet-4.6 + - name: gpt-5-codex +``` + +| Tier | Models | Use when | +|------|--------|----------| +| `pilot` | 4 (claude-sonnet-4.6, gpt-5.4 baseline, gpt-5-codex, claude-opus-4.6) | Skill is stable; you want full cross-model signal | +| `expanded` | 2 (claude-sonnet-4.6, gpt-5-codex) | Skill is new; cap quota cost while it stabilises | + +The PR workflows that consume the manifest: + +- [`.github/workflows/waza-evals.yml`](https://github.com/Azure/git-ape/blob/main/.github/workflows/waza-evals.yml) — runs skill evals per PR +- [`.github/workflows/waza-agent-evals.yml`](https://github.com/Azure/git-ape/blob/main/.github/workflows/waza-agent-evals.yml) — runs agent evals per PR + +Agent evals are discovered from the filesystem (every directory under `.github/evals/agents/` with an `eval.yaml` runs); they do not need a manifest entry. + +> **Preflight gating:** both workflows include a `preflight` job that probes the `COPILOT_GITHUB_TOKEN` secret. If the token is missing or lacks access to the private `microsoft/waza` repo (where waza releases live), downstream jobs are skipped cleanly instead of failing red. Maintainers can validate the secret end-to-end by checking that the matrix actually runs on the next PR. + +## Local validation + +```bash +# Single run, single model +waza run .github/evals/my-skill/eval.yaml --no-cache + +# Verbose with debug +waza run .github/evals/my-skill/eval.yaml -v --debug + +# Cross-model bench (uses the bench prompt) +# In VS Code: /skill-bench skillName=my-skill +``` + +The [`/skill-bench`](./prompts#skill-bench) and [`/agent-bench`](./prompts#agent-bench) prompts wrap the cross-model run + `waza compare` for you and print a one-line winner summary. + +## Read next + +- [Prompts](./prompts) — onboard / bench / improve / promote loops +- [Authoring skills](./skills) — what an evaluable skill looks like +- [Authoring agents](./agents) — agent surface specifics diff --git a/website/docs/authoring/framework.md b/website/docs/authoring/framework.md new file mode 100644 index 0000000..237db88 --- /dev/null +++ b/website/docs/authoring/framework.md @@ -0,0 +1,385 @@ +--- +title: "Authoring framework for skills, agents, prompts, and instructions" +sidebar_label: "Framework spec" +sidebar_position: 2 +description: "The contract every skill and agent in this repo follows: anatomy templates, grounding policy, eval-as-contract enforcement, and the closed-loop authoring lifecycle." +--- + +## Purpose + +This is the framework spec for every prompt-engineering artifact in `.github/`. It is generic — domain examples are illustrative, the rules apply to any skill or agent regardless of domain. + +The skill format conforms to the **Agent Skills open standard** at [agentskills.io](https://agentskills.io/) [\[1\]](#refs). This document extends that standard with three repo-specific contracts: a third grounding layer for live tool calls, an eval-as-contract policy enforced in CI, and an agent file format for VS Code Copilot custom agents. + +Read this once before adding a new skill, agent, prompt, or shared reference. Every file under `.github/skills/**`, `.github/agents/**`, `.github/prompts/**`, `.github/instructions/**`, and `.github/references/**` MUST conform to the contracts below. + +## TL;DR + +Four primitives. One grounding policy. One lifecycle. The eval suite enforces the contract; the skill author does not "train" anything. + +```text +PROMPT (.prompt.md) user-invokable command, triggers a workflow + │ + ▼ +AGENT (.agent.md) persona + workflow + curated skill set + │ + ▼ +SKILL (SKILL.md) one narrow procedure + tool mandates + sources + │ + ▼ +REFERENCES (references/*) authoritative snapshots, schemas, regex tables + +CROSS-CUTTING: INSTRUCTIONS (.instructions.md, applyTo: glob) — repo-wide policy. +``` + +Decision rule: + +* **Skill** — *"how do I do X step by step, with what sources?"* +* **Agent** — *"who am I, what is my workflow, which skills do I own?"* +* **Prompt** — *"the user's verb that boots a workflow"* +* **Instructions** — *"rules that apply regardless of which skill or agent fired"* + +## The four primitives + +### Skill + +Atomic, reusable procedure. One job, well grounded. Follows `templates/SKILL.template.md`. + +A skill is a directory containing a `SKILL.md` with YAML frontmatter (required: `name`, `description`) plus optional `scripts/`, `references/`, and `assets/` subdirectories — the canonical layout defined by the Agent Skills spec [\[1\]](#refs)[\[2\]](#refs). + +Lives at `.github/skills//SKILL.md`. Discovered by the runtime via `plugin.json` and by the eval harness via `.github/evals/manifest.yaml`. + +### Agent + +Thin orchestrator. Owns a persona, a workflow, and a curated list of skills. Follows `templates/AGENT.template.md`. + +Lives at `.github/agents/.agent.md`. Calls skills for domain knowledge. Carries no how-to detail of its own; if an agent file is becoming long, extract the procedural content into one or more skills. + +### Prompt + +User-invokable verb. Boots a workflow with arguments. Often delegates to an agent via `agent:` frontmatter. + +Lives at `.github/prompts/.prompt.md`. + +### Instructions + +Cross-cutting policy applied by glob. Used for conventions that span many files (commit style, markdown linting, naming standards). + +Lives at `.github/instructions/.instructions.md`. Required frontmatter: `description` and `applyTo`. + +## The grounding contract + +This is the centerpiece. Every skill obeys it. Every agent inherits it from its skills. + +### Progressive disclosure, plus a third layer + +The Agent Skills standard defines a three-stage **progressive disclosure** model: Discovery (metadata only), Activation (full `SKILL.md` body), and Execution (bundled scripts and reference files loaded on demand) [\[1\]](#refs)[\[2\]](#refs). This framework adopts that model verbatim and adds an L3 layer for live tool-mediated fetches when freshness matters. + +| Layer | Agent Skills stage [\[1\]](#refs) | Where it lives | When it is used | Cost | +|---|---|---|---|---| +| **L1 — Inline canon** | Activation (SKILL.md body) | `SKILL.md` body, up to roughly 20 facts | Hot path; common, stable answers | 0 tool calls | +| **L2 — References corpus** | Execution (bundled files) | `references/*.md` next to the skill, or shared `.github/references//` | When L1 misses | 1 file read | +| **L3 — Live fetch** *(framework extension)* | n/a (not in baseline spec) | MCP tool (`microsoft_docs_*`, etc.), `curl`, REST API | When L2 misses or freshness is required | 1 or more tool calls | + +The upstream spec recommends keeping `SKILL.md` under roughly 500 lines and 5,000 tokens [\[1\]](#refs)[\[6\]](#refs) — only the core instructions the agent needs on every run. When a skill legitimately needs more, move detail to L2 references and tell the agent *when* to load each file: `Read references/api-errors.md if the API returns a non-200 status code` is more useful than a bare `see references/ for details`. That conditional load instruction is what makes progressive disclosure work in practice. + +### Citation policy (cite-or-fail) + +Every factual claim in a skill's output MUST cite one of: + +1. A snapshot date from an L1 or L2 source. +2. A live URL from an L3 fetch result. +3. The literal token `unknown — out of corpus`. + +Skills never recite from model memory without grounding. Where memory-only answers are explicitly acceptable, the SKILL.md says so under a `## Stop conditions` section. + +### Snapshot refresh policy + +Each L2 file carries this header block: + +```yaml +--- +source: +snapshot: YYYY-MM-DD +refresh_command: +--- +``` + +A repo-level `scripts/refresh-snapshots.sh` (or per-canon script) re-fetches and diffs. `waza check` (or the equivalent quality gate) flags snapshots older than a per-canon threshold. + +### Shared references convention + +When two or more skills consume the same canon, the canon moves up to `.github/references//`. Skills reference it by relative path. Never copy-paste a snapshot table between skills. + +```text +.github/references/ +├── README.md ← framework conventions for shared canon +├── azure-caf/ +│ ├── abbreviations.md +│ └── naming-rules.md +└── ... +``` + +## The skill anatomy contract + +The following sections are the recommended structure for `SKILL.md`, in this order. The full scaffold lives at `templates/SKILL.template.md` — copy it when authoring a new skill. **Anatomy is documentation, not a runtime gate**: the eval (`eval.yaml`) is what graders run against, and it is where the load-bearing checks live. + +The Agent Skills spec only requires `name` and `description` in frontmatter [\[1\]](#refs). This framework keeps those two fields and adds repo-specific anatomy sections below — none of these sections are part of the open standard; they exist to make our skills self-contained and eval-gradable. + +| Section | Purpose | +|---|---| +| Frontmatter `name`, `description` | Routing metadata (per Agent Skills spec [\[1\]](#refs)). `description` ends with `USE FOR:`, `DO NOT USE FOR:`, `INVOKES:` clauses. | +| `## Purpose` | One paragraph; why this skill exists. | +| `## When to use` | Bullets mirroring `USE FOR`. | +| `## When NOT to use` | Bullets mirroring `DO NOT USE FOR`. | +| `## Procedure` | Numbered, deterministic steps. | +| `## Authoritative sources` | Table: source name, URL, snapshot date. | +| `## Inline canonical data` | L1 content — small, stable facts. | +| `## References` | Pointers to L2 files (relative paths). | +| `## Tool mandates` | L3 — required tool calls and how to cite their results. | +| `## Output schema` | JSON shape or markdown sections callers can rely on. | +| `## Anti-patterns` | Negative examples. | +| `## Stop conditions` | When to escalate, refuse, or ask the user. | + +Two checks decide whether a SKILL.md is framework-compliant: + +1. **Self-contained.** A fresh model with only this file in context can execute the procedure without guessing. This mirrors Anthropic's "think from Claude's perspective" guidance [\[2\]](#refs): the `name` and `description` are the trigger surface; the body is the operational manual. +2. **Auditable.** Every claim in its output traces back to a source listed under `## Authoritative sources` — inline, references, or a tool result. + +## The agent anatomy contract + +The following sections are the recommended structure for `.agent.md`. The full scaffold lives at `templates/AGENT.template.md` — copy it when authoring a new agent. **Anatomy is documentation, not a runtime gate**: the eval (`eval.yaml`) is what graders run against. Agents are thin (domain knowledge belongs in the skills they call), so the load-bearing minimums are the frontmatter (`name`, `description`, `tools`) and a `## Workflow` section the eval can anchor task prompts against. + +| Section | Purpose | +|---|---| +| Frontmatter `name`, `description`, `argumentHint`, `tools` | Routing metadata + tool whitelist. | +| `## Identity (non-negotiable)` | Persona lock-in. The agent never identifies as anything else, including on off-topic prompts. | +| `## Mission` | One sentence. | +| `## Skills I own` | Ordered list of skills with load priority. | +| `## Workflow` | Phases with explicit hand-offs to skills. | +| `## State management` | Where session state lives; recovery rules. | +| `## Interaction contract` | Question cadence; headless mode hook (CI-safe). | +| `## Non-goals` | Refusal redirect script for off-topic prompts. | +| `## Hand-off contracts` | Inputs and outputs when calling other agents. | + +Rule of thumb: agents are thin. Domain knowledge belongs in the skills they call. If an agent file grows fat with how-to detail, that detail is mis-located — extract it into a skill. + +## Authoring practices + +Anatomy is necessary but not sufficient. These are the editorial principles every skill in this repo applies. They come from the official Agent Skills authoring guide [\[6\]](#refs) and Anthropic's design rationale [\[2\]](#refs). + +### Ground every skill in real expertise + +Skills extracted from a hands-on agent session (with corrections you made along the way) and skills synthesized from project artifacts (runbooks, schemas, code-review comments, PR history) consistently outperform skills generated cold from generic best-practices articles [\[6\]](#refs). Two viable paths: + +* **Extract from a hands-on task.** Run the task with an agent, note the corrections you made, then distil the reusable pattern into a skill. +* **Synthesize from existing project artifacts.** Feed internal documentation, schemas, runbooks, and the version-control history (especially patches and fixes) into an LLM as source material — not generic references. + +Generic skills with no project-specific context produce vague procedures (`handle errors appropriately`) and are the most common first-author failure mode [\[6\]](#refs). + +### Spend context wisely + +Once a skill activates, its full `SKILL.md` body loads into the agent's context window alongside conversation history, system context, and other active skills. Every token competes for attention. + +* **Add what the agent lacks; omit what it knows.** Skip explanations of well-known concepts (what a PDF is, how HTTP works). Jump to the project-specific decision (which library, which flag, which gotcha). +* **Design coherent units.** A skill is like a function: encapsulate one job that composes with others. Too narrow → multiple skills load for one task; too broad → activation precision suffers. +* **Aim for moderate detail.** Concise stepwise guidance plus one worked example outperforms exhaustive enumeration. Trust the agent's own judgment for routine edge cases. + +### Calibrate control to fragility + +Not every instruction needs the same prescriptiveness. Match specificity to how fragile the task is [\[6\]](#refs). + +* **Give freedom where multiple approaches work.** Code review, analysis, design — describe what to look for, explain *why*, let the model choose how. +* **Be prescriptive where operations are fragile.** Database migrations, destructive ops, a specific deploy sequence — give the exact command and forbid variation. +* **Provide defaults, not menus.** When several tools could work, pick one and mention alternatives as escape hatches. Avoid `you can use A, B, C, or D` lists. +* **Favor procedures over declarations.** Teach the agent how to approach a class of problems, not the answer to one instance. + +### Reusable patterns + +Pick the ones that fit; not every skill needs all of them [\[6\]](#refs). + +| Pattern | When to use | +|---|---| +| **Gotchas section** | Concrete corrections to mistakes the agent will make without being told (soft deletes, field-name aliases, misleading health endpoints). Add a gotcha every time you have to correct the agent. | +| **Output templates** | When the caller expects a specific format. Pattern-matching against a concrete structure beats describing the format in prose. Inline if short; in `assets/` if long. | +| **Checklists** | Multi-step workflows with dependencies. Helps the agent track progress and skip nothing. | +| **Validation loops** | Do the work → run a validator → fix → repeat until validation passes. The validator can be a script, a checklist, or a reference document. | +| **Plan-validate-execute** | For batch or destructive ops: emit an intermediate plan as structured data, validate it against a source of truth, only then execute. | +| **Bundled scripts** | When trace analysis shows the agent reinventing the same logic across runs — extract it into `scripts/` once. See the next section. | + +### Refine with real execution + +Even a single execute-then-revise pass noticeably improves quality; complex skills often need several iterations [\[6\]](#refs). Read execution *traces*, not just final outputs — if the agent wastes time on unproductive steps, the cause is usually a vague instruction, an instruction that doesn't apply, or too many options without a default. The eval-as-contract section below codifies the systematic iteration loop. + +## Bundled scripts contract + +When a skill bundles executables in `scripts/`, those scripts MUST be designed for agentic invocation. These rules come from the upstream Using Scripts guide [\[7\]](#refs). + +| Rule | Why | +|---|---| +| **Non-interactive — hard requirement.** | Agents run in non-TTY shells. Any script that blocks on `read`, password prompt, or confirmation menu hangs the run indefinitely. Accept input via flags, env vars, or stdin only. | +| **Implement `--help` with a usage line, flag list, and at least one example.** | `--help` is the primary surface through which the agent learns the interface. Keep it concise — it enters the context window. | +| **Write actionable error messages.** | An opaque `Error: invalid input` wastes a turn. Say what went wrong, what was expected, and what to try (e.g., `Error: --format must be one of: json, csv, table. Received: "xml"`). | +| **Use structured output (JSON / CSV / TSV) by default.** | Composable with `jq`, `cut`, `awk`. Send data to stdout, diagnostics and progress to stderr. | +| **Document distinct exit codes.** | The agent reads exit codes to decide next steps. Reserve `0` for success, distinct non-zero codes for invocation errors vs. domain failures, and document each code in `--help`. | +| **Be idempotent.** | The harness may retry. `Create if not exists` is safer than `create and fail on duplicate`. | +| **Offer `--dry-run` for destructive or stateful ops.** | Lets the agent preview the effect before committing. | +| **Predictable output size.** | Many agent harnesses truncate tool output beyond 10–30K characters [\[7\]](#refs). Default to a summary; support `--offset` / pagination for full results; or require `--output FILE` for large dumps. | +| **Pin versions for one-off command invocations.** | When `SKILL.md` instructs `uvx ruff@0.8.0 …` or `npx eslint@9.0.0 …`, pin the version so behaviour stays stable over time [\[7\]](#refs). | +| **Declare inline dependencies for self-contained scripts.** | Python: PEP 723 inline metadata, run with `uv run`. Deno, Bun, Ruby, and Go all have analogous patterns [\[7\]](#refs). No separate `requirements.txt` for skill-local scripts. | +| **List bundled scripts in `SKILL.md` and tell the agent when to call them.** | A `## Available scripts` block followed by procedural references (`bash scripts/validate.sh "$INPUT"`) is more discoverable than naming the file once, buried in prose. | + +Reference paths in `SKILL.md` are relative to the skill directory root — the agent runs commands from there. + +## Eval-as-contract + +Each layer of the framework has a matching grader family. The eval enforces the framework; the framework does not enforce itself. This aligns with Anthropic's "start with evaluation" guidance for skill authors [\[2\]](#refs): identify capability gaps via representative tasks, then build skills to address them. + +Grader types in the table below are from the waza harness [\[3\]](#refs). + +| Property | Grader type [\[3\]](#refs) | Failure means | +|---|---|---| +| Trigger precision (positive + negative) | `type: trigger` | Description block is wrong (USE FOR / DO NOT USE FOR). | +| Tool mandates (L3) | `behavior` / `tool_constraint` with `expect_tools` | Skill did not call the required tool. | +| Citation policy | `prompt` judge + regex on output | Answer claims fact without source. | +| Content correctness | `prompt` judge with PASS criteria | Canonical value missing or wrong. | +| Schema compliance | `json_schema` / `program` / regex | Output does not match declared `## Output schema`. | +| Refusal cleanliness | `prompt` + regex with refusal markers | Off-topic produced an answer instead of redirect. | +| Budget | `behavior` with `max_tool_calls`, `max_duration_ms` | Skill ran away or thrashed. | +| Persona-lock (agents) | `prompt` + regex | Agent identified as a generic assistant. | + +Hard-won grader rules from this repo: + +* Prompt graders are **binary**. The waza prompt grader gives the judge LLM exactly two tools: `set_waza_grade_pass` (score `1.0`) and `set_waza_grade_fail` (score `0.0`) [\[3\]](#refs). Do not write 1-to-5 rubrics — the judge collapses them to 0 or 1. +* Prompt graders require `continue_session: true` or the judge has no view of the agent's output [\[3\]](#refs). +* LLM-as-judge is reliable but biased. Strong judges agree with humans about 80% of the time but exhibit position, verbosity, and self-enhancement biases [\[4\]](#refs) — mitigate by pairing prompt graders with deterministic graders (`text`, `file`, `json_schema`). +* Eval-level `skill_invocation` graders with aspirational `required_skills` fire on every task and produce deterministic 0.0 noise. Scope content graders to positive tasks only. + +Shared grader blocks live at `.github/evals/_lib/graders/*.yaml` (when created). Per-skill evals `extends:` them rather than copy-pasting. + +### Test case design and iteration + +Graders only check what you thought to assert. Test-case design is the other half — taken from the upstream evaluation guide [\[8\]](#refs). + +* **Start with 2–3 cases.** Don't over-invest before you've seen your first round of results. Expand later. Each case is a realistic prompt + a human-readable expected output + (optional) input files. +* **Vary prompt phrasing.** Some cases should be casual (`hey can you clean up this csv`), others precise (`Parse the CSV at data/input.csv, drop rows where column B is null, …`). Cover at least one edge case (malformed input, ambiguous request, refusal). +* **Compare with-skill against without-skill (baseline).** A skill that doesn't beat the no-skill baseline is not adding value. When iterating on an existing skill, snapshot the previous version and use it as the baseline. +* **Write assertions only after seeing the first outputs.** Good assertions are specific, observable, and programmatically verifiable (`Both axes are labeled`, `The output is valid JSON`, `The report includes at least 3 recommendations`). Weak assertions are vague (`looks good`) or too brittle (`exactly the phrase 'Total Revenue: $X'`). +* **Use scripts for mechanical checks, LLM judges for narrative quality.** Scripts are reliable and reusable across iterations; LLM judges complement them for organization, formatting, and overall usability. +* **Capture timing and tokens.** A skill that improves quality but triples token usage is a different trade-off than one that's both better and cheaper. Track `total_tokens` and `duration_ms` per run; the waza harness records both [\[3\]](#refs). +* **Analyse patterns, not just averages.** Remove assertions that always pass in both configurations (no signal). Investigate assertions that always fail in both (broken assertion, too-hard task, or wrong check). Focus iteration effort on assertions that pass with the skill and fail without — that is where the skill is adding value. +* **High variance ≠ bad skill.** If the same case passes sometimes and fails others, either the assertion is sensitive to model randomness or the skill's instructions are ambiguous enough that the model interprets them differently each run. Add an example or tighten the wording. +* **Keep a human in the review loop.** Assertion grading and pattern analysis only catch what you thought to write assertions for. A reviewer catches issues you didn't anticipate. + +The iteration loop itself: run → grade → review → propose changes (give failed assertions, human feedback, and execution transcripts to an LLM) → apply → re-run [\[8\]](#refs). Stop when feedback is consistently empty or improvements plateau. In this repo, `/skill-improve` automates the run-grade-propose-re-run cycle on top of the waza harness. + +## The authoring lifecycle + +Four commands per surface (skills and agents), same pattern: onboard → bench → improve → promote. See the [Prompts](./prompts) catalogue for full argument lists and cost notes. + +```text + ┌──────────────────┐ + │ /skill-onboard │ scaffold SKILL + eval + manifest entry; + │ scaffold + smoke│ runs quality check + smoke trial on 1 model + └────────┬─────────┘ + │ pass smoke? + ┌────────▼─────────┐ + │ /skill-bench │ run across N models (pilot tier); + │ multi-model │ identify winning model + weak tasks + └────────┬─────────┘ + │ score >= threshold? + ┌────────▼─────────┐ + │ /skill-improve │ read failures, propose SKILL edits, + │ failure-driven │ verify with re-run; baseline-vs-after diff + └────────┬─────────┘ + │ regression-free? + ┌────────▼─────────┐ + │ /skill-promote │ move from pilot tier to expanded tier; + │ tier gate │ requires evidence (improvement + bench) + └────────┬─────────┘ + │ live + ┌────────▼─────────┐ + │ nightly trend │ trend report; alert on regression + │ workflow │ + └──────────────────┘ +``` + +The same four commands exist for agents (`/agent-onboard`, `/agent-bench`, `/agent-improve`, `/agent-promote`). Agents add one extra phase: persona-lock verification — the eval asserts the agent identifies as itself, never as "GitHub Copilot CLI" or "software engineering assistant." + +## Repository layout + +```text +.github/ +├── copilot-instructions.md ← workspace-wide rules +├── templates/ +│ ├── SKILL.template.md ← skill scaffold (use as starting point) +│ └── AGENT.template.md ← agent scaffold +├── skills// +│ ├── SKILL.md +│ └── references/*.md ← skill-local L2 corpus +├── agents/.agent.md +├── prompts/.prompt.md +├── instructions/*.instructions.md +├── references/ ← shared L2 corpus across skills +│ └── /*.md +├── evals/ +│ ├── manifest.yaml ← tier registration +│ ├── _lib/graders/ ← shared grader blocks +│ └── / +│ ├── eval.yaml +│ └── tasks/ +│ ├── positive-*.yaml +│ └── negative-*.yaml +└── workflows/ + ├── waza-evals.yml ← PR-blocking grading + └── waza-trends.yml ← nightly trend +``` + +## Authoring checklist + +Before opening a PR that adds or modifies a skill or agent: + +* [ ] File starts from the matching template under `.github/templates/`. +* [ ] Frontmatter `description` includes `USE FOR:`, `DO NOT USE FOR:`, `INVOKES:` (skills) or persona + tools (agents). +* [ ] Recommended sections from the anatomy guidance above are present where they add value (anatomy is documentation, not a runtime gate — the eval is what graders run against). +* [ ] Every factual claim traces to L1 inline, an L2 file, or a mandated L3 tool call. +* [ ] L2 files (if any) carry the `source` + `snapshot` + `refresh_command` header. +* [ ] At least one positive task and one negative task exist in the eval; prompt phrasing is varied (casual + precise) and at least one edge case is covered. +* [ ] Assertions are specific, observable, and programmatically verifiable; vague or brittle assertions removed. +* [ ] A without-skill (or previous-version) baseline run exists for comparison. +* [ ] Tool-use and citation graders are configured. +* [ ] `SKILL.md` body stays within roughly 500 lines / 5,000 tokens; overflow lives in L2 with explicit *when-to-load* instructions. +* [ ] Any bundled `scripts/*` are non-interactive, support `--help`, document distinct exit codes, and produce structured output (data to stdout, diagnostics to stderr). +* [ ] A `## Available scripts` block in `SKILL.md` lists every bundled script with a one-line summary. +* [ ] Manifest entry added to `.github/evals/manifest.yaml`. +* [ ] Smoke trial ran cleanly on the configured smoke model. +* [ ] If a shared canon was duplicated, it has been hoisted to `.github/references//`. + +## What is intentionally out of scope + +* **Model fine-tuning.** Skills are not training data; they are runtime context. There is no weight-level "training" step in this framework. This matches the Agent Skills design: skills extend agent capabilities at runtime via files and folders, not via model weights [\[2\]](#refs). +* **Vector search / RAG infrastructure.** The L2 layer is a curated, dated snapshot — a human-readable corpus, not an embedding index. +* **Multi-tenant skill packaging.** Sharing skills across repos is handled by the host platform (plugin manifest), not by this framework. + +## Security note + +Skills can include instructions and executable code, so a malicious skill is a real attack surface — install only from trusted sources, and audit unfamiliar skills before use [\[2\]](#refs). When this repo accepts a skill contribution, the review must cover bundled scripts, network endpoints, and any tool mandates that could exfiltrate data. + +## Versioning + +This spec is v0.1. Breaking changes (renaming required sections, changing the anatomy contracts) bump the minor version and require a migration note pinned at the top of this file. + +
+ +## References + +Snapshot dates reflect the date each source was last verified against upstream. + +1. **Agent Skills open standard.** Agent Skills Overview. — defines the SKILL.md folder format, the required `name` + `description` frontmatter, and the three-stage progressive disclosure model (Discovery, Activation, Execution). Snapshot: 2026-05-21. +2. **Zhang B., Lazuka K., Murag M.** "Equipping agents for the real world with Agent Skills." Anthropic Engineering, Oct 16 2025. — origin article for the Agent Skills design; defines progressive disclosure, best practices ("start with evaluation," "structure for scale," "think from Claude's perspective," "iterate with Claude"), and security guidance. Snapshot: 2026-05-21. +3. **Microsoft waza.** "Graders" — Validators and Graders reference. — defines the grader taxonomy (`text`, `file`, `diff`, `json_schema`, `prompt`, `behavior`, `action_sequence`, `skill_invocation`, `tool_constraint`, `tool_calls`, `program`, `trigger`), the binary `set_waza_grade_pass` / `set_waza_grade_fail` contract for prompt graders, and the `continue_session: true` mechanism. Snapshot: 2026-05-21. +4. **Zheng L. et al.** "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena." NeurIPS 2023 Datasets and Benchmarks. arXiv:2306.05685. — establishes that strong LLM judges reach ~80% agreement with humans (matching inter-human agreement) but exhibit position, verbosity, and self-enhancement biases that mixed grading layers must mitigate. Snapshot: 2026-05-21. +5. **anthropics/skills.** Reference skill repository. — canonical SKILL.md examples (document-skills, example-skills) and the upstream `template/` scaffold. Snapshot: 2026-05-21. +6. **Agent Skills.** "Best practices for skill creators." — upstream editorial guide; defines *start from real expertise*, *spend context wisely* (≤500 lines / ≤5,000 tokens), *calibrate control to fragility*, and the reusable patterns (gotchas, templates, checklists, validation loops, plan-validate-execute, bundling scripts). Snapshot: 2026-05-21. +7. **Agent Skills.** "Using scripts in skills." — design rules for bundled scripts: non-interactive, `--help`, structured output, meaningful exit codes, idempotency, `--dry-run`, predictable output size, inline-dependency declarations (PEP 723 et al.), and pinned versions for one-off commands. Snapshot: 2026-05-21. +8. **Agent Skills.** "Evaluating skill output quality." — eval-driven iteration: test cases (`evals/evals.json`), with-skill vs. without-skill baseline, assertion design, grading (LLM judge + scripts), aggregation (`benchmark.json`), pattern analysis, human review, and the iteration loop. Snapshot: 2026-05-21. diff --git a/website/docs/authoring/overview.md b/website/docs/authoring/overview.md new file mode 100644 index 0000000..759caf9 --- /dev/null +++ b/website/docs/authoring/overview.md @@ -0,0 +1,84 @@ +--- +title: "Authoring Overview" +sidebar_label: "Overview" +sidebar_position: 1 +description: "Where skills, agents, prompts, and evals live in the repo and how Copilot discovers them." +--- + +# Authoring Overview + +Git-Ape is a [GitHub Copilot agent plugin](https://docs.github.com/en/copilot/reference/copilot-cli-reference/cli-plugin-reference). The plugin manifest at the repo root declares two directories that Copilot discovers automatically: + +```json title="plugin.json" +{ + "name": "git-ape", + "agents": ".github/agents/", + "skills": ".github/skills/" +} +``` + +Everything else — prompts, eval suites, the CI matrix — supports the contents of those two directories. + +## Vocabulary + +| Term | File pattern | Discovered by | Purpose | +|------|--------------|---------------|---------| +| **Skill** | `.github/skills//SKILL.md` | Plugin manifest (`skills:`) | A focused, callable capability with a documented procedure. Invoked by agents or directly with `/`. | +| **Agent** | `.github/agents/.agent.md` | Plugin manifest (`agents:`) | A persona with a `tools:` allowlist that orchestrates one or more skills to deliver a workflow. Invoked with `@` in Copilot Chat or `/` in the CLI. | +| **Prompt** | `.github/prompts/.prompt.md` | VS Code Chat prompt picker | A scripted authoring workflow (onboard, benchmark, improve, promote) you run while writing skills and agents. Not shipped to end users. Invoked with `/`. | +| **Eval suite** | `.github/evals//eval.yaml` and `.github/evals/agents//eval.yaml` | [microsoft/waza](https://github.com/microsoft/waza) | A spec + tasks that score a skill or agent across models. | + +## Repo layout + +``` +.github/ +├── agents/ +│ ├── azure-policy-advisor.agent.md +│ ├── git-ape.agent.md +│ └── ... +├── skills/ +│ ├── azure-cost-estimator/SKILL.md +│ ├── prereq-check/SKILL.md +│ └── ... +├── prompts/ +│ ├── skill-onboard.prompt.md +│ ├── skill-bench.prompt.md +│ ├── skill-improve.prompt.md +│ ├── skill-promote.prompt.md +│ ├── agent-onboard.prompt.md +│ ├── agent-bench.prompt.md +│ ├── agent-improve.prompt.md +│ └── agent-promote.prompt.md +├── evals/ +│ ├── manifest.yaml # CI matrix configuration +│ ├── /eval.yaml # Skill evals +│ └── agents//eval.yaml # Agent evals +└── workflows/ + ├── waza-evals.yml # Per-PR skill evals + └── waza-agent-evals.yml # Per-PR agent evals + +plugin.json # Plugin manifest +.waza.yaml # Project-level waza config +``` + +## When to add what + +| You want to… | Add a… | +|--------------|--------| +| Wrap a single API or workflow step (cost lookup, policy query, naming rule check) | [Skill](./skills) | +| Coordinate several skills behind a persona (deployment, advisory, onboarding) | [Agent](./agents) | +| Score quality of a skill or agent across models | [Eval suite](./evals) | +| Scaffold, benchmark, or harden the skill or agent you just wrote | Reuse an existing [prompt](./prompts) — onboard → bench → improve → promote covers the loop. New prompts are rarely needed. | + +## Naming and registration + +* Skill directory names and agent file basenames use lowercase kebab-case. The skill's `SKILL.md` frontmatter `name:` field must match the directory name; the agent's `.agent.md` frontmatter `name:` is a display name and is separate from the file basename. +* Adding a new file under `.github/skills//` or `.github/agents/.agent.md` is all you need to register it — there is no separate index to update. The plugin manifest scans the directories on load. +* For a new skill to appear in CI evals, append a `{ name, tier }` entry to `.github/evals/manifest.yaml` and create `.github/evals//eval.yaml`. See [Eval suites](./evals). + +## Read next + +- [Authoring skills](./skills) — frontmatter, structure, and minimum bar +- [Authoring agents](./agents) — persona-lock, `tools:` taxonomy, sub-agent wiring +- [Eval suites](./evals) — what graders score, how tasks are structured +- [Prompts](./prompts) — onboard, bench, improve, and promote your skill or agent from creation diff --git a/website/docs/authoring/prompts.md b/website/docs/authoring/prompts.md new file mode 100644 index 0000000..65ad0ed --- /dev/null +++ b/website/docs/authoring/prompts.md @@ -0,0 +1,144 @@ +--- +title: "Prompts" +sidebar_label: "Prompts" +sidebar_position: 6 +description: "The onboard / bench / improve / promote prompts you use to scaffold, evaluate, and harden your skills and agents from creation onward." +--- + +# Prompts + +Prompts are short, parametric commands shipped under [`.github/prompts/`](https://github.com/Azure/git-ape/tree/main/.github/prompts) that wrap the authoring loop: scaffolding eval suites at creation, cross-model benchmarking, iterative quality improvement, and readiness assessment for promotion. + +**Use them while authoring, not after.** They exist so that every skill and agent you write is grounded in measurable evals from the first commit. Drafting a `SKILL.md` or `.agent.md` without running `/skill-onboard` or `/agent-onboard` is shipping blind — the prompts are how you turn a rough draft into something a model can reliably execute. + +## When to use which + +| Goal | Prompt | +|------|--------| +| "This skill has no eval suite yet — scaffold one." | [`/skill-onboard`](#skill-onboard) | +| "This agent has no eval suite yet — scaffold one." | [`/agent-onboard`](#agent-onboard) | +| "Which model handles this skill best?" | [`/skill-bench`](#skill-bench) | +| "Which model handles this agent best?" | [`/agent-bench`](#agent-bench) | +| "This skill scored low — help me fix it." | [`/skill-improve`](#skill-improve) | +| "This agent scored low — help me fix it." | [`/agent-improve`](#agent-improve) | +| "Is this skill ready for the `pilot` tier?" | [`/skill-promote`](#skill-promote) | +| "Is this agent production-ready?" | [`/agent-promote`](#agent-promote) | + +> **Cost notice:** every prompt invokes `waza run` one or more times. Each leg consumes premium model requests. Bench and promote prompts run across multiple models (default four); improve loops can run up to three rounds. Plan your quota before invoking. + +## File format + +All prompts share the same shape: + +```markdown +--- +agent: 'agent' +description: 'One-sentence description' +argument-hint: '[paramA=...] [paramB=...]' +--- + +# Prompt body + +Procedural steps the prompt's wrapping agent will execute, +typically a sequence of `bash` blocks and decision points. +``` + +The `agent: 'agent'` value pins execution to VS Code's generic chat agent (no specific persona). Add or edit prompt files directly under `.github/prompts/`; no further registration is needed. + +## skill-onboard + +**Description.** Stage 0 of the eval lifecycle — bootstrap a brand-new eval suite for a skill that currently has none. Scaffolds `eval.yaml` + positive / negative / off-topic task files, patches them to repo conventions (hybrid graders, concrete prompts, schema headers), registers the skill at the `expanded` tier in `manifest.yaml`, and runs a single-model smoke trial to confirm the suite is wired correctly. + +**Arguments.** `[skillName=...] [positiveTasks={2|3|4}] [negativeTasks={1|2}] [smokeModel=claude-sonnet-4.6]` + +**Interactivity.** **Interactive.** Pauses for approval before appending to `manifest.yaml` and before running the smoke trial. + +**Output.** A new `.github/evals//` directory containing `eval.yaml`, positive tasks, a trigger-only negative task, and an off-topic refusal task, plus a `{ name: , tier: expanded }` entry in `manifest.yaml`. The smoke trial prints per-task pass / fail and aggregate score. + +**Out of scope.** Does **not** edit `SKILL.md` (use [`/skill-improve`](#skill-improve) for that) and does **not** promote the skill to the `pilot` tier (use [`/skill-promote`](#skill-promote) after the skill has matured in `expanded`). + +**Cost.** ≈ 5–8 premium requests per invocation: 1 for the `waza suggest --apply` scaffold + `1 × len(tasks)` for the smoke trial (default 4) plus per-task LLM-judge calls. + +**Use when.** You've authored or refactored a `SKILL.md` that has no companion eval suite and you want a guarded path from zero to a working `expanded`-tier entry without hand-writing every task YAML. + +## agent-onboard + +**Description.** Stage 0 of the agent eval lifecycle — bootstrap a brand-new eval suite for a custom agent that currently has no evaluation. Scaffolds `.github/evals/agents//` with `eval.yaml`, a mirror copy of the `.agent.md` (waza walks the directory under `skill_directories: ["."]`), positive and negative tasks, and an off-topic task with a `clean_refusal` grader that asserts the agent identifies itself and redirects to its specialty. Runs a single-model smoke trial. No edits to the canonical `.agent.md` or to `manifest.yaml` (agent evals are auto-discovered from the filesystem). + +**Arguments.** `[agentName=...] [positiveTasks={2|3|4}] [negativeTasks={1|2}] [smokeModel=claude-sonnet-4.6]` + +**Interactivity.** **Interactive.** Pauses for approval before writing the eval directory and before running the smoke trial. + +**Output.** A new `.github/evals/agents//` directory with `eval.yaml`, a mirrored `.agent.md`, positive tasks (hybrid `trigger` + `answer_quality` graders), a trigger-only negative task, and an off-topic refusal task. The smoke trial prints per-task pass / fail and an aggregate score. + +**Out of scope.** Does **not** edit the canonical `.github/agents/.agent.md` (use [`/agent-improve`](#agent-improve) for that), does **not** run readiness checks (use [`/agent-promote`](#agent-promote) after the agent has matured), and does **not** touch `manifest.yaml`. + +**Cost.** ≈ 6–9 premium requests per invocation: `1 × len(tasks)` for the smoke trial (default 4) plus per-task LLM-judge calls. + +**Use when.** You've authored or refactored an `.agent.md` that has no companion eval suite and you want a guarded path from zero to a working agent eval directory. + +## skill-bench + +**Description.** Cross-model benchmark for a single skill: runs `waza` eval once per model, captures results, compares with `waza compare`, and prints a one-line winner summary. + +**Arguments.** `[skillName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]` + +**Interactivity.** Non-interactive once `skillName` is supplied. Prompts for the name if omitted. + +**Output.** A `waza compare` table (per-model aggregate score, success rate, latency, premium requests) plus a one-line winner. + +**Use when.** You want to know which model handles a skill best — for example, before promoting a skill or after editing the SKILL.md substantially. + +## agent-bench + +**Description.** Same as `skill-bench` but targets a custom agent (under `.github/evals/agents//`). + +**Arguments.** `[agentName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]` + +**Use when.** Sweeping the model field for an agent. Pair with `/agent-promote` once the winner is clear. + +## skill-improve + +**Description.** Local feedback loop for a single skill: baseline → audit → propose edits → apply (with approval) → re-rank via `waza compare`. Optionally loops up to 3 rounds for deeper refinement. + +**Arguments.** `[skillName=...] [iterations={1|2|3}] [rescoreQuality={true|false}]` + +**Interactivity.** **Interactive.** After each proposed edit you approve, reject, or amend. + +**Output.** A per-round diff plus updated comparison table. The skill file is modified in place when you approve. + +**Use when.** A skill scored below the pilot promotion bar and you want a guided revision loop instead of hand-editing. + +## agent-improve + +**Description.** Same as `skill-improve`, applied to `.github/agents/.agent.md`. Also re-syncs the eval-directory mirror (`.github/evals/agents//.agent.md`) after every approved edit. + +**Arguments.** `[agentName=...] [iterations={1|2|3}] [rescoreQuality={true|false}]` + +**Use when.** An agent's persona-lock leaks, off-topic refusals are weak, or trigger precision is below threshold. + +## skill-promote + +**Description.** Assess whether a skill in the `expanded` eval tier is ready to graduate to `pilot` (full 4-model fan-out). Runs the eval suite, checks against numeric promotion criteria, and prints a graduation report. + +**Arguments.** `[skillName=...]` + +**Output.** A `PROMOTE` / `BLOCK` verdict with the specific criterion that gated the decision (e.g. `success_rate < 0.85 on gpt-5.4`). When `PROMOTE`, it suggests the `manifest.yaml` patch. + +**Use when.** A skill has been stable in `expanded` for a few PRs and you're considering moving it to `pilot`. + +## agent-promote + +**Description.** Assess whether a custom agent is production-ready: runs the eval suite across pilot-tier models, checks against numeric readiness criteria, and prints a graduation report. + +**Arguments.** `[agentName=...] [models=claude-sonnet-4.6,gpt-5.4,gpt-5-codex,claude-opus-4.6]` + +**Output.** A readiness verdict per criterion (persona-lock, off-topic refusal rate, trigger precision, budget compliance, answer-quality threshold) and an overall `GO` / `NO-GO`. + +**Use when.** Before flipping an agent's `user-invocable: true` for the first time, or before announcing it to users. + +## Read next + +- [Eval suites](./evals) — what the prompts actually run +- [Authoring skills](./skills) — content the improve loop edits and the onboard prompt consumes as-is +- [Authoring agents](./agents) — agent surface specifics diff --git a/website/docs/authoring/skills.md b/website/docs/authoring/skills.md new file mode 100644 index 0000000..0a68055 --- /dev/null +++ b/website/docs/authoring/skills.md @@ -0,0 +1,207 @@ +--- +title: "Authoring Skills" +sidebar_label: "Skills" +sidebar_position: 3 +description: "How to add a new skill: directory layout, SKILL.md frontmatter, structure, and registration." +--- + +# Authoring Skills + +A **skill** is a focused, callable capability with a documented procedure. Skills are the atomic unit Git-Ape composes into agent workflows. Each skill is a directory under [`.github/skills/`](https://github.com/Azure/git-ape/tree/main/.github/skills) containing one `SKILL.md` file (plus optional helper files). + +## Quick start + +```bash +SKILL=my-new-skill +mkdir -p .github/skills/"$SKILL" +$EDITOR .github/skills/"$SKILL"/SKILL.md +``` + +Then write the frontmatter and body following the template below. No further registration is needed — `plugin.json` declares `"skills": ".github/skills/"` and Copilot auto-discovers every subdirectory containing a `SKILL.md`. + +> **Optimize your skill from the start.** Don't ship a `SKILL.md` blind — use the prompts listed in [Prompts](./prompts) to evaluate and harden it as you write: +> +> - [`/skill-onboard`](./prompts#skill-onboard) — scaffolds `.github/evals//` with positive and negative tasks and runs a smoke trial so you see how the skill behaves before you commit it. +> - [`/skill-bench`](./prompts#skill-bench) — benchmarks the skill across models so you know which ones it works on. +> - [`/skill-improve`](./prompts#skill-improve) — diagnoses failing tasks and proposes targeted edits to your `SKILL.md`. +> - [`/skill-promote`](./prompts#skill-promote) — locks the skill in once it's stable. +> +> Run `/skill-onboard` as soon as your first draft is readable — even rough drafts surface gaps faster through evals than through re-reads. + +## File layout + +``` +.github/skills/my-new-skill/ +├── SKILL.md # Required: frontmatter + procedure +├── scripts/ # Optional: helper scripts the skill shells out to +└── templates/ # Optional: text/config templates referenced by the skill +``` + +The directory name **must** match the `name:` field in frontmatter. + +## SKILL.md template + +```markdown +--- +name: my-new-skill +description: "One sentence describing what the skill does and when it should fire. WHEN: trigger phrase 1, trigger phrase 2, trigger phrase 3. DO NOT USE FOR: scope boundary 1, scope boundary 2." +argument-hint: "Free-text hint shown to users when they invoke the skill" +user-invocable: true +license: MIT +metadata: + author: Git-Ape + version: "1.0.0" +--- + +# Display Title + +One paragraph describing what the skill does and the value it delivers. + +## Quick Reference + +| Property | Value | +|----------|-------| +| Best for | One-line summary of the primary use case | +| MCP tools | Tool names, or `None — runs locally via shell` | +| CLI | Primary commands, e.g. `az policy assignment list` | +| Related skills | Sibling skills to call before/after | +| Side effects | `Read-only`, or list what gets created / modified | + +## When to Use + +- Bullet describing trigger condition 1 (user's voice) +- Bullet describing trigger condition 2 +- Bullet describing trigger condition 3 + +## Rules + +1. Numbered, blocking constraints the agent must follow. +2. Use `⛔` or `❌` prefixes for hard rules and reference them later when steps depend on them. + +## Steps + +| # | Action | Reference | +|---|--------|-----------| +| 1 | **Verify Prerequisites** — what to check first | inline | +| 2 | **Do the Thing** — short imperative | [references/foo.md](references/foo.md) | +| 3 | **Report Results** — produce the output contract | See [Outputs](#outputs) | + +### Step 1: Verify Prerequisites + +```bash +command -v az >/dev/null || { echo "az not found"; exit 1; } +``` + +### Step 2: Do the Thing + +Describe the action. Use fenced code blocks for any shell or API calls so they can be reused verbatim. Push long examples into `references/*.md` to stay under the token budget. + +### Step 3: Report Results + +Link to the **Outputs** section. + +## Outputs + +Show the literal structure the skill is contracted to produce — table, JSON shape, or file path. Eval graders score against this contract, so make it concrete. + +## Error Handling + +| Error | Cause | Fix | +|-------|-------|-----| +| `` | Why it happens | What to run | + +## Constraints + +**Always:** + +- ✅ Concrete do-this behavior + +**Never:** + +- ❌ Concrete don't-do-this behavior + +## Next + +What the user (or agent) should invoke after a successful run. Use a clickable chip: + +> Next: **@Some Agent** — or run `/some-skill` to continue. + +`@AgentName` and `/skill-name` render as clickable chips in VS Code Copilot Chat. +``` + +## Frontmatter reference + +| Field | Required | Purpose | +|-------|:--------:|---------| +| `name` | ✅ | Kebab-case skill identifier. Must match directory name. | +| `description` | ✅ | Used by the Copilot router. Encode trigger phrases with `USE FOR:` and scope boundaries with `DO NOT USE FOR:` — specific verbs and nouns improve routing precision. (`WHEN:` is also accepted but `USE FOR:` matches the wider skill ecosystem.) | +| `argument-hint` | ⚪ | Free-text hint displayed in the prompt picker. | +| `user-invocable` | ⚪ | Defaults to `true`. Set `false` for skills that only run as a sub-step of an agent and should not be surfaced standalone. | +| `license` | ⚪ | Recommended `MIT` for skills shipped with this repo — keeps redistribution rights explicit. | +| `metadata.author` | ⚪ | Free-text author or team name (e.g. `Git-Ape`, `Microsoft`). | +| `metadata.version` | ⚪ | Semver string. Bump on every behavior change — eval suites and CI can pin to a version. | + +## Anatomy of a good skill + +Look at [`prereq-check/SKILL.md`](https://github.com/Azure/git-ape/blob/main/.github/skills/prereq-check/SKILL.md) for the canonical reference. The twelve principles below are the patterns Git-Ape skills follow — they are distilled from the Microsoft `azure-skills` package and apply to every skill in `.github/skills/`. + +### Twelve principles + +1. **Frontmatter is metadata, not decoration.** Add `license`, `metadata.author`, and `metadata.version` so skills are versionable and reproducible. Encode trigger boundaries in `description` with `USE FOR:` and `DO NOT USE FOR:` markers — vague descriptions hurt router precision. +2. **Open with a `## Quick Reference` table.** One scannable block (`Best for`, `MCP tools`, `CLI`, `Related skills`, `Side effects`) before any prose. Cuts time-to-orient for both the model and a human reviewer. +3. **`## When to Use` is the trigger contract.** Concrete, user-voice bullets. The router and eval graders both grade against this list. +4. **Hard-block guardrails as callouts.** Use `> **⛔ STOP**` / `> **⚠️ MANDATORY**` blockquotes for non-negotiables; numbered `## Rules` for everything else. +5. **Steps as a table, body as expansion.** A `# | Action | Reference` table at the top, then per-step detail underneath. Lets the agent skim and dispatch without re-reading the whole body. +6. **MCP-first, CLI-fallback.** Every Azure-touching skill lists MCP tools in a table with an explicit CLI fallback when MCP is not enabled. Discover before you act — never assume names or schemas. +7. **Explicit `## Outputs` contract.** What files, tables, or JSON the skill is contracted to produce. Eval graders score against this. +8. **`## Error Handling` table.** Rows of `Error | Cause | Fix` for the top failure modes. Cheap, high-signal documentation. +9. **`## Constraints` as Always / Never sections.** Explicit do/don't lists at the bottom catch drift. Use `**Always:**` and `**Never:**` headers — polarity is already conveyed, so do **not** prefix each bullet with ✅/❌ (each emoji costs 1-3 tokens and adds no information). Reserve emoji semantics for *status output* only: ⛔ blocking, ⚠️ warn, ❌ misconfigured, ✅ applied, 🔄 platform default, ❔ unknown. +10. **Cross-skill chains are explicit — emit a handoff chip.** Document `A → B → C` flows and end with a `## Next` pointer. VS Code Copilot Chat renders `@AgentName` mentions and `/skill-name` slash commands as clickable chips — the closest thing to a button in the chat surface. Always include at least one in `## Next` (e.g. `Next: **@Git-Ape Onboarding** — or run /git-ape-onboarding`) so the user can dispatch the follow-up with one click. Add `⛔ MANDATORY NEXT STEP` when the hand-off is required. +11. **Push depth into `references/`.** Keep `SKILL.md` close to the 1,300-token budget; long CLI examples, schema tables, and provider-specific patterns belong in `references/*.md` linked from the steps table. Bash one-shots can live in `scripts/`. +12. **No persona language.** Skills read like runbooks. Persona-lock belongs in the `.agent.md` that calls the skill, not in the skill itself. + +## Token budget + +`waza` runs a token audit on every skill. The thresholds live in [`.waza.yaml`](https://github.com/Azure/git-ape/blob/main/.waza.yaml): + +```yaml +tokens: + warningThreshold: 1000 # warn above this + fallbackLimit: 1300 # hard fail above this (waza tokens compare --strict) +``` + +Run `waza tokens count .github/skills/my-new-skill/SKILL.md` while iterating to keep the skill within budget. The [`/skill-improve`](./prompts#skill-improve) prompt automates the audit + edit loop and shows a before/after delta. + +## Local validation + +Before opening a PR, run: + +```bash +# Lint frontmatter and structure +waza check .github/skills/my-new-skill + +# (Optional) Estimate token count +waza tokens count .github/skills/my-new-skill + +# If you also wrote an eval suite (see "Eval suites"): +waza run .github/evals/my-new-skill/eval.yaml --no-cache +``` + +`waza check` validates the skill against the [agentskills.io](https://agentskills.io) frontmatter spec. + +## CI integration + +Adding the skill file is enough to ship it as a runtime capability. To opt the skill into the [PR-time eval matrix](./evals#how-ci-picks-up-your-eval), add a row to [`.github/evals/manifest.yaml`](https://github.com/Azure/git-ape/blob/main/.github/evals/manifest.yaml) **and** create `.github/evals/my-new-skill/eval.yaml`. The matrix runs the suite against every model in the selected tier on each PR that touches relevant files. + +## Common pitfalls + +- **Vague `description:` text** — the trigger grader will catch this. Specific verbs/nouns improve routing. +- **Directory name doesn't match `name:`** — `waza check` will flag it but the plugin loader silently skips the skill in some clients. Always match exactly. +- **Skill embeds persona** — move "you are X" framing into an `.agent.md` and have the agent call the skill. Skills should read like runbooks, not personas. +- **No output contract** — the `behavior` grader needs something concrete to verify. Document the literal output shape. + +## Read next + +- [Eval suites](./evals) — score the skill across models +- [Prompts](./prompts#skill-improve) — local audit + edit loop +- [Authoring agents](./agents) — wrap the skill in a persona diff --git a/website/docs/skills/overview.md b/website/docs/skills/overview.md index e892f93..9753a8b 100644 --- a/website/docs/skills/overview.md +++ b/website/docs/skills/overview.md @@ -21,7 +21,7 @@ Skills are focused capabilities invoked by agents at specific stages of the depl | [Azure Naming Research](./azure-naming-research) | Research Azure naming constraints and CAF abbreviations for a given resource type. Use when you need to look up the official CAF slug, naming rules (length, scope, valid characters), and derive validation/cleaning regex patterns for an Azure resource. Triggers on: CAF abbreviation lookup, Azure naming rules research, resource naming constraints. | ✅ | | [Azure Policy Advisor](./azure-policy-advisor) | Assess Azure Policy compliance for ARM template resources. Queries existing subscription assignments and unassigned custom/built-in definitions, cross-references with Microsoft Learn recommendations. Produces per-resource policy recommendations with implementation options. | ✅ | | [Azure Resource Availability](./azure-resource-availability) | Query live Azure APIs to validate resource availability before template generation or deployment. Checks VM SKU restrictions, Kubernetes/runtime version support, API version compatibility, and subscription quota. Use during requirements gathering and preflight to catch deployment failures early. | ✅ | -| [Azure REST API Reference](./azure-rest-api-reference) | Look up Azure REST API and ARM template reference documentation for any resource type. Returns exact property schemas, required fields, valid values, and latest stable API versions. Use BEFORE generating or modifying ARM templates to ensure correctness. No Azure connection required. | ✅ | +| [Azure Rest Api Reference](./azure-rest-api-reference) | Look up Azure REST API and ARM template reference documentation for any resource type. Returns exact property schemas, required fields, valid values, and latest stable API versions. Use BEFORE generating or modifying ARM templates to ensure correctness. No Azure connection required. | ✅ | | [Azure Role Selector](./azure-role-selector) | Recommend least-privilege Azure RBAC roles for deployed resources. Finds minimal built-in roles matching desired permissions or creates custom role definitions. Use during security analysis or when configuring access for service principals and managed identities. | ✅ | | [Azure Security Analyzer](./azure-security-analyzer) | Analyze Azure resource configurations against security best practices using Azure MCP bestpractices service. Produces per-resource security assessment with severity ratings and recommendations. Use during template generation before deployment confirmation. | ✅ | | [Prereq Check](./prereq-check) | Check that all required CLI tools are installed, meet minimum versions, and have active auth sessions. Shows platform-specific install commands for anything missing. | ✅ | diff --git a/website/docs/skills/prereq-check.md b/website/docs/skills/prereq-check.md index 0e139a6..680db90 100644 --- a/website/docs/skills/prereq-check.md +++ b/website/docs/skills/prereq-check.md @@ -43,6 +43,25 @@ Validates the local environment has the CLI tools and auth sessions needed to ru | jq | `jq` | 1.6 | JSON parsing in scripts and workflows | | git | `git` | any | Version control (usually pre-installed) | +## Reported Command-Not-Found Errors + +Before running checks, inspect the user's prompt for explicit missing-command +reports such as `az: command not found`, `command not found: gh`, or "jq is not +found". Track any matching binaries (`az`, `gh`, `jq`, `git`) as +**reported missing tools**. + +A reported missing tool is actionable even if this terminal can find it. The +user may be in a different shell, PATH, dev container, or machine than the +agent. For each reported missing tool: + +- State what this terminal detected separately from what the user reported. +- Always include install/reinstall or PATH repair guidance for that tool. +- Always include verification commands, such as `command -v az` and + `az --version`. +- If this terminal finds the tool, explain that the likely issue is + shell-specific PATH/configuration drift and recommend reopening the shell or + reloading the shell profile after install/PATH changes. + ## Execution Playbook Run the steps below in order. Present results as a table. Stop at the first blocking failure. @@ -56,6 +75,7 @@ echo "Platform: $OS / $ARCH" ``` Map the result for install instructions: + - `Darwin` → macOS - `Linux` → Linux (check for `apt-get` vs `yum`/`dnf` to narrow distro) - `MINGW*` / `MSYS*` → Windows (git-bash) @@ -109,11 +129,15 @@ Show a table with pass/fail status: Mark a tool ❌ if it is missing OR below the minimum version. -### Step 4: Show Install Commands (only if something is missing) +### Step 4: Show Install Commands and PATH Repair Guidance -Show install commands only for missing or outdated tools, matching the detected platform. +Show install commands for any tool that is missing, outdated, or reported by +the user as "command not found", matching the detected platform. If a reported +tool is present in this terminal, frame the guidance as reinstall/PATH repair +rather than claiming the user's report was wrong. **macOS (Homebrew):** + ```bash brew install azure-cli # az brew install gh # GitHub CLI @@ -122,6 +146,7 @@ brew install git # git (if missing) ``` **Ubuntu / Debian:** + ```bash # az — Microsoft repository curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash @@ -140,6 +165,7 @@ sudo apt-get install -y jq ``` **RHEL / Fedora:** + ```bash # az sudo rpm --import https://packages.microsoft.com/keys/microsoft.asc @@ -153,6 +179,7 @@ sudo dnf install -y jq ``` **Windows (PowerShell with winget):** + ```powershell winget install Microsoft.AzureCLI winget install GitHub.cli @@ -161,6 +188,19 @@ winget install jqlang.jq > **Windows note:** Git-Ape skills require a BASH shell. Install [Git for Windows](https://gitforwindows.org/) and use git-bash. +**PATH verification and shell refresh (all platforms):** + +```bash +command -v az && az --version +command -v gh && gh --version +command -v jq && jq --version +command -v git && git --version +``` + +If a command is installed but still not found in the user's shell, close and +reopen the terminal, then reload the shell profile (`source ~/.bashrc`, +`source ~/.zshrc`, or equivalent) and run the verification commands again. + ### Step 5: Check Auth Sessions Only run this step if all tools passed Step 3. @@ -185,11 +225,13 @@ Present a final verdict: - **✅ READY** — All tools installed, versions OK, auth sessions active. Proceed with any Git-Ape skill. - **⚠️ TOOLS MISSING** — List what to install. Do not proceed until resolved. +- **⚠️ REPORTED COMMAND NOT FOUND** — This terminal can find the tool, but the user's shell reported it missing. Provide install/PATH repair guidance and verification commands before proceeding. - **⚠️ AUTH MISSING** — Tools OK but user needs to run `az login` and/or `gh auth login`. ## Agent Behavior 1. Run Steps 1–5 by executing the commands in the terminal. 2. Present the results table and install commands (if needed). -3. Do NOT install anything automatically — show the commands and let the user run them. -4. If everything passes, tell the user they're ready and suggest next steps (e.g., `/git-ape-onboarding`). +3. If the user reported "command not found", do NOT omit install/PATH guidance just because this terminal finds the tool. +4. Do NOT install anything automatically — show the commands and let the user run them. +5. If everything passes and no command-not-found issue was reported, tell the user they're ready and suggest next steps (e.g., `/git-ape-onboarding`). diff --git a/website/docs/workflows/overview.md b/website/docs/workflows/overview.md index fcea474..393bda1 100644 --- a/website/docs/workflows/overview.md +++ b/website/docs/workflows/overview.md @@ -33,6 +33,8 @@ Workflows ship as **`*.exampleyml`** files in `.github/workflows/` so they are i | [Git-Ape: Verify Setup](./git-ape-verify) | `git-ape-verify.exampleyml` | workflow_dispatch | verify | | [Issue Triage Agent](./issue-triage-agent-lock) | `issue-triage-agent.lock.yml` | schedule, workflow_dispatch | activation, agent, conclusion, detection, safe_outputs | | [PR Validation](./pr-validation) | `pr-validation.yml` | pull_request | structure-check, markdownlint | +| [Waza agent evals](./waza-agent-evals) | `waza-agent-evals.yml` | pull_request, workflow_dispatch | preflight, prepare, tokens, eval, comment | +| [Waza skill evals](./waza-evals) | `waza-evals.yml` | pull_request, workflow_dispatch | preflight, prepare, tokens, eval, comment | ## Pipeline Architecture diff --git a/website/docs/workflows/waza-agent-evals.md b/website/docs/workflows/waza-agent-evals.md new file mode 100644 index 0000000..96acc76 --- /dev/null +++ b/website/docs/workflows/waza-agent-evals.md @@ -0,0 +1,1178 @@ +--- +title: "Waza agent evals" +sidebar_label: "Waza agent evals" +description: "GitHub Actions workflow: Waza agent evals" +--- + + + + +# Waza agent evals + +**Workflow file:** `.github/workflows/waza-agent-evals.yml` + +## Triggers + +- **`pull_request`** — paths: `.github/agents/**/*.agent.md, .github/evals/agents/**, .github/workflows/waza-agent-evals.yml` +- **`workflow_dispatch`** + + +## Permissions + +- `contents: read` +- `pull-requests: write` + +## Jobs + +### `preflight` + +| Property | Value | +|----------|-------| +| **Display Name** | Preflight (check secrets) | +| **Runs On** | `ubuntu-latest` | +| **Steps** | 1 | + +### `prepare` + +| Property | Value | +|----------|-------| +| **Display Name** | Determine matrix | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight` | +| **Steps** | 2 | + +### `tokens` + +| Property | Value | +|----------|-------| +| **Display Name** | Agent file token comparison vs main (advisory) | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight` | +| **Steps** | 4 | + +### `eval` + +| Property | Value | +|----------|-------| +| **Display Name** | ${{ matrix.agent || 'eval (skipped — no agent changes)' }} | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight`, `prepare` | +| **Steps** | 6 | + +### `comment` + +| Property | Value | +|----------|-------| +| **Display Name** | Post advisory comment on PR | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight`, `prepare`, `eval`, `tokens` | +| **Steps** | 4 | + + + +## Source + +
+Click to view full workflow YAML + +```yaml +name: Waza agent evals + +# Advisory-mode evaluation of custom Git-Ape agents. +# Runs on PRs that touch a `.agent.md` or its eval directory. Posts a comment +# with results. Always non-blocking — eval failures never gate merges. +# +# Why a parallel workflow (vs. extending waza-evals.yml): +# - Different cost profile: agent evals are compound (agent + auto-loaded +# skills via plugin.json) and cost ~5 premium reqs each. No tier-based +# multi-model fan-out — single model (claude-sonnet-4.6) to cap quota. +# - Different artifacts: agents share `waza tokens profile` and `waza +# quality` parity with the skills workflow (each agent's `.agent.md` +# is staged as a temporary `SKILL.md` to satisfy waza's skill-walker); +# `waza check` is skipped because the agentskills.io spec it enforces +# rejects agent-specific frontmatter fields ('agents', 'argument-hint', +# 'model', 'tools', 'user-invocable') as invalid. +# - Different layout: agent evals live at `.github/evals/agents//`, +# not `.github/evals//`. The eval consumes a mirrored +# `.agent.md` next to `eval.yaml` via `skill_directories: ["."]`, +# which this workflow re-syncs from the canonical `.github/agents/` copy +# before running. +# +# Per-PR scoping: +# - Touch the workflow file → full matrix. +# - Touch `.github/agents/.agent.md` → that agent only (if an eval +# directory exists). +# - Touch `.github/evals/agents//...` → that agent only. +# - workflow_dispatch with no input → full matrix. +# - workflow_dispatch with `agent:` input → that agent only. +# +# Notes: +# - The canonical agent list is discovered from the filesystem +# (`.github/evals/agents//eval.yaml`) — no separate manifest. +# Drop in a new agent eval directory and this workflow picks it up +# on the next PR. +# - copilot-sdk needs a Copilot-scoped token. Default GITHUB_TOKEN does +# NOT carry that scope. We use the `COPILOT_GITHUB_TOKEN` repo secret +# (already configured for waza-evals.yml). +# - Comment posting uses the default token (only needs pull-requests: write). + +on: + pull_request: + paths: + - '.github/agents/**/*.agent.md' + - '.github/evals/agents/**' + - '.github/workflows/waza-agent-evals.yml' + workflow_dispatch: + inputs: + agent: + description: 'Single agent name to run (default: all agents with an eval directory)' + required: false + type: string + +permissions: + contents: read + pull-requests: write + +concurrency: + group: waza-agent-evals-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # --------------------------------------------------------------------------- + # preflight: verify that the COPILOT_GITHUB_TOKEN secret is configured. + # When absent, every downstream job is skipped cleanly (no red checks). The + # maintainer setup steps are in PR #109 / README. + # --------------------------------------------------------------------------- + preflight: + name: Preflight (check secrets) + runs-on: ubuntu-latest + timeout-minutes: 2 + outputs: + enabled: ${{ steps.check.outputs.enabled }} + steps: + - name: Check COPILOT_GITHUB_TOKEN availability + id: check + env: + TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: | + if [ -z "${TOKEN:-}" ]; then + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza agent eval jobs. See repo README / PR #109 for setup." + exit 0 + fi + # Token is set — verify it can actually read the private microsoft/waza + # repo (release downloads need access). Reject silently if 401/403/404. + # Capture headers + body for diagnostics (no token is ever printed). + hdr_file=$(mktemp) + body_file=$(mktemp) + http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/microsoft/waza/releases/latest || true) + if [ "${http_code}" = "200" ]; then + echo "enabled=true" >> "$GITHUB_OUTPUT" + echo "COPILOT_GITHUB_TOKEN can read microsoft/waza — eval jobs will run." + else + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza agent eval jobs." + echo "--- diagnostic: response headers (token not included) ---" + grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true + echo "--- diagnostic: response body (first 500 bytes) ---" + head -c 500 "${body_file}" || true + echo + echo "--- diagnostic: token-user identity probe ---" + user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/user || true) + echo "GET /user -> HTTP ${user_code}" + if [ "${user_code}" = "200" ]; then + jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user" + else + head -c 300 "${body_file}.user" || true + fi + echo + fi + rm -f "${hdr_file}" "${body_file}" "${body_file}.user" + + # --------------------------------------------------------------------------- + # prepare: discover all configured agents from the filesystem, then narrow + # to the subset affected by this PR (or run all on workflow_dispatch / a + # workflow-file change). Outputs: + # - agents: JSON array of selected agent names (drives comment ordering) + # - legs: JSON array of { agent } entries for matrix.include + # - mode/reason: human-readable scope info for the PR comment banner + # --------------------------------------------------------------------------- + prepare: + name: Determine matrix + runs-on: ubuntu-latest + timeout-minutes: 5 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + outputs: + agents: ${{ steps.select.outputs.agents }} + legs: ${{ steps.select.outputs.legs }} + reason: ${{ steps.select.outputs.reason }} + mode: ${{ steps.select.outputs.mode }} + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Select agents + id: select + env: + REQUESTED: ${{ inputs.agent }} + EVENT: ${{ github.event_name }} + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + set -euo pipefail + + # Canonical agent list: every directory under .github/evals/agents/ + # that contains an eval.yaml. Filesystem is the source of truth. + # The directory may not exist yet (no agent suites ported) — treat as empty. + if [ -d .github/evals/agents ]; then + ALL_AGENTS="$( + find .github/evals/agents -mindepth 2 -maxdepth 2 -name eval.yaml \ + | awk -F/ '{print $4}' \ + | sort -u \ + | jq -R -s -c 'split("\n") | map(select(length > 0))' + )" + else + ALL_AGENTS="[]" + fi + echo "ALL_AGENTS=$ALL_AGENTS" + + # emit + emit() { + local selected="$1" mode="$2" reason="$3" + local legs + legs="$(echo "$selected" | jq -c '[ .[] | { agent: . } ]')" + { + echo "agents=${selected}" + echo "legs=${legs}" + echo "mode=${mode}" + echo "reason=${reason}" + } >> "$GITHUB_OUTPUT" + echo "Selected agents: ${selected}" + echo "Legs: ${legs}" + echo "Mode: ${mode}" + echo "Reason: ${reason}" + } + + # --- Case 1: workflow_dispatch with single-agent input --- + if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then + if echo "$ALL_AGENTS" | jq -e --arg a "$REQUESTED" '. | index($a)' > /dev/null; then + emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)" + exit 0 + else + echo "::error::Requested agent '$REQUESTED' has no eval directory under .github/evals/agents/ (available: $ALL_AGENTS)" + exit 1 + fi + fi + + # --- Case 2: workflow_dispatch without input → full matrix --- + if [ "$EVENT" = "workflow_dispatch" ]; then + emit "$ALL_AGENTS" "full" "workflow_dispatch (no input → full matrix)" + exit 0 + fi + + # --- Case 3: pull_request — diff against base --- + if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then + emit "$ALL_AGENTS" "full" "pull_request: missing base/head SHA → full matrix" + exit 0 + fi + + git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true + + changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true) + if [ -z "$changed" ]; then + emit "[]" "none" "no files changed in PR" + exit 0 + fi + + echo "--- changed files ---" + echo "$changed" + echo "---------------------" + + # Workflow-file changes → full matrix (semantics of this workflow itself changed). + if echo "$changed" | grep -qE '^\.github/workflows/waza-agent-evals\.yml$'; then + emit "$ALL_AGENTS" "full" "workflow file changed → full matrix" + exit 0 + fi + + # Per-agent changes from both possible paths: + # .github/agents/.agent.md + # .github/evals/agents//... + # shellcheck disable=SC2016 + changed_agents=$( + echo "$changed" | awk -F/ ' + /^\.github\/agents\/.+\.agent\.md$/ { + fname=$3 + sub(/\.agent\.md$/, "", fname) + print fname + } + /^\.github\/evals\/agents\// && NF >= 5 {print $4} + ' | sort -u + ) + + if [ -z "$changed_agents" ]; then + emit "[]" "none" "no per-agent files changed" + exit 0 + fi + + # Intersect with the configured (filesystem) list. + selected=$( + printf '%s\n' "$changed_agents" \ + | jq -R -s -c --argjson all "$ALL_AGENTS" \ + '[ split("\n")[] | select(length > 0) | select(IN($all[])) ]' + ) + + if [ "$selected" = "[]" ]; then + emit "[]" "none" "changed agent(s) have no eval directory: $(echo "$changed_agents" | tr '\n' ' ')" + exit 0 + fi + + count=$(echo "$selected" | jq 'length') + names=$(echo "$selected" | jq -r 'join(", ")') + emit "$selected" "subset" "diff-scoped: ${count} changed agent(s) — ${names}" + + # --------------------------------------------------------------------------- + # tokens: token comparison vs main for `.agent.md` files. Runs once (not + # per-matrix) and uploads a single JSON artifact consumed by the comment + # job. `waza tokens compare` is local computation only — no LLM, no quota + # cost. Advisory — never fails the workflow. + # --------------------------------------------------------------------------- + tokens: + name: Agent file token comparison vs main (advisory) + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + continue-on-error: true + env: + # Only used for the release-API lookup (public-repo read). Keeps the + # secret list consistent across all jobs in this workflow. + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (latest GitHub release) + run: | + set -euo pipefail + waza_version="$(curl -fsSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + https://api.github.com/repos/microsoft/waza/releases/latest \ + | jq -r '.tag_name')" + if [ -z "${waza_version}" ] || [ "${waza_version}" = "null" ]; then + echo "::error::could not resolve latest waza release tag" + exit 1 + fi + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Compare .agent.md token counts vs origin/main + # Advisory step — never gate the workflow on filter quirks. Disable + # `-e` (GitHub injects `bash -e {0}`) so a single jq failure can't + # kill the step before the recovery branches run. + shell: bash {0} + run: | + set -uo pipefail + mkdir -p .waza-results + # `waza tokens compare` without --skills walks every .md file + # in the repo. We post-filter to .github/agents/*.agent.md + # entries only. --threshold 0 keeps the exit code clean + # (advisory, never gates). + waza tokens compare origin/main --threshold 0 --format json \ + > .waza-results/tokens-compare-raw.json 2>&1 || true + + # Filter to .agent.md files in the agents directory. Tolerate + # multiple top-level schemas across waza versions — if the JSON + # has a top-level `files` array, filter that; otherwise pass + # the raw payload through and let the comment script decide. + # `.path // ""` makes the regex test null-safe (some waza + # versions emit summary/totals entries with a null path). + if jq -e 'type == "object" and has("files")' \ + .waza-results/tokens-compare-raw.json > /dev/null 2>&1; then + jq '{ + base: .base, + head: .head, + files: [ .files[] + | select((.path // "") | test("^\\.github/agents/.+\\.agent\\.md$")) ] + }' .waza-results/tokens-compare-raw.json \ + > .waza-results/tokens-compare.json \ + || cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json + else + cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json + fi + + echo "--- filtered agent token comparison ---" + cat .waza-results/tokens-compare.json || true + exit 0 + + - name: Upload token comparison artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-agent-tokens-compare + path: .waza-results/tokens-compare.json + retention-days: 14 + if-no-files-found: warn + include-hidden-files: true + + # --------------------------------------------------------------------------- + # eval: matrix (agent). Each leg runs `waza run` on the agent's compound + # eval and produces a markdown snippet for the PR comment. Single-model + # (claude-sonnet-4.6) to cap quota cost — each leg averages ~5 premium reqs. + # --------------------------------------------------------------------------- + eval: + name: "${{ matrix.agent || 'eval (skipped — no agent changes)' }}" + needs: [preflight, prepare] + if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != '' + runs-on: ubuntu-latest + timeout-minutes: 20 + continue-on-error: true + strategy: + fail-fast: false + # Throttle concurrent SDK sessions to keep us under the Copilot models + # API rate-limit ceiling. Without this cap, bursting 8 agent legs in + # parallel reliably trips `Failed to list models: 429` on a subset of + # legs — they fail in <2s without consuming any premium requests and + # surface as fake low scores. 3 concurrent SDK sessions has empirically + # stayed under the limit; raise cautiously. + max-parallel: 3 + matrix: + include: ${{ fromJSON(needs.prepare.outputs.legs) }} + env: + # copilot-sdk authenticates with this token. Default GITHUB_TOKEN does + # not carry Copilot scope, so we use a dedicated PAT in repo secrets. + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (latest GitHub release) + run: | + set -euo pipefail + waza_version="$(curl -fsSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + https://api.github.com/repos/microsoft/waza/releases/latest \ + | jq -r '.tag_name')" + if [ -z "${waza_version}" ] || [ "${waza_version}" = "null" ]; then + echo "::error::could not resolve latest waza release tag" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Sync mirrored .agent.md from canonical .github/agents/ + # The eval's `skill_directories: ["."]` loads the sibling .agent.md + # mirror; the canonical source lives in .github/agents/. Copy on + # every run so the eval always reflects the canonical agent file + # under test, without requiring contributors to keep them in sync + # by hand. + run: | + set -euo pipefail + agent="${{ matrix.agent }}" + src=".github/agents/${agent}.agent.md" + dst=".github/evals/agents/${agent}/${agent}.agent.md" + if [ -f "$src" ] && [ -d ".github/evals/agents/${agent}" ]; then + cp "$src" "$dst" + echo "Synced ${src} -> ${dst}" + else + echo "::warning::Missing canonical agent file or eval dir for ${agent}: src=${src}, dst-dir=.github/evals/agents/${agent}" + fi + + - name: Run waza eval (advisory) + id: run + run: | + # GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT + # disable -e, so a non-zero exit from `waza run` (e.g. metric below + # threshold) kills the script before `rc=$?` runs. Explicitly + # disable errexit so we can capture the code and surface it in the + # PR comment instead of failing the leg silently. + set +e + set -uo pipefail + mkdir -p .waza-results + agent="${{ matrix.agent }}" + spec=".github/evals/agents/${agent}/eval.yaml" + + # ---- Retry-on-infra-failure wrapper ------------------------------- + # Three infra-failure classes can corrupt a leg WITHOUT being model + # quality signal — see the same pattern in waza-evals.yml and + # waza-trends.yml. Detect ALL three classes per attempt, retry on + # any of them (with longer backoff on quota), and INFRA_FAILED + # the leg if retries exhaust so we don't blend fake low scores + # into the PR comment: + # 1. `Session not found` (JSON-RPC -32603): the Copilot SDK + # dropped the session before waza's `prompt` grader could + # resume it (continue_session: true). Validations get wiped + # to null on affected tasks, dragging the leg aggregate down. + # 2. `failed to run grader`: the judge LLM backend itself + # crashed during a grader call. Status=error, empty + # validations, fake low score. + # 3. `Failed to list models: 429`: Copilot models API rate-limit + # hit BEFORE the agent could start. Worst case: all tasks + # return status=error in <2s with deterministic 0-ish scores. + # + # All three are transient. We retry up to 2 times (3 total + # attempts). On exhaustion, we delete the corrupt JSON and write + # an INFRA_FAILED sidecar + markdown notice; the aggregator's + # fallback path (no JSON → use rawMd) will surface that notice + # instead of polluting the score table. + # + # --judge-model is decoupled from the executor model so quality + # scores are always judged by claude-sonnet-4.6 even if we ever + # add per-agent model overrides. + max_attempts=3 + attempt=0 + rc=0 + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + echo "::group::waza run attempt ${attempt}/${max_attempts} for ${agent}" + rc=0 + waza run "${spec}" \ + --model "claude-sonnet-4.6" \ + --judge-model "claude-sonnet-4.6" \ + --suggest \ + --recommend \ + --format "github-comment" \ + --output ".waza-results/${agent}.json" \ + --reporter "junit:.waza-results/${agent}.junit.xml" \ + > ".waza-results/${agent}.md" + rc=$? + echo "::endgroup::" + + if [ ! -f ".waza-results/${agent}.json" ]; then + echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})" + if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi + break + fi + + # Count each infra-failure class in this attempt's artifact. + infra_counts=$(jq -r ' + [.tasks[]?.runs[]? | (.error_msg // "")] as $errs + | { session: ([$errs[] | select(contains("Session not found"))] | length), + grader: ([$errs[] | select(contains("failed to run grader"))] | length), + quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) } + | "\(.session) \(.grader) \(.quota)" + ' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0") + session_errs=$(echo "${infra_counts}" | awk '{print $1}') + grader_errs=$(echo "${infra_counts}" | awk '{print $2}') + quota_errs=$(echo "${infra_counts}" | awk '{print $3}') + total_infra=$((session_errs + grader_errs + quota_errs)) + + if [ "${total_infra}" = "0" ]; then + echo "::notice::${agent} attempt ${attempt} clean (no infra-failure errors)" + break + fi + + echo "::warning::${agent} attempt ${attempt} hit ${session_errs} session-not-found + ${grader_errs} grader-infra + ${quota_errs} quota-429 error(s)" + if [ $attempt -lt $max_attempts ]; then + # Discard partial artifacts so the next attempt is independent. + rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.md" ".waza-results/${agent}.junit.xml" + # Quota errors need longer backoff than session/grader to let + # the Copilot models API window reset. + if [ "${quota_errs}" != "0" ]; then sleep 30; else sleep 5; fi + fi + done + + # Final classification: if any infra errors remain after all + # attempts, treat the leg as INFRA_FAILED and discard the corrupt + # JSON so it doesn't pollute the score table. + final_session=0 + final_grader=0 + final_quota=0 + if [ -f ".waza-results/${agent}.json" ]; then + infra_counts=$(jq -r ' + [.tasks[]?.runs[]? | (.error_msg // "")] as $errs + | { session: ([$errs[] | select(contains("Session not found"))] | length), + grader: ([$errs[] | select(contains("failed to run grader"))] | length), + quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) } + | "\(.session) \(.grader) \(.quota)" + ' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0") + final_session=$(echo "${infra_counts}" | awk '{print $1}') + final_grader=$(echo "${infra_counts}" | awk '{print $2}') + final_quota=$(echo "${infra_counts}" | awk '{print $3}') + fi + final_infra=$((final_session + final_grader + final_quota)) + + if [ "${final_infra}" != "0" ]; then + echo "::error::${agent} still has ${final_session} session-not-found + ${final_grader} grader-infra + ${final_quota} quota-429 error(s) after ${max_attempts} attempts — discarding corrupt artifact" + printf 'session_not_found_errors=%s\ngrader_failed_errors=%s\nquota_429_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \ + "${final_session}" "${final_grader}" "${final_quota}" "${max_attempts}" "${rc}" \ + > ".waza-results/${agent}.infra-failed" + rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.junit.xml" + # Replace the markdown with a clear INFRA_FAILED notice. Use + # printf (no heredoc) because heredoc EOF terminators clash + # with YAML block-scalar indentation rules in `run: |` steps. + { + printf '### `%s` — INFRA_FAILED\n\n' "${agent}" + printf 'waza run hit infra-level error(s) from the Copilot SDK ' + printf 'after **%s attempt(s)**:\n\n' "${max_attempts}" + printf -- '- `Session not found` (JSON-RPC -32603): **%s**\n' "${final_session}" + printf -- '- `failed to run grader` (judge backend crash): **%s**\n' "${final_grader}" + printf -- '- `Failed to list models: 429` (Copilot quota): **%s**\n\n' "${final_quota}" + printf 'These error classes are transient infrastructure issues, ' + printf 'not model-quality signal. **No score is reported for this leg** ' + printf '— treating a corrupted run as a low score would be misleading. ' + printf 'See the workflow logs and the `waza-agent-results-%s` artifact for details.\n' "${agent}" + } > ".waza-results/${agent}.md" + fi + # ---- end retry wrapper -------------------------------------------- + + echo "exit_code=${rc}" >> "$GITHUB_OUTPUT" + echo + echo "--- captured PR-comment markdown ---" + cat ".waza-results/${agent}.md" || true + # Never fail the step itself — surface the code in the comment. + exit 0 + + - name: Agent signal — tokens profile + quality (advisory) + # Parity with `waza-evals.yml`: surface `waza tokens profile` and + # `waza quality` output for `.agent.md` files. Both commands target + # `SKILL.md` only, so we stage a temporary copy of the agent file + # named `SKILL.md` in a NON-DOT directory ('.waza-results/...' or + # any other dotted path is silently skipped by waza's workspace + # walker). The stage dir is named with the agent slug so judge + # output ('📊 : ...') and table headers display the agent + # name instead of a random tmp suffix. + # + # `waza check` is intentionally skipped: it validates the + # agentskills.io SKILL spec, which rejects agent-specific + # frontmatter fields ('agents', 'argument-hint', 'model', + # 'tools', 'user-invocable') as invalid. Running it would + # surface confusing "spec failures" that aren't real agent + # quality signal. + # + # `waza quality` consumes ~1 premium Copilot request per leg via + # its LLM judge (claude-sonnet-4.6 by default). Failures are + # tolerated with `|| true` so a flaky judge call doesn't tank + # the whole leg. + if: always() + run: | + set -uo pipefail + mkdir -p .waza-results + agent="${{ matrix.agent }}" + src=".github/agents/${agent}.agent.md" + if [ ! -f "$src" ]; then + echo "::warning::canonical agent file missing for ${agent}: ${src} — skipping signal steps" + exit 0 + fi + + # Stage as SKILL.md in a non-dotted path so waza's workspace + # walker (which skips hidden/dotted dirs) finds it. + stage_root="waza-agent-stage" + stage_dir="${stage_root}/${agent}" + rm -rf "$stage_dir" + mkdir -p "$stage_dir" + cp "$src" "${stage_dir}/SKILL.md" + + echo "::group::waza tokens profile (${agent})" + waza tokens profile "$stage_dir" \ + > ".waza-results/${agent}-tokens-profile.txt" 2>&1 || true + # Strip the temp stage_root prefix from the human-readable output + # so the display reads "agent-name:" instead of + # "waza-agent-stage/agent-name:". + sed -i "s|${stage_root}/||g" ".waza-results/${agent}-tokens-profile.txt" || true + cat ".waza-results/${agent}-tokens-profile.txt" || true + echo "::endgroup::" + + echo "::group::waza quality (${agent}) — LLM judge, ~1 premium req" + waza quality "$stage_dir" --format table \ + > ".waza-results/${agent}-quality.txt" 2>&1 || true + sed -i "s|${stage_root}/||g" ".waza-results/${agent}-quality.txt" || true + cat ".waza-results/${agent}-quality.txt" || true + echo "::endgroup::" + + # Clean up stage so it doesn't end up in the artifact. + rm -rf "$stage_root" + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-agent-results-${{ matrix.agent }} + path: .waza-results/ + retention-days: 14 + if-no-files-found: warn + # `.waza-results/` starts with a dot, and upload-artifact treats + # any path segment starting with `.` as hidden by default. Without + # this, the artifact is silently empty. + include-hidden-files: true + + # --------------------------------------------------------------------------- + # comment: fan-in. Downloads all artifacts and posts one aggregated comment. + # Idempotent — uses an HTML marker to update the same comment on subsequent + # pushes instead of stacking new ones. + # --------------------------------------------------------------------------- + comment: + name: Post advisory comment on PR + needs: [preflight, prepare, eval, tokens] + if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always() + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Download all eval artifacts + uses: actions/download-artifact@v8 + with: + path: artifacts + pattern: waza-agent-results-* + merge-multiple: false + + - name: Download token comparison artifact + uses: actions/download-artifact@v8 + with: + name: waza-agent-tokens-compare + path: artifacts/waza-agent-tokens-compare + continue-on-error: true + + # `actions/download-artifact@v8` is documented as creating a per-artifact + # subdirectory under `path` when `pattern` is used with `merge-multiple: + # false`. In practice, when only ONE artifact matches the pattern + # (typical for diff-scoped PR runs with a single changed agent), v8 + # extracts the artifact contents directly into `path` with no + # subdirectory — the same behavior as a single-name download. The + # downstream aggregator script expects the nested layout, so this step + # normalizes the flat case back into nested. Idempotent: a no-op when + # the layout is already nested (multi-agent runs). + - name: Normalize artifact layout (handle v8 single-match flattening) + shell: bash + run: | + set -euo pipefail + if [ ! -d artifacts ]; then + echo "artifacts/ does not exist — nothing to normalize" + exit 0 + fi + + echo "--- artifact layout BEFORE normalization ---" + find artifacts -maxdepth 3 -mindepth 1 | sort + + shopt -s nullglob + cd artifacts + for f in *.json *.md *.junit.xml *-quality.txt *-tokens-profile.txt *.infra-failed; do + [ -f "$f" ] || continue + agent="$f" + for suf in -quality.txt -tokens-profile.txt .junit.xml .json .md .infra-failed; do + agent="${agent%"$suf"}" + done + if [ -z "$agent" ] || [ "$agent" = "$f" ]; then + echo "::warning::Could not derive agent slug from filename '$f' — leaving in place" + continue + fi + mkdir -p "waza-agent-results-${agent}" + mv -- "$f" "waza-agent-results-${agent}/" + echo " moved: $f -> waza-agent-results-${agent}/" + done + cd - + + echo "--- artifact layout AFTER normalization ---" + find artifacts -maxdepth 3 -mindepth 1 | sort + + - name: Aggregate and post comment + uses: actions/github-script@v9 + env: + PREPARE_MODE: ${{ needs.prepare.outputs.mode }} + PREPARE_REASON: ${{ needs.prepare.outputs.reason }} + PREPARE_AGENTS: ${{ needs.prepare.outputs.agents }} + with: + # Default GITHUB_TOKEN — has `pull-requests: write` and is the + # right identity for bot-style comments. + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const path = require('path'); + + const agents = JSON.parse(process.env.PREPARE_AGENTS || '[]'); + const root = 'artifacts'; + const allDirs = fs.existsSync(root) + ? fs.readdirSync(root) + .filter((d) => d.startsWith('waza-agent-results-')) + .sort() + : []; + + // ---------------- helpers ---------------- + function readFileOrNull(filePath) { + try { + if (!fs.existsSync(filePath)) return null; + const c = fs.readFileSync(filePath, 'utf8'); + return c.length > 0 ? c : null; + } catch (e) { + core.debug(`readFileOrNull: ${filePath} -> ${e.message}`); + return null; + } + } + + function readJsonOrNull(filePath) { + const raw = readFileOrNull(filePath); + if (!raw) return null; + try { + return JSON.parse(raw); + } catch (e) { + core.debug(`readJsonOrNull: parse failed for ${filePath}: ${e.message}`); + return null; + } + } + + function fmtMs(ms) { + if (typeof ms !== 'number' || !isFinite(ms)) return '—'; + if (ms < 1000) return `${ms}ms`; + const s = ms / 1000; + if (s < 60) return `${s.toFixed(1)}s`; + const m = Math.floor(s / 60); + return `${m}m${(s - m * 60).toFixed(0)}s`; + } + + function fmtTokens(n) { + if (typeof n !== 'number' || !isFinite(n) || n === 0) return '—'; + if (n < 1000) return String(n); + if (n < 1_000_000) return `${(n / 1000).toFixed(1)}K`; + return `${(n / 1_000_000).toFixed(2)}M`; + } + + function scoreEmoji(score, succeeded, total) { + if (typeof score !== 'number') return '⚠️'; + if (succeeded === total && total > 0) return '✅'; + if (succeeded > 0) return '⚠️'; + return '❌'; + } + + function truncateText(text, maxLines) { + if (!text) return ''; + const lines = text.split('\n'); + if (lines.length <= maxLines) return text; + return lines.slice(0, maxLines).join('\n') + + `\n… (${lines.length - maxLines} more lines truncated)`; + } + + // ---------------- top-level: agent token compare ---------------- + const tcPath = path.join(root, 'waza-agent-tokens-compare', 'tokens-compare.json'); + const tc = readJsonOrNull(tcPath); + let tokenCompareSection = ''; + if (tc) { + const files = Array.isArray(tc.files) ? tc.files : []; + if (files.length > 0) { + const rows = files + .map((f) => { + const before = (f.base_tokens != null) ? f.base_tokens + : (f.before != null) ? f.before : '—'; + const after = (f.head_tokens != null) ? f.head_tokens + : (f.after != null) ? f.after + : (f.tokens != null) ? f.tokens : '—'; + const delta = (f.delta != null) ? f.delta + : (typeof before === 'number' && typeof after === 'number') + ? (after - before) : '—'; + const pct = (f.percent_change != null) ? `${f.percent_change.toFixed(1)}%` + : (typeof before === 'number' && before > 0 && typeof delta === 'number') + ? `${(delta * 100 / before).toFixed(1)}%` + : '—'; + const sign = (typeof delta === 'number' && delta > 0) ? '+' : ''; + return `| \`${f.path}\` | ${before} | ${after} | ${sign}${delta} | ${pct} |`; + }) + .join('\n'); + tokenCompareSection = [ + '
📊 Agent file token comparison vs main (advisory)', + '', + `| File | Base | Head | Δ | % |`, + `|---|---|---|---|---|`, + rows, + '', + '
', + ].join('\n'); + } else { + tokenCompareSection = [ + '
📊 Agent file token comparison vs main (advisory)', + '', + '_No `.agent.md` files changed vs `main` (or token-compare returned no entries)._', + '', + '
', + ].join('\n'); + } + } + + // ---------------- per-agent sections ---------------- + const byAgent = new Map(); + for (const d of allDirs) { + const agent = d.replace(/^waza-agent-results-/, ''); + byAgent.set(agent, d); + } + + const sections = []; + for (const agent of agents) { + const dir = byAgent.get(agent); + if (!dir) { + sections.push([ + `### Agent: \`${agent}\``, + '', + '_No artifact produced. See workflow logs._', + ].join('\n')); + continue; + } + + const jsonPath = path.join(root, dir, `${agent}.json`); + const rawMd = readFileOrNull(path.join(root, dir, `${agent}.md`)); + const json = readJsonOrNull(jsonPath); + + if (!json) { + // Fall back to raw github-comment markdown if JSON is unavailable. + sections.push([ + `### Agent: \`${agent}\``, + '', + rawMd || '_No output captured. See workflow logs._', + ].join('\n')); + continue; + } + + const summary = json.summary || {}; + const usage = summary.usage || {}; + const total = summary.total_tests || 0; + const ok = summary.succeeded || 0; + const failed = summary.failed || 0; + const score = (typeof summary.aggregate_score === 'number') + ? summary.aggregate_score.toFixed(2) : '—'; + const emoji = scoreEmoji(summary.aggregate_score, ok, total); + + const headline = + `**Score:** ${emoji} ${score} (${ok}/${total} tasks) | ` + + `**Duration:** ${fmtMs(summary.duration_ms)} | ` + + `**Cost:** ${usage.premium_requests ?? '—'} premium req${usage.premium_requests === 1 ? '' : 's'}, ` + + `${usage.turns ?? '—'} turns | ` + + `**Tokens:** ${fmtTokens(usage.input_tokens)} in / ${fmtTokens(usage.output_tokens)} out` + + (usage.cache_read_tokens ? ` / ${fmtTokens(usage.cache_read_tokens)} cache-read` : ''); + + // Per-task table. + const tasks = Array.isArray(json.tasks) ? json.tasks : []; + const taskRows = tasks.map((t) => { + const run0 = (t.runs && t.runs[0]) || {}; + const sd = run0.session_digest || {}; + const taskScore = (typeof t.score === 'number') ? t.score.toFixed(2) + : (typeof run0.score === 'number') ? run0.score.toFixed(2) + : '—'; + const passed = run0.status === 'passed' || run0.status === 'pass'; + const statusEmoji = passed ? '✅' : (run0.status === 'error' ? '⚠️' : '❌'); + const toolCalls = sd.tool_call_count ?? '—'; + const graders = (run0.validations) + ? Object.keys(run0.validations).join(', ') + : '—'; + const name = (t.display_name || t.name || t.id || '(unnamed)') + .replace(/\|/g, '\\|'); + return `| ${name} | ${taskScore} | ${statusEmoji} | ${toolCalls} | ${graders} |`; + }).join('\n'); + const taskTable = tasks.length > 0 + ? [ + '| Task | Score | Status | Tool calls | Graders |', + '|---|---|---|---|---|', + taskRows, + ].join('\n') + : '_No task data in JSON output._'; + + // Per-agent signal sections (parity with skills workflow): + // model-independent, fed by per-leg `waza tokens profile` + // and `waza quality` output. Both files live alongside the + // eval JSON/markdown in the same artifact dir. + const tokensProfilePath = path.join(root, dir, `${agent}-tokens-profile.txt`); + const tokensProfileRaw = readFileOrNull(tokensProfilePath); + const tokensSection = tokensProfileRaw + ? [ + '
🔢 Tokens (count + profile)', + '', + '```', + tokensProfileRaw.trim(), + '```', + '', + '
', + ].join('\n') + : ''; + + const qualityPath = path.join(root, dir, `${agent}-quality.txt`); + const qualityRaw = readFileOrNull(qualityPath); + const qualitySection = qualityRaw + ? [ + '
🎯 Quality (5-dim table)', + '', + '```', + qualityRaw.trim(), + '```', + '', + '_Scored by `waza quality` with `claude-sonnet-4.6` as LLM judge. The agent\'s `.agent.md` is staged as `SKILL.md` for analysis; treat dimensions as advisory signal (the rubric was authored for skills)._', + '', + '
', + ].join('\n') + : ''; + + // Failure details (only when something failed). + const failureDetails = []; + for (const t of tasks) { + const run0 = (t.runs && t.runs[0]) || {}; + if (run0.status === 'passed' || run0.status === 'pass') continue; + const name = t.display_name || t.name || t.id || '(unnamed)'; + const lines = [`#### Task: ${name}`, '']; + const validations = run0.validations || {}; + for (const [gname, v] of Object.entries(validations)) { + if (v.passed) continue; + const fb = (v.feedback || '_no feedback_').replace(/\n/g, ' ').slice(0, 400); + lines.push(`- ❌ **${gname}** (${(v.score ?? 0).toFixed(2)}): ${fb}`); + } + const out = run0.final_output; + if (out && typeof out === 'string') { + lines.push('', '
Agent output (truncated)', '', '```', truncateText(out, 30), '```', '', '
'); + } + failureDetails.push(lines.join('\n')); + } + const failurePanel = (failed > 0 && failureDetails.length > 0) + ? [ + '
🐛 Failure details', + '', + failureDetails.join('\n\n---\n\n'), + '', + '
', + ].join('\n') + : ''; + + // Suggestion / recommendation report (--suggest --recommend). + const sug = (json.metadata && json.metadata.suggestion_report) || null; + const rec = (json.metadata && json.metadata.recommendation_report) || null; + const suggestionParts = []; + if (sug && typeof sug === 'string' && sug.trim().length > 0) { + suggestionParts.push(sug.trim()); + } + if (rec && typeof rec === 'string' && rec.trim().length > 0) { + if (suggestionParts.length > 0) suggestionParts.push('\n\n---\n\n'); + suggestionParts.push(rec.trim()); + } + const suggestionPanel = suggestionParts.length > 0 + ? [ + failed > 0 + ? '
💡 Suggestions / root-cause analysis' + : '
💡 Suggestions / recommendations', + '', + suggestionParts.join(''), + '', + '
', + ].join('\n') + : ''; + + // Raw eval output (closed by default — fallback / drill-down). + const rawPanel = rawMd + ? [ + '
📄 Full eval output (raw --format github-comment markdown)', + '', + rawMd.trim(), + '', + '
', + ].join('\n') + : ''; + + const parts = [ + `### Agent: \`${agent}\``, + '', + headline, + '', + taskTable, + ]; + if (tokensSection) { parts.push('', tokensSection); } + if (qualitySection) { parts.push('', qualitySection); } + if (failurePanel) { parts.push('', failurePanel); } + if (suggestionPanel) { parts.push('', suggestionPanel); } + if (rawPanel) { parts.push('', rawPanel); } + sections.push(parts.join('\n')); + } + + const totalLegs = allDirs.length; + + const prepareMode = (process.env.PREPARE_MODE || '').trim(); + const prepareReason = (process.env.PREPARE_REASON || '').trim(); + let scopeBanner = ''; + if (prepareMode === 'none') { + scopeBanner = + '> ℹ️ **No agents evaluated.** ' + (prepareReason || 'No relevant changes detected.'); + } else if (prepareMode === 'subset') { + scopeBanner = + '> 🎯 **Diff-scoped run.** ' + (prepareReason || 'Only changed agents evaluated.') + + ' Touch `.github/workflows/waza-agent-evals.yml` or trigger `workflow_dispatch` to run all agents.'; + } else if (prepareMode === 'single') { + scopeBanner = + '> 🎯 **Single-agent run.** ' + (prepareReason || 'workflow_dispatch input.'); + } else if (prepareMode === 'full') { + scopeBanner = + '> 🔁 **Full matrix run.** ' + (prepareReason || 'All configured agents evaluated.'); + } + + const header = [ + '', + '## 🤖 Waza agent evals (advisory)', + '', + scopeBanner, + scopeBanner ? '' : null, + 'Ran ' + totalLegs + ' agent eval' + (totalLegs === 1 ? '' : 's') + + ' against `claude-sonnet-4.6`. Each eval consumes ~5 premium Copilot requests; results are non-blocking — investigate failures via the workflow logs and the per-agent `waza-agent-results-*` artifacts.', + '', + '> **How this works:** This workflow auto-syncs the canonical `.github/agents/.agent.md` into the sibling mirror inside `.github/evals/agents//` before each run, so the score below reflects the version of the agent in this PR — not whatever was committed when the eval was first wired up.', + '', + ].filter((line) => line !== null).join('\n'); + + const sectionsBlock = sections.length > 0 + ? sections.join('\n\n---\n\n') + : '_No agents in scope for this PR._'; + const body = [ + header.replace(/\s+$/, ''), + tokenCompareSection.replace(/\s+$/, ''), + sectionsBlock, + ].filter((s) => s.length > 0).join('\n\n') + '\n'; + + const { owner, repo } = context.repo; + const issue_number = context.payload.pull_request.number; + const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number }); + const existing = comments.find((c) => c.body && c.body.includes('')); + if (existing) { + await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body }); + } else { + await github.rest.issues.createComment({ owner, repo, issue_number, body }); + } + +``` + +
diff --git a/website/docs/workflows/waza-evals.md b/website/docs/workflows/waza-evals.md new file mode 100644 index 0000000..e6f0e09 --- /dev/null +++ b/website/docs/workflows/waza-evals.md @@ -0,0 +1,954 @@ +--- +title: "Waza skill evals" +sidebar_label: "Waza skill evals" +description: "GitHub Actions workflow: Waza skill evals" +--- + + + + +# Waza skill evals + +**Workflow file:** `.github/workflows/waza-evals.yml` + +## Triggers + +- **`pull_request`** — paths: `.github/skills/**/SKILL.md, .github/skills/**/eval.yaml, .github/skills/**/tasks/**...` +- **`workflow_dispatch`** + + +## Permissions + +- `contents: read` +- `pull-requests: write` + +## Jobs + +### `preflight` + +| Property | Value | +|----------|-------| +| **Display Name** | Preflight (check secrets) | +| **Runs On** | `ubuntu-latest` | +| **Steps** | 1 | + +### `prepare` + +| Property | Value | +|----------|-------| +| **Display Name** | Determine matrix | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight` | +| **Steps** | 2 | + +### `tokens` + +| Property | Value | +|----------|-------| +| **Display Name** | Token comparison vs main (advisory) | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight` | +| **Steps** | 4 | + +### `eval` + +| Property | Value | +|----------|-------| +| **Display Name** | ${{ matrix.skill || 'eval' }} / ${{ matrix.model || 'skipped (no skill changes)' }} | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight`, `prepare` | +| **Steps** | 7 | + +### `comment` + +| Property | Value | +|----------|-------| +| **Display Name** | Post advisory comment on PR | +| **Runs On** | `ubuntu-latest` | +| **Depends On** | `preflight`, `prepare`, `eval`, `tokens` | +| **Steps** | 3 | + + + +## Source + +
+Click to view full workflow YAML + +```yaml +name: Waza skill evals + +# Advisory-mode evaluation of agent skills. +# Runs on PRs that touch SKILL.md or any eval file. Posts a comment with results. +# Always non-blocking — eval failures never gate merges. +# +# Single source of truth: .github/evals/manifest.yaml +# Lists configured skills, tier classification, and per-tier model fan-out. +# Edit that file to add/remove a skill or promote a tier — this workflow +# reads it dynamically and needs no changes. +# +# Architecture: +# - `prepare` job: reads the manifest, then diffs base..head (or honors a +# workflow_dispatch input) to determine which subset of skills to +# evaluate. Builds the matrix.include payload (one entry per +# skill × tier-model). Project-wide config changes (.waza.yaml, this +# workflow file, the manifest) trigger the full matrix. +# - `tokens` job: runs once, compares token counts across all skills vs main +# and uploads the result for the comment job. Cheap, always runs. +# - `eval` job: matrix expanded purely from `prepare.outputs.legs`. Each +# leg runs the eval suite plus per-skill signal steps (tokens profile, +# quality, check). Uploads per-leg artifacts. Skipped entirely if no +# skills changed. +# - `comment` job: fan-in. Downloads all artifacts and posts a single +# `` PR comment with one section per skill +# (in manifest order) plus a header noting which skills ran and why. +# +# Per-PR scoping: +# - Skill-only changes → only changed skills run (saves Copilot quota). +# - .waza.yaml, manifest, or workflow file changes → full matrix. +# - .github/evals// changes → that skill only. +# - workflow_dispatch with no input → full matrix. +# - workflow_dispatch with `skill:` input → that skill only. +# +# Notes: +# - waza's eval schema only supports `skill:`. Custom agents under +# .github/agents/*.agent.md are *not* evaluable by this workflow. See +# docs/WAZA.md "Agent evals" for the upstream limitation. +# - copilot-sdk needs a Copilot-scoped token. Default GITHUB_TOKEN does +# NOT carry that scope. We use the `COPILOT_GITHUB_TOKEN` repo secret. +# Comment posting uses the default token (only needs pull-requests: write). + +on: + pull_request: + paths: + - '.github/skills/**/SKILL.md' + - '.github/skills/**/eval.yaml' + - '.github/skills/**/tasks/**' + - '.github/skills/**/fixtures/**' + - '.github/evals/**' + - '.waza.yaml' + - '.github/workflows/waza-evals.yml' + workflow_dispatch: + inputs: + skill: + description: 'Single skill name to run (default: all configured pilot evals)' + required: false + type: string + +permissions: + contents: read + pull-requests: write + +concurrency: + group: waza-evals-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +# Note: there is no top-level skill list. The canonical list lives in +# .github/evals/manifest.yaml and is read by the `prepare` job below. + +jobs: + # --------------------------------------------------------------------------- + # preflight: verify that the COPILOT_GITHUB_TOKEN secret is configured. + # When absent, every downstream job is skipped cleanly (no red checks). The + # maintainer setup steps are in PR #109 / README. + # --------------------------------------------------------------------------- + preflight: + name: Preflight (check secrets) + runs-on: ubuntu-latest + timeout-minutes: 2 + outputs: + enabled: ${{ steps.check.outputs.enabled }} + steps: + - name: Check COPILOT_GITHUB_TOKEN availability + id: check + env: + TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: | + if [ -z "${TOKEN:-}" ]; then + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza skill eval jobs. See repo README / PR #109 for setup." + exit 0 + fi + # Token is set — verify it can actually read the private microsoft/waza + # repo (release downloads need access). Reject silently if 401/403/404. + # Capture headers + body for diagnostics (no token is ever printed). + hdr_file=$(mktemp) + body_file=$(mktemp) + http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/microsoft/waza/releases/latest || true) + if [ "${http_code}" = "200" ]; then + echo "enabled=true" >> "$GITHUB_OUTPUT" + echo "COPILOT_GITHUB_TOKEN can read microsoft/waza — eval jobs will run." + else + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza skill eval jobs." + echo "--- diagnostic: response headers (token not included) ---" + grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true + echo "--- diagnostic: response body (first 500 bytes) ---" + head -c 500 "${body_file}" || true + echo + echo "--- diagnostic: token-user identity probe ---" + user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/user || true) + echo "GET /user -> HTTP ${user_code}" + if [ "${user_code}" = "200" ]; then + # Print only the login + token type, never the token itself. + jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user" + else + head -c 300 "${body_file}.user" || true + fi + echo + fi + rm -f "${hdr_file}" "${body_file}" "${body_file}.user" + + # --------------------------------------------------------------------------- + # prepare: read .github/evals/manifest.yaml and decide which skills to + # evaluate based on the diff / dispatch input. Outputs: + # - skills: JSON array of selected skill names (drives comment ordering) + # - legs: JSON array of {skill, model, baseline} for matrix.include + # - baseline_models: JSON array of model names that run with --baseline + # - mode/reason: human-readable scope info for the PR comment banner + # --------------------------------------------------------------------------- + prepare: + name: Determine matrix + runs-on: ubuntu-latest + timeout-minutes: 5 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + outputs: + skills: ${{ steps.select.outputs.skills }} + legs: ${{ steps.select.outputs.legs }} + baseline_models: ${{ steps.select.outputs.baseline_models }} + reason: ${{ steps.select.outputs.reason }} + mode: ${{ steps.select.outputs.mode }} + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Select skills + id: select + env: + REQUESTED: ${{ inputs.skill }} + EVENT: ${{ github.event_name }} + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + set -euo pipefail + + # Source of truth for skills + tier + per-tier model fan-out. + manifest=".github/evals/manifest.yaml" + if [ ! -f "$manifest" ]; then + echo "::error::manifest not found: $manifest" + exit 1 + fi + + # Convert YAML -> JSON once; everything else is jq. + manifest_json="$(yq -o=json '.' "$manifest")" + + ALL_SKILLS="$(echo "$manifest_json" | jq -c '[.skills[].name]')" + BASELINE_MODELS="$(echo "$manifest_json" | jq -c ' + [ .tiers[].models[] | select(.baseline == true) | .name ] | unique + ')" + echo "ALL_SKILLS=$ALL_SKILLS" + echo "BASELINE_MODELS=$BASELINE_MODELS" + + # emit + # Computes `legs` from selected skills + manifest tiers and writes + # all four outputs (skills, legs, baseline_models, mode, reason). + emit() { + local selected="$1" mode="$2" reason="$3" + local legs + legs="$(echo "$manifest_json" | jq -c --argjson sel "$selected" ' + . as $root + | [ $root.skills[] + | .name as $sname + | select($sel | index($sname)) + | .tier as $tier + | $root.tiers[$tier].models[] + | { skill: $sname, model: .name, baseline: (.baseline == true) } + ] + ')" + { + echo "skills=${selected}" + echo "legs=${legs}" + echo "baseline_models=${BASELINE_MODELS}" + echo "mode=${mode}" + echo "reason=${reason}" + } >> "$GITHUB_OUTPUT" + echo "Selected skills: ${selected}" + echo "Legs: ${legs}" + echo "Mode: ${mode}" + echo "Reason: ${reason}" + } + + # --- Case 1: workflow_dispatch with single-skill input --- + if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then + if echo "$ALL_SKILLS" | jq -e --arg s "$REQUESTED" '. | index($s)' > /dev/null; then + emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)" + exit 0 + else + echo "::error::Requested skill '$REQUESTED' is not in the manifest ($ALL_SKILLS)" + exit 1 + fi + fi + + # --- Case 2: workflow_dispatch without input → full matrix --- + if [ "$EVENT" = "workflow_dispatch" ]; then + emit "$ALL_SKILLS" "full" "workflow_dispatch (no input → full matrix)" + exit 0 + fi + + # --- Case 3: pull_request — diff against base --- + if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then + emit "$ALL_SKILLS" "full" "pull_request: missing base/head SHA → full matrix" + exit 0 + fi + + # Make sure the base commit is fetched (checkout fetched everything + # via fetch-depth: 0, but be defensive in case of shallow merges). + git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true + + changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true) + if [ -z "$changed" ]; then + emit "[]" "none" "no files changed in PR" + exit 0 + fi + + echo "--- changed files ---" + echo "$changed" + echo "---------------------" + + # Project-wide config changes → full matrix. + # Includes the manifest itself: changing tiers / model fan-out + # affects every skill, so re-run everything. + if echo "$changed" | grep -qE '^(\.waza\.yaml|\.github/workflows/waza-evals\.yml|\.github/evals/manifest\.yaml)$'; then + emit "$ALL_SKILLS" "full" "project-wide config change (.waza.yaml, manifest, or workflow file) → full matrix" + exit 0 + fi + + # Per-skill changes: collect skill names from both layouts. + # .github/skills//... → SKILL.md, references, etc. + # .github/evals//... → eval.yaml, tasks, fixtures. + # NF >= 4 filter excludes files at the root of evals/ (like + # manifest.yaml) which are handled by the config-wide check above. + changed_skills=$( + echo "$changed" | awk -F/ ' + /^\.github\/skills\// && NF >= 4 {print $3} + /^\.github\/evals\// && NF >= 4 {print $3} + ' | sort -u + ) + + if [ -z "$changed_skills" ]; then + emit "[]" "none" "no per-skill files changed" + exit 0 + fi + + # Intersect with the canonical configured list. + selected=$( + printf '%s\n' "$changed_skills" \ + | jq -R -s -c --argjson all "$ALL_SKILLS" \ + '[ split("\n")[] | select(length > 0) | select(IN($all[])) ]' + ) + + if [ "$selected" = "[]" ]; then + emit "[]" "none" "changed skill(s) not in the manifest: $(echo "$changed_skills" | tr '\n' ' ')" + exit 0 + fi + + count=$(echo "$selected" | jq 'length') + names=$(echo "$selected" | jq -r 'join(", ")') + emit "$selected" "subset" "diff-scoped: ${count} changed skill(s) — ${names}" + + # --------------------------------------------------------------------------- + # tokens: compare token counts across all SKILL.md files vs main. + # Runs once (not per-matrix) and uploads a single JSON artifact consumed + # by the comment job. Advisory — never fails the workflow. + # --------------------------------------------------------------------------- + tokens: + name: Token comparison vs main (advisory) + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: preflight + if: needs.preflight.outputs.enabled == 'true' + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (latest GitHub release) + run: | + set -euo pipefail + waza_version="$(curl -fsSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + https://api.github.com/repos/microsoft/waza/releases/latest \ + | jq -r '.tag_name')" + if [ -z "${waza_version}" ] || [ "${waza_version}" = "null" ]; then + echo "::error::could not resolve latest waza release tag" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Token comparison vs main (advisory) + id: tokens-compare + run: | + set -uo pipefail + mkdir -p .waza-results + # Advisory: no --strict so the step never fails the workflow. + # --format json produces machine-readable output for the comment job. + waza tokens compare main --skills --threshold 10 --format json \ + > .waza-results/tokens-compare.json 2>&1 || true + echo "--- token comparison output ---" + cat .waza-results/tokens-compare.json || true + # Always exit cleanly — advisory only. + exit 0 + + - name: Upload token comparison artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-tokens-compare + path: .waza-results/tokens-compare.json + retention-days: 14 + if-no-files-found: warn + include-hidden-files: true + + # --------------------------------------------------------------------------- + # eval: matrix (skill x model). Each leg runs the eval suite plus per-skill + # signal steps (tokens profile, quality, check). All steps are advisory. + # + # The skill axis is supplied by the `prepare` job — only changed skills run + # on per-PR events; the full list runs on workflow_dispatch (no input) and + # on PRs that touch project-wide config. + # --------------------------------------------------------------------------- + eval: + name: "${{ matrix.skill || 'eval' }} / ${{ matrix.model || 'skipped (no skill changes)' }}" + needs: [preflight, prepare] + if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != '' + runs-on: ubuntu-latest + timeout-minutes: 25 + # Per-job continue-on-error so a single matrix leg failure doesn't fail + # the whole workflow. Combined with `if: always()` on the comment job, + # this guarantees the PR comment is posted even when some legs fail. + continue-on-error: true + strategy: + fail-fast: false + matrix: + # Matrix is sourced entirely from the manifest via the prepare job. + # Each include entry is `{ skill, model, baseline }`. Adding a skill + # or promoting a tier means editing .github/evals/manifest.yaml — + # never this workflow. + include: ${{ fromJSON(needs.prepare.outputs.legs) }} + env: + # copilot-sdk authenticates with this token. Default GITHUB_TOKEN does + # not carry Copilot scope, so we use a dedicated PAT in repo secrets. + # Also reused for the release-API lookup (only needs public-repo read). + GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + WAZA_NO_UPDATE_CHECK: '1' + + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install waza (latest GitHub release) + run: | + set -euo pipefail + waza_version="$(curl -fsSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + https://api.github.com/repos/microsoft/waza/releases/latest \ + | jq -r '.tag_name')" + if [ -z "${waza_version}" ] || [ "${waza_version}" = "null" ]; then + echo "::error::could not resolve latest waza release tag" + exit 1 + fi + echo "Installing waza ${waza_version}" + + os="$(uname -s | tr '[:upper:]' '[:lower:]')" + arch="$(uname -m)" + case "${arch}" in + x86_64|amd64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + esac + asset="waza-${os}-${arch}" + base="https://github.com/microsoft/waza/releases/download/${waza_version}" + tmp="$(mktemp -d)" + + curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}" + curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt" + ( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - ) + sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza + rm -rf "${tmp}" + waza --version + + - name: Run waza eval (advisory) + id: run + run: | + # GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT + # disable -e, so a non-zero exit from `waza run` (e.g. metric below + # threshold) kills the script before `rc=$?` runs. Explicitly + # disable errexit so we can capture the code and surface it in the + # PR comment instead of failing the leg silently. + set +e + set -uo pipefail + mkdir -p .waza-results + spec=".github/evals/${{ matrix.skill }}/eval.yaml" + # Slug used for filenames + artifact suffix; harmless when the + # model has dots (gpt-5.4) since GH Actions allows them. + slug="${{ matrix.skill }}-${{ matrix.model }}" + + # gpt-5.4 (and any other model flagged `baseline: true` in the + # manifest) runs in --baseline (A/B) mode to cap quota cost while + # still providing a reference point for cross-model comparison. All + # other models run standard. The PR comment labels each leg. + extra_flags="" + if [ "${{ matrix.baseline }}" = "true" ]; then + extra_flags="--baseline" + fi + + # ---- Retry-on-session-not-found wrapper ----------------------------- + # The Copilot SDK occasionally drops the agent's session before + # waza's `prompt` grader can resume it (`continue_session: true`), + # producing JSON-RPC -32603 "Session not found" errors. When this + # fires, the run is marked status=error with `validations: null` — + # ALL graders' verdicts for that task are wiped, dragging the leg + # aggregate down ~50–80% even when the agent's actual response was + # correct. + # + # The error is purely transient (server-side session GC). Retrying + # the leg with a fresh waza process consistently recovers. We retry + # up to 2 times (3 total attempts) on session-not-found ONLY — other + # errors (rate-limit 429, below-threshold scores, network) are NOT + # retried since they have different recovery characteristics. + # + # Stdout (--format github-comment) is the markdown for the PR + # comment; capture it cleanly to its own file. Stderr (progress, + # task results, "Running benchmark:") streams to the runner log. + # --model overrides the spec's config.model so we can fan out the + # same eval suite across multiple models. + # --judge-model decouples the LLM-as-judge from the executor model + # so quality scores are always judged by claude-sonnet-4.6. + # --suggest --recommend appends outcome-tied recommendations. + max_attempts=3 + attempt=0 + rc=0 + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + echo "::group::waza run attempt ${attempt}/${max_attempts} for ${slug}" + rc=0 + # shellcheck disable=SC2086 + waza run "${spec}" \ + --model "${{ matrix.model }}" \ + --judge-model "claude-sonnet-4.6" \ + --suggest \ + --recommend \ + ${extra_flags} \ + --format "github-comment" \ + --output ".waza-results/${slug}.json" \ + --reporter "junit:.waza-results/${slug}.junit.xml" \ + --parallel \ + > ".waza-results/${slug}.md" + rc=$? + echo "::endgroup::" + + if [ ! -f ".waza-results/${slug}.json" ]; then + echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})" + if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi + break + fi + + session_errs=$(jq -r ' + [.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length + ' ".waza-results/${slug}.json" 2>/dev/null || echo 0) + + if [ "${session_errs}" = "0" ]; then + echo "::notice::${slug} attempt ${attempt} clean (no session-not-found errors)" + break + fi + + echo "::warning::${slug} attempt ${attempt} hit ${session_errs} session-not-found error(s)" + if [ $attempt -lt $max_attempts ]; then + # Discard partial artifacts so the next attempt is independent. + rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.md" ".waza-results/${slug}.junit.xml" + sleep 5 + fi + done + # If retries exhausted and the artifact STILL has session-not-found + # errors, the data is corrupt (validations: null on affected runs + # would drag the leg aggregate down 50–80% as a fake "low score"). + # Discard it so the PR comment surfaces this as INFRA_FAILED rather + # than a misleading low score. + final_session_errs=0 + if [ -f ".waza-results/${slug}.json" ]; then + final_session_errs=$(jq -r ' + [.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length + ' ".waza-results/${slug}.json" 2>/dev/null || echo 0) + fi + if [ "${final_session_errs}" != "0" ]; then + echo "::error::${slug} still has ${final_session_errs} session-not-found error(s) after ${max_attempts} attempts — discarding corrupt artifact" + printf 'session_not_found_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \ + "${final_session_errs}" "${max_attempts}" "${rc}" \ + > ".waza-results/${slug}.infra-failed" + rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.junit.xml" + # Replace the markdown summary with a clear INFRA_FAILED notice + # so the PR comment shows the actual problem instead of stale + # markdown from one of the failed attempts. Use printf (no heredoc) + # because heredoc EOF terminators clash with YAML block-scalar + # indentation rules in `run: |` steps. + { + printf '### `%s` — INFRA_FAILED\n\n' "${slug}" + printf 'waza run hit `%s` `Session not found` JSON-RPC error(s) ' "${final_session_errs}" + printf 'from the Copilot SDK after **%s attempt(s)**. ' "${max_attempts}" + printf 'The session-resume path used by `prompt` graders with ' + printf '`continue_session: true` is intermittently flaky in CI; ' + printf 'retries did not recover. **No score is reported for this leg** ' + printf '— treating a corrupted run as a low score would be misleading.\n' + } > ".waza-results/${slug}.md" + fi + # ---- end retry wrapper ---------------------------------------------- + + echo "exit_code=${rc}" >> "$GITHUB_OUTPUT" + echo + echo "--- captured PR-comment markdown ---" + cat ".waza-results/${slug}.md" || true + # Never fail the step itself — surface the code in the comment. + exit 0 + + - name: Tokens profile (advisory) + id: tokens-profile + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + waza tokens profile ".github/skills/${{ matrix.skill }}" \ + > ".waza-results/${slug}-tokens-profile.txt" 2>&1 || true + cat ".waza-results/${slug}-tokens-profile.txt" || true + exit 0 + + - name: Quality signal (advisory) + id: quality + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + # --judge-model omitted: this step uses the project default judge model + # (claude-sonnet-4.6 from .waza.yaml) for consistent quality scoring + # regardless of which executor model is running in this matrix leg. + waza quality ".github/skills/${{ matrix.skill }}" --format table \ + > ".waza-results/${slug}-quality.txt" 2>&1 || true + cat ".waza-results/${slug}-quality.txt" || true + exit 0 + + - name: Compliance check (advisory) + id: check + continue-on-error: true + run: | + set -uo pipefail + slug="${{ matrix.skill }}-${{ matrix.model }}" + mkdir -p .waza-results + waza check ".github/skills/${{ matrix.skill }}" \ + > ".waza-results/${slug}-check.txt" 2>&1 || true + cat ".waza-results/${slug}-check.txt" || true + exit 0 + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v7 + with: + name: waza-results-${{ matrix.skill }}-${{ matrix.model }} + path: .waza-results/ + retention-days: 14 + if-no-files-found: warn + # `.waza-results/` starts with a dot, and upload-artifact treats + # any path segment starting with `.` as hidden by default. Without + # this, the artifact is silently empty. + include-hidden-files: true + + # --------------------------------------------------------------------------- + # comment: fan-in. Downloads all artifacts and posts one aggregated comment. + # --------------------------------------------------------------------------- + comment: + name: Post advisory comment on PR + needs: [preflight, prepare, eval, tokens] + if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always() + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Download all eval artifacts + uses: actions/download-artifact@v8 + with: + path: artifacts + pattern: waza-results-* + merge-multiple: false + + - name: Download token comparison artifact + uses: actions/download-artifact@v8 + with: + name: waza-tokens-compare + path: artifacts/waza-tokens-compare + continue-on-error: true + + - name: Aggregate and post comment + uses: actions/github-script@v9 + env: + PREPARE_MODE: ${{ needs.prepare.outputs.mode }} + PREPARE_REASON: ${{ needs.prepare.outputs.reason }} + PREPARE_SKILLS: ${{ needs.prepare.outputs.skills }} + PREPARE_BASELINES: ${{ needs.prepare.outputs.baseline_models }} + with: + # Default GITHUB_TOKEN — has `pull-requests: write` and is the + # right identity for bot-style comments. + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const path = require('path'); + + // Each matrix job uploads `waza-results--` + // containing per-leg files (slug.md, slug-tokens-profile.txt, + // slug-quality.txt, slug-check.txt). + // + // Skill ordering and baseline-model classification are sourced + // from .github/evals/manifest.yaml via the prepare job — no + // hardcoded lists in this workflow. + const skills = JSON.parse(process.env.PREPARE_SKILLS || '[]'); + const baselineModels = new Set( + JSON.parse(process.env.PREPARE_BASELINES || '[]') + ); + const root = 'artifacts'; + const allDirs = fs.existsSync(root) + ? fs.readdirSync(root) + .filter((d) => d.startsWith('waza-results-')) + .sort() + : []; + + // Helper: read a file, return trimmed content or fallback string. + // Logs a debug note when returning the fallback so missing artifacts + // are visible in the Actions log without failing the step. + function readArtifact(filePath, fallback) { + if (fs.existsSync(filePath)) { + const c = fs.readFileSync(filePath, 'utf8').trim(); + if (c) return c; + core.debug(`readArtifact: file exists but is empty — ${filePath}`); + } else { + core.debug(`readArtifact: file not found — ${filePath}`); + } + return fallback; + } + + // Helper: wrap content in a
block if it exceeds threshold. + function maybeCollapse(summary, content, threshold) { + const limit = threshold || 50; + const lines = content.split('\n').length; + if (lines > limit) { + return `
${summary} (${lines} lines — click to expand)\n\n${content}\n\n
`; + } + return `**${summary}**\n\n${content}`; + } + + // Group artifacts by skill. + const bySkill = new Map(); + for (const d of allDirs) { + const rest = d.replace(/^waza-results-/, ''); + const skill = skills.find((s) => rest === s || rest.startsWith(s + '-')); + if (!skill) continue; + const model = rest === skill ? '(default)' : rest.slice(skill.length + 1); + if (!bySkill.has(skill)) bySkill.set(skill, []); + bySkill.get(skill).push({ model, dir: d, slug: rest }); + } + + // Token comparison section (top-level, from tokens job). + let tokenCompareSection = ''; + const tcPath = path.join(root, 'waza-tokens-compare', 'tokens-compare.json'); + const tcRaw = readArtifact(tcPath, ''); + if (tcRaw) { + const tcBlock = '```json\n' + tcRaw + '\n```'; + tokenCompareSection = [ + '
📊 Token comparison vs main (advisory)', + '', + tcBlock, + '', + '
', + '', + ].join('\n'); + } + + // Build per-skill sections. + const sections = []; + for (const skill of skills) { + if (!bySkill.has(skill)) continue; + const legs = bySkill.get(skill).sort((a, b) => a.model.localeCompare(b.model)); + + // Score (per model) + Suggestions/Recommendations + const scoreParts = []; + for (const leg of legs) { + const isBaseline = baselineModels.has(leg.model); + const modelLabel = isBaseline + ? leg.model + ' *(baseline — A/B mode)*' + : leg.model; + const mdPath = path.join(root, leg.dir, leg.slug + '.md'); + const body = readArtifact(mdPath, + '_No output captured. See workflow logs and the `' + leg.dir + '` artifact._'); + scoreParts.push('
Model: ' + modelLabel + + '\n\n' + body + '\n\n
'); + } + const scoreSection = '
📈 Score (per model) + Suggestions/Recommendations\n\n' + + scoreParts.join('\n\n') + '\n\n
'; + + // Tokens (count + profile) — model-independent, use first available leg. + let tokenBody = '_Not available._'; + for (const leg of legs) { + const tp = path.join(root, leg.dir, leg.slug + '-tokens-profile.txt'); + const c = readArtifact(tp, ''); + if (c) { tokenBody = '```\n' + c + '\n```'; break; } + } + const tokenSection = maybeCollapse('🔢 Tokens (count + profile)', tokenBody); + + // Quality (5-dim table) — model-independent, use first available leg. + let qualityBody = '_Not available._'; + for (const leg of legs) { + const qp = path.join(root, leg.dir, leg.slug + '-quality.txt'); + const c = readArtifact(qp, ''); + if (c) { qualityBody = '```\n' + c + '\n```'; break; } + } + const qualitySection = maybeCollapse('🎯 Quality (5-dim table)', qualityBody); + + // Check (compliance summary) — model-independent, use first available leg. + let checkBody = '_Not available._'; + for (const leg of legs) { + const cp = path.join(root, leg.dir, leg.slug + '-check.txt'); + const c = readArtifact(cp, ''); + if (c) { checkBody = '```\n' + c + '\n```'; break; } + } + // `waza check` expects `eval.yaml` colocated with `SKILL.md`. This + // repo separates them (`.github/skills//SKILL.md` vs + // `.github/evals//eval.yaml`), so the "Evaluation Suite: + // Not Found" line is a false negative — the eval actually ran + // (see the "Score" section above). Prepend a note so reviewers + // are not misled. + const checkNote = + '> ℹ️ **`waza check` expects `eval.yaml` colocated with `SKILL.md`.** ' + + 'This repo separates them into `.github/evals/' + skill + '/eval.yaml`, ' + + 'so the "Evaluation Suite: Not Found" line below is a false negative — ' + + 'the eval actually ran (see the **Score** section above).\n\n'; + const checkSection = maybeCollapse('✅ Check (compliance summary)', checkNote + checkBody); + + sections.push([ + '### Skill: `' + skill + '`', + '', + scoreSection, + '', + tokenSection, + '', + qualitySection, + '', + checkSection, + ].join('\n')); + } + + const totalLegs = allDirs.length; + + // Selection-mode banner from the prepare job. + const prepareMode = (process.env.PREPARE_MODE || '').trim(); + const prepareReason = (process.env.PREPARE_REASON || '').trim(); + let scopeBanner = ''; + if (prepareMode === 'none') { + scopeBanner = + '> ℹ️ **No skills evaluated.** ' + (prepareReason || 'No relevant changes detected.') + + ' The token comparison above (if any) is the only signal for this PR.'; + } else if (prepareMode === 'subset') { + scopeBanner = + '> 🎯 **Diff-scoped run.** ' + (prepareReason || 'Only changed skills evaluated.') + + ' Touch `.waza.yaml` or trigger `workflow_dispatch` to run the full matrix.'; + } else if (prepareMode === 'single') { + scopeBanner = + '> 🎯 **Single-skill run.** ' + (prepareReason || 'workflow_dispatch input.'); + } else if (prepareMode === 'full') { + scopeBanner = + '> 🔁 **Full matrix run.** ' + (prepareReason || 'All configured skills evaluated.'); + } + + const header = [ + '', + '## 🧪 Waza skill evals (advisory)', + '', + scopeBanner, + scopeBanner ? '' : null, + 'Ran ' + totalLegs + ' matrix leg' + (totalLegs === 1 ? '' : 's') + + ' in parallel (skills × models). Results are non-blocking — investigate failures via the workflow logs and the per-leg `waza-results-*` artifacts.', + '', + '> **Legend:** Models flagged `baseline: true` in `.github/evals/manifest.yaml` (currently: `' + + (Array.from(baselineModels).join('`, `') || 'none') + + '`) run with `--baseline` (A/B mode) to cap quota. All other models run standard. Judge model is fixed at `claude-sonnet-4.6` across all legs.', + '', + ].filter((line) => line !== null).join('\n'); + + // Assemble body. Each major block is separated by a blank line so + // that GitHub Flavored Markdown correctly recognizes the per-skill + // `### Skill: ...` headings (without a blank line after the + // preceding `
` they get rendered as plain text). + const sectionsBlock = sections.length > 0 + ? sections.join('\n\n---\n\n') + : '_No artifacts produced. See workflow logs._'; + const body = [ + header.replace(/\s+$/, ''), + tokenCompareSection.replace(/\s+$/, ''), + sectionsBlock, + ].filter((s) => s.length > 0).join('\n\n') + '\n'; + + const { owner, repo } = context.repo; + const issue_number = context.payload.pull_request.number; + + // Paginate to find our marker comment — listComments defaults to + // 30 per page and our comment may be beyond that on busy PRs. + let existing = null; + for await (const response of github.paginate.iterator( + github.rest.issues.listComments, + { owner, repo, issue_number, per_page: 100 } + )) { + const found = response.data.find((c) => c.body && c.body.includes('')); + if (found) { existing = found; break; } + } + + if (existing) { + await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body }); + } else { + await github.rest.issues.createComment({ owner, repo, issue_number, body }); + } + +``` + +
diff --git a/website/sidebars.ts b/website/sidebars.ts index 5dffaa2..62e0e9a 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -60,6 +60,18 @@ const sidebars: SidebarsConfig = { {type: 'autogenerated', dirName: 'skills'}, ], }, + { + type: 'category', + label: 'Authoring', + collapsed: false, + items: [ + 'authoring/overview', + 'authoring/skills', + 'authoring/agents', + 'authoring/evals', + 'authoring/prompts', + ], + }, { type: 'category', label: 'CI/CD Workflows',