From 10fef041ee4b5a7d910c5927d92e443be92c005e Mon Sep 17 00:00:00 2001 From: Aaron Goldsmith Date: Sat, 21 Mar 2026 09:26:15 -0700 Subject: [PATCH 1/3] Add /mobius-evolve skill for free Opus-powered agent evolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Evaluator-optimizer loop that uses Claude Code Opus (free via Pro) to analyze judge feedback and refine underperforming agents' system prompts — same quality as the API evolve command at zero cost. Includes load_underperformers.py helper script to surface agents with low win rates and their loss feedback. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/mobius-evolve/SKILL.md | 124 +++++++++++++++++ .../scripts/load_underperformers.py | 131 ++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 .claude/skills/mobius-evolve/SKILL.md create mode 100644 .claude/skills/mobius-evolve/scripts/load_underperformers.py diff --git a/.claude/skills/mobius-evolve/SKILL.md b/.claude/skills/mobius-evolve/SKILL.md new file mode 100644 index 0000000..a4bb174 --- /dev/null +++ b/.claude/skills/mobius-evolve/SKILL.md @@ -0,0 +1,124 @@ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 + + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 + +diff --git a/.claude/skills/mobius-evolve/SKILL.md b/.claude/skills/mobius-evolve/SKILL.md +new file mode 100644 +index 0000000..6fbbeb9 +--- /dev/null ++++ b/.claude/skills/mobius-evolve/SKILL.md +@@ -0,0 +1,101 @@ ++--- ++name: mobius-evolve ++description: Use when the user says "evolve", "mobius evolve", "improve agents", or wants to refine underperforming agents without API costs. ++user-invocable: true ++argument-hint: "[specialization] [--threshold 0.4]" ++--- ++ ++# Mobius Evolve (Local Opus — Evaluator-Optimizer Loop) ++ ++**You ARE the evaluator-optimizer.** You are Claude Opus running locally — this costs $0 in API calls. Instead of calling the API to refine agents, YOU analyze the judge feedback, critique the current system prompt, and craft an improved version using the agentic-eval reflection pattern. ++ ++## Why this matters ++ ++The `mobius evolve` CLI command calls the Opus API to refine agents ($$$). But you (Claude Code Opus) are already running and are the same model. So: ++- **This skill** = Opus refining agents for FREE (Pro subscription) with multi-pass self-critique ++- **mobius evolve** = Opus refining agents for $$$ (API calls), single or multi-pass ++- Same quality, zero cost, and you have full conversation context. ++ ++## What to do ++ ++### Step 1: Load underperformers ++ ++```bash ++python .claude/skills/mobius-evolve/scripts/load_underperformers.py [specialization] [--threshold 0.4] ++``` ++ ++This shows agents with low win rates, their current system prompts, and judge feedback from their losses. ++ ++### Step 2: Analyze and refine (YOU are the evaluator-optimizer loop) ++ ++For each underperformer, apply the **reflection pattern**: ++ ++**Evaluate:** Read the agent's current system prompt and the judge feedback. Identify the specific weaknesses the judges called out. ++ ++**Critique:** Ask yourself: ++- What specific failure patterns does the feedback reveal? ++- Is the system prompt too generic? Too narrow? Missing edge cases? ++- Does it lack clear quality criteria or output format expectations? ++- Would a different problem-solving approach help? ++ ++**Refine:** Write an improved system prompt that directly addresses each criticism. Be substantive — cosmetic rewording doesn't help. ++ ++**Self-check:** Before registering, verify your refinement: ++- Does it address EVERY piece of judge feedback? ++- Is it specific and opinionated (not generic)? ++- Does it include quality criteria, methodology, and output format? ++- Is it meaningfully different from the original, not just reworded? ++ ++If your self-check fails, iterate — refine again before registering. ++ ++### Step 3: Register the improved agent ++ ++Use the create_agent script to register the evolved version: ++ ++```bash ++python .claude/skills/mobius-seed/scripts/create_agent.py '{ ++ "name": "Agent Name v2", ++ "slug": "original-slug-v2", ++ "description": "Updated description", ++ "system_prompt": "Your improved system prompt here...", ++ "provider": "anthropic", ++ "model": "claude-haiku-4-5-20251001", ++ "tools": ["Read", "Grep", "Glob", "Bash", "Write", "Edit"], ++ "specializations": ["coding", "refactoring"], ++ "is_champion": false ++}' ++``` ++ ++Important: ++- Set `is_champion` to `false` — the evolved agent must earn its rank through competition ++- Keep the same provider/model unless there's a strong reason to change ++- Use a slug like `original-slug-v2` or `original-slug-gen2` to show lineage ++- Keep the same specializations ++ ++### Step 4: Show results ++ ++```bash ++python -m mobius.cli agent list ++``` ++ ++## The Agentic-Eval Pattern You're Running ++ ++``` ++Load underperformers → Read judge feedback → Critique prompt ++ ↑ ↓ ++ └──── Self-check fails? ← Refine prompt ←───┘ ++ ↓ ++ Self-check passes ++ ↓ ++ Register agent ++``` ++ ++This is the Evaluator-Optimizer pattern from agentic-eval, with YOU as both evaluator and optimizer. The key advantage over the CLI version: you can reason about the feedback in full context, consider the agent's match history, and make nuanced improvements that a single API call might miss. ++ ++## Pro Tips ++ ++- **Don't just reword** — change the agent's methodology, add specific techniques, restructure the approach ++- **Study the winners** — if the judge praised a winning agent's approach, consider incorporating similar strategies ++- **Vary approaches** — if an agent keeps losing with approach X, try a fundamentally different approach Y ++- **Keep prompts focused** — under 2000 tokens but dense with specific guidance ++- **Consider the model** — Haiku benefits from very explicit instructions; Gemini Flash may need different prompt styles diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py new file mode 100644 index 0000000..7ff6197 --- /dev/null +++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py @@ -0,0 +1,131 @@ +commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad +Author: Aaron Goldsmith +Date: Sat Mar 21 09:13:05 2026 -0700 + + Add agentic competition tasks, agent definitions, and skills + + - Agent definitions: competition-tasks, depth-test, tree-solver + - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) + - Competition tasks: standard + agentic (tool-heavy, multi-tier) + - Cleanup script for dead-weight agents + - Fix hardcoded paths in agentic tasks to use relative paths + - Make system monitoring task cross-platform (Unix tools) + - Remove unused import in cleanup_agents.py + - Add .tree-workspace/ to .gitignore + + Co-Authored-By: Claude Opus 4.6 + +diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py +new file mode 100644 +index 0000000..797ee6d +--- /dev/null ++++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py +@@ -0,0 +1,108 @@ ++"""Load underperforming agents with their loss feedback for evolution. ++ ++Usage: ++ python load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] ++ ++Outputs agent details, win rates, and judge feedback from their losses ++so that Opus can craft improved system prompts. ++""" ++ ++import json ++import sys ++ ++sys.path.insert(0, "src") ++ ++from mobius.config import get_config ++from mobius.db import init_db, row_to_dict ++from mobius.registry import Registry ++from mobius.tournament import Tournament ++ ++ ++def main(): ++ args = sys.argv[1:] ++ ++ # Parse flags ++ specialization = None ++ threshold = 0.4 ++ min_matches = 3 ++ ++ i = 0 ++ while i < len(args): ++ if args[i] == "--threshold" and i + 1 < len(args): ++ threshold = float(args[i + 1]) ++ i += 2 ++ elif args[i] == "--min-matches" and i + 1 < len(args): ++ min_matches = int(args[i + 1]) ++ i += 2 ++ elif not args[i].startswith("--"): ++ specialization = args[i] ++ i += 1 ++ else: ++ i += 1 ++ ++ config = get_config() ++ conn, _ = init_db(config) ++ registry = Registry(conn, config) ++ tournament = Tournament(conn, config, registry) ++ ++ agents = registry.list_agents(specialization=specialization) ++ if not agents: ++ print(f"No agents found{' for ' + specialization if specialization else ''}.") ++ sys.exit(1) ++ ++ underperformers = [] ++ for agent in agents: ++ if agent.total_matches < min_matches: ++ continue ++ win_rate = tournament.get_agent_recent_win_rate( ++ agent.id, window=config.underperformer_window ++ ) ++ if win_rate < threshold: ++ underperformers.append((agent, win_rate)) ++ ++ if not underperformers: ++ print(f"No underperformers below {threshold:.0%} win rate (min {min_matches} matches).") ++ print("\nAll agents:") ++ for agent in agents: ++ wr = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window) ++ print(f" {agent.name} ({agent.slug}) — {wr:.0%} win rate, {agent.total_matches} matches") ++ sys.exit(0) ++ ++ print(f"UNDERPERFORMERS (below {threshold:.0%} win rate, min {min_matches} matches)") ++ print(f"Found: {len(underperformers)}") ++ print() ++ ++ for agent, win_rate in underperformers: ++ matches = tournament.get_agent_matches(agent.id, limit=10) ++ losses = [m for m in matches if m.winner_id != agent.id and not m.voided] ++ ++ print(f"{'='*70}") ++ print(f"AGENT: {agent.name}") ++ print(f" Slug: {agent.slug}") ++ print(f" ID: {agent.id}") ++ print(f" Provider: {agent.provider}/{agent.model}") ++ print(f" Win Rate: {win_rate:.0%} (Elo: {agent.elo_rating:.0f})") ++ print(f" Generation: {agent.generation}") ++ print(f" Specializations: {', '.join(agent.specializations)}") ++ print(f" Losses: {len(losses)}") ++ print() ++ print(" CURRENT SYSTEM PROMPT:") ++ print(f" {agent.system_prompt}") ++ print() ++ ++ if losses: ++ print(" JUDGE FEEDBACK FROM LOSSES:") ++ for j, m in enumerate(losses[:5], 1): ++ print(f" --- Loss {j} ---") ++ print(f" Task: {m.task_description[:150]}") ++ if m.judge_reasoning: ++ print(f" Judge: {m.judge_reasoning[:300]}") ++ print() ++ ++ print() ++ ++ conn.close() ++ ++ ++if __name__ == "__main__": ++ main() From 4a11c4f51e01d814165459fffae0f8da94758bde Mon Sep 17 00:00:00 2001 From: Aaron Goldsmith Date: Sat, 21 Mar 2026 10:07:55 -0700 Subject: [PATCH 2/3] Fix broken files (strip git metadata), clean imports, filter unjudged matches - Strip raw `git show` output (commit metadata, diff headers, leading +/- chars) from SKILL.md and load_underperformers.py so they parse correctly - Remove unused `json` and `row_to_dict` imports from load_underperformers.py - Filter out unjudged matches (winner_id is None) from loss counting - Update SKILL.md argument-hint to include --min-matches Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/skills/mobius-evolve/SKILL.md | 225 ++++++++--------- .../scripts/load_underperformers.py | 238 ++++++++---------- 2 files changed, 208 insertions(+), 255 deletions(-) diff --git a/.claude/skills/mobius-evolve/SKILL.md b/.claude/skills/mobius-evolve/SKILL.md index a4bb174..3761dec 100644 --- a/.claude/skills/mobius-evolve/SKILL.md +++ b/.claude/skills/mobius-evolve/SKILL.md @@ -1,124 +1,101 @@ -commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad -Author: Aaron Goldsmith -Date: Sat Mar 21 09:13:05 2026 -0700 - - Add agentic competition tasks, agent definitions, and skills - - - Agent definitions: competition-tasks, depth-test, tree-solver - - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) - - Competition tasks: standard + agentic (tool-heavy, multi-tier) - - Cleanup script for dead-weight agents - - Fix hardcoded paths in agentic tasks to use relative paths - - Make system monitoring task cross-platform (Unix tools) - - Remove unused import in cleanup_agents.py - - Add .tree-workspace/ to .gitignore - - Co-Authored-By: Claude Opus 4.6 - -diff --git a/.claude/skills/mobius-evolve/SKILL.md b/.claude/skills/mobius-evolve/SKILL.md -new file mode 100644 -index 0000000..6fbbeb9 ---- /dev/null -+++ b/.claude/skills/mobius-evolve/SKILL.md -@@ -0,0 +1,101 @@ -+--- -+name: mobius-evolve -+description: Use when the user says "evolve", "mobius evolve", "improve agents", or wants to refine underperforming agents without API costs. -+user-invocable: true -+argument-hint: "[specialization] [--threshold 0.4]" -+--- -+ -+# Mobius Evolve (Local Opus — Evaluator-Optimizer Loop) -+ -+**You ARE the evaluator-optimizer.** You are Claude Opus running locally — this costs $0 in API calls. Instead of calling the API to refine agents, YOU analyze the judge feedback, critique the current system prompt, and craft an improved version using the agentic-eval reflection pattern. -+ -+## Why this matters -+ -+The `mobius evolve` CLI command calls the Opus API to refine agents ($$$). But you (Claude Code Opus) are already running and are the same model. So: -+- **This skill** = Opus refining agents for FREE (Pro subscription) with multi-pass self-critique -+- **mobius evolve** = Opus refining agents for $$$ (API calls), single or multi-pass -+- Same quality, zero cost, and you have full conversation context. -+ -+## What to do -+ -+### Step 1: Load underperformers -+ -+```bash -+python .claude/skills/mobius-evolve/scripts/load_underperformers.py [specialization] [--threshold 0.4] -+``` -+ -+This shows agents with low win rates, their current system prompts, and judge feedback from their losses. -+ -+### Step 2: Analyze and refine (YOU are the evaluator-optimizer loop) -+ -+For each underperformer, apply the **reflection pattern**: -+ -+**Evaluate:** Read the agent's current system prompt and the judge feedback. Identify the specific weaknesses the judges called out. -+ -+**Critique:** Ask yourself: -+- What specific failure patterns does the feedback reveal? -+- Is the system prompt too generic? Too narrow? Missing edge cases? -+- Does it lack clear quality criteria or output format expectations? -+- Would a different problem-solving approach help? -+ -+**Refine:** Write an improved system prompt that directly addresses each criticism. Be substantive — cosmetic rewording doesn't help. -+ -+**Self-check:** Before registering, verify your refinement: -+- Does it address EVERY piece of judge feedback? -+- Is it specific and opinionated (not generic)? -+- Does it include quality criteria, methodology, and output format? -+- Is it meaningfully different from the original, not just reworded? -+ -+If your self-check fails, iterate — refine again before registering. -+ -+### Step 3: Register the improved agent -+ -+Use the create_agent script to register the evolved version: -+ -+```bash -+python .claude/skills/mobius-seed/scripts/create_agent.py '{ -+ "name": "Agent Name v2", -+ "slug": "original-slug-v2", -+ "description": "Updated description", -+ "system_prompt": "Your improved system prompt here...", -+ "provider": "anthropic", -+ "model": "claude-haiku-4-5-20251001", -+ "tools": ["Read", "Grep", "Glob", "Bash", "Write", "Edit"], -+ "specializations": ["coding", "refactoring"], -+ "is_champion": false -+}' -+``` -+ -+Important: -+- Set `is_champion` to `false` — the evolved agent must earn its rank through competition -+- Keep the same provider/model unless there's a strong reason to change -+- Use a slug like `original-slug-v2` or `original-slug-gen2` to show lineage -+- Keep the same specializations -+ -+### Step 4: Show results -+ -+```bash -+python -m mobius.cli agent list -+``` -+ -+## The Agentic-Eval Pattern You're Running -+ -+``` -+Load underperformers → Read judge feedback → Critique prompt -+ ↑ ↓ -+ └──── Self-check fails? ← Refine prompt ←───┘ -+ ↓ -+ Self-check passes -+ ↓ -+ Register agent -+``` -+ -+This is the Evaluator-Optimizer pattern from agentic-eval, with YOU as both evaluator and optimizer. The key advantage over the CLI version: you can reason about the feedback in full context, consider the agent's match history, and make nuanced improvements that a single API call might miss. -+ -+## Pro Tips -+ -+- **Don't just reword** — change the agent's methodology, add specific techniques, restructure the approach -+- **Study the winners** — if the judge praised a winning agent's approach, consider incorporating similar strategies -+- **Vary approaches** — if an agent keeps losing with approach X, try a fundamentally different approach Y -+- **Keep prompts focused** — under 2000 tokens but dense with specific guidance -+- **Consider the model** — Haiku benefits from very explicit instructions; Gemini Flash may need different prompt styles +--- +name: mobius-evolve +description: Use when the user says "evolve", "mobius evolve", "improve agents", or wants to refine underperforming agents without API costs. +user-invocable: true +argument-hint: "[specialization] [--threshold 0.4] [--min-matches 3]" +--- + +# Mobius Evolve (Local Opus — Evaluator-Optimizer Loop) + +**You ARE the evaluator-optimizer.** You are Claude Opus running locally — this costs $0 in API calls. Instead of calling the API to refine agents, YOU analyze the judge feedback, critique the current system prompt, and craft an improved version using the agentic-eval reflection pattern. + +## Why this matters + +The `mobius evolve` CLI command calls the Opus API to refine agents ($$$). But you (Claude Code Opus) are already running and are the same model. So: +- **This skill** = Opus refining agents for FREE (Pro subscription) with multi-pass self-critique +- **mobius evolve** = Opus refining agents for $$$ (API calls), single or multi-pass +- Same quality, zero cost, and you have full conversation context. + +## What to do + +### Step 1: Load underperformers + +```bash +python .claude/skills/mobius-evolve/scripts/load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] +``` + +This shows agents with low win rates, their current system prompts, and judge feedback from their losses. + +### Step 2: Analyze and refine (YOU are the evaluator-optimizer loop) + +For each underperformer, apply the **reflection pattern**: + +**Evaluate:** Read the agent's current system prompt and the judge feedback. Identify the specific weaknesses the judges called out. + +**Critique:** Ask yourself: +- What specific failure patterns does the feedback reveal? +- Is the system prompt too generic? Too narrow? Missing edge cases? +- Does it lack clear quality criteria or output format expectations? +- Would a different problem-solving approach help? + +**Refine:** Write an improved system prompt that directly addresses each criticism. Be substantive — cosmetic rewording doesn't help. + +**Self-check:** Before registering, verify your refinement: +- Does it address EVERY piece of judge feedback? +- Is it specific and opinionated (not generic)? +- Does it include quality criteria, methodology, and output format? +- Is it meaningfully different from the original, not just reworded? + +If your self-check fails, iterate — refine again before registering. + +### Step 3: Register the improved agent + +Use the create_agent script to register the evolved version: + +```bash +python .claude/skills/mobius-seed/scripts/create_agent.py '{ + "name": "Agent Name v2", + "slug": "original-slug-v2", + "description": "Updated description", + "system_prompt": "Your improved system prompt here...", + "provider": "anthropic", + "model": "claude-haiku-4-5-20251001", + "tools": ["Read", "Grep", "Glob", "Bash", "Write", "Edit"], + "specializations": ["coding", "refactoring"], + "is_champion": false +}' +``` + +Important: +- Set `is_champion` to `false` — the evolved agent must earn its rank through competition +- Keep the same provider/model unless there's a strong reason to change +- Use a slug like `original-slug-v2` or `original-slug-gen2` to show lineage +- Keep the same specializations + +### Step 4: Show results + +```bash +python -m mobius.cli agent list +``` + +## The Agentic-Eval Pattern You're Running + +``` +Load underperformers → Read judge feedback → Critique prompt + ↑ ↓ + └──── Self-check fails? ← Refine prompt ←───┘ + ↓ + Self-check passes + ↓ + Register agent +``` + +This is the Evaluator-Optimizer pattern from agentic-eval, with YOU as both evaluator and optimizer. The key advantage over the CLI version: you can reason about the feedback in full context, consider the agent's match history, and make nuanced improvements that a single API call might miss. + +## Pro Tips + +- **Don't just reword** — change the agent's methodology, add specific techniques, restructure the approach +- **Study the winners** — if the judge praised a winning agent's approach, consider incorporating similar strategies +- **Vary approaches** — if an agent keeps losing with approach X, try a fundamentally different approach Y +- **Keep prompts focused** — under 2000 tokens but dense with specific guidance +- **Consider the model** — Haiku benefits from very explicit instructions; Gemini Flash may need different prompt styles diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py index 7ff6197..c7a9842 100644 --- a/.claude/skills/mobius-evolve/scripts/load_underperformers.py +++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py @@ -1,131 +1,107 @@ -commit fe66e75cc7f79b4ed77b2c8490f4e19862924bad -Author: Aaron Goldsmith -Date: Sat Mar 21 09:13:05 2026 -0700 - - Add agentic competition tasks, agent definitions, and skills - - - Agent definitions: competition-tasks, depth-test, tree-solver - - Skills: mobius-evolve (free Opus evolution), tree-solve (recursive decomposition) - - Competition tasks: standard + agentic (tool-heavy, multi-tier) - - Cleanup script for dead-weight agents - - Fix hardcoded paths in agentic tasks to use relative paths - - Make system monitoring task cross-platform (Unix tools) - - Remove unused import in cleanup_agents.py - - Add .tree-workspace/ to .gitignore - - Co-Authored-By: Claude Opus 4.6 - -diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py -new file mode 100644 -index 0000000..797ee6d ---- /dev/null -+++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py -@@ -0,0 +1,108 @@ -+"""Load underperforming agents with their loss feedback for evolution. -+ -+Usage: -+ python load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] -+ -+Outputs agent details, win rates, and judge feedback from their losses -+so that Opus can craft improved system prompts. -+""" -+ -+import json -+import sys -+ -+sys.path.insert(0, "src") -+ -+from mobius.config import get_config -+from mobius.db import init_db, row_to_dict -+from mobius.registry import Registry -+from mobius.tournament import Tournament -+ -+ -+def main(): -+ args = sys.argv[1:] -+ -+ # Parse flags -+ specialization = None -+ threshold = 0.4 -+ min_matches = 3 -+ -+ i = 0 -+ while i < len(args): -+ if args[i] == "--threshold" and i + 1 < len(args): -+ threshold = float(args[i + 1]) -+ i += 2 -+ elif args[i] == "--min-matches" and i + 1 < len(args): -+ min_matches = int(args[i + 1]) -+ i += 2 -+ elif not args[i].startswith("--"): -+ specialization = args[i] -+ i += 1 -+ else: -+ i += 1 -+ -+ config = get_config() -+ conn, _ = init_db(config) -+ registry = Registry(conn, config) -+ tournament = Tournament(conn, config, registry) -+ -+ agents = registry.list_agents(specialization=specialization) -+ if not agents: -+ print(f"No agents found{' for ' + specialization if specialization else ''}.") -+ sys.exit(1) -+ -+ underperformers = [] -+ for agent in agents: -+ if agent.total_matches < min_matches: -+ continue -+ win_rate = tournament.get_agent_recent_win_rate( -+ agent.id, window=config.underperformer_window -+ ) -+ if win_rate < threshold: -+ underperformers.append((agent, win_rate)) -+ -+ if not underperformers: -+ print(f"No underperformers below {threshold:.0%} win rate (min {min_matches} matches).") -+ print("\nAll agents:") -+ for agent in agents: -+ wr = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window) -+ print(f" {agent.name} ({agent.slug}) — {wr:.0%} win rate, {agent.total_matches} matches") -+ sys.exit(0) -+ -+ print(f"UNDERPERFORMERS (below {threshold:.0%} win rate, min {min_matches} matches)") -+ print(f"Found: {len(underperformers)}") -+ print() -+ -+ for agent, win_rate in underperformers: -+ matches = tournament.get_agent_matches(agent.id, limit=10) -+ losses = [m for m in matches if m.winner_id != agent.id and not m.voided] -+ -+ print(f"{'='*70}") -+ print(f"AGENT: {agent.name}") -+ print(f" Slug: {agent.slug}") -+ print(f" ID: {agent.id}") -+ print(f" Provider: {agent.provider}/{agent.model}") -+ print(f" Win Rate: {win_rate:.0%} (Elo: {agent.elo_rating:.0f})") -+ print(f" Generation: {agent.generation}") -+ print(f" Specializations: {', '.join(agent.specializations)}") -+ print(f" Losses: {len(losses)}") -+ print() -+ print(" CURRENT SYSTEM PROMPT:") -+ print(f" {agent.system_prompt}") -+ print() -+ -+ if losses: -+ print(" JUDGE FEEDBACK FROM LOSSES:") -+ for j, m in enumerate(losses[:5], 1): -+ print(f" --- Loss {j} ---") -+ print(f" Task: {m.task_description[:150]}") -+ if m.judge_reasoning: -+ print(f" Judge: {m.judge_reasoning[:300]}") -+ print() -+ -+ print() -+ -+ conn.close() -+ -+ -+if __name__ == "__main__": -+ main() +"""Load underperforming agents with their loss feedback for evolution. + +Usage: + python load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] + +Outputs agent details, win rates, and judge feedback from their losses +so that Opus can craft improved system prompts. +""" + +import sys + +sys.path.insert(0, "src") + +from mobius.config import get_config +from mobius.db import init_db +from mobius.registry import Registry +from mobius.tournament import Tournament + + +def main(): + args = sys.argv[1:] + + # Parse flags + specialization = None + threshold = 0.4 + min_matches = 3 + + i = 0 + while i < len(args): + if args[i] == "--threshold" and i + 1 < len(args): + threshold = float(args[i + 1]) + i += 2 + elif args[i] == "--min-matches" and i + 1 < len(args): + min_matches = int(args[i + 1]) + i += 2 + elif not args[i].startswith("--"): + specialization = args[i] + i += 1 + else: + i += 1 + + config = get_config() + conn, _ = init_db(config) + registry = Registry(conn, config) + tournament = Tournament(conn, config, registry) + + agents = registry.list_agents(specialization=specialization) + if not agents: + print(f"No agents found{' for ' + specialization if specialization else ''}.") + sys.exit(1) + + underperformers = [] + for agent in agents: + if agent.total_matches < min_matches: + continue + win_rate = tournament.get_agent_recent_win_rate( + agent.id, window=config.underperformer_window + ) + if win_rate < threshold: + underperformers.append((agent, win_rate)) + + if not underperformers: + print(f"No underperformers below {threshold:.0%} win rate (min {min_matches} matches).") + print("\nAll agents:") + for agent in agents: + wr = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window) + print(f" {agent.name} ({agent.slug}) — {wr:.0%} win rate, {agent.total_matches} matches") + sys.exit(0) + + print(f"UNDERPERFORMERS (below {threshold:.0%} win rate, min {min_matches} matches)") + print(f"Found: {len(underperformers)}") + print() + + for agent, win_rate in underperformers: + matches = tournament.get_agent_matches(agent.id, limit=10) + losses = [m for m in matches if m.winner_id is not None and m.winner_id != agent.id and not m.voided] + + print(f"{'='*70}") + print(f"AGENT: {agent.name}") + print(f" Slug: {agent.slug}") + print(f" ID: {agent.id}") + print(f" Provider: {agent.provider}/{agent.model}") + print(f" Win Rate: {win_rate:.0%} (Elo: {agent.elo_rating:.0f})") + print(f" Generation: {agent.generation}") + print(f" Specializations: {', '.join(agent.specializations)}") + print(f" Losses: {len(losses)}") + print() + print(" CURRENT SYSTEM PROMPT:") + print(f" {agent.system_prompt}") + print() + + if losses: + print(" JUDGE FEEDBACK FROM LOSSES:") + for j, m in enumerate(losses[:5], 1): + print(f" --- Loss {j} ---") + print(f" Task: {m.task_description[:150]}") + if m.judge_reasoning: + print(f" Judge: {m.judge_reasoning[:300]}") + print() + + print() + + conn.close() + + +if __name__ == "__main__": + main() From 5a31c0a0a311ae053f74960630e77ac8bf3a002c Mon Sep 17 00:00:00 2001 From: Aaron Goldsmith Date: Sat, 21 Mar 2026 10:18:12 -0700 Subject: [PATCH 3/3] Exclude voided matches from underperformer win-rate calculation Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scripts/load_underperformers.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py index c7a9842..f78fdc9 100644 --- a/.claude/skills/mobius-evolve/scripts/load_underperformers.py +++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py @@ -17,6 +17,16 @@ from mobius.tournament import Tournament +def _win_rate_excluding_voided(tournament, agent_id: str, window: int) -> tuple[float, int]: + """Calculate win rate excluding voided matches. Returns (rate, valid_count).""" + matches = tournament.get_agent_matches(agent_id, limit=window) + valid = [m for m in matches if not m.voided] + if not valid: + return 0.0, 0 + wins = sum(1 for m in valid if m.winner_id == agent_id) + return wins / len(valid), len(valid) + + def main(): args = sys.argv[1:] @@ -51,11 +61,11 @@ def main(): underperformers = [] for agent in agents: - if agent.total_matches < min_matches: - continue - win_rate = tournament.get_agent_recent_win_rate( - agent.id, window=config.underperformer_window + win_rate, valid_count = _win_rate_excluding_voided( + tournament, agent.id, config.underperformer_window ) + if valid_count < min_matches: + continue if win_rate < threshold: underperformers.append((agent, win_rate)) @@ -63,7 +73,9 @@ def main(): print(f"No underperformers below {threshold:.0%} win rate (min {min_matches} matches).") print("\nAll agents:") for agent in agents: - wr = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window) + wr, _ = _win_rate_excluding_voided( + tournament, agent.id, config.underperformer_window + ) print(f" {agent.name} ({agent.slug}) — {wr:.0%} win rate, {agent.total_matches} matches") sys.exit(0)