diff --git a/.claude/skills/mobius-evolve/SKILL.md b/.claude/skills/mobius-evolve/SKILL.md new file mode 100644 index 0000000..3761dec --- /dev/null +++ b/.claude/skills/mobius-evolve/SKILL.md @@ -0,0 +1,101 @@ +--- +name: mobius-evolve +description: Use when the user says "evolve", "mobius evolve", "improve agents", or wants to refine underperforming agents without API costs. +user-invocable: true +argument-hint: "[specialization] [--threshold 0.4] [--min-matches 3]" +--- + +# Mobius Evolve (Local Opus — Evaluator-Optimizer Loop) + +**You ARE the evaluator-optimizer.** You are Claude Opus running locally — this costs $0 in API calls. Instead of calling the API to refine agents, YOU analyze the judge feedback, critique the current system prompt, and craft an improved version using the agentic-eval reflection pattern. + +## Why this matters + +The `mobius evolve` CLI command calls the Opus API to refine agents ($$$). But you (Claude Code Opus) are already running and are the same model. So: +- **This skill** = Opus refining agents for FREE (Pro subscription) with multi-pass self-critique +- **mobius evolve** = Opus refining agents for $$$ (API calls), single or multi-pass +- Same quality, zero cost, and you have full conversation context. + +## What to do + +### Step 1: Load underperformers + +```bash +python .claude/skills/mobius-evolve/scripts/load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] +``` + +This shows agents with low win rates, their current system prompts, and judge feedback from their losses. + +### Step 2: Analyze and refine (YOU are the evaluator-optimizer loop) + +For each underperformer, apply the **reflection pattern**: + +**Evaluate:** Read the agent's current system prompt and the judge feedback. Identify the specific weaknesses the judges called out. + +**Critique:** Ask yourself: +- What specific failure patterns does the feedback reveal? +- Is the system prompt too generic? Too narrow? Missing edge cases? +- Does it lack clear quality criteria or output format expectations? +- Would a different problem-solving approach help? + +**Refine:** Write an improved system prompt that directly addresses each criticism. Be substantive — cosmetic rewording doesn't help. + +**Self-check:** Before registering, verify your refinement: +- Does it address EVERY piece of judge feedback? +- Is it specific and opinionated (not generic)? +- Does it include quality criteria, methodology, and output format? +- Is it meaningfully different from the original, not just reworded? + +If your self-check fails, iterate — refine again before registering. + +### Step 3: Register the improved agent + +Use the create_agent script to register the evolved version: + +```bash +python .claude/skills/mobius-seed/scripts/create_agent.py '{ + "name": "Agent Name v2", + "slug": "original-slug-v2", + "description": "Updated description", + "system_prompt": "Your improved system prompt here...", + "provider": "anthropic", + "model": "claude-haiku-4-5-20251001", + "tools": ["Read", "Grep", "Glob", "Bash", "Write", "Edit"], + "specializations": ["coding", "refactoring"], + "is_champion": false +}' +``` + +Important: +- Set `is_champion` to `false` — the evolved agent must earn its rank through competition +- Keep the same provider/model unless there's a strong reason to change +- Use a slug like `original-slug-v2` or `original-slug-gen2` to show lineage +- Keep the same specializations + +### Step 4: Show results + +```bash +python -m mobius.cli agent list +``` + +## The Agentic-Eval Pattern You're Running + +``` +Load underperformers → Read judge feedback → Critique prompt + ↑ ↓ + └──── Self-check fails? ← Refine prompt ←───┘ + ↓ + Self-check passes + ↓ + Register agent +``` + +This is the Evaluator-Optimizer pattern from agentic-eval, with YOU as both evaluator and optimizer. The key advantage over the CLI version: you can reason about the feedback in full context, consider the agent's match history, and make nuanced improvements that a single API call might miss. + +## Pro Tips + +- **Don't just reword** — change the agent's methodology, add specific techniques, restructure the approach +- **Study the winners** — if the judge praised a winning agent's approach, consider incorporating similar strategies +- **Vary approaches** — if an agent keeps losing with approach X, try a fundamentally different approach Y +- **Keep prompts focused** — under 2000 tokens but dense with specific guidance +- **Consider the model** — Haiku benefits from very explicit instructions; Gemini Flash may need different prompt styles diff --git a/.claude/skills/mobius-evolve/scripts/load_underperformers.py b/.claude/skills/mobius-evolve/scripts/load_underperformers.py new file mode 100644 index 0000000..f78fdc9 --- /dev/null +++ b/.claude/skills/mobius-evolve/scripts/load_underperformers.py @@ -0,0 +1,119 @@ +"""Load underperforming agents with their loss feedback for evolution. + +Usage: + python load_underperformers.py [specialization] [--threshold 0.4] [--min-matches 3] + +Outputs agent details, win rates, and judge feedback from their losses +so that Opus can craft improved system prompts. +""" + +import sys + +sys.path.insert(0, "src") + +from mobius.config import get_config +from mobius.db import init_db +from mobius.registry import Registry +from mobius.tournament import Tournament + + +def _win_rate_excluding_voided(tournament, agent_id: str, window: int) -> tuple[float, int]: + """Calculate win rate excluding voided matches. Returns (rate, valid_count).""" + matches = tournament.get_agent_matches(agent_id, limit=window) + valid = [m for m in matches if not m.voided] + if not valid: + return 0.0, 0 + wins = sum(1 for m in valid if m.winner_id == agent_id) + return wins / len(valid), len(valid) + + +def main(): + args = sys.argv[1:] + + # Parse flags + specialization = None + threshold = 0.4 + min_matches = 3 + + i = 0 + while i < len(args): + if args[i] == "--threshold" and i + 1 < len(args): + threshold = float(args[i + 1]) + i += 2 + elif args[i] == "--min-matches" and i + 1 < len(args): + min_matches = int(args[i + 1]) + i += 2 + elif not args[i].startswith("--"): + specialization = args[i] + i += 1 + else: + i += 1 + + config = get_config() + conn, _ = init_db(config) + registry = Registry(conn, config) + tournament = Tournament(conn, config, registry) + + agents = registry.list_agents(specialization=specialization) + if not agents: + print(f"No agents found{' for ' + specialization if specialization else ''}.") + sys.exit(1) + + underperformers = [] + for agent in agents: + win_rate, valid_count = _win_rate_excluding_voided( + tournament, agent.id, config.underperformer_window + ) + if valid_count < min_matches: + continue + if win_rate < threshold: + underperformers.append((agent, win_rate)) + + if not underperformers: + print(f"No underperformers below {threshold:.0%} win rate (min {min_matches} matches).") + print("\nAll agents:") + for agent in agents: + wr, _ = _win_rate_excluding_voided( + tournament, agent.id, config.underperformer_window + ) + print(f" {agent.name} ({agent.slug}) — {wr:.0%} win rate, {agent.total_matches} matches") + sys.exit(0) + + print(f"UNDERPERFORMERS (below {threshold:.0%} win rate, min {min_matches} matches)") + print(f"Found: {len(underperformers)}") + print() + + for agent, win_rate in underperformers: + matches = tournament.get_agent_matches(agent.id, limit=10) + losses = [m for m in matches if m.winner_id is not None and m.winner_id != agent.id and not m.voided] + + print(f"{'='*70}") + print(f"AGENT: {agent.name}") + print(f" Slug: {agent.slug}") + print(f" ID: {agent.id}") + print(f" Provider: {agent.provider}/{agent.model}") + print(f" Win Rate: {win_rate:.0%} (Elo: {agent.elo_rating:.0f})") + print(f" Generation: {agent.generation}") + print(f" Specializations: {', '.join(agent.specializations)}") + print(f" Losses: {len(losses)}") + print() + print(" CURRENT SYSTEM PROMPT:") + print(f" {agent.system_prompt}") + print() + + if losses: + print(" JUDGE FEEDBACK FROM LOSSES:") + for j, m in enumerate(losses[:5], 1): + print(f" --- Loss {j} ---") + print(f" Task: {m.task_description[:150]}") + if m.judge_reasoning: + print(f" Judge: {m.judge_reasoning[:300]}") + print() + + print() + + conn.close() + + +if __name__ == "__main__": + main()