AaronGoldsmith · AaronGoldsmith · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/src/mobius/agent_builder.py b/src/mobius/agent_builder.py
@@ -300,6 +300,71 @@ async def refine_agent(
             logger.error("Invalid refined agent from builder: %s", e)
             return None
 
+    async def critique_refinement(
+        self,
+        original: AgentRecord,
+        refined: AgentRecord,
+        feedback: str,
+    ) -> dict | None:
+        """Self-critique: evaluate whether a refinement addresses the judge feedback.
+
+        Returns {"pass": bool, "summary": str} or None on failure.
+        """
+        prompt = f"""Evaluate whether this agent refinement actually addresses the judge feedback.
+
+## Original Agent
+Name: {original.name}
+System prompt:
+```
+{original.system_prompt}
+```
+
+## Refined Agent
+Name: {refined.name}
+System prompt:
+```
+{refined.system_prompt}
+```
+
+## Judge Feedback That Prompted the Refinement
+{feedback}
+
+## Task
+Evaluate whether the refined prompt meaningfully addresses the criticism.
+Return JSON: {{"pass": true/false, "summary": "1-sentence explanation"}}
+Only pass if the refinement makes substantive changes that address the feedback. Cosmetic rewording is a fail."""
+
+        result = await run_judge(
+            prompt=prompt,
+            system_prompt="You are a critical evaluator of agent refinements. Return only valid JSON.",
+            provider_name=self.builder_provider,
+            model=self.builder_model,
+        )
+
+        if not result.success:
+            logger.error("Critique failed: %s", result.error)
+            return None
+
+        # Try json.loads directly first — _parse_agent_json discards non-agent JSON
+        data = None
+        try:
+            data = json.loads(result.output.strip())
+        except (json.JSONDecodeError, ValueError):
+            data = _parse_agent_json(result.output)
+
+        if isinstance(data, dict) and "pass" in data:
+            # Normalize: bool("false") is True, so handle string booleans properly
+            pass_val = data["pass"]
+            if isinstance(pass_val, str):
+                pass_val = pass_val.strip().lower() not in ("false", "0", "no", "")
+            return {
+                "pass": bool(pass_val),
+                "summary": str(data.get("summary", "")),
+            }
+
+        logger.warning("Critique returned unparseable result: %s", result.output[:200])
+        return None
+
     async def crossbreed(
         self, agent_a: AgentRecord, agent_b: AgentRecord
     ) -> AgentRecord | None:

diff --git a/src/mobius/cli.py b/src/mobius/cli.py
@@ -289,44 +289,123 @@ def scout(
 @app.command()
 def evolve(
     specialization: str = typer.Argument(..., help="Specialization to evolve"),
+    iterations: int = typer.Option(1, "--iterations", "-i", help="Refinement iterations per agent (agentic-eval loop)"),
+    threshold: float = typer.Option(0.4, "--threshold", "-t", help="Win rate threshold — agents below this get evolved"),
     verbose: bool = typer.Option(False, "--verbose", "-v"),
 ):
-    """Trigger agent builder refinement for a specialization."""
+    """Evolve underperforming agents for a specialization using judge feedback.
+
+    Finds agents with win rates below the threshold and refines them
+    using an evaluator-optimizer loop: each iteration generates an improved
+    agent, then self-critiques the refinement before registering it.
+    """
+    if iterations < 1:
+        console.print("[red]Error: --iterations must be >= 1[/red]")
+        raise typer.Exit(1)
+    if not (0 <= threshold <= 1):
+        console.print("[red]Error: --threshold must be between 0 and 1[/red]")
+        raise typer.Exit(1)
+
     _setup_logging(verbose)
     config, conn, registry, tournament, *_ = _get_components()[:4]
     from mobius.agent_builder import AgentBuilder
 
-    champions = registry.get_champions(specialization=specialization)
-    if not champions:
-        console.print(f"[red]No champions for '{specialization}'. Run more competitions first.[/red]")
+    # Target underperformers, not champions
+    all_agents = registry.list_agents(specialization=specialization)
+    if not all_agents:
+        console.print(f"[red]No agents for '{specialization}'. Run 'mobius bootstrap' first.[/red]")
         raise typer.Exit(1)
 
+    underperformers = [
+        a for a in all_agents
+        if a.total_matches >= 3
+        and tournament.get_agent_recent_win_rate(a.id, window=config.underperformer_window) < threshold
+    ]
+
+    if not underperformers:
+        console.print(f"[yellow]No underperformers below {threshold:.0%} win rate for '{specialization}'.[/yellow]")
+        console.print("[dim]Agents need at least 3 matches to be eligible.[/dim]")
+        raise typer.Exit(0)
+
     builder = AgentBuilder(config)
-    for champ in champions:
-        # Gather recent judge feedback from losses
-        matches = tournament.get_agent_matches(champ.id, limit=10)
-        losses = [m for m in matches if m.winner_id != champ.id and not m.voided]
+    evolved_count = 0
+
+    for agent in underperformers:
+        win_rate = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window)
+        matches = tournament.get_agent_matches(agent.id, limit=10)
+        losses = [m for m in matches if m.winner_id != agent.id and not m.voided]
 
         if not losses:
-            console.print(f"[yellow]{champ.name} has no recent losses — nothing to improve.[/yellow]")
+            console.print(f"[yellow]{agent.name} has no recorded losses — skipping.[/yellow]")
             continue
 
         feedback = "\n\n".join(
             f"Task: {m.task_description[:100]}\nJudge: {m.judge_reasoning[:200]}"
             for m in losses[:5]
         )
 
-        console.print(f"[bold]Evolving {champ.name} based on {len(losses)} losses...[/bold]")
-        improved = asyncio.run(builder.refine_agent(champ, feedback))
+        console.print(
+            f"\n[bold]Evolving {agent.name}[/bold] "
+            f"(win rate: {win_rate:.0%}, gen {agent.generation})"
+        )
 
-        if improved:
-            if registry.get_agent_by_slug(improved.slug):
-                improved.slug = f"{improved.slug}-{improved.id[:6]}"
-            registry.create_agent(improved)
-            console.print(f"[green]Created challenger: {improved.name} (gen {improved.generation})[/green]")
+        # Evaluator-optimizer loop: refine, self-critique, repeat
+        candidate = agent
+        candidate_feedback = feedback
+        best_candidate = None
+        original_parent_id = agent.id  # preserve lineage across iterations
+
+        for iteration in range(iterations):
+            if iterations > 1:
+                console.print(f"  [dim]Iteration {iteration + 1}/{iterations}[/dim]")
+
+            improved = asyncio.run(builder.refine_agent(candidate, candidate_feedback))
+            if not improved:
+                console.print(f"  [red]Refinement failed at iteration {iteration + 1}[/red]")
+                break
+
+            # Always point lineage back to the original agent, not intermediate candidates
+            improved.parent_id = original_parent_id
+
+            # Self-critique: evaluate if the refinement actually addresses the feedback
+            if iterations > 1:
+                critique = asyncio.run(builder.critique_refinement(
+                    original=agent,
+                    refined=improved,
+                    feedback=feedback,
+                ))
+                if critique and critique.get("pass"):
+                    console.print(f"  [green]Self-critique passed: {critique.get('summary', '')}[/green]")
+                    best_candidate = improved
+                    break
+                elif critique:
+                    console.print(f"  [yellow]Self-critique: {critique.get('summary', '')}[/yellow]")
+                    # Feed only the critique summary for next iteration (not original feedback)
+                    candidate = improved
+                    candidate_feedback = (
+                        f"Address this critique of your previous attempt:\n"
+                        f"{critique.get('summary', '')}"
+                    )
+                    best_candidate = improved  # keep latest even if not perfect
+                    continue
+                else:
+                    best_candidate = improved
+                    break
+            else:
+                best_candidate = improved
+
+        if best_candidate:
+            if registry.get_agent_by_slug(best_candidate.slug):
+                best_candidate.slug = f"{best_candidate.slug}-{best_candidate.id[:6]}"
+            registry.create_agent(best_candidate)
+            evolved_count += 1
+            console.print(
+                f"  [green]Created: {best_candidate.name} (gen {best_candidate.generation})[/green]"
+            )
         else:
-            console.print(f"[red]Failed to create improved version of {champ.name}[/red]")
+            console.print(f"  [red]Failed to create improved version of {agent.name}[/red]")
 
+    console.print(f"\n[bold]Evolved {evolved_count}/{len(underperformers)} underperformers.[/bold]")
     conn.close()
 
 

diff --git a/src/mobius/config.py b/src/mobius/config.py
@@ -23,6 +23,7 @@ class MobiusConfig(BaseModel):
     agent_timeout_seconds: int = 120
     agent_max_turns: int = 10
     agent_budget_usd: float = 0.05
+    agent_max_output_tokens: int = 16384
 
     # Judge
     judge_models: list[dict[str, str]] = [

diff --git a/src/mobius/judge.py b/src/mobius/judge.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import asyncio
 import json
 import logging
 import random
@@ -129,7 +130,10 @@ async def evaluate(
         labels = list(string.ascii_uppercase[: len(agent_ids)])
 
         verdicts: list[tuple[JudgeVerdict, str]] = []  # (verdict, model)
-        judge_models_used: list[str] = []
+
+        # Prepare per-judge data (independent shuffle per judge)
+        judge_tasks = []
+        judge_meta = []  # (provider, model, label_to_agent)
 
         for judge_config in self.config.judge_models:
             provider = judge_config["provider"]
@@ -143,14 +147,23 @@ async def evaluate(
 
             prompt = _build_judge_prompt(task, outputs, label_map)
 
-            result = await run_judge(
+            judge_tasks.append(run_judge(
                 prompt=prompt,
                 system_prompt=JUDGE_SYSTEM_PROMPT,
                 provider_name=provider,
                 model=model,
-            )
+            ))
+            judge_meta.append((provider, model, label_to_agent))
+
+        # Run all judges in parallel
+        results = await asyncio.gather(*judge_tasks, return_exceptions=True)
 
-            judge_models_used.append(f"{provider}/{model}")
+        judge_models_used = []
+
+        for result, (provider, model, label_to_agent) in zip(results, judge_meta):
+            if isinstance(result, Exception):
+                logger.warning("Judge %s/%s raised: %s", provider, model, result)
+                continue
 
             if not result.success:
                 logger.warning("Judge %s/%s failed: %s", provider, model, result.error)
@@ -159,6 +172,7 @@ async def evaluate(
             verdict = _parse_verdict(result.output, label_to_agent)
             if verdict:
                 verdicts.append((verdict, f"{provider}/{model}"))
+                judge_models_used.append(f"{provider}/{model}")
                 logger.info(
                     "Judge %s/%s picked winner: %s (mapped to agent)",
                     provider,

diff --git a/src/mobius/providers/anthropic.py b/src/mobius/providers/anthropic.py
@@ -37,6 +37,7 @@ async def run_agent(
         max_budget_usd: float = 0.05,
         timeout_seconds: int = 120,
         working_dir: str | None = None,
+        max_tokens: int = 16384,
     ) -> ProviderResult:
         """Run via Anthropic messages API, with tool loop if tools requested."""
         api_key = _get_api_key()
@@ -60,22 +61,22 @@ async def run_agent(
         if use_tools:
             return await self._run_with_tools(
                 client, prompt, system_prompt, model,
-                max_turns, timeout_seconds, working_dir,
+                max_turns, timeout_seconds, working_dir, max_tokens,
             )
         else:
             return await self._run_simple(
-                client, prompt, system_prompt, model, timeout_seconds,
+                client, prompt, system_prompt, model, timeout_seconds, max_tokens,
             )
 
     async def _run_simple(
         self, client, prompt: str, system_prompt: str,
-        model: str, timeout_seconds: int,
+        model: str, timeout_seconds: int, max_tokens: int = 16384,
     ) -> ProviderResult:
         """Single-shot message, same as Google/OpenAI providers."""
         try:
             response = await asyncio.wait_for(
                 client.messages.create(
-                    model=model, max_tokens=4096,
+                    model=model, max_tokens=max_tokens,
                     system=system_prompt,
                     messages=[{"role": "user", "content": prompt}],
                 ),
@@ -103,7 +104,7 @@ async def _run_simple(
     async def _run_with_tools(
         self, client, prompt: str, system_prompt: str,
         model: str, max_turns: int, timeout_seconds: int,
-        working_dir: str | None = None,
+        working_dir: str | None = None, max_tokens: int = 16384,
     ) -> ProviderResult:
         """Agentic loop with bash tool use."""
         messages = [{"role": "user", "content": prompt}]
@@ -115,7 +116,7 @@ async def _run_with_tools(
             for turn in range(max_turns):
                 response = await asyncio.wait_for(
                     client.messages.create(
-                        model=model, max_tokens=4096,
+                        model=model, max_tokens=max_tokens,
                         system=system_prompt,
                         messages=messages,
                         tools=[ANTHROPIC_BASH_TOOL],

diff --git a/src/mobius/providers/base.py b/src/mobius/providers/base.py
@@ -39,6 +39,7 @@ async def run_agent(
         max_budget_usd: float = 0.05,
         timeout_seconds: int = 120,
         working_dir: str | None = None,
+        max_tokens: int = 16384,
     ) -> ProviderResult:
         """Execute an agent and return its output."""
         ...

diff --git a/src/mobius/providers/google.py b/src/mobius/providers/google.py
@@ -38,6 +38,7 @@ async def run_agent(
         max_budget_usd: float = 0.05,
         timeout_seconds: int = 120,
         working_dir: str | None = None,
+        max_tokens: int = 16384,
     ) -> ProviderResult:
         """Execute via Google GenAI SDK, with tool loop if requested."""
         api_key = _get_api_key()
@@ -62,7 +63,7 @@ async def run_agent(
         if use_tools:
             return await self._run_with_tools(
                 client, types, prompt, system_prompt, model,
-                max_turns, timeout_seconds, working_dir,
+                max_turns, timeout_seconds, working_dir, max_tokens,
             )
         else:
             return await self._run_simple(
@@ -108,7 +109,7 @@ async def _run_simple(
     async def _run_with_tools(
         self, client, types, prompt: str, system_prompt: str,
         model: str, max_turns: int, timeout_seconds: int,
-        working_dir: str | None = None,
+        working_dir: str | None = None, max_tokens: int = 16384,
     ) -> ProviderResult:
         """Agentic loop with function calling."""
         # Build the tool declaration
@@ -123,7 +124,7 @@ async def _run_with_tools(
         config = types.GenerateContentConfig(
             system_instruction=system_prompt,
             tools=[bash_tool],
-            max_output_tokens=4096,
+            max_output_tokens=max_tokens,
         )
 
         contents = [types.Content(