diff --git a/src/mobius/agent_builder.py b/src/mobius/agent_builder.py index 86cad05..d47efbf 100644 --- a/src/mobius/agent_builder.py +++ b/src/mobius/agent_builder.py @@ -300,6 +300,71 @@ async def refine_agent( logger.error("Invalid refined agent from builder: %s", e) return None + async def critique_refinement( + self, + original: AgentRecord, + refined: AgentRecord, + feedback: str, + ) -> dict | None: + """Self-critique: evaluate whether a refinement addresses the judge feedback. + + Returns {"pass": bool, "summary": str} or None on failure. + """ + prompt = f"""Evaluate whether this agent refinement actually addresses the judge feedback. + +## Original Agent +Name: {original.name} +System prompt: +``` +{original.system_prompt} +``` + +## Refined Agent +Name: {refined.name} +System prompt: +``` +{refined.system_prompt} +``` + +## Judge Feedback That Prompted the Refinement +{feedback} + +## Task +Evaluate whether the refined prompt meaningfully addresses the criticism. +Return JSON: {{"pass": true/false, "summary": "1-sentence explanation"}} +Only pass if the refinement makes substantive changes that address the feedback. Cosmetic rewording is a fail.""" + + result = await run_judge( + prompt=prompt, + system_prompt="You are a critical evaluator of agent refinements. Return only valid JSON.", + provider_name=self.builder_provider, + model=self.builder_model, + ) + + if not result.success: + logger.error("Critique failed: %s", result.error) + return None + + # Try json.loads directly first — _parse_agent_json discards non-agent JSON + data = None + try: + data = json.loads(result.output.strip()) + except (json.JSONDecodeError, ValueError): + data = _parse_agent_json(result.output) + + if isinstance(data, dict) and "pass" in data: + # Normalize: bool("false") is True, so handle string booleans properly + pass_val = data["pass"] + if isinstance(pass_val, str): + pass_val = pass_val.strip().lower() not in ("false", "0", "no", "") + return { + "pass": bool(pass_val), + "summary": str(data.get("summary", "")), + } + + logger.warning("Critique returned unparseable result: %s", result.output[:200]) + return None + async def crossbreed( self, agent_a: AgentRecord, agent_b: AgentRecord ) -> AgentRecord | None: diff --git a/src/mobius/cli.py b/src/mobius/cli.py index 84e4ddc..8e0a92d 100644 --- a/src/mobius/cli.py +++ b/src/mobius/cli.py @@ -289,26 +289,54 @@ def scout( @app.command() def evolve( specialization: str = typer.Argument(..., help="Specialization to evolve"), + iterations: int = typer.Option(1, "--iterations", "-i", help="Refinement iterations per agent (agentic-eval loop)"), + threshold: float = typer.Option(0.4, "--threshold", "-t", help="Win rate threshold — agents below this get evolved"), verbose: bool = typer.Option(False, "--verbose", "-v"), ): - """Trigger agent builder refinement for a specialization.""" + """Evolve underperforming agents for a specialization using judge feedback. + + Finds agents with win rates below the threshold and refines them + using an evaluator-optimizer loop: each iteration generates an improved + agent, then self-critiques the refinement before registering it. + """ + if iterations < 1: + console.print("[red]Error: --iterations must be >= 1[/red]") + raise typer.Exit(1) + if not (0 <= threshold <= 1): + console.print("[red]Error: --threshold must be between 0 and 1[/red]") + raise typer.Exit(1) + _setup_logging(verbose) config, conn, registry, tournament, *_ = _get_components()[:4] from mobius.agent_builder import AgentBuilder - champions = registry.get_champions(specialization=specialization) - if not champions: - console.print(f"[red]No champions for '{specialization}'. Run more competitions first.[/red]") + # Target underperformers, not champions + all_agents = registry.list_agents(specialization=specialization) + if not all_agents: + console.print(f"[red]No agents for '{specialization}'. Run 'mobius bootstrap' first.[/red]") raise typer.Exit(1) + underperformers = [ + a for a in all_agents + if a.total_matches >= 3 + and tournament.get_agent_recent_win_rate(a.id, window=config.underperformer_window) < threshold + ] + + if not underperformers: + console.print(f"[yellow]No underperformers below {threshold:.0%} win rate for '{specialization}'.[/yellow]") + console.print("[dim]Agents need at least 3 matches to be eligible.[/dim]") + raise typer.Exit(0) + builder = AgentBuilder(config) - for champ in champions: - # Gather recent judge feedback from losses - matches = tournament.get_agent_matches(champ.id, limit=10) - losses = [m for m in matches if m.winner_id != champ.id and not m.voided] + evolved_count = 0 + + for agent in underperformers: + win_rate = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window) + matches = tournament.get_agent_matches(agent.id, limit=10) + losses = [m for m in matches if m.winner_id != agent.id and not m.voided] if not losses: - console.print(f"[yellow]{champ.name} has no recent losses — nothing to improve.[/yellow]") + console.print(f"[yellow]{agent.name} has no recorded losses — skipping.[/yellow]") continue feedback = "\n\n".join( @@ -316,17 +344,68 @@ def evolve( for m in losses[:5] ) - console.print(f"[bold]Evolving {champ.name} based on {len(losses)} losses...[/bold]") - improved = asyncio.run(builder.refine_agent(champ, feedback)) + console.print( + f"\n[bold]Evolving {agent.name}[/bold] " + f"(win rate: {win_rate:.0%}, gen {agent.generation})" + ) - if improved: - if registry.get_agent_by_slug(improved.slug): - improved.slug = f"{improved.slug}-{improved.id[:6]}" - registry.create_agent(improved) - console.print(f"[green]Created challenger: {improved.name} (gen {improved.generation})[/green]") + # Evaluator-optimizer loop: refine, self-critique, repeat + candidate = agent + candidate_feedback = feedback + best_candidate = None + original_parent_id = agent.id # preserve lineage across iterations + + for iteration in range(iterations): + if iterations > 1: + console.print(f" [dim]Iteration {iteration + 1}/{iterations}[/dim]") + + improved = asyncio.run(builder.refine_agent(candidate, candidate_feedback)) + if not improved: + console.print(f" [red]Refinement failed at iteration {iteration + 1}[/red]") + break + + # Always point lineage back to the original agent, not intermediate candidates + improved.parent_id = original_parent_id + + # Self-critique: evaluate if the refinement actually addresses the feedback + if iterations > 1: + critique = asyncio.run(builder.critique_refinement( + original=agent, + refined=improved, + feedback=feedback, + )) + if critique and critique.get("pass"): + console.print(f" [green]Self-critique passed: {critique.get('summary', '')}[/green]") + best_candidate = improved + break + elif critique: + console.print(f" [yellow]Self-critique: {critique.get('summary', '')}[/yellow]") + # Feed only the critique summary for next iteration (not original feedback) + candidate = improved + candidate_feedback = ( + f"Address this critique of your previous attempt:\n" + f"{critique.get('summary', '')}" + ) + best_candidate = improved # keep latest even if not perfect + continue + else: + best_candidate = improved + break + else: + best_candidate = improved + + if best_candidate: + if registry.get_agent_by_slug(best_candidate.slug): + best_candidate.slug = f"{best_candidate.slug}-{best_candidate.id[:6]}" + registry.create_agent(best_candidate) + evolved_count += 1 + console.print( + f" [green]Created: {best_candidate.name} (gen {best_candidate.generation})[/green]" + ) else: - console.print(f"[red]Failed to create improved version of {champ.name}[/red]") + console.print(f" [red]Failed to create improved version of {agent.name}[/red]") + console.print(f"\n[bold]Evolved {evolved_count}/{len(underperformers)} underperformers.[/bold]") conn.close() diff --git a/src/mobius/config.py b/src/mobius/config.py index f90fbba..d7be313 100644 --- a/src/mobius/config.py +++ b/src/mobius/config.py @@ -23,6 +23,7 @@ class MobiusConfig(BaseModel): agent_timeout_seconds: int = 120 agent_max_turns: int = 10 agent_budget_usd: float = 0.05 + agent_max_output_tokens: int = 16384 # Judge judge_models: list[dict[str, str]] = [ diff --git a/src/mobius/judge.py b/src/mobius/judge.py index 6e60c9b..7a43ec4 100644 --- a/src/mobius/judge.py +++ b/src/mobius/judge.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import json import logging import random @@ -129,7 +130,10 @@ async def evaluate( labels = list(string.ascii_uppercase[: len(agent_ids)]) verdicts: list[tuple[JudgeVerdict, str]] = [] # (verdict, model) - judge_models_used: list[str] = [] + + # Prepare per-judge data (independent shuffle per judge) + judge_tasks = [] + judge_meta = [] # (provider, model, label_to_agent) for judge_config in self.config.judge_models: provider = judge_config["provider"] @@ -143,14 +147,23 @@ async def evaluate( prompt = _build_judge_prompt(task, outputs, label_map) - result = await run_judge( + judge_tasks.append(run_judge( prompt=prompt, system_prompt=JUDGE_SYSTEM_PROMPT, provider_name=provider, model=model, - ) + )) + judge_meta.append((provider, model, label_to_agent)) + + # Run all judges in parallel + results = await asyncio.gather(*judge_tasks, return_exceptions=True) - judge_models_used.append(f"{provider}/{model}") + judge_models_used = [] + + for result, (provider, model, label_to_agent) in zip(results, judge_meta): + if isinstance(result, Exception): + logger.warning("Judge %s/%s raised: %s", provider, model, result) + continue if not result.success: logger.warning("Judge %s/%s failed: %s", provider, model, result.error) @@ -159,6 +172,7 @@ async def evaluate( verdict = _parse_verdict(result.output, label_to_agent) if verdict: verdicts.append((verdict, f"{provider}/{model}")) + judge_models_used.append(f"{provider}/{model}") logger.info( "Judge %s/%s picked winner: %s (mapped to agent)", provider, diff --git a/src/mobius/providers/anthropic.py b/src/mobius/providers/anthropic.py index 62e1fa1..2e36a4a 100644 --- a/src/mobius/providers/anthropic.py +++ b/src/mobius/providers/anthropic.py @@ -37,6 +37,7 @@ async def run_agent( max_budget_usd: float = 0.05, timeout_seconds: int = 120, working_dir: str | None = None, + max_tokens: int = 16384, ) -> ProviderResult: """Run via Anthropic messages API, with tool loop if tools requested.""" api_key = _get_api_key() @@ -60,22 +61,22 @@ async def run_agent( if use_tools: return await self._run_with_tools( client, prompt, system_prompt, model, - max_turns, timeout_seconds, working_dir, + max_turns, timeout_seconds, working_dir, max_tokens, ) else: return await self._run_simple( - client, prompt, system_prompt, model, timeout_seconds, + client, prompt, system_prompt, model, timeout_seconds, max_tokens, ) async def _run_simple( self, client, prompt: str, system_prompt: str, - model: str, timeout_seconds: int, + model: str, timeout_seconds: int, max_tokens: int = 16384, ) -> ProviderResult: """Single-shot message, same as Google/OpenAI providers.""" try: response = await asyncio.wait_for( client.messages.create( - model=model, max_tokens=4096, + model=model, max_tokens=max_tokens, system=system_prompt, messages=[{"role": "user", "content": prompt}], ), @@ -103,7 +104,7 @@ async def _run_simple( async def _run_with_tools( self, client, prompt: str, system_prompt: str, model: str, max_turns: int, timeout_seconds: int, - working_dir: str | None = None, + working_dir: str | None = None, max_tokens: int = 16384, ) -> ProviderResult: """Agentic loop with bash tool use.""" messages = [{"role": "user", "content": prompt}] @@ -115,7 +116,7 @@ async def _run_with_tools( for turn in range(max_turns): response = await asyncio.wait_for( client.messages.create( - model=model, max_tokens=4096, + model=model, max_tokens=max_tokens, system=system_prompt, messages=messages, tools=[ANTHROPIC_BASH_TOOL], diff --git a/src/mobius/providers/base.py b/src/mobius/providers/base.py index 75f8ab9..4123fa5 100644 --- a/src/mobius/providers/base.py +++ b/src/mobius/providers/base.py @@ -39,6 +39,7 @@ async def run_agent( max_budget_usd: float = 0.05, timeout_seconds: int = 120, working_dir: str | None = None, + max_tokens: int = 16384, ) -> ProviderResult: """Execute an agent and return its output.""" ... diff --git a/src/mobius/providers/google.py b/src/mobius/providers/google.py index 2afa995..7aed169 100644 --- a/src/mobius/providers/google.py +++ b/src/mobius/providers/google.py @@ -38,6 +38,7 @@ async def run_agent( max_budget_usd: float = 0.05, timeout_seconds: int = 120, working_dir: str | None = None, + max_tokens: int = 16384, ) -> ProviderResult: """Execute via Google GenAI SDK, with tool loop if requested.""" api_key = _get_api_key() @@ -62,7 +63,7 @@ async def run_agent( if use_tools: return await self._run_with_tools( client, types, prompt, system_prompt, model, - max_turns, timeout_seconds, working_dir, + max_turns, timeout_seconds, working_dir, max_tokens, ) else: return await self._run_simple( @@ -108,7 +109,7 @@ async def _run_simple( async def _run_with_tools( self, client, types, prompt: str, system_prompt: str, model: str, max_turns: int, timeout_seconds: int, - working_dir: str | None = None, + working_dir: str | None = None, max_tokens: int = 16384, ) -> ProviderResult: """Agentic loop with function calling.""" # Build the tool declaration @@ -123,7 +124,7 @@ async def _run_with_tools( config = types.GenerateContentConfig( system_instruction=system_prompt, tools=[bash_tool], - max_output_tokens=4096, + max_output_tokens=max_tokens, ) contents = [types.Content( diff --git a/src/mobius/providers/openai.py b/src/mobius/providers/openai.py index 4800e9a..220dc00 100644 --- a/src/mobius/providers/openai.py +++ b/src/mobius/providers/openai.py @@ -29,6 +29,7 @@ async def run_agent( max_budget_usd: float = 0.05, timeout_seconds: int = 120, working_dir: str | None = None, + max_tokens: int = 16384, ) -> ProviderResult: """Execute via OpenAI chat completions, with tool loop if requested.""" try: @@ -45,16 +46,16 @@ async def run_agent( if use_tools: return await self._run_with_tools( client, prompt, system_prompt, model, - max_turns, timeout_seconds, working_dir, + max_turns, timeout_seconds, working_dir, max_tokens, ) else: return await self._run_simple( - client, prompt, system_prompt, model, timeout_seconds, + client, prompt, system_prompt, model, timeout_seconds, max_tokens, ) async def _run_simple( self, client, prompt: str, system_prompt: str, - model: str, timeout_seconds: int, + model: str, timeout_seconds: int, max_tokens: int = 16384, ) -> ProviderResult: """Single-shot completion, no tools.""" try: @@ -65,7 +66,7 @@ async def _run_simple( {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ], - max_tokens=4096, + max_tokens=max_tokens, ), timeout=timeout_seconds, ) @@ -92,7 +93,7 @@ async def _run_simple( async def _run_with_tools( self, client, prompt: str, system_prompt: str, model: str, max_turns: int, timeout_seconds: int, - working_dir: str | None = None, + working_dir: str | None = None, max_tokens: int = 16384, ) -> ProviderResult: """Agentic loop with function calling.""" messages = [ @@ -110,7 +111,7 @@ async def _run_with_tools( model=model, messages=messages, tools=[OPENAI_BASH_TOOL], - max_tokens=4096, + max_tokens=max_tokens, ), timeout=timeout_seconds, ) diff --git a/src/mobius/providers/openrouter.py b/src/mobius/providers/openrouter.py index e12b697..3be0e86 100644 --- a/src/mobius/providers/openrouter.py +++ b/src/mobius/providers/openrouter.py @@ -35,6 +35,7 @@ async def run_agent( max_budget_usd: float = 0.05, timeout_seconds: int = 120, working_dir: str | None = None, + max_tokens: int = 16384, ) -> ProviderResult: """Execute via OpenRouter (OpenAI-compatible endpoint).""" api_key = self._get_api_key() @@ -58,16 +59,16 @@ async def run_agent( if use_tools: return await self._run_with_tools( client, prompt, system_prompt, model, - max_turns, timeout_seconds, working_dir, + max_turns, timeout_seconds, working_dir, max_tokens, ) else: return await self._run_simple( - client, prompt, system_prompt, model, timeout_seconds, + client, prompt, system_prompt, model, timeout_seconds, max_tokens, ) async def _run_simple( self, client, prompt: str, system_prompt: str, - model: str, timeout_seconds: int, + model: str, timeout_seconds: int, max_tokens: int = 16384, ) -> ProviderResult: """Single-shot completion, no tools.""" try: @@ -78,7 +79,7 @@ async def _run_simple( {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ], - max_tokens=4096, + max_tokens=max_tokens, ), timeout=timeout_seconds, ) @@ -105,7 +106,7 @@ async def _run_simple( async def _run_with_tools( self, client, prompt: str, system_prompt: str, model: str, max_turns: int, timeout_seconds: int, - working_dir: str | None = None, + working_dir: str | None = None, max_tokens: int = 16384, ) -> ProviderResult: """Agentic loop with function calling (OpenAI-compatible).""" messages = [ @@ -123,7 +124,7 @@ async def _run_with_tools( model=model, messages=messages, tools=[OPENAI_BASH_TOOL], - max_tokens=4096, + max_tokens=max_tokens, ), timeout=timeout_seconds, )