Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions src/mobius/agent_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,71 @@ async def refine_agent(
logger.error("Invalid refined agent from builder: %s", e)
return None

async def critique_refinement(
self,
original: AgentRecord,
refined: AgentRecord,
feedback: str,
) -> dict | None:
"""Self-critique: evaluate whether a refinement addresses the judge feedback.

Returns {"pass": bool, "summary": str} or None on failure.
"""
prompt = f"""Evaluate whether this agent refinement actually addresses the judge feedback.

## Original Agent
Name: {original.name}
System prompt:
```
{original.system_prompt}
```

## Refined Agent
Name: {refined.name}
System prompt:
```
{refined.system_prompt}
```

## Judge Feedback That Prompted the Refinement
{feedback}

## Task
Evaluate whether the refined prompt meaningfully addresses the criticism.
Return JSON: {{"pass": true/false, "summary": "1-sentence explanation"}}
Only pass if the refinement makes substantive changes that address the feedback. Cosmetic rewording is a fail."""

result = await run_judge(
prompt=prompt,
system_prompt="You are a critical evaluator of agent refinements. Return only valid JSON.",
provider_name=self.builder_provider,
model=self.builder_model,
)

if not result.success:
logger.error("Critique failed: %s", result.error)
return None

# Try json.loads directly first — _parse_agent_json discards non-agent JSON
data = None
try:
data = json.loads(result.output.strip())
except (json.JSONDecodeError, ValueError):
data = _parse_agent_json(result.output)

if isinstance(data, dict) and "pass" in data:
# Normalize: bool("false") is True, so handle string booleans properly
pass_val = data["pass"]
if isinstance(pass_val, str):
pass_val = pass_val.strip().lower() not in ("false", "0", "no", "")
return {
"pass": bool(pass_val),
"summary": str(data.get("summary", "")),
}
Comment thread
AaronGoldsmith marked this conversation as resolved.

logger.warning("Critique returned unparseable result: %s", result.output[:200])
return None

async def crossbreed(
self, agent_a: AgentRecord, agent_b: AgentRecord
) -> AgentRecord | None:
Expand Down
113 changes: 96 additions & 17 deletions src/mobius/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,44 +289,123 @@ def scout(
@app.command()
def evolve(
specialization: str = typer.Argument(..., help="Specialization to evolve"),
iterations: int = typer.Option(1, "--iterations", "-i", help="Refinement iterations per agent (agentic-eval loop)"),
threshold: float = typer.Option(0.4, "--threshold", "-t", help="Win rate threshold — agents below this get evolved"),
Comment thread
AaronGoldsmith marked this conversation as resolved.
verbose: bool = typer.Option(False, "--verbose", "-v"),
):
"""Trigger agent builder refinement for a specialization."""
"""Evolve underperforming agents for a specialization using judge feedback.

Finds agents with win rates below the threshold and refines them
using an evaluator-optimizer loop: each iteration generates an improved
agent, then self-critiques the refinement before registering it.
"""
if iterations < 1:
console.print("[red]Error: --iterations must be >= 1[/red]")
raise typer.Exit(1)
if not (0 <= threshold <= 1):
console.print("[red]Error: --threshold must be between 0 and 1[/red]")
raise typer.Exit(1)

_setup_logging(verbose)
config, conn, registry, tournament, *_ = _get_components()[:4]
from mobius.agent_builder import AgentBuilder

champions = registry.get_champions(specialization=specialization)
if not champions:
console.print(f"[red]No champions for '{specialization}'. Run more competitions first.[/red]")
# Target underperformers, not champions
all_agents = registry.list_agents(specialization=specialization)
if not all_agents:
console.print(f"[red]No agents for '{specialization}'. Run 'mobius bootstrap' first.[/red]")
raise typer.Exit(1)

underperformers = [
a for a in all_agents
if a.total_matches >= 3
and tournament.get_agent_recent_win_rate(a.id, window=config.underperformer_window) < threshold
]

if not underperformers:
console.print(f"[yellow]No underperformers below {threshold:.0%} win rate for '{specialization}'.[/yellow]")
console.print("[dim]Agents need at least 3 matches to be eligible.[/dim]")
raise typer.Exit(0)

builder = AgentBuilder(config)
for champ in champions:
# Gather recent judge feedback from losses
matches = tournament.get_agent_matches(champ.id, limit=10)
losses = [m for m in matches if m.winner_id != champ.id and not m.voided]
evolved_count = 0

for agent in underperformers:
win_rate = tournament.get_agent_recent_win_rate(agent.id, window=config.underperformer_window)
matches = tournament.get_agent_matches(agent.id, limit=10)
Comment thread
AaronGoldsmith marked this conversation as resolved.
losses = [m for m in matches if m.winner_id != agent.id and not m.voided]

if not losses:
console.print(f"[yellow]{champ.name} has no recent losses — nothing to improve.[/yellow]")
console.print(f"[yellow]{agent.name} has no recorded losses — skipping.[/yellow]")
continue

feedback = "\n\n".join(
f"Task: {m.task_description[:100]}\nJudge: {m.judge_reasoning[:200]}"
for m in losses[:5]
)

console.print(f"[bold]Evolving {champ.name} based on {len(losses)} losses...[/bold]")
improved = asyncio.run(builder.refine_agent(champ, feedback))
console.print(
f"\n[bold]Evolving {agent.name}[/bold] "
f"(win rate: {win_rate:.0%}, gen {agent.generation})"
)

if improved:
if registry.get_agent_by_slug(improved.slug):
improved.slug = f"{improved.slug}-{improved.id[:6]}"
registry.create_agent(improved)
console.print(f"[green]Created challenger: {improved.name} (gen {improved.generation})[/green]")
# Evaluator-optimizer loop: refine, self-critique, repeat
candidate = agent
candidate_feedback = feedback
best_candidate = None
original_parent_id = agent.id # preserve lineage across iterations

for iteration in range(iterations):
if iterations > 1:
console.print(f" [dim]Iteration {iteration + 1}/{iterations}[/dim]")

improved = asyncio.run(builder.refine_agent(candidate, candidate_feedback))
if not improved:
console.print(f" [red]Refinement failed at iteration {iteration + 1}[/red]")
break

# Always point lineage back to the original agent, not intermediate candidates
improved.parent_id = original_parent_id

# Self-critique: evaluate if the refinement actually addresses the feedback
if iterations > 1:
critique = asyncio.run(builder.critique_refinement(
original=agent,
refined=improved,
feedback=feedback,
))
if critique and critique.get("pass"):
console.print(f" [green]Self-critique passed: {critique.get('summary', '')}[/green]")
best_candidate = improved
break
elif critique:
console.print(f" [yellow]Self-critique: {critique.get('summary', '')}[/yellow]")
# Feed only the critique summary for next iteration (not original feedback)
candidate = improved
candidate_feedback = (
Comment thread
AaronGoldsmith marked this conversation as resolved.
f"Address this critique of your previous attempt:\n"
f"{critique.get('summary', '')}"
)
best_candidate = improved # keep latest even if not perfect
continue
else:
best_candidate = improved
break
else:
best_candidate = improved

if best_candidate:
if registry.get_agent_by_slug(best_candidate.slug):
best_candidate.slug = f"{best_candidate.slug}-{best_candidate.id[:6]}"
registry.create_agent(best_candidate)
evolved_count += 1
console.print(
f" [green]Created: {best_candidate.name} (gen {best_candidate.generation})[/green]"
)
else:
console.print(f"[red]Failed to create improved version of {champ.name}[/red]")
console.print(f" [red]Failed to create improved version of {agent.name}[/red]")

console.print(f"\n[bold]Evolved {evolved_count}/{len(underperformers)} underperformers.[/bold]")
conn.close()


Expand Down
1 change: 1 addition & 0 deletions src/mobius/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class MobiusConfig(BaseModel):
agent_timeout_seconds: int = 120
agent_max_turns: int = 10
agent_budget_usd: float = 0.05
agent_max_output_tokens: int = 16384
Comment thread
AaronGoldsmith marked this conversation as resolved.

# Judge
judge_models: list[dict[str, str]] = [
Expand Down
22 changes: 18 additions & 4 deletions src/mobius/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import asyncio
import json
import logging
import random
Expand Down Expand Up @@ -129,7 +130,10 @@ async def evaluate(
labels = list(string.ascii_uppercase[: len(agent_ids)])

verdicts: list[tuple[JudgeVerdict, str]] = [] # (verdict, model)
judge_models_used: list[str] = []

# Prepare per-judge data (independent shuffle per judge)
judge_tasks = []
judge_meta = [] # (provider, model, label_to_agent)

for judge_config in self.config.judge_models:
provider = judge_config["provider"]
Expand All @@ -143,14 +147,23 @@ async def evaluate(

prompt = _build_judge_prompt(task, outputs, label_map)

result = await run_judge(
judge_tasks.append(run_judge(
prompt=prompt,
system_prompt=JUDGE_SYSTEM_PROMPT,
provider_name=provider,
model=model,
)
))
judge_meta.append((provider, model, label_to_agent))

# Run all judges in parallel
results = await asyncio.gather(*judge_tasks, return_exceptions=True)

judge_models_used.append(f"{provider}/{model}")
judge_models_used = []

for result, (provider, model, label_to_agent) in zip(results, judge_meta):
if isinstance(result, Exception):
logger.warning("Judge %s/%s raised: %s", provider, model, result)
continue

if not result.success:
logger.warning("Judge %s/%s failed: %s", provider, model, result.error)
Expand All @@ -159,6 +172,7 @@ async def evaluate(
verdict = _parse_verdict(result.output, label_to_agent)
if verdict:
verdicts.append((verdict, f"{provider}/{model}"))
judge_models_used.append(f"{provider}/{model}")
logger.info(
"Judge %s/%s picked winner: %s (mapped to agent)",
provider,
Expand Down
13 changes: 7 additions & 6 deletions src/mobius/providers/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ async def run_agent(
max_budget_usd: float = 0.05,
timeout_seconds: int = 120,
working_dir: str | None = None,
max_tokens: int = 16384,
) -> ProviderResult:
"""Run via Anthropic messages API, with tool loop if tools requested."""
api_key = _get_api_key()
Expand All @@ -60,22 +61,22 @@ async def run_agent(
if use_tools:
return await self._run_with_tools(
client, prompt, system_prompt, model,
max_turns, timeout_seconds, working_dir,
max_turns, timeout_seconds, working_dir, max_tokens,
)
else:
return await self._run_simple(
client, prompt, system_prompt, model, timeout_seconds,
client, prompt, system_prompt, model, timeout_seconds, max_tokens,
)

async def _run_simple(
self, client, prompt: str, system_prompt: str,
model: str, timeout_seconds: int,
model: str, timeout_seconds: int, max_tokens: int = 16384,
) -> ProviderResult:
"""Single-shot message, same as Google/OpenAI providers."""
try:
response = await asyncio.wait_for(
client.messages.create(
model=model, max_tokens=4096,
model=model, max_tokens=max_tokens,
system=system_prompt,
messages=[{"role": "user", "content": prompt}],
),
Expand Down Expand Up @@ -103,7 +104,7 @@ async def _run_simple(
async def _run_with_tools(
self, client, prompt: str, system_prompt: str,
model: str, max_turns: int, timeout_seconds: int,
working_dir: str | None = None,
working_dir: str | None = None, max_tokens: int = 16384,
) -> ProviderResult:
"""Agentic loop with bash tool use."""
messages = [{"role": "user", "content": prompt}]
Expand All @@ -115,7 +116,7 @@ async def _run_with_tools(
for turn in range(max_turns):
response = await asyncio.wait_for(
client.messages.create(
model=model, max_tokens=4096,
model=model, max_tokens=max_tokens,
system=system_prompt,
messages=messages,
tools=[ANTHROPIC_BASH_TOOL],
Expand Down
1 change: 1 addition & 0 deletions src/mobius/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ async def run_agent(
max_budget_usd: float = 0.05,
timeout_seconds: int = 120,
working_dir: str | None = None,
max_tokens: int = 16384,
) -> ProviderResult:
"""Execute an agent and return its output."""
...
Expand Down
7 changes: 4 additions & 3 deletions src/mobius/providers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ async def run_agent(
max_budget_usd: float = 0.05,
timeout_seconds: int = 120,
working_dir: str | None = None,
max_tokens: int = 16384,
) -> ProviderResult:
"""Execute via Google GenAI SDK, with tool loop if requested."""
api_key = _get_api_key()
Expand All @@ -62,7 +63,7 @@ async def run_agent(
if use_tools:
return await self._run_with_tools(
client, types, prompt, system_prompt, model,
max_turns, timeout_seconds, working_dir,
max_turns, timeout_seconds, working_dir, max_tokens,
)
else:
return await self._run_simple(
Expand Down Expand Up @@ -108,7 +109,7 @@ async def _run_simple(
async def _run_with_tools(
self, client, types, prompt: str, system_prompt: str,
model: str, max_turns: int, timeout_seconds: int,
working_dir: str | None = None,
working_dir: str | None = None, max_tokens: int = 16384,
) -> ProviderResult:
"""Agentic loop with function calling."""
# Build the tool declaration
Expand All @@ -123,7 +124,7 @@ async def _run_with_tools(
config = types.GenerateContentConfig(
system_instruction=system_prompt,
tools=[bash_tool],
max_output_tokens=4096,
max_output_tokens=max_tokens,
)
Comment thread
AaronGoldsmith marked this conversation as resolved.

contents = [types.Content(
Expand Down
Loading
Loading