Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/mobius/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def run(
task: str = typer.Argument(..., help="The task for agents to compete on"),
n: int = typer.Option(None, "--agents", "-n", help="Number of competing agents"),
no_ui: bool = typer.Option(False, "--no-ui", help="Disable live terminal UI"),
sandbox: bool = typer.Option(False, "--sandbox", help="Run agents in Docker sandbox"),
verbose: bool = typer.Option(False, "--verbose", "-v"),
):
"""Run a competition: select agents, execute in parallel, judge outputs."""
Expand All @@ -111,6 +112,8 @@ def run(

if n:
config.swarm_size = n
if sandbox:
config.sandbox_enabled = True

agent_count = registry.count_agents()
if agent_count == 0:
Expand Down
10 changes: 10 additions & 0 deletions src/mobius/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ class MobiusConfig(BaseModel):
agent_max_turns: int = 10
agent_budget_usd: float = 0.05

# Sandbox
sandbox_enabled: bool = False
sandbox_image: str = "python:3.12-slim"
sandbox_memory_limit: str = "512m"
sandbox_network: bool = False # no network access by default

# Judge
judge_models: list[dict[str, str]] = [
{"provider": "anthropic", "model": "claude-opus-4-6"},
Expand Down Expand Up @@ -89,5 +95,9 @@ def get_config() -> MobiusConfig:
config.swarm_size = int(val)
if val := os.environ.get("MOBIUS_BUDGET_USD"):
config.global_budget_usd = float(val)
if os.environ.get("MOBIUS_SANDBOX", "").lower() in ("1", "true", "yes"):
config.sandbox_enabled = True
if val := os.environ.get("MOBIUS_SANDBOX_IMAGE"):
config.sandbox_image = val

return config
27 changes: 26 additions & 1 deletion src/mobius/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
from __future__ import annotations

import logging
import os

from mobius.config import MobiusConfig
from mobius.db import vec_to_blob
from mobius.embedder import embed
from mobius.judge import JudgePanel
from mobius.memory import Memory
from mobius.models import AgentRecord, JudgeVerdict, MatchRecord, MemoryEntry
from mobius.providers.tools import create_sandbox, destroy_sandbox, set_sandbox
from mobius.selector import Selector
from mobius.swarm import Swarm, SwarmResult
from mobius.tournament import Tournament
Expand Down Expand Up @@ -125,7 +127,27 @@ async def run_competition(
[a.slug for a in agents],
)

# 2. Run swarm
# 2. Set up sandbox if enabled
sandbox_name = None
if self.config.sandbox_enabled:
if working_dir is None:
working_dir = os.getcwd()
try:
sandbox_name = create_sandbox(
image=self.config.sandbox_image,
memory_limit=self.config.sandbox_memory_limit,
network=self.config.sandbox_network,
working_dir=working_dir,
Comment thread
AaronGoldsmith marked this conversation as resolved.
)
set_sandbox(sandbox_name)
logger.info("Sandbox active: %s", sandbox_name)
except Exception as e:
raise RuntimeError(
f"Sandbox creation failed and sandbox_enabled=True, "
f"refusing to run on host: {e}"
) from e

# 3. Run swarm
ui = SwarmUI() if show_ui else None
if ui:
# Register agents for UI display
Expand All @@ -151,6 +173,9 @@ async def run_competition(
finally:
if ui:
ui.stop()
if sandbox_name:
set_sandbox(None)
destroy_sandbox(sandbox_name)

# 3. Check if we have enough outputs to judge
successful = swarm_result.successful_outputs
Expand Down
129 changes: 124 additions & 5 deletions src/mobius/providers/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,144 @@

Each provider has its own format for declaring tools, but the underlying
execution is identical: run a shell command, return the output.

When sandbox mode is enabled, commands run inside a disposable Docker
container instead of on the host.
"""

from __future__ import annotations

import logging
import os
import subprocess
import uuid

logger = logging.getLogger(__name__)


def run_command(command: str, timeout: int = 30, working_dir: str | None = None) -> str:
"""Execute a shell command and return output."""
# ---------------------------------------------------------------------------
# Sandbox container lifecycle
# ---------------------------------------------------------------------------

_active_containers: dict[str, str] = {} # name -> container id
_current_sandbox: str | None = None # set by orchestrator for current competition


def create_sandbox(
image: str = "python:3.12-slim",
memory_limit: str = "512m",
network: bool = False,
working_dir: str | None = None,
) -> str:
"""Create and start a warm sandbox container. Returns container name."""
name = f"mobius-sandbox-{uuid.uuid4().hex[:8]}"
cmd = [
"docker", "create",
"--name", name,
"--memory", memory_limit,
"--cpus", "1",
"--workdir", "/workspace",
]
if working_dir:
cmd += ["-v", f"{working_dir}:/workspace"]
if not network:
cmd += ["--network", "none"]
cmd += [image, "sleep", "infinity"]

result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
raise RuntimeError(f"Failed to create sandbox: {result.stderr.strip()}")

start_result = subprocess.run(
["docker", "start", name],
capture_output=True, text=True, timeout=10,
)
if start_result.returncode != 0:
# Cleanup the created-but-not-started container
subprocess.run(
["docker", "rm", "-f", name],
capture_output=True, text=True, timeout=10,
)
raise RuntimeError(f"Failed to start sandbox: {start_result.stderr.strip()}")

_active_containers[name] = result.stdout.strip()
logger.info("Sandbox created: %s (image=%s, network=%s)", name, image, network)
return name


def destroy_sandbox(name: str) -> None:
"""Stop and remove a sandbox container."""
try:
result = subprocess.run(
command, shell=True, capture_output=True, text=True,
timeout=timeout, cwd=working_dir or os.getcwd(),
encoding="utf-8", errors="replace",
["docker", "rm", "-f", name],
capture_output=True, text=True, timeout=15,
)
if result.returncode == 0:
_active_containers.pop(name, None)
logger.info("Sandbox destroyed: %s", name)
else:
logger.warning(
"Failed to destroy sandbox %s: docker rm returned %d: %s",
name, result.returncode, result.stderr.strip(),
)
except Exception as e:
logger.warning("Failed to destroy sandbox %s: %s", name, e)


def destroy_all_sandboxes() -> None:
"""Clean up all active sandbox containers."""
for name in list(_active_containers):
destroy_sandbox(name)


def set_sandbox(name: str | None) -> None:
"""Set the active sandbox for all subsequent run_command calls."""
global _current_sandbox
_current_sandbox = name


def get_current_sandbox() -> str | None:
"""Return the name of the currently active sandbox, or None."""
return _current_sandbox


# ---------------------------------------------------------------------------
# Command execution
# ---------------------------------------------------------------------------

def run_command(
command: str,
timeout: int = 30,
working_dir: str | None = None,
sandbox: str | None = None,
) -> str:
"""Execute a shell command and return output.

Args:
command: The shell command to run.
timeout: Max seconds before killing the command.
working_dir: Working directory (host mode only).
sandbox: Container name to exec into. If None, uses current sandbox.
"""
sandbox = sandbox or _current_sandbox
try:
if sandbox:
if sandbox not in _active_containers:
raise RuntimeError(
f"Sandbox '{sandbox}' is not in active containers. "
"Refusing to fall back to host execution."
)
result = subprocess.run(
["docker", "exec", sandbox, "sh", "-lc", command],
capture_output=True, text=True, timeout=timeout,
encoding="utf-8", errors="replace",
)
else:
result = subprocess.run(
command, shell=True, capture_output=True, text=True,
timeout=timeout, cwd=working_dir or os.getcwd(),
encoding="utf-8", errors="replace",
)
output = result.stdout
if result.returncode != 0 and result.stderr:
output += f"\n[stderr]: {result.stderr}"
Expand Down
15 changes: 11 additions & 4 deletions src/mobius/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from mobius.providers.google import GoogleProvider
from mobius.providers.openai import OpenAIProvider
from mobius.providers.openrouter import OpenRouterProvider
from mobius.providers.tools import get_current_sandbox

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,10 +67,16 @@ def get_provider(provider_name: ProviderType) -> Provider:

def _build_context_prefix(agent: AgentRecord, working_dir: str) -> str:
"""Build an environment context string so agents know what they can do."""
lines = [
f"Working directory: {os.path.basename(working_dir)}",
_PLATFORM_LINE,
]
if get_current_sandbox():
lines = [
"Working directory: /workspace",
"Platform: Linux (sandboxed Docker container)",
]
else:
lines = [
f"Working directory: {os.path.basename(working_dir)}",
_PLATFORM_LINE,
]

# Only advertise tools that are actually wired up in providers.
tools = [t for t in (agent.tools or []) if t in _IMPLEMENTED_TOOLS]
Expand Down
Loading