diff --git a/src/mobius/cli.py b/src/mobius/cli.py index 84e4ddc..6dc3108 100644 --- a/src/mobius/cli.py +++ b/src/mobius/cli.py @@ -103,6 +103,7 @@ def run( task: str = typer.Argument(..., help="The task for agents to compete on"), n: int = typer.Option(None, "--agents", "-n", help="Number of competing agents"), no_ui: bool = typer.Option(False, "--no-ui", help="Disable live terminal UI"), + sandbox: bool = typer.Option(False, "--sandbox", help="Run agents in Docker sandbox"), verbose: bool = typer.Option(False, "--verbose", "-v"), ): """Run a competition: select agents, execute in parallel, judge outputs.""" @@ -111,6 +112,8 @@ def run( if n: config.swarm_size = n + if sandbox: + config.sandbox_enabled = True agent_count = registry.count_agents() if agent_count == 0: diff --git a/src/mobius/config.py b/src/mobius/config.py index f90fbba..126fe60 100644 --- a/src/mobius/config.py +++ b/src/mobius/config.py @@ -24,6 +24,12 @@ class MobiusConfig(BaseModel): agent_max_turns: int = 10 agent_budget_usd: float = 0.05 + # Sandbox + sandbox_enabled: bool = False + sandbox_image: str = "python:3.12-slim" + sandbox_memory_limit: str = "512m" + sandbox_network: bool = False # no network access by default + # Judge judge_models: list[dict[str, str]] = [ {"provider": "anthropic", "model": "claude-opus-4-6"}, @@ -89,5 +95,9 @@ def get_config() -> MobiusConfig: config.swarm_size = int(val) if val := os.environ.get("MOBIUS_BUDGET_USD"): config.global_budget_usd = float(val) + if os.environ.get("MOBIUS_SANDBOX", "").lower() in ("1", "true", "yes"): + config.sandbox_enabled = True + if val := os.environ.get("MOBIUS_SANDBOX_IMAGE"): + config.sandbox_image = val return config diff --git a/src/mobius/orchestrator.py b/src/mobius/orchestrator.py index bbff38f..91d4253 100644 --- a/src/mobius/orchestrator.py +++ b/src/mobius/orchestrator.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import os from mobius.config import MobiusConfig from mobius.db import vec_to_blob @@ -10,6 +11,7 @@ from mobius.judge import JudgePanel from mobius.memory import Memory from mobius.models import AgentRecord, JudgeVerdict, MatchRecord, MemoryEntry +from mobius.providers.tools import create_sandbox, destroy_sandbox, set_sandbox from mobius.selector import Selector from mobius.swarm import Swarm, SwarmResult from mobius.tournament import Tournament @@ -125,7 +127,27 @@ async def run_competition( [a.slug for a in agents], ) - # 2. Run swarm + # 2. Set up sandbox if enabled + sandbox_name = None + if self.config.sandbox_enabled: + if working_dir is None: + working_dir = os.getcwd() + try: + sandbox_name = create_sandbox( + image=self.config.sandbox_image, + memory_limit=self.config.sandbox_memory_limit, + network=self.config.sandbox_network, + working_dir=working_dir, + ) + set_sandbox(sandbox_name) + logger.info("Sandbox active: %s", sandbox_name) + except Exception as e: + raise RuntimeError( + f"Sandbox creation failed and sandbox_enabled=True, " + f"refusing to run on host: {e}" + ) from e + + # 3. Run swarm ui = SwarmUI() if show_ui else None if ui: # Register agents for UI display @@ -151,6 +173,9 @@ async def run_competition( finally: if ui: ui.stop() + if sandbox_name: + set_sandbox(None) + destroy_sandbox(sandbox_name) # 3. Check if we have enough outputs to judge successful = swarm_result.successful_outputs diff --git a/src/mobius/providers/tools.py b/src/mobius/providers/tools.py index e853284..1de7dbb 100644 --- a/src/mobius/providers/tools.py +++ b/src/mobius/providers/tools.py @@ -2,6 +2,9 @@ Each provider has its own format for declaring tools, but the underlying execution is identical: run a shell command, return the output. + +When sandbox mode is enabled, commands run inside a disposable Docker +container instead of on the host. """ from __future__ import annotations @@ -9,18 +12,134 @@ import logging import os import subprocess +import uuid logger = logging.getLogger(__name__) -def run_command(command: str, timeout: int = 30, working_dir: str | None = None) -> str: - """Execute a shell command and return output.""" +# --------------------------------------------------------------------------- +# Sandbox container lifecycle +# --------------------------------------------------------------------------- + +_active_containers: dict[str, str] = {} # name -> container id +_current_sandbox: str | None = None # set by orchestrator for current competition + + +def create_sandbox( + image: str = "python:3.12-slim", + memory_limit: str = "512m", + network: bool = False, + working_dir: str | None = None, +) -> str: + """Create and start a warm sandbox container. Returns container name.""" + name = f"mobius-sandbox-{uuid.uuid4().hex[:8]}" + cmd = [ + "docker", "create", + "--name", name, + "--memory", memory_limit, + "--cpus", "1", + "--workdir", "/workspace", + ] + if working_dir: + cmd += ["-v", f"{working_dir}:/workspace"] + if not network: + cmd += ["--network", "none"] + cmd += [image, "sleep", "infinity"] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + raise RuntimeError(f"Failed to create sandbox: {result.stderr.strip()}") + + start_result = subprocess.run( + ["docker", "start", name], + capture_output=True, text=True, timeout=10, + ) + if start_result.returncode != 0: + # Cleanup the created-but-not-started container + subprocess.run( + ["docker", "rm", "-f", name], + capture_output=True, text=True, timeout=10, + ) + raise RuntimeError(f"Failed to start sandbox: {start_result.stderr.strip()}") + + _active_containers[name] = result.stdout.strip() + logger.info("Sandbox created: %s (image=%s, network=%s)", name, image, network) + return name + + +def destroy_sandbox(name: str) -> None: + """Stop and remove a sandbox container.""" try: result = subprocess.run( - command, shell=True, capture_output=True, text=True, - timeout=timeout, cwd=working_dir or os.getcwd(), - encoding="utf-8", errors="replace", + ["docker", "rm", "-f", name], + capture_output=True, text=True, timeout=15, ) + if result.returncode == 0: + _active_containers.pop(name, None) + logger.info("Sandbox destroyed: %s", name) + else: + logger.warning( + "Failed to destroy sandbox %s: docker rm returned %d: %s", + name, result.returncode, result.stderr.strip(), + ) + except Exception as e: + logger.warning("Failed to destroy sandbox %s: %s", name, e) + + +def destroy_all_sandboxes() -> None: + """Clean up all active sandbox containers.""" + for name in list(_active_containers): + destroy_sandbox(name) + + +def set_sandbox(name: str | None) -> None: + """Set the active sandbox for all subsequent run_command calls.""" + global _current_sandbox + _current_sandbox = name + + +def get_current_sandbox() -> str | None: + """Return the name of the currently active sandbox, or None.""" + return _current_sandbox + + +# --------------------------------------------------------------------------- +# Command execution +# --------------------------------------------------------------------------- + +def run_command( + command: str, + timeout: int = 30, + working_dir: str | None = None, + sandbox: str | None = None, +) -> str: + """Execute a shell command and return output. + + Args: + command: The shell command to run. + timeout: Max seconds before killing the command. + working_dir: Working directory (host mode only). + sandbox: Container name to exec into. If None, uses current sandbox. + """ + sandbox = sandbox or _current_sandbox + try: + if sandbox: + if sandbox not in _active_containers: + raise RuntimeError( + f"Sandbox '{sandbox}' is not in active containers. " + "Refusing to fall back to host execution." + ) + result = subprocess.run( + ["docker", "exec", sandbox, "sh", "-lc", command], + capture_output=True, text=True, timeout=timeout, + encoding="utf-8", errors="replace", + ) + else: + result = subprocess.run( + command, shell=True, capture_output=True, text=True, + timeout=timeout, cwd=working_dir or os.getcwd(), + encoding="utf-8", errors="replace", + ) output = result.stdout if result.returncode != 0 and result.stderr: output += f"\n[stderr]: {result.stderr}" diff --git a/src/mobius/runner.py b/src/mobius/runner.py index 1749e3f..2ca46f9 100644 --- a/src/mobius/runner.py +++ b/src/mobius/runner.py @@ -12,6 +12,7 @@ from mobius.providers.google import GoogleProvider from mobius.providers.openai import OpenAIProvider from mobius.providers.openrouter import OpenRouterProvider +from mobius.providers.tools import get_current_sandbox logger = logging.getLogger(__name__) @@ -66,10 +67,16 @@ def get_provider(provider_name: ProviderType) -> Provider: def _build_context_prefix(agent: AgentRecord, working_dir: str) -> str: """Build an environment context string so agents know what they can do.""" - lines = [ - f"Working directory: {os.path.basename(working_dir)}", - _PLATFORM_LINE, - ] + if get_current_sandbox(): + lines = [ + "Working directory: /workspace", + "Platform: Linux (sandboxed Docker container)", + ] + else: + lines = [ + f"Working directory: {os.path.basename(working_dir)}", + _PLATFORM_LINE, + ] # Only advertise tools that are actually wired up in providers. tools = [t for t in (agent.tools or []) if t in _IMPLEMENTED_TOOLS]