AaronGoldsmith · AaronGoldsmith · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/src/mobius/cli.py b/src/mobius/cli.py
@@ -103,6 +103,7 @@ def run(
     task: str = typer.Argument(..., help="The task for agents to compete on"),
     n: int = typer.Option(None, "--agents", "-n", help="Number of competing agents"),
     no_ui: bool = typer.Option(False, "--no-ui", help="Disable live terminal UI"),
+    sandbox: bool = typer.Option(False, "--sandbox", help="Run agents in Docker sandbox"),
     verbose: bool = typer.Option(False, "--verbose", "-v"),
 ):
     """Run a competition: select agents, execute in parallel, judge outputs."""
@@ -111,6 +112,8 @@ def run(
 
     if n:
         config.swarm_size = n
+    if sandbox:
+        config.sandbox_enabled = True
 
     agent_count = registry.count_agents()
     if agent_count == 0:

diff --git a/src/mobius/config.py b/src/mobius/config.py
@@ -24,6 +24,12 @@ class MobiusConfig(BaseModel):
     agent_max_turns: int = 10
     agent_budget_usd: float = 0.05
 
+    # Sandbox
+    sandbox_enabled: bool = False
+    sandbox_image: str = "python:3.12-slim"
+    sandbox_memory_limit: str = "512m"
+    sandbox_network: bool = False  # no network access by default
+
     # Judge
     judge_models: list[dict[str, str]] = [
         {"provider": "anthropic", "model": "claude-opus-4-6"},
@@ -89,5 +95,9 @@ def get_config() -> MobiusConfig:
         config.swarm_size = int(val)
     if val := os.environ.get("MOBIUS_BUDGET_USD"):
         config.global_budget_usd = float(val)
+    if os.environ.get("MOBIUS_SANDBOX", "").lower() in ("1", "true", "yes"):
+        config.sandbox_enabled = True
+    if val := os.environ.get("MOBIUS_SANDBOX_IMAGE"):
+        config.sandbox_image = val
 
     return config
diff --git a/src/mobius/orchestrator.py b/src/mobius/orchestrator.py
@@ -3,13 +3,15 @@
 from __future__ import annotations
 
 import logging
+import os
 
 from mobius.config import MobiusConfig
 from mobius.db import vec_to_blob
 from mobius.embedder import embed
 from mobius.judge import JudgePanel
 from mobius.memory import Memory
 from mobius.models import AgentRecord, JudgeVerdict, MatchRecord, MemoryEntry
+from mobius.providers.tools import create_sandbox, destroy_sandbox, set_sandbox
 from mobius.selector import Selector
 from mobius.swarm import Swarm, SwarmResult
 from mobius.tournament import Tournament
@@ -125,7 +127,27 @@ async def run_competition(
             [a.slug for a in agents],
         )
 
-        # 2. Run swarm
+        # 2. Set up sandbox if enabled
+        sandbox_name = None
+        if self.config.sandbox_enabled:
+            if working_dir is None:
+                working_dir = os.getcwd()
+            try:
+                sandbox_name = create_sandbox(
+                    image=self.config.sandbox_image,
+                    memory_limit=self.config.sandbox_memory_limit,
+                    network=self.config.sandbox_network,
+                    working_dir=working_dir,
+                )
+                set_sandbox(sandbox_name)
+                logger.info("Sandbox active: %s", sandbox_name)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Sandbox creation failed and sandbox_enabled=True, "
+                    f"refusing to run on host: {e}"
+                ) from e
+
+        # 3. Run swarm
         ui = SwarmUI() if show_ui else None
         if ui:
             # Register agents for UI display
@@ -151,6 +173,9 @@ async def run_competition(
         finally:
             if ui:
                 ui.stop()
+            if sandbox_name:
+                set_sandbox(None)
+                destroy_sandbox(sandbox_name)
 
         # 3. Check if we have enough outputs to judge
         successful = swarm_result.successful_outputs

diff --git a/src/mobius/providers/tools.py b/src/mobius/providers/tools.py
@@ -2,25 +2,144 @@
 
 Each provider has its own format for declaring tools, but the underlying
 execution is identical: run a shell command, return the output.
+
+When sandbox mode is enabled, commands run inside a disposable Docker
+container instead of on the host.
 """
 
 from __future__ import annotations
 
 import logging
 import os
 import subprocess
+import uuid
 
 logger = logging.getLogger(__name__)
 
 
-def run_command(command: str, timeout: int = 30, working_dir: str | None = None) -> str:
-    """Execute a shell command and return output."""
+# ---------------------------------------------------------------------------
+# Sandbox container lifecycle
+# ---------------------------------------------------------------------------
+
+_active_containers: dict[str, str] = {}  # name -> container id
+_current_sandbox: str | None = None  # set by orchestrator for current competition
+
+
+def create_sandbox(
+    image: str = "python:3.12-slim",
+    memory_limit: str = "512m",
+    network: bool = False,
+    working_dir: str | None = None,
+) -> str:
+    """Create and start a warm sandbox container. Returns container name."""
+    name = f"mobius-sandbox-{uuid.uuid4().hex[:8]}"
+    cmd = [
+        "docker", "create",
+        "--name", name,
+        "--memory", memory_limit,
+        "--cpus", "1",
+        "--workdir", "/workspace",
+    ]
+    if working_dir:
+        cmd += ["-v", f"{working_dir}:/workspace"]
+    if not network:
+        cmd += ["--network", "none"]
+    cmd += [image, "sleep", "infinity"]
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    if result.returncode != 0:
+        raise RuntimeError(f"Failed to create sandbox: {result.stderr.strip()}")
+
+    start_result = subprocess.run(
+        ["docker", "start", name],
+        capture_output=True, text=True, timeout=10,
+    )
+    if start_result.returncode != 0:
+        # Cleanup the created-but-not-started container
+        subprocess.run(
+            ["docker", "rm", "-f", name],
+            capture_output=True, text=True, timeout=10,
+        )
+        raise RuntimeError(f"Failed to start sandbox: {start_result.stderr.strip()}")
+
+    _active_containers[name] = result.stdout.strip()
+    logger.info("Sandbox created: %s (image=%s, network=%s)", name, image, network)
+    return name
+
+
+def destroy_sandbox(name: str) -> None:
+    """Stop and remove a sandbox container."""
     try:
         result = subprocess.run(
-            command, shell=True, capture_output=True, text=True,
-            timeout=timeout, cwd=working_dir or os.getcwd(),
-            encoding="utf-8", errors="replace",
+            ["docker", "rm", "-f", name],
+            capture_output=True, text=True, timeout=15,
         )
+        if result.returncode == 0:
+            _active_containers.pop(name, None)
+            logger.info("Sandbox destroyed: %s", name)
+        else:
+            logger.warning(
+                "Failed to destroy sandbox %s: docker rm returned %d: %s",
+                name, result.returncode, result.stderr.strip(),
+            )
+    except Exception as e:
+        logger.warning("Failed to destroy sandbox %s: %s", name, e)
+
+
+def destroy_all_sandboxes() -> None:
+    """Clean up all active sandbox containers."""
+    for name in list(_active_containers):
+        destroy_sandbox(name)
+
+
+def set_sandbox(name: str | None) -> None:
+    """Set the active sandbox for all subsequent run_command calls."""
+    global _current_sandbox
+    _current_sandbox = name
+
+
+def get_current_sandbox() -> str | None:
+    """Return the name of the currently active sandbox, or None."""
+    return _current_sandbox
+
+
+# ---------------------------------------------------------------------------
+# Command execution
+# ---------------------------------------------------------------------------
+
+def run_command(
+    command: str,
+    timeout: int = 30,
+    working_dir: str | None = None,
+    sandbox: str | None = None,
+) -> str:
+    """Execute a shell command and return output.
+
+    Args:
+        command: The shell command to run.
+        timeout: Max seconds before killing the command.
+        working_dir: Working directory (host mode only).
+        sandbox: Container name to exec into. If None, uses current sandbox.
+    """
+    sandbox = sandbox or _current_sandbox
+    try:
+        if sandbox:
+            if sandbox not in _active_containers:
+                raise RuntimeError(
+                    f"Sandbox '{sandbox}' is not in active containers. "
+                    "Refusing to fall back to host execution."
+                )
+            result = subprocess.run(
+                ["docker", "exec", sandbox, "sh", "-lc", command],
+                capture_output=True, text=True, timeout=timeout,
+                encoding="utf-8", errors="replace",
+            )
+        else:
+            result = subprocess.run(
+                command, shell=True, capture_output=True, text=True,
+                timeout=timeout, cwd=working_dir or os.getcwd(),
+                encoding="utf-8", errors="replace",
+            )
         output = result.stdout
         if result.returncode != 0 and result.stderr:
             output += f"\n[stderr]: {result.stderr}"

diff --git a/src/mobius/runner.py b/src/mobius/runner.py
@@ -12,6 +12,7 @@
 from mobius.providers.google import GoogleProvider
 from mobius.providers.openai import OpenAIProvider
 from mobius.providers.openrouter import OpenRouterProvider
+from mobius.providers.tools import get_current_sandbox
 
 logger = logging.getLogger(__name__)
 
@@ -66,10 +67,16 @@ def get_provider(provider_name: ProviderType) -> Provider:
 
 def _build_context_prefix(agent: AgentRecord, working_dir: str) -> str:
     """Build an environment context string so agents know what they can do."""
-    lines = [
-        f"Working directory: {os.path.basename(working_dir)}",
-        _PLATFORM_LINE,
-    ]
+    if get_current_sandbox():
+        lines = [
+            "Working directory: /workspace",
+            "Platform: Linux (sandboxed Docker container)",
+        ]
+    else:
+        lines = [
+            f"Working directory: {os.path.basename(working_dir)}",
+            _PLATFORM_LINE,
+        ]
 
     # Only advertise tools that are actually wired up in providers.
     tools = [t for t in (agent.tools or []) if t in _IMPLEMENTED_TOOLS]