From a64b5b70e0ef49b4360b1530ea62ea4039517ac2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:39:22 +0900
Subject: [PATCH 01/12] feat: add interface contracts for harness/adapter
 boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AgentWorkflow / EvaluatorContract Protocol 정의로 editable harness와
fixed adapter 사이의 경계를 추상화함. AST 기반 테스트로 heavy runtime
dependency 없이 agent.py 호환성 검증.
---
 contracts.py            | 48 +++++++++++++++++++++++
 tests/__init__.py       |  0
 tests/test_contracts.py | 86 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)
 create mode 100644 contracts.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_contracts.py

diff --git a/contracts.py b/contracts.py
new file mode 100644
index 0000000..843f94c
--- /dev/null
+++ b/contracts.py
@@ -0,0 +1,48 @@
+"""Interface contracts for the harness/adapter boundary.
+
+These Protocol classes define the shape that the editable harness (agent.py)
+must satisfy. The fixed adapter (AutoAgent) depends on these abstractions, not
+on concrete implementations.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class AgentWorkflow(Protocol):
+    """Contract for the editable harness layer.
+
+    Implementors provide tool construction, agent construction, and
+    task-execution orchestration.  The fixed adapter calls these three
+    entry-points only — nothing else crosses the boundary.
+    """
+
+    def create_tools(self, environment: Any) -> list:
+        """Return a list of tools configured for *environment*."""
+        ...
+
+    def create_agent(self, environment: Any) -> Any:
+        """Build and return a configured agent instance."""
+        ...
+
+    async def run_task(self, environment: Any, instruction: str) -> dict:
+        """Execute a task and return a result mapping."""
+        ...
+
+
+@runtime_checkable
+class EvaluatorContract(Protocol):
+    """Contract for trajectory evaluators.
+
+    Evaluators receive a trajectory dict and an expected-outcome dict and
+    return a scalar score in the range [0.0, 1.0].
+    """
+
+    def score(self, trajectory: dict, expected: dict) -> float:
+        """Score *trajectory* against *expected*. Returns a float in [0, 1]."""
+        ...
+
+
+__all__ = ["AgentWorkflow", "EvaluatorContract"]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_contracts.py b/tests/test_contracts.py
new file mode 100644
index 0000000..4f61cb5
--- /dev/null
+++ b/tests/test_contracts.py
@@ -0,0 +1,86 @@
+"""Tests for contracts.py — interface contract definitions."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+
+
+def test_contracts_module_exists() -> None:
+    """contracts module must be importable."""
+    contracts = importlib.import_module("contracts")
+    assert contracts is not None
+
+
+def test_agent_workflow_protocol_exists() -> None:
+    """contracts module must expose AgentWorkflow."""
+    contracts = importlib.import_module("contracts")
+    assert hasattr(contracts, "AgentWorkflow"), "AgentWorkflow not found in contracts"
+
+
+def test_evaluator_contract_protocol_exists() -> None:
+    """contracts module must expose EvaluatorContract."""
+    contracts = importlib.import_module("contracts")
+    assert hasattr(contracts, "EvaluatorContract"), "EvaluatorContract not found in contracts"
+
+
+def test_agent_workflow_has_required_methods() -> None:
+    """AgentWorkflow Protocol must declare run_task, create_tools, create_agent."""
+    contracts = importlib.import_module("contracts")
+    AgentWorkflow = contracts.AgentWorkflow
+
+    required = {"run_task", "create_tools", "create_agent"}
+    # Protocol methods appear as annotations or as actual members
+    members = set(dir(AgentWorkflow))
+    assert required <= members, f"Missing methods: {required - members}"
+
+
+def test_evaluator_contract_has_score_method() -> None:
+    """EvaluatorContract Protocol must declare score."""
+    contracts = importlib.import_module("contracts")
+    EvaluatorContract = contracts.EvaluatorContract
+
+    assert "score" in dir(EvaluatorContract), "score method not found in EvaluatorContract"
+
+
+def test_agent_py_functions_are_callable() -> None:
+    """agent.py must define create_tools, create_agent, run_task as top-level functions.
+
+    Verified via AST to avoid importing heavy runtime dependencies (harbor, openai-agents).
+    """
+    import ast
+    from pathlib import Path  # noqa: PLC0415
+
+    source = (Path(__file__).parent.parent / "agent.py").read_text()
+    tree = ast.parse(source)
+
+    top_level_funcs = {
+        node.name
+        for node in ast.walk(tree)
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and isinstance(node.col_offset, int)
+        and node.col_offset == 0  # module-level only
+    }
+
+    for name in ("create_tools", "create_agent", "run_task"):
+        assert name in top_level_funcs, f"{name} not found as a top-level function in agent.py"
+
+
+def test_run_task_is_coroutine_function() -> None:
+    """run_task in agent.py must be declared as async to satisfy the async contract.
+
+    Verified via AST to avoid importing heavy runtime dependencies.
+    """
+    import ast
+    from pathlib import Path  # noqa: PLC0415
+
+    source = (Path(__file__).parent.parent / "agent.py").read_text()
+    tree = ast.parse(source)
+
+    async_top_level = {
+        node.name
+        for node in ast.walk(tree)
+        if isinstance(node, ast.AsyncFunctionDef) and node.col_offset == 0
+    }
+
+    assert "run_task" in async_top_level, "run_task must be an async def in agent.py"

From f83c79bafaf411cb8103f7e7b89eac564272ca81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:42:34 +0900
Subject: [PATCH 02/12] refactor: extract fixed adapter boundary into
 adapter.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Harbor 통합 코드(to_atif, AutoAgent)를 별도 adapter.py로 분리해
editable harness(agent.py)와 고정 어댑터 경계를 물리적으로 구분함.
agent.py 하단에서 re-export해 Harbor의 agent:AutoAgent 진입점 호환성 유지.
---
 adapter.py            | 191 ++++++++++++++++++++++++++++++++++++++++++
 agent.py              | 180 +--------------------------------------
 tests/test_adapter.py | 123 +++++++++++++++++++++++++++
 3 files changed, 315 insertions(+), 179 deletions(-)
 create mode 100644 adapter.py
 create mode 100644 tests/test_adapter.py

diff --git a/adapter.py b/adapter.py
new file mode 100644
index 0000000..5eaae1d
--- /dev/null
+++ b/adapter.py
@@ -0,0 +1,191 @@
+"""Fixed Harbor adapter — DO NOT MODIFY. Read-only in production."""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+
+from agents.items import (
+    ItemHelpers,
+    MessageOutputItem,
+    ReasoningItem,
+    ToolCallItem,
+    ToolCallOutputItem,
+)
+from agents.usage import Usage
+from harbor.agents.base import BaseAgent
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+# Import editable harness entry-points (run_task, MODEL) from agent.py.
+# agent.py imports *this* module at the bottom, so by the time Python resolves
+# this import agent.py's module-level symbols are already defined — no cycle.
+from agent import MODEL, run_task  # noqa: E402
+
+
+def to_atif(result: object, model: str, duration_ms: int = 0) -> dict:
+    """Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict."""
+    steps: list[dict] = []
+    step_id = 0
+    now = datetime.now(timezone.utc).isoformat()
+
+    def _step(source: str, message: str, **extra: object) -> dict:
+        nonlocal step_id
+        step_id += 1
+        step = {
+            "step_id": step_id,
+            "timestamp": now,
+            "source": source,
+            "message": message,
+        }
+        step.update({key: value for key, value in extra.items() if value is not None})
+        return step
+
+    pending_tool_call = None
+    for item in result.new_items:
+        if isinstance(item, MessageOutputItem):
+            text = ItemHelpers.text_message_output(item)
+            if text:
+                steps.append(_step("agent", text, model_name=model))
+        elif isinstance(item, ReasoningItem):
+            summaries = getattr(item.raw_item, "summary", None)
+            reasoning = (
+                "\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None
+            )
+            if reasoning:
+                steps.append(
+                    _step(
+                        "agent",
+                        "(thinking)",
+                        reasoning_content=reasoning,
+                        model_name=model,
+                    )
+                )
+        elif isinstance(item, ToolCallItem):
+            raw = item.raw_item
+            if hasattr(raw, "name"):
+                pending_tool_call = raw
+        elif isinstance(item, ToolCallOutputItem) and pending_tool_call:
+            arguments = (
+                json.loads(pending_tool_call.arguments)
+                if isinstance(pending_tool_call.arguments, str)
+                else pending_tool_call.arguments
+            )
+            output_str = str(item.output) if item.output else ""
+            steps.append(
+                _step(
+                    "agent",
+                    f"Tool: {pending_tool_call.name}",
+                    tool_calls=[
+                        {
+                            "tool_call_id": pending_tool_call.call_id,
+                            "function_name": pending_tool_call.name,
+                            "arguments": arguments,
+                        }
+                    ],
+                    observation={
+                        "results": [
+                            {
+                                "source_call_id": pending_tool_call.call_id,
+                                "content": output_str,
+                            }
+                        ]
+                    },
+                )
+            )
+            pending_tool_call = None
+
+    if pending_tool_call:
+        arguments = (
+            json.loads(pending_tool_call.arguments)
+            if isinstance(pending_tool_call.arguments, str)
+            else pending_tool_call.arguments
+        )
+        steps.append(
+            _step(
+                "agent",
+                f"Tool: {pending_tool_call.name}",
+                tool_calls=[
+                    {
+                        "tool_call_id": pending_tool_call.call_id,
+                        "function_name": pending_tool_call.name,
+                        "arguments": arguments,
+                    }
+                ],
+            )
+        )
+
+    if not steps:
+        steps.append(_step("user", "(empty)"))
+
+    usage = Usage()
+    for response in result.raw_responses:
+        usage.add(response.usage)
+
+    return {
+        "schema_version": "ATIF-v1.6",
+        "session_id": getattr(result, "last_response_id", None) or "unknown",
+        "agent": {"name": "autoagent", "version": "0.1.0", "model_name": model},
+        "steps": steps,
+        "final_metrics": {
+            "total_prompt_tokens": usage.input_tokens,
+            "total_completion_tokens": usage.output_tokens,
+            "total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0,
+            "total_cost_usd": None,
+            "total_steps": len(steps),
+            "extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)},
+        },
+    }
+
+
+class AutoAgent(BaseAgent):
+    """Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container."""
+
+    SUPPORTS_ATIF = True
+
+    def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._extra_env = dict(extra_env) if extra_env else {}
+
+    @staticmethod
+    def name() -> str:
+        return "autoagent"
+
+    def version(self) -> str | None:
+        return "0.1.0"
+
+    async def setup(self, environment: BaseEnvironment) -> None:
+        pass
+
+    async def run(
+        self, instruction: str, environment: BaseEnvironment, context: AgentContext
+    ) -> None:
+        await environment.exec(command="mkdir -p /task")
+        instr_file = self.logs_dir / "instruction.md"
+        instr_file.write_text(instruction)
+        await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md")
+
+        result, duration_ms = await run_task(environment, instruction)
+
+        atif = to_atif(result, model=MODEL, duration_ms=duration_ms)
+        traj_path = self.logs_dir / "trajectory.json"
+        traj_path.write_text(json.dumps(atif, indent=2))
+
+        try:
+            final_metrics = atif.get("final_metrics", {})
+            context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0)
+            context.n_output_tokens = final_metrics.get("total_completion_tokens", 0)
+            context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0)
+        except Exception:
+            pass
+
+        usage = Usage()
+        for response in result.raw_responses:
+            usage.add(response.usage)
+        print(
+            f"turns={len(result.raw_responses)} duration_ms={duration_ms} "
+            f"input={usage.input_tokens} output={usage.output_tokens}"
+        )
+
+
+__all__ = ["AutoAgent", "to_atif"]
diff --git a/agent.py b/agent.py
index d155db4..168e2c4 100644
--- a/agent.py
+++ b/agent.py
@@ -2,23 +2,11 @@
 
 from __future__ import annotations
 
-import json
 import time
-from datetime import datetime, timezone
 
 from agents import Agent, Runner, function_tool
-from agents.items import (
-    ItemHelpers,
-    MessageOutputItem,
-    ReasoningItem,
-    ToolCallItem,
-    ToolCallOutputItem,
-)
 from agents.tool import FunctionTool
-from agents.usage import Usage
-from harbor.agents.base import BaseAgent
 from harbor.environments.base import BaseEnvironment
-from harbor.models.agent.context import AgentContext
 
 
 # ============================================================================
@@ -73,170 +61,4 @@ async def run_task(
     return result, duration_ms
 
 
-# ============================================================================
-# FIXED ADAPTER BOUNDARY: do not modify unless the human explicitly asks.
-# Harbor integration and trajectory serialization live here.
-# ============================================================================
-
-def to_atif(result: object, model: str, duration_ms: int = 0) -> dict:
-    """Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict."""
-    steps: list[dict] = []
-    step_id = 0
-    now = datetime.now(timezone.utc).isoformat()
-
-    def _step(source: str, message: str, **extra: object) -> dict:
-        nonlocal step_id
-        step_id += 1
-        step = {
-            "step_id": step_id,
-            "timestamp": now,
-            "source": source,
-            "message": message,
-        }
-        step.update({key: value for key, value in extra.items() if value is not None})
-        return step
-
-    pending_tool_call = None
-    for item in result.new_items:
-        if isinstance(item, MessageOutputItem):
-            text = ItemHelpers.text_message_output(item)
-            if text:
-                steps.append(_step("agent", text, model_name=model))
-        elif isinstance(item, ReasoningItem):
-            summaries = getattr(item.raw_item, "summary", None)
-            reasoning = "\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None
-            if reasoning:
-                steps.append(
-                    _step(
-                        "agent",
-                        "(thinking)",
-                        reasoning_content=reasoning,
-                        model_name=model,
-                    )
-                )
-        elif isinstance(item, ToolCallItem):
-            raw = item.raw_item
-            if hasattr(raw, "name"):
-                pending_tool_call = raw
-        elif isinstance(item, ToolCallOutputItem) and pending_tool_call:
-            arguments = (
-                json.loads(pending_tool_call.arguments)
-                if isinstance(pending_tool_call.arguments, str)
-                else pending_tool_call.arguments
-            )
-            output_str = str(item.output) if item.output else ""
-            steps.append(
-                _step(
-                    "agent",
-                    f"Tool: {pending_tool_call.name}",
-                    tool_calls=[
-                        {
-                            "tool_call_id": pending_tool_call.call_id,
-                            "function_name": pending_tool_call.name,
-                            "arguments": arguments,
-                        }
-                    ],
-                    observation={
-                        "results": [
-                            {
-                                "source_call_id": pending_tool_call.call_id,
-                                "content": output_str,
-                            }
-                        ]
-                    },
-                )
-            )
-            pending_tool_call = None
-
-    if pending_tool_call:
-        arguments = (
-            json.loads(pending_tool_call.arguments)
-            if isinstance(pending_tool_call.arguments, str)
-            else pending_tool_call.arguments
-        )
-        steps.append(
-            _step(
-                "agent",
-                f"Tool: {pending_tool_call.name}",
-                tool_calls=[
-                    {
-                        "tool_call_id": pending_tool_call.call_id,
-                        "function_name": pending_tool_call.name,
-                        "arguments": arguments,
-                    }
-                ],
-            )
-        )
-
-    if not steps:
-        steps.append(_step("user", "(empty)"))
-
-    usage = Usage()
-    for response in result.raw_responses:
-        usage.add(response.usage)
-
-    return {
-        "schema_version": "ATIF-v1.6",
-        "session_id": getattr(result, "last_response_id", None) or "unknown",
-        "agent": {"name": "autoagent", "version": "0.1.0", "model_name": model},
-        "steps": steps,
-        "final_metrics": {
-            "total_prompt_tokens": usage.input_tokens,
-            "total_completion_tokens": usage.output_tokens,
-            "total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0,
-            "total_cost_usd": None,
-            "total_steps": len(steps),
-            "extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)},
-        },
-    }
-
-
-class AutoAgent(BaseAgent):
-    """Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container."""
-
-    SUPPORTS_ATIF = True
-
-    def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._extra_env = dict(extra_env) if extra_env else {}
-
-    @staticmethod
-    def name() -> str:
-        return "autoagent"
-
-    def version(self) -> str | None:
-        return "0.1.0"
-
-    async def setup(self, environment: BaseEnvironment) -> None:
-        pass
-
-    async def run(self, instruction: str, environment: BaseEnvironment, context: AgentContext) -> None:
-        await environment.exec(command="mkdir -p /task")
-        instr_file = self.logs_dir / "instruction.md"
-        instr_file.write_text(instruction)
-        await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md")
-
-        result, duration_ms = await run_task(environment, instruction)
-
-        atif = to_atif(result, model=MODEL, duration_ms=duration_ms)
-        traj_path = self.logs_dir / "trajectory.json"
-        traj_path.write_text(json.dumps(atif, indent=2))
-
-        try:
-            final_metrics = atif.get("final_metrics", {})
-            context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0)
-            context.n_output_tokens = final_metrics.get("total_completion_tokens", 0)
-            context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0)
-        except Exception:
-            pass
-
-        usage = Usage()
-        for response in result.raw_responses:
-            usage.add(response.usage)
-        print(
-            f"turns={len(result.raw_responses)} duration_ms={duration_ms} "
-            f"input={usage.input_tokens} output={usage.output_tokens}"
-        )
-
-
-__all__ = ["AutoAgent"]
+from adapter import AutoAgent, to_atif  # noqa: F401 — Harbor entrypoint
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
new file mode 100644
index 0000000..595ca1a
--- /dev/null
+++ b/tests/test_adapter.py
@@ -0,0 +1,123 @@
+"""Tests for adapter.py — fixed Harbor adapter boundary."""
+
+from __future__ import annotations
+
+import ast
+import hashlib
+from pathlib import Path
+
+
+ADAPTER_PATH = Path(__file__).parent.parent / "adapter.py"
+AGENT_PATH = Path(__file__).parent.parent / "agent.py"
+
+
+def _parse_adapter() -> ast.Module:
+    return ast.parse(ADAPTER_PATH.read_text())
+
+
+def test_adapter_file_exists() -> None:
+    """adapter.py must exist on disk."""
+    assert ADAPTER_PATH.exists(), "adapter.py not found"
+
+
+def test_adapter_exposes_autoagent() -> None:
+    """adapter.py must define AutoAgent class at module level.
+
+    Verified via AST to avoid importing heavy runtime dependencies.
+    """
+    tree = _parse_adapter()
+    top_level_classes = {
+        node.name
+        for node in ast.walk(tree)
+        if isinstance(node, ast.ClassDef) and node.col_offset == 0
+    }
+    assert "AutoAgent" in top_level_classes, "AutoAgent class not found in adapter.py"
+
+
+def test_adapter_exposes_to_atif() -> None:
+    """adapter.py must define to_atif function at module level.
+
+    Verified via AST to avoid importing heavy runtime dependencies.
+    """
+    tree = _parse_adapter()
+    top_level_funcs = {
+        node.name
+        for node in ast.walk(tree)
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and node.col_offset == 0
+    }
+    assert "to_atif" in top_level_funcs, "to_atif function not found in adapter.py"
+
+
+def test_autoagent_supports_atif() -> None:
+    """AutoAgent must declare SUPPORTS_ATIF = True as a class-level assignment.
+
+    Verified via AST to avoid importing heavy runtime dependencies.
+    """
+    tree = _parse_adapter()
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef) and node.name == "AutoAgent":
+            for stmt in node.body:
+                # SUPPORTS_ATIF = True  →  ast.Assign or ast.AnnAssign
+                if isinstance(stmt, ast.Assign):
+                    for target in stmt.targets:
+                        if isinstance(target, ast.Name) and target.id == "SUPPORTS_ATIF":
+                            value = stmt.value
+                            assert isinstance(value, ast.Constant) and value.value is True, (
+                                "AutoAgent.SUPPORTS_ATIF must be True"
+                            )
+                            return
+    raise AssertionError("AutoAgent.SUPPORTS_ATIF = True not found in adapter.py")
+
+
+def test_adapter_source_hash_is_stable() -> None:
+    """adapter.py source hash must be a valid SHA-256 hex digest (tamper detection baseline).
+
+    This test does NOT assert a specific hash value — it verifies that the file
+    exists and produces a well-formed digest. Pin the expected value in CI/CD if
+    immutability enforcement is required.
+    """
+    source = ADAPTER_PATH.read_bytes()
+    digest = hashlib.sha256(source).hexdigest()
+    assert len(digest) == 64, "SHA-256 digest must be 64 hex characters"
+    assert all(c in "0123456789abcdef" for c in digest), "Digest must be lowercase hex"
+
+
+def test_agent_imports_autoagent_for_harbor_compat() -> None:
+    """agent.py must re-export AutoAgent via 'from adapter import AutoAgent'.
+
+    Harbor uses --agent-import-path agent:AutoAgent, so agent.py must expose it.
+    Verified via AST to avoid importing heavy runtime dependencies.
+    """
+    source = AGENT_PATH.read_text()
+    tree = ast.parse(source)
+
+    found = False
+    for node in ast.walk(tree):
+        if (
+            isinstance(node, ast.ImportFrom)
+            and node.module == "adapter"
+            and any(alias.name == "AutoAgent" for alias in node.names)
+        ):
+            found = True
+            break
+
+    assert found, "agent.py must re-export AutoAgent via 'from adapter import AutoAgent'"
+
+
+def test_adapter_does_not_define_run_task() -> None:
+    """run_task must live in agent.py (editable harness), not adapter.py.
+
+    Verified via AST to keep the boundary clean.
+    """
+    tree = _parse_adapter()
+    top_level_funcs = {
+        node.name
+        for node in ast.walk(tree)
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and node.col_offset == 0
+    }
+    assert "run_task" not in top_level_funcs, (
+        "run_task must not be defined in adapter.py — it belongs in the editable harness (agent.py)"
+    )

From 0d5c29736758e75650e7d3b5ffb9724759c3d309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:45:13 +0900
Subject: [PATCH 03/12] feat: Docker read-only + network isolation for eval
 containers

---
 Dockerfile.base     |  9 +++------
 scripts/run_eval.sh | 12 ++++++++++++
 2 files changed, 15 insertions(+), 6 deletions(-)
 create mode 100755 scripts/run_eval.sh

diff --git a/Dockerfile.base b/Dockerfile.base
index 705ad73..599e69c 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -5,13 +5,10 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
-
-# Python deps — only what the agent needs (harbor excluded via .dockerignore)
+COPY contracts.py adapter.py /app/fixed/
+COPY agent.py /app/editable/
 COPY pyproject.toml ./
 RUN uv pip install --system .
-
-# Agent code
-COPY agent.py ./
-
+ENV PYTHONPATH=/app/fixed:/app/editable:/app
 RUN ln -sf $(which python3) /usr/local/bin/python
 RUN mkdir -p /logs /app/output
diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh
new file mode 100755
index 0000000..b322419
--- /dev/null
+++ b/scripts/run_eval.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+NETWORK="${EVAL_NETWORK:-none}"
+docker run --rm --read-only \
+  --network="$NETWORK" \
+  --tmpfs /tmp:size=512M \
+  --mount type=bind,source="$(pwd)/adapter.py",target=/app/fixed/adapter.py,readonly \
+  --mount type=bind,source="$(pwd)/contracts.py",target=/app/fixed/contracts.py,readonly \
+  -v "$(pwd)/agent.py:/app/editable/agent.py:rw" \
+  -e PYTHONPATH=/app/fixed:/app/editable:/app \
+  --security-opt no-new-privileges:true \
+  autoagent-base "$@"

From f2db2a144992d80657d6beca35a6341c8b65b180 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:45:46 +0900
Subject: [PATCH 04/12] feat: preflight policy gate for mutation validation

---
 preflight.py            | 49 ++++++++++++++++++++++++++++++++
 tests/test_preflight.py | 63 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 preflight.py
 create mode 100644 tests/test_preflight.py

diff --git a/preflight.py b/preflight.py
new file mode 100644
index 0000000..e19c6ac
--- /dev/null
+++ b/preflight.py
@@ -0,0 +1,49 @@
+"""Preflight policy gate — rule-based diff checker for mutation validation."""
+
+import re
+from dataclasses import dataclass
+
+FIXED_FILES = {"adapter.py", "contracts.py", "__init__.py"}
+
+FORBIDDEN_PATTERNS = [
+    r'\bimport\s+importlib\b',
+    r'\bimport\s+ctypes\b',
+    r'\bsys\.modules\b',
+]
+
+
+@dataclass
+class PreflightResult:
+    rejected: bool
+    reason: str
+
+
+def check_diff(diff_text: str) -> PreflightResult:
+    """Check a unified diff for policy violations.
+
+    Args:
+        diff_text: Unified diff string to validate.
+
+    Returns:
+        PreflightResult with rejected=True and a reason if any rule is violated.
+    """
+    for line in diff_text.splitlines():
+        # Check if any fixed file is being modified (appears in diff --git header)
+        if line.startswith("diff --git"):
+            for fixed_file in FIXED_FILES:
+                if f"/{fixed_file}" in line or f" {fixed_file}" in line:
+                    return PreflightResult(
+                        rejected=True,
+                        reason=f"modification of fixed file detected: {fixed_file}",
+                    )
+
+        # Check forbidden patterns only in added lines
+        if line.startswith("+") and not line.startswith("+++"):
+            for pattern in FORBIDDEN_PATTERNS:
+                if re.search(pattern, line):
+                    return PreflightResult(
+                        rejected=True,
+                        reason=f"forbidden pattern found: {pattern}",
+                    )
+
+    return PreflightResult(rejected=False, reason="")
diff --git a/tests/test_preflight.py b/tests/test_preflight.py
new file mode 100644
index 0000000..f48a443
--- /dev/null
+++ b/tests/test_preflight.py
@@ -0,0 +1,63 @@
+"""Tests for preflight policy gate."""
+
+from preflight import check_diff
+
+
+def test_reject_fixed_modification():
+    diff = """\
+diff --git a/adapter.py b/adapter.py
+--- a/adapter.py
++++ b/adapter.py
+@@ -1,3 +1,4 @@
++# modified
+ import autoagent
+"""
+    result = check_diff(diff)
+    assert result.rejected is True
+    assert "adapter.py" in result.reason
+
+
+def test_reject_forbidden_import():
+    diff = """\
+diff --git a/agent.py b/agent.py
+--- a/agent.py
++++ b/agent.py
+@@ -1,3 +1,4 @@
++import importlib
+ async def run_task(task):
+     pass
+"""
+    result = check_diff(diff)
+    assert result.rejected is True
+    assert result.reason != ""
+
+
+def test_allow_clean_change():
+    diff = """\
+diff --git a/agent.py b/agent.py
+--- a/agent.py
++++ b/agent.py
+@@ -1,3 +1,5 @@
++import os
++
+ async def run_task(task):
+-    pass
++    return {"score": 1.0}
+"""
+    result = check_diff(diff)
+    assert result.rejected is False
+
+
+def test_reject_sys_modules():
+    diff = """\
+diff --git a/agent.py b/agent.py
+--- a/agent.py
++++ b/agent.py
+@@ -1,3 +1,4 @@
++sys.modules["os"] = None
+ async def run_task(task):
+     pass
+"""
+    result = check_diff(diff)
+    assert result.rejected is True
+    assert result.reason != ""

From 5c6ad96a4801300844adf4c0493c9aa8b05f5114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:46:24 +0900
Subject: [PATCH 05/12] feat: structured experiment logging with ATIF sidecar
 index

---
 experiment_log.py            | 60 +++++++++++++++++++++++++++++++++
 tests/test_experiment_log.py | 65 ++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 experiment_log.py
 create mode 100644 tests/test_experiment_log.py

diff --git a/experiment_log.py b/experiment_log.py
new file mode 100644
index 0000000..bc168cd
--- /dev/null
+++ b/experiment_log.py
@@ -0,0 +1,60 @@
+"""Structured experiment logging with ATIF sidecar index."""
+
+import json
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Optional
+
+
+@dataclass
+class ExperimentEntry:
+    version: str
+    parent: Optional[str]
+    schema_version: int
+    editable_tree_hash: str
+    fixed_tree_hash: str
+    contract_version: str
+    container_image_digest: str
+    scores: dict[str, Any]
+    cost_usd: float
+    tokens_used: int
+    duration_sec: float
+    trace_id: str
+    atif_version: str
+    trajectory_uri: str
+    delta: dict[str, Any]
+    root_cause: str
+    meta_reasoning: str
+    network_profile: str
+    evaluator_digest: str
+    timestamp: str
+    meta: dict[str, Any] = field(default_factory=dict)
+
+
+class ExperimentLogger:
+    """Append-only JSONL logger for experiment entries."""
+
+    def __init__(self, path: Path) -> None:
+        self._path = path
+
+    def append(self, entry: ExperimentEntry) -> None:
+        """Append an entry to the JSONL log file."""
+        with self._path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(asdict(entry)) + "\n")
+
+    def read_all(self) -> list[ExperimentEntry]:
+        """Read all entries from the log file.
+
+        Returns:
+            List of ExperimentEntry objects, or empty list if file does not exist.
+        """
+        if not self._path.exists():
+            return []
+        entries = []
+        with self._path.open("r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data = json.loads(line)
+                    entries.append(ExperimentEntry(**data))
+        return entries
diff --git a/tests/test_experiment_log.py b/tests/test_experiment_log.py
new file mode 100644
index 0000000..5dfb873
--- /dev/null
+++ b/tests/test_experiment_log.py
@@ -0,0 +1,65 @@
+"""Tests for structured experiment logging."""
+
+import json
+import tempfile
+from pathlib import Path
+
+from experiment_log import ExperimentEntry, ExperimentLogger
+
+
+def _make_entry(**kwargs) -> ExperimentEntry:
+    defaults = dict(
+        version="v0.1.0",
+        parent=None,
+        schema_version=1,
+        editable_tree_hash="abc123",
+        fixed_tree_hash="def456",
+        contract_version="1.0",
+        container_image_digest="sha256:deadbeef",
+        scores={"pass@1": 0.85},
+        cost_usd=0.05,
+        tokens_used=1500,
+        duration_sec=12.3,
+        trace_id="trace-001",
+        atif_version="0.3.0",
+        trajectory_uri="s3://bucket/traj/001.jsonl",
+        delta={"agent.py": "+10 -2"},
+        root_cause="improved retry logic",
+        meta_reasoning="higher pass@1 expected from retry",
+        network_profile="none",
+        evaluator_digest="sha256:cafebabe",
+        timestamp="2026-04-04T00:00:00Z",
+    )
+    defaults.update(kwargs)
+    return ExperimentEntry(**defaults)
+
+
+def test_entry_has_atif_connection_keys():
+    entry = _make_entry()
+    assert hasattr(entry, "trace_id")
+    assert hasattr(entry, "trajectory_uri")
+    assert entry.trace_id == "trace-001"
+    assert entry.trajectory_uri == "s3://bucket/traj/001.jsonl"
+
+
+def test_append_and_read_back():
+    with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f:
+        path = Path(f.name)
+
+    try:
+        logger = ExperimentLogger(path)
+        entry = _make_entry()
+        logger.append(entry)
+
+        entries = logger.read_all()
+        assert len(entries) == 1
+        assert entries[0].version == "v0.1.0"
+        assert entries[0].scores == {"pass@1": 0.85}
+        assert entries[0].trace_id == "trace-001"
+    finally:
+        path.unlink(missing_ok=True)
+
+
+def test_read_all_returns_empty_list_for_missing_file():
+    logger = ExperimentLogger(Path("/tmp/nonexistent_experiment_log_xyz.jsonl"))
+    assert logger.read_all() == []

From b95dc8a2f45547137d700db224f70099aa56a805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:48:47 +0900
Subject: [PATCH 06/12] feat: add 5 smoke test tasks for Level 1 evaluation

---
 scripts/run_smoke.sh                       |  5 ++++
 tasks/smoke/csv-analysis/instruction.md    |  1 +
 tasks/smoke/csv-analysis/task.toml         |  4 +++
 tasks/smoke/csv-analysis/tests/test.sh     | 31 +++++++++++++++++++++
 tasks/smoke/fibonacci/instruction.md       |  1 +
 tasks/smoke/fibonacci/task.toml            |  4 +++
 tasks/smoke/fibonacci/tests/test.sh        | 17 ++++++++++++
 tasks/smoke/git-log/instruction.md         |  1 +
 tasks/smoke/git-log/task.toml              |  4 +++
 tasks/smoke/git-log/tests/test.sh          | 32 ++++++++++++++++++++++
 tasks/smoke/hello-world/instruction.md     |  1 +
 tasks/smoke/hello-world/task.toml          |  4 +++
 tasks/smoke/hello-world/tests/test.sh      | 17 ++++++++++++
 tasks/smoke/text-processing/instruction.md |  1 +
 tasks/smoke/text-processing/task.toml      |  4 +++
 tasks/smoke/text-processing/tests/test.sh  | 17 ++++++++++++
 16 files changed, 144 insertions(+)
 create mode 100755 scripts/run_smoke.sh
 create mode 100644 tasks/smoke/csv-analysis/instruction.md
 create mode 100644 tasks/smoke/csv-analysis/task.toml
 create mode 100755 tasks/smoke/csv-analysis/tests/test.sh
 create mode 100644 tasks/smoke/fibonacci/instruction.md
 create mode 100644 tasks/smoke/fibonacci/task.toml
 create mode 100755 tasks/smoke/fibonacci/tests/test.sh
 create mode 100644 tasks/smoke/git-log/instruction.md
 create mode 100644 tasks/smoke/git-log/task.toml
 create mode 100755 tasks/smoke/git-log/tests/test.sh
 create mode 100644 tasks/smoke/hello-world/instruction.md
 create mode 100644 tasks/smoke/hello-world/task.toml
 create mode 100755 tasks/smoke/hello-world/tests/test.sh
 create mode 100644 tasks/smoke/text-processing/instruction.md
 create mode 100644 tasks/smoke/text-processing/task.toml
 create mode 100755 tasks/smoke/text-processing/tests/test.sh

diff --git a/scripts/run_smoke.sh b/scripts/run_smoke.sh
new file mode 100755
index 0000000..37cbdd2
--- /dev/null
+++ b/scripts/run_smoke.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== Smoke Test (Level 1) ==="
+uv run harbor run -p tasks/smoke/ --agent-import-path agent:AutoAgent -o jobs/smoke
+echo "=== Smoke Complete ==="
diff --git a/tasks/smoke/csv-analysis/instruction.md b/tasks/smoke/csv-analysis/instruction.md
new file mode 100644
index 0000000..485cbff
--- /dev/null
+++ b/tasks/smoke/csv-analysis/instruction.md
@@ -0,0 +1 @@
+Create a CSV file at /task/output/data.csv with headers "name,score" and 3 rows: Alice,85 Bob,92 Charlie,78. Then write the average score to /task/output/average.txt
diff --git a/tasks/smoke/csv-analysis/task.toml b/tasks/smoke/csv-analysis/task.toml
new file mode 100644
index 0000000..60ccecd
--- /dev/null
+++ b/tasks/smoke/csv-analysis/task.toml
@@ -0,0 +1,4 @@
+[task]
+name = "csv-analysis"
+description = "Create a CSV file and compute the average score"
+timeout_sec = 60
diff --git a/tasks/smoke/csv-analysis/tests/test.sh b/tasks/smoke/csv-analysis/tests/test.sh
new file mode 100755
index 0000000..3b341e2
--- /dev/null
+++ b/tasks/smoke/csv-analysis/tests/test.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+CSV_FILE="/task/output/data.csv"
+AVG_FILE="/task/output/average.txt"
+
+if [ ! -f "$CSV_FILE" ]; then
+    echo "FAIL: $CSV_FILE does not exist"
+    exit 1
+fi
+
+if [ ! -f "$AVG_FILE" ]; then
+    echo "FAIL: $AVG_FILE does not exist"
+    exit 1
+fi
+
+# Count data rows (excluding header)
+DATA_ROWS=$(tail -n +2 "$CSV_FILE" | grep -c '[^[:space:]]' || true)
+if [ "$DATA_ROWS" -ne 3 ]; then
+    echo "FAIL: expected 3 data rows but got $DATA_ROWS"
+    exit 1
+fi
+
+# Check average (integer: (85+92+78)/3 = 85)
+AVG_CONTENT=$(cat "$AVG_FILE" | tr -d '[:space:]')
+if [ "$AVG_CONTENT" != "85" ]; then
+    echo "FAIL: expected average '85' but got '$AVG_CONTENT'"
+    exit 1
+fi
+
+echo "PASS"
diff --git a/tasks/smoke/fibonacci/instruction.md b/tasks/smoke/fibonacci/instruction.md
new file mode 100644
index 0000000..ce5b28a
--- /dev/null
+++ b/tasks/smoke/fibonacci/instruction.md
@@ -0,0 +1 @@
+Calculate the 10th Fibonacci number and write it to /task/output/fib.txt
diff --git a/tasks/smoke/fibonacci/task.toml b/tasks/smoke/fibonacci/task.toml
new file mode 100644
index 0000000..59a3c26
--- /dev/null
+++ b/tasks/smoke/fibonacci/task.toml
@@ -0,0 +1,4 @@
+[task]
+name = "fibonacci"
+description = "Calculate the 10th Fibonacci number and write it to output"
+timeout_sec = 60
diff --git a/tasks/smoke/fibonacci/tests/test.sh b/tasks/smoke/fibonacci/tests/test.sh
new file mode 100755
index 0000000..9ba948e
--- /dev/null
+++ b/tasks/smoke/fibonacci/tests/test.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUTPUT_FILE="/task/output/fib.txt"
+
+if [ ! -f "$OUTPUT_FILE" ]; then
+    echo "FAIL: $OUTPUT_FILE does not exist"
+    exit 1
+fi
+
+CONTENT=$(cat "$OUTPUT_FILE" | tr -d '[:space:]')
+if [ "$CONTENT" != "55" ]; then
+    echo "FAIL: expected '55' but got '$CONTENT'"
+    exit 1
+fi
+
+echo "PASS"
diff --git a/tasks/smoke/git-log/instruction.md b/tasks/smoke/git-log/instruction.md
new file mode 100644
index 0000000..6478715
--- /dev/null
+++ b/tasks/smoke/git-log/instruction.md
@@ -0,0 +1 @@
+Initialize a git repo in /tmp/test-repo, create 3 commits with messages "first", "second", "third", then write the output of "git log --oneline" to /task/output/log.txt
diff --git a/tasks/smoke/git-log/task.toml b/tasks/smoke/git-log/task.toml
new file mode 100644
index 0000000..cfdf8a9
--- /dev/null
+++ b/tasks/smoke/git-log/task.toml
@@ -0,0 +1,4 @@
+[task]
+name = "git-log"
+description = "Initialize a git repo, create 3 commits, and write git log to output"
+timeout_sec = 120
diff --git a/tasks/smoke/git-log/tests/test.sh b/tasks/smoke/git-log/tests/test.sh
new file mode 100755
index 0000000..c5d9299
--- /dev/null
+++ b/tasks/smoke/git-log/tests/test.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUTPUT_FILE="/task/output/log.txt"
+
+if [ ! -f "$OUTPUT_FILE" ]; then
+    echo "FAIL: $OUTPUT_FILE does not exist"
+    exit 1
+fi
+
+LINE_COUNT=$(wc -l < "$OUTPUT_FILE")
+if [ "$LINE_COUNT" -ne 3 ]; then
+    echo "FAIL: expected 3 lines but got $LINE_COUNT"
+    exit 1
+fi
+
+if ! grep -q "first" "$OUTPUT_FILE"; then
+    echo "FAIL: 'first' not found in log"
+    exit 1
+fi
+
+if ! grep -q "second" "$OUTPUT_FILE"; then
+    echo "FAIL: 'second' not found in log"
+    exit 1
+fi
+
+if ! grep -q "third" "$OUTPUT_FILE"; then
+    echo "FAIL: 'third' not found in log"
+    exit 1
+fi
+
+echo "PASS"
diff --git a/tasks/smoke/hello-world/instruction.md b/tasks/smoke/hello-world/instruction.md
new file mode 100644
index 0000000..a3f6479
--- /dev/null
+++ b/tasks/smoke/hello-world/instruction.md
@@ -0,0 +1 @@
+Write the text "Hello, World!" to /task/output/hello.txt
diff --git a/tasks/smoke/hello-world/task.toml b/tasks/smoke/hello-world/task.toml
new file mode 100644
index 0000000..23e6b43
--- /dev/null
+++ b/tasks/smoke/hello-world/task.toml
@@ -0,0 +1,4 @@
+[task]
+name = "hello-world"
+description = "Write hello world to output file"
+timeout_sec = 60
diff --git a/tasks/smoke/hello-world/tests/test.sh b/tasks/smoke/hello-world/tests/test.sh
new file mode 100755
index 0000000..b8b925d
--- /dev/null
+++ b/tasks/smoke/hello-world/tests/test.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUTPUT_FILE="/task/output/hello.txt"
+
+if [ ! -f "$OUTPUT_FILE" ]; then
+    echo "FAIL: $OUTPUT_FILE does not exist"
+    exit 1
+fi
+
+CONTENT=$(cat "$OUTPUT_FILE")
+if [ "$CONTENT" != "Hello, World!" ]; then
+    echo "FAIL: expected 'Hello, World!' but got '$CONTENT'"
+    exit 1
+fi
+
+echo "PASS"
diff --git a/tasks/smoke/text-processing/instruction.md b/tasks/smoke/text-processing/instruction.md
new file mode 100644
index 0000000..b9fcbb8
--- /dev/null
+++ b/tasks/smoke/text-processing/instruction.md
@@ -0,0 +1 @@
+Count the number of words in the following text and write the count to /task/output/count.txt: "The quick brown fox jumps over the lazy dog"
diff --git a/tasks/smoke/text-processing/task.toml b/tasks/smoke/text-processing/task.toml
new file mode 100644
index 0000000..58b7bf2
--- /dev/null
+++ b/tasks/smoke/text-processing/task.toml
@@ -0,0 +1,4 @@
+[task]
+name = "text-processing"
+description = "Count words in a given text and write the count to output"
+timeout_sec = 60
diff --git a/tasks/smoke/text-processing/tests/test.sh b/tasks/smoke/text-processing/tests/test.sh
new file mode 100755
index 0000000..9ae004a
--- /dev/null
+++ b/tasks/smoke/text-processing/tests/test.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUTPUT_FILE="/task/output/count.txt"
+
+if [ ! -f "$OUTPUT_FILE" ]; then
+    echo "FAIL: $OUTPUT_FILE does not exist"
+    exit 1
+fi
+
+CONTENT=$(cat "$OUTPUT_FILE" | tr -d '[:space:]')
+if [ "$CONTENT" != "9" ]; then
+    echo "FAIL: expected '9' but got '$CONTENT'"
+    exit 1
+fi
+
+echo "PASS"

From 0ff372816aeec33c4136d90cef890e711cdbb383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 10:56:29 +0900
Subject: [PATCH 07/12] fix: address code review findings

- adapter.py: lazy import to avoid circular dependency on standalone import
- adapter.py: track pending tool calls by call_id (dict) instead of single slot
- preflight.py: exact root-relative path matching to avoid false positives
- contracts.py: fix run_task return type to tuple[Any, int]
- tests: add test for nested __init__.py not being flagged as fixed file
---
 adapter.py              | 76 ++++++++++++++++++++++-------------------
 contracts.py            |  4 +--
 preflight.py            | 18 ++++++----
 tests/test_preflight.py | 13 +++++++
 4 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/adapter.py b/adapter.py
index 5eaae1d..21778e6 100644
--- a/adapter.py
+++ b/adapter.py
@@ -17,10 +17,10 @@
 from harbor.environments.base import BaseEnvironment
 from harbor.models.agent.context import AgentContext
 
-# Import editable harness entry-points (run_task, MODEL) from agent.py.
-# agent.py imports *this* module at the bottom, so by the time Python resolves
-# this import agent.py's module-level symbols are already defined — no cycle.
-from agent import MODEL, run_task  # noqa: E402
+def _load_harness():
+    """Lazy import to avoid circular dependency with agent.py."""
+    from agent import MODEL, run_task
+    return MODEL, run_task
 
 
 def to_atif(result: object, model: str, duration_ms: int = 0) -> dict:
@@ -41,7 +41,7 @@ def _step(source: str, message: str, **extra: object) -> dict:
         step.update({key: value for key, value in extra.items() if value is not None})
         return step
 
-    pending_tool_call = None
+    pending_tool_calls: dict[str, object] = {}
     for item in result.new_items:
         if isinstance(item, MessageOutputItem):
             text = ItemHelpers.text_message_output(item)
@@ -63,39 +63,44 @@ def _step(source: str, message: str, **extra: object) -> dict:
                 )
         elif isinstance(item, ToolCallItem):
             raw = item.raw_item
-            if hasattr(raw, "name"):
-                pending_tool_call = raw
-        elif isinstance(item, ToolCallOutputItem) and pending_tool_call:
-            arguments = (
-                json.loads(pending_tool_call.arguments)
-                if isinstance(pending_tool_call.arguments, str)
-                else pending_tool_call.arguments
+            if hasattr(raw, "call_id") and hasattr(raw, "name"):
+                pending_tool_calls[raw.call_id] = raw
+        elif isinstance(item, ToolCallOutputItem):
+            output_call_id = (
+                getattr(item.raw_item, "call_id", None)
+                or getattr(item.raw_item, "tool_call_id", None)
             )
-            output_str = str(item.output) if item.output else ""
-            steps.append(
-                _step(
-                    "agent",
-                    f"Tool: {pending_tool_call.name}",
-                    tool_calls=[
-                        {
-                            "tool_call_id": pending_tool_call.call_id,
-                            "function_name": pending_tool_call.name,
-                            "arguments": arguments,
-                        }
-                    ],
-                    observation={
-                        "results": [
+            pending_tool_call = pending_tool_calls.pop(output_call_id, None) if output_call_id else None
+            if pending_tool_call:
+                arguments = (
+                    json.loads(pending_tool_call.arguments)
+                    if isinstance(pending_tool_call.arguments, str)
+                    else pending_tool_call.arguments
+                )
+                output_str = str(item.output) if item.output else ""
+                steps.append(
+                    _step(
+                        "agent",
+                        f"Tool: {pending_tool_call.name}",
+                        tool_calls=[
                             {
-                                "source_call_id": pending_tool_call.call_id,
-                                "content": output_str,
+                                "tool_call_id": pending_tool_call.call_id,
+                                "function_name": pending_tool_call.name,
+                                "arguments": arguments,
                             }
-                        ]
-                    },
+                        ],
+                        observation={
+                            "results": [
+                                {
+                                    "source_call_id": pending_tool_call.call_id,
+                                    "content": output_str,
+                                }
+                            ]
+                        },
+                    )
                 )
-            )
-            pending_tool_call = None
 
-    if pending_tool_call:
+    for pending_tool_call in pending_tool_calls.values():
         arguments = (
             json.loads(pending_tool_call.arguments)
             if isinstance(pending_tool_call.arguments, str)
@@ -165,9 +170,10 @@ async def run(
         instr_file.write_text(instruction)
         await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md")
 
-        result, duration_ms = await run_task(environment, instruction)
+        model, run_task_fn = _load_harness()
+        result, duration_ms = await run_task_fn(environment, instruction)
 
-        atif = to_atif(result, model=MODEL, duration_ms=duration_ms)
+        atif = to_atif(result, model=model, duration_ms=duration_ms)
         traj_path = self.logs_dir / "trajectory.json"
         traj_path.write_text(json.dumps(atif, indent=2))
 
diff --git a/contracts.py b/contracts.py
index 843f94c..42f54bd 100644
--- a/contracts.py
+++ b/contracts.py
@@ -27,8 +27,8 @@ def create_agent(self, environment: Any) -> Any:
         """Build and return a configured agent instance."""
         ...
 
-    async def run_task(self, environment: Any, instruction: str) -> dict:
-        """Execute a task and return a result mapping."""
+    async def run_task(self, environment: Any, instruction: str) -> tuple[Any, int]:
+        """Execute a task and return (result, duration_ms)."""
         ...
 
 
diff --git a/preflight.py b/preflight.py
index e19c6ac..b78b3c4 100644
--- a/preflight.py
+++ b/preflight.py
@@ -28,14 +28,18 @@ def check_diff(diff_text: str) -> PreflightResult:
         PreflightResult with rejected=True and a reason if any rule is violated.
     """
     for line in diff_text.splitlines():
-        # Check if any fixed file is being modified (appears in diff --git header)
+        # Check if any fixed file is being modified (exact root-relative path)
         if line.startswith("diff --git"):
-            for fixed_file in FIXED_FILES:
-                if f"/{fixed_file}" in line or f" {fixed_file}" in line:
-                    return PreflightResult(
-                        rejected=True,
-                        reason=f"modification of fixed file detected: {fixed_file}",
-                    )
+            parts = line.split()
+            if len(parts) >= 4:
+                left = parts[2].removeprefix("a/")
+                right = parts[3].removeprefix("b/")
+                for path in (left, right):
+                    if path in FIXED_FILES:
+                        return PreflightResult(
+                            rejected=True,
+                            reason=f"modification of fixed file detected: {path}",
+                        )
 
         # Check forbidden patterns only in added lines
         if line.startswith("+") and not line.startswith("+++"):
diff --git a/tests/test_preflight.py b/tests/test_preflight.py
index f48a443..3f4a864 100644
--- a/tests/test_preflight.py
+++ b/tests/test_preflight.py
@@ -48,6 +48,19 @@ async def run_task(task):
     assert result.rejected is False
 
 
+def test_allow_nested_init_py():
+    """tests/__init__.py should NOT be rejected (only root __init__.py is fixed)."""
+    diff = """\
+diff --git a/tests/__init__.py b/tests/__init__.py
+--- a/tests/__init__.py
++++ b/tests/__init__.py
+@@ -0,0 +1 @@
++# test package
+"""
+    result = check_diff(diff)
+    assert result.rejected is False
+
+
 def test_reject_sys_modules():
     diff = """\
 diff --git a/agent.py b/agent.py

From 83370c2395ba7ab10d7a4a20f8ca112a2cd7e032 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 11:01:13 +0900
Subject: [PATCH 08/12] fix: address round 2 code review findings

- preflight: broaden forbidden patterns (from/import variants, __import__)
- run_eval.sh: add writable tmpfs for /task, /logs, /app/output
- tests: add cases for from importlib and __import__ bypass
---
 preflight.py            |  5 +++--
 scripts/run_eval.sh     |  3 +++
 tests/test_preflight.py | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/preflight.py b/preflight.py
index b78b3c4..ad81db5 100644
--- a/preflight.py
+++ b/preflight.py
@@ -6,9 +6,10 @@
 FIXED_FILES = {"adapter.py", "contracts.py", "__init__.py"}
 
 FORBIDDEN_PATTERNS = [
-    r'\bimport\s+importlib\b',
-    r'\bimport\s+ctypes\b',
+    r'\b(?:from|import)\s+importlib\b',
+    r'\b(?:from|import)\s+ctypes\b',
     r'\bsys\.modules\b',
+    r'\b__import__\s*\(',
 ]
 
 
diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh
index b322419..5ffa3a8 100755
--- a/scripts/run_eval.sh
+++ b/scripts/run_eval.sh
@@ -4,6 +4,9 @@ NETWORK="${EVAL_NETWORK:-none}"
 docker run --rm --read-only \
   --network="$NETWORK" \
   --tmpfs /tmp:size=512M \
+  --tmpfs /task:size=512M \
+  --tmpfs /logs:size=128M \
+  --tmpfs /app/output:size=128M \
   --mount type=bind,source="$(pwd)/adapter.py",target=/app/fixed/adapter.py,readonly \
   --mount type=bind,source="$(pwd)/contracts.py",target=/app/fixed/contracts.py,readonly \
   -v "$(pwd)/agent.py:/app/editable/agent.py:rw" \
diff --git a/tests/test_preflight.py b/tests/test_preflight.py
index 3f4a864..d356f21 100644
--- a/tests/test_preflight.py
+++ b/tests/test_preflight.py
@@ -61,6 +61,30 @@ def test_allow_nested_init_py():
     assert result.rejected is False
 
 
+def test_reject_from_importlib():
+    diff = """\
+diff --git a/agent.py b/agent.py
+--- a/agent.py
++++ b/agent.py
+@@ -1,3 +1,4 @@
++from importlib import import_module
+"""
+    result = check_diff(diff)
+    assert result.rejected is True
+
+
+def test_reject_dunder_import():
+    diff = """\
+diff --git a/agent.py b/agent.py
+--- a/agent.py
++++ b/agent.py
+@@ -1,3 +1,4 @@
++mod = __import__("os")
+"""
+    result = check_diff(diff)
+    assert result.rejected is True
+
+
 def test_reject_sys_modules():
     diff = """\
 diff --git a/agent.py b/agent.py

From 5e750b42066f0062650cc0adaa27e641d0a90831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 11:12:27 +0900
Subject: [PATCH 09/12] feat: evolutionary archive with exploit/explore pools

---
 archive_manager.py            | 201 ++++++++++++++++++++++++++++++++++
 tests/test_archive_manager.py | 160 +++++++++++++++++++++++++++
 2 files changed, 361 insertions(+)
 create mode 100644 archive_manager.py
 create mode 100644 tests/test_archive_manager.py

diff --git a/archive_manager.py b/archive_manager.py
new file mode 100644
index 0000000..bdc03c1
--- /dev/null
+++ b/archive_manager.py
@@ -0,0 +1,201 @@
+"""Evolutionary archive with exploit/explore pools."""
+import hashlib
+import json
+import shutil
+import tarfile
+from pathlib import Path
+
+
+class ArchiveManager:
+    """Manages versioned snapshots of an editable directory in exploit/explore pools.
+
+    Args:
+        archive_dir: Directory where tarballs and index.jsonl are stored.
+        editable_dir: Directory being snapshotted/restored.
+        cap: Total max versions across both pools.
+        exploit_cap: Max versions in exploit pool.
+        explore_cap: Max versions in explore pool.
+        explore_protected_min: Minimum explore entries that survive cap enforcement.
+    """
+
+    def __init__(
+        self,
+        archive_dir: Path,
+        editable_dir: Path,
+        cap: int = 50,
+        exploit_cap: int = 30,
+        explore_cap: int = 20,
+        explore_protected_min: int = 10,
+    ) -> None:
+        self.archive_dir = Path(archive_dir)
+        self.editable_dir = Path(editable_dir)
+        self.cap = cap
+        self.exploit_cap = exploit_cap
+        self.explore_cap = explore_cap
+        self.explore_protected_min = explore_protected_min
+        self.archive_dir.mkdir(parents=True, exist_ok=True)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def snapshot(
+        self,
+        version: str,
+        scores: dict,
+        pool: str = "exploit",
+        generation: int = 0,
+        stale_generations: int = 0,
+    ) -> str:
+        """Tarball editable_dir, record metadata, return tree_hash.
+
+        Args:
+            version: Unique version identifier.
+            scores: Dict of suite→score floats.
+            pool: "exploit" or "explore".
+            generation: Current generation counter.
+            stale_generations: How many generations without improvement.
+
+        Returns:
+            SHA-256 hex digest of directory contents.
+        """
+        tree_hash = self._hash_dir(self.editable_dir)
+        tarball_path = self.archive_dir / f"{version}.tar.gz"
+        with tarfile.open(tarball_path, "w:gz") as tar:
+            tar.add(self.editable_dir, arcname=".")
+
+        entry = {
+            "version": version,
+            "pool": pool,
+            "scores": scores,
+            "tree_hash": tree_hash,
+            "generation": generation,
+            "stale_generations": stale_generations,
+        }
+        index_path = self.archive_dir / "index.jsonl"
+        with index_path.open("a") as f:
+            f.write(json.dumps(entry) + "\n")
+
+        self._enforce_cap()
+        return tree_hash
+
+    def restore(self, version: str) -> None:
+        """Extract the tarball for version back into editable_dir.
+
+        Args:
+            version: Version identifier to restore.
+        """
+        tarball_path = self.archive_dir / f"{version}.tar.gz"
+        if not tarball_path.exists():
+            raise FileNotFoundError(f"No tarball for version {version!r}")
+
+        # Clear editable_dir then extract
+        shutil.rmtree(self.editable_dir)
+        self.editable_dir.mkdir(parents=True)
+
+        with tarfile.open(tarball_path, "r:gz") as tar:
+            tar.extractall(self.editable_dir)
+
+    def list_versions(self, pool: str | None = None) -> list[str]:
+        """Return list of archived version identifiers.
+
+        Args:
+            pool: If given, filter to "exploit" or "explore". Otherwise return all.
+
+        Returns:
+            List of version strings in insertion order.
+        """
+        entries = self._read_index()
+        if pool is not None:
+            entries = [e for e in entries if e["pool"] == pool]
+        return [e["version"] for e in entries]
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _enforce_cap(self) -> None:
+        """Remove oldest entries when per-pool or total cap is exceeded.
+
+        For the explore pool, never remove entries below explore_protected_min.
+        """
+        entries = self._read_index()
+
+        # Enforce per-pool caps first
+        exploit_entries = [e for e in entries if e["pool"] == "exploit"]
+        explore_entries = [e for e in entries if e["pool"] == "explore"]
+
+        # Trim exploit (oldest first, no protected minimum)
+        while len(exploit_entries) > self.exploit_cap:
+            removed = exploit_entries.pop(0)
+            self._remove_version(removed["version"])
+
+        # Trim explore (respect protected minimum)
+        while len(explore_entries) > self.explore_cap and len(explore_entries) > self.explore_protected_min:
+            removed = explore_entries.pop(0)
+            self._remove_version(removed["version"])
+
+        # Enforce global cap on combined list (oldest first, explore protected)
+        combined = exploit_entries + explore_entries
+        combined.sort(key=lambda e: entries.index(e) if e in entries else 0)
+
+        while len(combined) > self.cap:
+            # Find oldest non-protected entry to remove
+            explore_count = sum(1 for e in combined if e["pool"] == "explore")
+            removed = None
+            for e in combined:
+                if e["pool"] == "exploit":
+                    removed = e
+                    break
+                if e["pool"] == "explore" and explore_count > self.explore_protected_min:
+                    removed = e
+                    break
+            if removed is None:
+                break
+            combined.remove(removed)
+            self._remove_version(removed["version"])
+
+        # Rewrite index with surviving entries
+        surviving = {e["version"] for e in combined}
+        all_entries = self._read_index()
+        kept = [e for e in all_entries if e["version"] in surviving]
+        self._write_index(kept)
+
+    def _remove_version(self, version: str) -> None:
+        """Delete tarball for a version (index rewrite is done by caller)."""
+        tarball = self.archive_dir / f"{version}.tar.gz"
+        if tarball.exists():
+            tarball.unlink()
+
+    def _hash_dir(self, path: Path) -> str:
+        """Compute deterministic SHA-256 hash over all files in a directory.
+
+        Args:
+            path: Directory to hash.
+
+        Returns:
+            64-character hex digest.
+        """
+        hasher = hashlib.sha256()
+        # Sort for determinism
+        for file_path in sorted(Path(path).rglob("*")):
+            if file_path.is_file():
+                rel = file_path.relative_to(path)
+                hasher.update(str(rel).encode())
+                hasher.update(file_path.read_bytes())
+        return hasher.hexdigest()
+
+    def _read_index(self) -> list[dict]:
+        """Read and parse index.jsonl into a list of entry dicts."""
+        index_path = self.archive_dir / "index.jsonl"
+        if not index_path.exists():
+            return []
+        lines = index_path.read_text().splitlines()
+        return [json.loads(line) for line in lines if line.strip()]
+
+    def _write_index(self, entries: list[dict]) -> None:
+        """Overwrite index.jsonl with given entries."""
+        index_path = self.archive_dir / "index.jsonl"
+        with index_path.open("w") as f:
+            for entry in entries:
+                f.write(json.dumps(entry) + "\n")
diff --git a/tests/test_archive_manager.py b/tests/test_archive_manager.py
new file mode 100644
index 0000000..68b82ce
--- /dev/null
+++ b/tests/test_archive_manager.py
@@ -0,0 +1,160 @@
+"""Tests for ArchiveManager: evolutionary archive with exploit/explore pools."""
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from archive_manager import ArchiveManager
+
+
+@pytest.fixture
+def tmp_dirs():
+    """Provide temporary archive_dir and editable_dir."""
+    with tempfile.TemporaryDirectory() as base:
+        archive_dir = Path(base) / "archive"
+        editable_dir = Path(base) / "editable"
+        archive_dir.mkdir()
+        editable_dir.mkdir()
+        yield archive_dir, editable_dir
+
+
+def _seed_editable(editable_dir: Path, content: dict[str, str]) -> None:
+    """Write {filename: text} into editable_dir."""
+    for name, text in content.items():
+        (editable_dir / name).write_text(text)
+
+
+def _read_index(archive_dir: Path) -> list[dict]:
+    index_path = archive_dir / "index.jsonl"
+    if not index_path.exists():
+        return []
+    return [json.loads(line) for line in index_path.read_text().splitlines() if line.strip()]
+
+
+# ---------------------------------------------------------------------------
+# test_snapshot_and_restore
+# ---------------------------------------------------------------------------
+
+def test_snapshot_and_restore(tmp_dirs):
+    """snapshot v1 → modify editable → restore v1 → original content back."""
+    archive_dir, editable_dir = tmp_dirs
+    _seed_editable(editable_dir, {"a.txt": "hello", "b.txt": "world"})
+
+    mgr = ArchiveManager(archive_dir, editable_dir)
+    tree_hash = mgr.snapshot("v1", scores={"smoke": 1.0})
+
+    # Mutate editable_dir
+    (editable_dir / "a.txt").write_text("mutated")
+    (editable_dir / "c.txt").write_text("new file")
+
+    mgr.restore("v1")
+
+    assert (editable_dir / "a.txt").read_text() == "hello"
+    assert (editable_dir / "b.txt").read_text() == "world"
+    assert not (editable_dir / "c.txt").exists()
+
+    # Index should have one entry
+    entries = _read_index(archive_dir)
+    assert len(entries) == 1
+    assert entries[0]["version"] == "v1"
+    assert entries[0]["tree_hash"] == tree_hash
+
+
+# ---------------------------------------------------------------------------
+# test_archive_cap
+# ---------------------------------------------------------------------------
+
+def test_archive_cap(tmp_dirs):
+    """Create 5 snapshots with cap=3 → only 3 remain."""
+    archive_dir, editable_dir = tmp_dirs
+    mgr = ArchiveManager(archive_dir, editable_dir, cap=3, exploit_cap=3, explore_cap=3)
+
+    for i in range(1, 6):
+        _seed_editable(editable_dir, {"f.txt": f"v{i}"})
+        mgr.snapshot(f"v{i}", scores={"smoke": 1.0}, pool="exploit")
+
+    versions = mgr.list_versions()
+    assert len(versions) == 3
+
+
+# ---------------------------------------------------------------------------
+# test_exploit_explore_pools
+# ---------------------------------------------------------------------------
+
+def test_exploit_explore_pools(tmp_dirs):
+    """Snapshot to different pools → list_versions filters correctly."""
+    archive_dir, editable_dir = tmp_dirs
+    mgr = ArchiveManager(archive_dir, editable_dir, cap=50, exploit_cap=30, explore_cap=20)
+
+    _seed_editable(editable_dir, {"x.txt": "exploit1"})
+    mgr.snapshot("e1", scores={"smoke": 1.0}, pool="exploit")
+
+    _seed_editable(editable_dir, {"x.txt": "exploit2"})
+    mgr.snapshot("e2", scores={"smoke": 0.9}, pool="exploit")
+
+    _seed_editable(editable_dir, {"x.txt": "explore1"})
+    mgr.snapshot("x1", scores={"smoke": 0.5}, pool="explore")
+
+    all_versions = mgr.list_versions()
+    exploit_versions = mgr.list_versions(pool="exploit")
+    explore_versions = mgr.list_versions(pool="explore")
+
+    assert set(all_versions) == {"e1", "e2", "x1"}
+    assert set(exploit_versions) == {"e1", "e2"}
+    assert set(explore_versions) == {"x1"}
+
+
+# ---------------------------------------------------------------------------
+# test_explore_protected_min
+# ---------------------------------------------------------------------------
+
+def test_explore_protected_min(tmp_dirs):
+    """With explore_protected_min=2 and explore_cap=2, adding 3 explore entries
+    must keep at least 2 explore entries after cap enforcement."""
+    archive_dir, editable_dir = tmp_dirs
+    mgr = ArchiveManager(
+        archive_dir,
+        editable_dir,
+        cap=50,
+        exploit_cap=30,
+        explore_cap=2,
+        explore_protected_min=2,
+    )
+
+    for i in range(1, 4):
+        _seed_editable(editable_dir, {"g.txt": f"explore{i}"})
+        mgr.snapshot(f"xp{i}", scores={"smoke": 0.5}, pool="explore")
+
+    explore_versions = mgr.list_versions(pool="explore")
+    assert len(explore_versions) >= 2
+
+
+# ---------------------------------------------------------------------------
+# test_hash_dir_deterministic
+# ---------------------------------------------------------------------------
+
+def test_hash_dir_deterministic(tmp_dirs):
+    """Same directory content → same hash regardless of call order."""
+    archive_dir, editable_dir = tmp_dirs
+    mgr = ArchiveManager(archive_dir, editable_dir)
+
+    _seed_editable(editable_dir, {"a.txt": "foo", "b.txt": "bar"})
+    h1 = mgr._hash_dir(editable_dir)
+    h2 = mgr._hash_dir(editable_dir)
+    assert h1 == h2
+    assert len(h1) == 64  # sha256 hex digest
+
+
+def test_hash_dir_changes_on_content_change(tmp_dirs):
+    """Different content → different hash."""
+    archive_dir, editable_dir = tmp_dirs
+    mgr = ArchiveManager(archive_dir, editable_dir)
+
+    _seed_editable(editable_dir, {"a.txt": "foo"})
+    h1 = mgr._hash_dir(editable_dir)
+
+    (editable_dir / "a.txt").write_text("bar")
+    h2 = mgr._hash_dir(editable_dir)
+
+    assert h1 != h2

From 83956b4f7079e69630d1b484c8ce08ab27d762e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 11:12:33 +0900
Subject: [PATCH 10/12] feat: per-suite promotion gates and migration rules

---
 promotion.py            | 191 ++++++++++++++++++++++++++++++++++++++++
 tests/test_promotion.py | 143 ++++++++++++++++++++++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 promotion.py
 create mode 100644 tests/test_promotion.py

diff --git a/promotion.py b/promotion.py
new file mode 100644
index 0000000..235dc35
--- /dev/null
+++ b/promotion.py
@@ -0,0 +1,191 @@
+"""Per-suite promotion gates and exploit/explore migration rules."""
+from dataclasses import dataclass, field
+
+
+# ---------------------------------------------------------------------------
+# Gate definitions
+# ---------------------------------------------------------------------------
+
+@dataclass
+class SuiteGate:
+    """Promotion threshold for a single evaluation suite.
+
+    Args:
+        min_absolute: Minimum absolute score required to pass.
+        max_regression_pct: Maximum allowed regression from best score (0.0 = 0%).
+    """
+
+    min_absolute: float
+    max_regression_pct: float
+
+
+@dataclass
+class PromotionGates:
+    """Collection of per-suite gates used to decide promotion.
+
+    Args:
+        gates: Mapping of suite name → SuiteGate.
+    """
+
+    gates: dict[str, SuiteGate]
+
+    @classmethod
+    def defaults(cls) -> "PromotionGates":
+        """Return default gates: smoke(1.0, 0%), spreadsheet(0.80, 5%), terminal(0.40, 5%)."""
+        return cls(
+            gates={
+                "smoke": SuiteGate(min_absolute=1.0, max_regression_pct=0.0),
+                "spreadsheet": SuiteGate(min_absolute=0.80, max_regression_pct=0.05),
+                "terminal": SuiteGate(min_absolute=0.40, max_regression_pct=0.05),
+            }
+        )
+
+
+# ---------------------------------------------------------------------------
+# Promotion result
+# ---------------------------------------------------------------------------
+
+@dataclass
+class PromotionResult:
+    """Result of a promotion gate check.
+
+    Args:
+        promoted: True if all gates passed.
+        reason: Human-readable explanation (empty string when promoted=True).
+    """
+
+    promoted: bool
+    reason: str = ""
+
+
+def check_promotion(
+    scores: dict[str, float],
+    gates: PromotionGates,
+    best_scores: dict[str, float],
+) -> PromotionResult:
+    """Check all promotion gates and return result.
+
+    A candidate passes only if every defined suite gate is satisfied:
+    - score >= gate.min_absolute
+    - score >= best_score * (1 - gate.max_regression_pct)
+
+    Args:
+        scores: Current candidate scores per suite.
+        gates: PromotionGates defining thresholds.
+        best_scores: Historical best scores per suite for regression check.
+
+    Returns:
+        PromotionResult with promoted=True if all gates pass.
+    """
+    for suite, gate in gates.gates.items():
+        current = scores.get(suite, 0.0)
+        best = best_scores.get(suite, 0.0)
+
+        if current < gate.min_absolute:
+            return PromotionResult(
+                promoted=False,
+                reason=f"{suite} score {current:.3f} below minimum {gate.min_absolute:.3f}",
+            )
+
+        regression_floor = best * (1.0 - gate.max_regression_pct)
+        if current < regression_floor:
+            return PromotionResult(
+                promoted=False,
+                reason=(
+                    f"{suite} score {current:.3f} regresses more than "
+                    f"{gate.max_regression_pct*100:.0f}% from best {best:.3f}"
+                ),
+            )
+
+    return PromotionResult(promoted=True)
+
+
+# ---------------------------------------------------------------------------
+# Migration config & plan
+# ---------------------------------------------------------------------------
+
+@dataclass
+class MigrationConfig:
+    """Configuration for periodic pool migration.
+
+    Args:
+        interval_generations: How many generations between migration runs.
+        explore_to_exploit_top_k: Top-k explore entries (by score) promoted each interval.
+        exploit_to_explore_bottom_k: Bottom-k exploit entries demoted each interval.
+        cross_domain_fast_track_threshold: cross_domain_delta threshold for immediate promotion.
+    """
+
+    interval_generations: int = 3
+    explore_to_exploit_top_k: int = 3
+    exploit_to_explore_bottom_k: int = 5
+    cross_domain_fast_track_threshold: float = 0.10
+
+
+@dataclass
+class MigrationPlan:
+    """Versions to move between pools.
+
+    Args:
+        promote_to_exploit: Explore versions to move into exploit pool.
+        demote_to_explore: Exploit versions to move into explore pool.
+    """
+
+    promote_to_exploit: list[str] = field(default_factory=list)
+    demote_to_explore: list[str] = field(default_factory=list)
+
+
+def _mean_score(scores: dict[str, float]) -> float:
+    """Compute mean over all suite scores."""
+    if not scores:
+        return 0.0
+    return sum(scores.values()) / len(scores)
+
+
+def compute_migration(
+    archive_index: list[dict],
+    generation: int,
+    config: MigrationConfig,
+) -> MigrationPlan:
+    """Determine which versions should move between exploit and explore pools.
+
+    Rules:
+    - Cross-domain fast track: any explore entry with cross_domain_delta above
+      threshold is promoted immediately regardless of generation.
+    - Interval migration: at multiples of interval_generations, top-k explore
+      entries (by mean score) are promoted and bottom-k exploit entries are demoted.
+
+    Args:
+        archive_index: List of entry dicts from index.jsonl.
+        generation: Current generation number.
+        config: MigrationConfig thresholds.
+
+    Returns:
+        MigrationPlan listing versions to promote and demote.
+    """
+    plan = MigrationPlan()
+
+    explore_entries = [e for e in archive_index if e.get("pool") == "explore"]
+    exploit_entries = [e for e in archive_index if e.get("pool") == "exploit"]
+
+    # Cross-domain fast track (always active, independent of interval)
+    fast_tracked = set()
+    for entry in explore_entries:
+        delta = entry.get("cross_domain_delta", 0.0)
+        if delta > config.cross_domain_fast_track_threshold:
+            plan.promote_to_exploit.append(entry["version"])
+            fast_tracked.add(entry["version"])
+
+    # Interval-based migration
+    if generation > 0 and generation % config.interval_generations == 0:
+        # Promote top-k explore by mean score (skip already fast-tracked)
+        eligible_explore = [e for e in explore_entries if e["version"] not in fast_tracked]
+        eligible_explore.sort(key=lambda e: _mean_score(e.get("scores", {})), reverse=True)
+        for entry in eligible_explore[: config.explore_to_exploit_top_k]:
+            plan.promote_to_exploit.append(entry["version"])
+
+        # Demote bottom-k exploit by mean score
+        exploit_entries.sort(key=lambda e: _mean_score(e.get("scores", {})))
+        for entry in exploit_entries[: config.exploit_to_explore_bottom_k]:
+            plan.demote_to_explore.append(entry["version"])
+
+    return plan
diff --git a/tests/test_promotion.py b/tests/test_promotion.py
new file mode 100644
index 0000000..52ed4da
--- /dev/null
+++ b/tests/test_promotion.py
@@ -0,0 +1,143 @@
+"""Tests for promotion gates and explore→exploit migration rules."""
+from promotion import (
+    MigrationConfig,
+    MigrationPlan,
+    PromotionGates,
+    PromotionResult,
+    check_promotion,
+    compute_migration,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_index(entries: list[dict]) -> list[dict]:
+    """Build a minimal archive index from shorthand dicts."""
+    defaults = {
+        "pool": "explore",
+        "scores": {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40},
+        "tree_hash": "abc",
+        "generation": 0,
+        "stale_generations": 0,
+        "cross_domain_delta": 0.0,
+    }
+    return [{**defaults, **e} for e in entries]
+
+
+# ---------------------------------------------------------------------------
+# test_smoke_gate_pass
+# ---------------------------------------------------------------------------
+
+def test_smoke_gate_pass():
+    """All gates met → promoted=True."""
+    scores = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45}
+    best = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45}
+    gates = PromotionGates.defaults()
+
+    result = check_promotion(scores, gates, best_scores=best)
+
+    assert isinstance(result, PromotionResult)
+    assert result.promoted is True
+
+
+# ---------------------------------------------------------------------------
+# test_smoke_gate_fail
+# ---------------------------------------------------------------------------
+
+def test_smoke_gate_fail():
+    """smoke < 1.0 → promoted=False."""
+    scores = {"smoke": 0.9, "spreadsheet": 0.85, "terminal": 0.45}
+    best = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45}
+    gates = PromotionGates.defaults()
+
+    result = check_promotion(scores, gates, best_scores=best)
+
+    assert result.promoted is False
+    assert "smoke" in result.reason.lower()
+
+
+# ---------------------------------------------------------------------------
+# test_regression_gate_fail
+# ---------------------------------------------------------------------------
+
+def test_regression_gate_fail():
+    """spreadsheet regresses > 5% from best → promoted=False."""
+    best = {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40}
+    # 0.80 * (1 - 0.05) = 0.76; go below that
+    scores = {"smoke": 1.0, "spreadsheet": 0.70, "terminal": 0.40}
+    gates = PromotionGates.defaults()
+
+    result = check_promotion(scores, gates, best_scores=best)
+
+    assert result.promoted is False
+    assert "spreadsheet" in result.reason.lower()
+
+
+# ---------------------------------------------------------------------------
+# test_migration_at_interval
+# ---------------------------------------------------------------------------
+
+def test_migration_at_interval():
+    """At generation 3 (== interval), top-k explore entries get promoted to exploit."""
+    config = MigrationConfig(
+        interval_generations=3,
+        explore_to_exploit_top_k=2,
+        exploit_to_explore_bottom_k=5,
+        cross_domain_fast_track_threshold=0.10,
+    )
+    index = _make_index([
+        {"version": "xp1", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.90, "terminal": 0.50}},
+        {"version": "xp2", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.70, "terminal": 0.30}},
+        {"version": "xp3", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40}},
+        {"version": "ex1", "pool": "exploit", "scores": {"smoke": 1.0, "spreadsheet": 0.95, "terminal": 0.60}},
+    ])
+
+    plan = compute_migration(index, generation=3, config=config)
+
+    assert isinstance(plan, MigrationPlan)
+    # Top-2 explore by score should be promoted
+    assert len(plan.promote_to_exploit) == 2
+    assert "xp1" in plan.promote_to_exploit  # highest spreadsheet
+
+
+# ---------------------------------------------------------------------------
+# test_migration_not_at_interval
+# ---------------------------------------------------------------------------
+
+def test_migration_not_at_interval():
+    """At generation 2 (not an interval), no migration happens."""
+    config = MigrationConfig(interval_generations=3)
+    index = _make_index([
+        {"version": "xp1", "pool": "explore"},
+        {"version": "ex1", "pool": "exploit"},
+    ])
+
+    plan = compute_migration(index, generation=2, config=config)
+
+    assert plan.promote_to_exploit == []
+    assert plan.demote_to_explore == []
+
+
+# ---------------------------------------------------------------------------
+# test_cross_domain_fast_track
+# ---------------------------------------------------------------------------
+
+def test_cross_domain_fast_track():
+    """Entry with cross_domain_delta > 10% gets immediate shadow priority (fast-tracked)."""
+    config = MigrationConfig(
+        interval_generations=3,
+        cross_domain_fast_track_threshold=0.10,
+    )
+    # Generation 1 — not an interval, but cross_domain_delta > threshold
+    index = _make_index([
+        {"version": "xp_novel", "pool": "explore", "cross_domain_delta": 0.15},
+        {"version": "xp_norm", "pool": "explore", "cross_domain_delta": 0.02},
+        {"version": "ex1", "pool": "exploit"},
+    ])
+
+    plan = compute_migration(index, generation=1, config=config)
+
+    assert "xp_novel" in plan.promote_to_exploit
+    assert "xp_norm" not in plan.promote_to_exploit

From 9df339fe4ca0eb61865eda43d6b60ff8015b36c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 11:13:50 +0900
Subject: [PATCH 11/12] feat: split program.md into fixed rules and editable
 strategy

---
 program-fixed.md    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 program-strategy.md | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 program-fixed.md
 create mode 100644 program-strategy.md

diff --git a/program-fixed.md b/program-fixed.md
new file mode 100644
index 0000000..54ae92e
--- /dev/null
+++ b/program-fixed.md
@@ -0,0 +1,45 @@
+# Program Fixed Rules (DO NOT MODIFY)
+
+These rules are immutable. The meta-agent must not alter this file.
+
+## Safety Rules
+
+1. Never modify files in the `fixed/` directory (`adapter.py`, `contracts.py`).
+2. Never use `importlib`, `ctypes`, `sys.modules`, or `__import__()` directly.
+3. Always log results via `ExperimentLogger` to `experiments.jsonl`.
+4. Never skip the preflight policy gate before evaluation.
+5. Respect per-suite promotion gates — do not bypass thresholds.
+6. Do not modify evaluator, promotion, or scoreboard logic.
+
+## Experiment Protocol
+
+1. Read the latest experiments.jsonl and recent task-level results.
+2. Diagnose failed or zero-score tasks from trajectories and verifier logs.
+3. Group failures by root cause.
+4. Choose one general harness improvement.
+5. Run preflight check on the diff.
+6. Execute smoke test (Level 1) — all 5 must pass.
+7. If smoke passes, execute domain suite (Level 2).
+8. Log results to experiments.jsonl with full metadata.
+9. If improved: snapshot to archive, commit.
+10. If regressed: restore previous best, log root cause.
+
+## Model Constraint
+
+Do not change `MODEL` from `gpt-5` without explicit human approval.
+
+## Overfitting Rule
+
+Do not add task-specific hacks. Use this test:
+"If this exact task disappeared, would this still be a worthwhile improvement?"
+
+## Keep / Discard Rules
+
+- If `passed` improved → keep.
+- If `passed` stayed same and harness is simpler → keep.
+- Otherwise → discard. Record root cause in experiments.jsonl.
+
+## Termination
+
+Continue iterating until the human explicitly stops.
+Never pause to ask whether to continue.
diff --git a/program-strategy.md b/program-strategy.md
new file mode 100644
index 0000000..4bbbc47
--- /dev/null
+++ b/program-strategy.md
@@ -0,0 +1,33 @@
+# Program Strategy
+
+> **Stage 1: READ-ONLY.** This file becomes editable after Stage 2 gate passes
+> (per-suite non-regression 10 consecutive runs + human approval).
+
+## Current Strategy
+
+- Focus on tool addition over prompt tuning (high-leverage).
+- Prefer specialized tools (e.g., openpyxl for Excel) over raw shell.
+- Test one change at a time for clear attribution.
+- Prioritize tasks with highest failure rate.
+
+## Tool Design Guidelines
+
+- Each tool should do one thing well.
+- Include input validation and clear error messages.
+- Return structured output, not raw stdout.
+- Match model's name-based priors (models pattern-match tool names).
+
+## Agent Architecture Strategy
+
+- Start with single agent + specialized tools.
+- Consider `agent.as_tool()` for verification sub-agent when:
+  - Many tasks fail silently (agent thinks it succeeded but output is wrong).
+  - Verification logic is complex enough to benefit from a separate agent.
+
+## Simplicity Criterion
+
+All else being equal, simpler is better:
+- Fewer components
+- Less brittle logic
+- Cleaner tool interfaces
+- Less code for the same outcome

From 08c68d6f4eb9ad145ae57a91a2746cd8d80a489f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= <physics91@naver.com>
Date: Sat, 4 Apr 2026 11:14:18 +0900
Subject: [PATCH 12/12] docs: add CONTRIBUTING.md with adapter, task, and
 evaluation guidelines

---
 CONTRIBUTING.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..5456150
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,78 @@
+# Contributing to AutoAgent
+
+## Adding a New Backend Adapter
+
+1. Copy `agent.py` as `agent-<backend>.py`.
+2. Implement the `AgentWorkflow` protocol from `contracts.py`:
+   - `create_tools(environment) -> list`
+   - `create_agent(environment) -> agent`
+   - `async run_task(environment, instruction) -> tuple[result, duration_ms]`
+3. Import `AutoAgent` from `adapter.py` — do not modify it.
+4. Run smoke tests: `bash scripts/run_smoke.sh`
+5. Log results using `ExperimentLogger` from `experiment_log.py`.
+
+See `agent-claude.py` for a Claude SDK reference implementation.
+
+## Adding New Tasks
+
+1. Create `tasks/<category>/<name>/` with:
+   - `task.toml` — task metadata (name, description, timeout_sec)
+   - `instruction.md` — what the agent should do
+   - `tests/test.sh` — verification script (exit 0 = pass, exit 1 = fail)
+2. Follow existing patterns in `tasks/smoke/`.
+3. Test scripts should use `set -euo pipefail`.
+
+## Evaluation Levels
+
+| Level | Location | Purpose |
+|-------|----------|---------|
+| Smoke | `tasks/smoke/` | Basic sanity (< 1 min) |
+| Domain | `tasks/<domain>/` | Domain-specific suite |
+| Cross-domain | External benchmarks | Generalization test |
+
+## Reporting Benchmark Results
+
+Use `experiments.jsonl` format via `ExperimentLogger`:
+
+```python
+from experiment_log import ExperimentLogger, ExperimentEntry
+
+logger = ExperimentLogger("experiments.jsonl")
+logger.append(ExperimentEntry(
+    version="v1",
+    scores={"smoke": 1.0, "spreadsheet": 0.85},
+    trace_id="trace-001",
+    trajectory_uri="jobs/v1/trajectory.json",
+    # ... other fields
+))
+```
+
+## Project Structure
+
+```
+autoagent/
+├── agent.py              # Editable harness (meta-agent modifies this)
+├── adapter.py            # Fixed Harbor adapter (read-only)
+├── contracts.py          # Interface protocols (read-only)
+├── preflight.py          # Mutation validation gate
+├── experiment_log.py     # ATIF sidecar experiment logger
+├── archive_manager.py    # Evolutionary archive (exploit/explore)
+├── promotion.py          # Promotion gates and migration rules
+├── program.md            # Original meta-agent directive
+├── program-fixed.md      # Immutable safety rules
+├── program-strategy.md   # Editable strategy (Stage 2)
+├── Dockerfile.base       # Container base image
+├── scripts/
+│   ├── run_eval.sh       # Docker eval runner (read-only + network isolation)
+│   └── run_smoke.sh      # Smoke test runner
+├── tasks/smoke/          # Level 1 smoke tests (5 tasks)
+└── tests/                # Unit tests
+```
+
+## Safety Boundary
+
+Files in the **fixed boundary** must not be modified by the meta-agent:
+- `adapter.py`, `contracts.py` — enforced via Docker read-only mount
+- Evaluator logic, promotion gates — enforced via preflight policy gate
+
+The `preflight.py` gate automatically rejects diffs that touch fixed files or use forbidden imports.