From a64b5b70e0ef49b4360b1530ea62ea4039517ac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:39:22 +0900 Subject: [PATCH 01/12] feat: add interface contracts for harness/adapter boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AgentWorkflow / EvaluatorContract Protocol 정의로 editable harness와 fixed adapter 사이의 경계를 추상화함. AST 기반 테스트로 heavy runtime dependency 없이 agent.py 호환성 검증. --- contracts.py | 48 +++++++++++++++++++++++ tests/__init__.py | 0 tests/test_contracts.py | 86 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 contracts.py create mode 100644 tests/__init__.py create mode 100644 tests/test_contracts.py diff --git a/contracts.py b/contracts.py new file mode 100644 index 0000000..843f94c --- /dev/null +++ b/contracts.py @@ -0,0 +1,48 @@ +"""Interface contracts for the harness/adapter boundary. + +These Protocol classes define the shape that the editable harness (agent.py) +must satisfy. The fixed adapter (AutoAgent) depends on these abstractions, not +on concrete implementations. +""" + +from __future__ import annotations + +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class AgentWorkflow(Protocol): + """Contract for the editable harness layer. + + Implementors provide tool construction, agent construction, and + task-execution orchestration. The fixed adapter calls these three + entry-points only — nothing else crosses the boundary. + """ + + def create_tools(self, environment: Any) -> list: + """Return a list of tools configured for *environment*.""" + ... + + def create_agent(self, environment: Any) -> Any: + """Build and return a configured agent instance.""" + ... + + async def run_task(self, environment: Any, instruction: str) -> dict: + """Execute a task and return a result mapping.""" + ... + + +@runtime_checkable +class EvaluatorContract(Protocol): + """Contract for trajectory evaluators. + + Evaluators receive a trajectory dict and an expected-outcome dict and + return a scalar score in the range [0.0, 1.0]. + """ + + def score(self, trajectory: dict, expected: dict) -> float: + """Score *trajectory* against *expected*. Returns a float in [0, 1].""" + ... + + +__all__ = ["AgentWorkflow", "EvaluatorContract"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_contracts.py b/tests/test_contracts.py new file mode 100644 index 0000000..4f61cb5 --- /dev/null +++ b/tests/test_contracts.py @@ -0,0 +1,86 @@ +"""Tests for contracts.py — interface contract definitions.""" + +from __future__ import annotations + +import importlib +import inspect + + +def test_contracts_module_exists() -> None: + """contracts module must be importable.""" + contracts = importlib.import_module("contracts") + assert contracts is not None + + +def test_agent_workflow_protocol_exists() -> None: + """contracts module must expose AgentWorkflow.""" + contracts = importlib.import_module("contracts") + assert hasattr(contracts, "AgentWorkflow"), "AgentWorkflow not found in contracts" + + +def test_evaluator_contract_protocol_exists() -> None: + """contracts module must expose EvaluatorContract.""" + contracts = importlib.import_module("contracts") + assert hasattr(contracts, "EvaluatorContract"), "EvaluatorContract not found in contracts" + + +def test_agent_workflow_has_required_methods() -> None: + """AgentWorkflow Protocol must declare run_task, create_tools, create_agent.""" + contracts = importlib.import_module("contracts") + AgentWorkflow = contracts.AgentWorkflow + + required = {"run_task", "create_tools", "create_agent"} + # Protocol methods appear as annotations or as actual members + members = set(dir(AgentWorkflow)) + assert required <= members, f"Missing methods: {required - members}" + + +def test_evaluator_contract_has_score_method() -> None: + """EvaluatorContract Protocol must declare score.""" + contracts = importlib.import_module("contracts") + EvaluatorContract = contracts.EvaluatorContract + + assert "score" in dir(EvaluatorContract), "score method not found in EvaluatorContract" + + +def test_agent_py_functions_are_callable() -> None: + """agent.py must define create_tools, create_agent, run_task as top-level functions. + + Verified via AST to avoid importing heavy runtime dependencies (harbor, openai-agents). + """ + import ast + from pathlib import Path # noqa: PLC0415 + + source = (Path(__file__).parent.parent / "agent.py").read_text() + tree = ast.parse(source) + + top_level_funcs = { + node.name + for node in ast.walk(tree) + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + and isinstance(node.col_offset, int) + and node.col_offset == 0 # module-level only + } + + for name in ("create_tools", "create_agent", "run_task"): + assert name in top_level_funcs, f"{name} not found as a top-level function in agent.py" + + +def test_run_task_is_coroutine_function() -> None: + """run_task in agent.py must be declared as async to satisfy the async contract. + + Verified via AST to avoid importing heavy runtime dependencies. + """ + import ast + from pathlib import Path # noqa: PLC0415 + + source = (Path(__file__).parent.parent / "agent.py").read_text() + tree = ast.parse(source) + + async_top_level = { + node.name + for node in ast.walk(tree) + if isinstance(node, ast.AsyncFunctionDef) and node.col_offset == 0 + } + + assert "run_task" in async_top_level, "run_task must be an async def in agent.py" From f83c79bafaf411cb8103f7e7b89eac564272ca81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:42:34 +0900 Subject: [PATCH 02/12] refactor: extract fixed adapter boundary into adapter.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harbor 통합 코드(to_atif, AutoAgent)를 별도 adapter.py로 분리해 editable harness(agent.py)와 고정 어댑터 경계를 물리적으로 구분함. agent.py 하단에서 re-export해 Harbor의 agent:AutoAgent 진입점 호환성 유지. --- adapter.py | 191 ++++++++++++++++++++++++++++++++++++++++++ agent.py | 180 +-------------------------------------- tests/test_adapter.py | 123 +++++++++++++++++++++++++++ 3 files changed, 315 insertions(+), 179 deletions(-) create mode 100644 adapter.py create mode 100644 tests/test_adapter.py diff --git a/adapter.py b/adapter.py new file mode 100644 index 0000000..5eaae1d --- /dev/null +++ b/adapter.py @@ -0,0 +1,191 @@ +"""Fixed Harbor adapter — DO NOT MODIFY. Read-only in production.""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone + +from agents.items import ( + ItemHelpers, + MessageOutputItem, + ReasoningItem, + ToolCallItem, + ToolCallOutputItem, +) +from agents.usage import Usage +from harbor.agents.base import BaseAgent +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + +# Import editable harness entry-points (run_task, MODEL) from agent.py. +# agent.py imports *this* module at the bottom, so by the time Python resolves +# this import agent.py's module-level symbols are already defined — no cycle. +from agent import MODEL, run_task # noqa: E402 + + +def to_atif(result: object, model: str, duration_ms: int = 0) -> dict: + """Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict.""" + steps: list[dict] = [] + step_id = 0 + now = datetime.now(timezone.utc).isoformat() + + def _step(source: str, message: str, **extra: object) -> dict: + nonlocal step_id + step_id += 1 + step = { + "step_id": step_id, + "timestamp": now, + "source": source, + "message": message, + } + step.update({key: value for key, value in extra.items() if value is not None}) + return step + + pending_tool_call = None + for item in result.new_items: + if isinstance(item, MessageOutputItem): + text = ItemHelpers.text_message_output(item) + if text: + steps.append(_step("agent", text, model_name=model)) + elif isinstance(item, ReasoningItem): + summaries = getattr(item.raw_item, "summary", None) + reasoning = ( + "\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None + ) + if reasoning: + steps.append( + _step( + "agent", + "(thinking)", + reasoning_content=reasoning, + model_name=model, + ) + ) + elif isinstance(item, ToolCallItem): + raw = item.raw_item + if hasattr(raw, "name"): + pending_tool_call = raw + elif isinstance(item, ToolCallOutputItem) and pending_tool_call: + arguments = ( + json.loads(pending_tool_call.arguments) + if isinstance(pending_tool_call.arguments, str) + else pending_tool_call.arguments + ) + output_str = str(item.output) if item.output else "" + steps.append( + _step( + "agent", + f"Tool: {pending_tool_call.name}", + tool_calls=[ + { + "tool_call_id": pending_tool_call.call_id, + "function_name": pending_tool_call.name, + "arguments": arguments, + } + ], + observation={ + "results": [ + { + "source_call_id": pending_tool_call.call_id, + "content": output_str, + } + ] + }, + ) + ) + pending_tool_call = None + + if pending_tool_call: + arguments = ( + json.loads(pending_tool_call.arguments) + if isinstance(pending_tool_call.arguments, str) + else pending_tool_call.arguments + ) + steps.append( + _step( + "agent", + f"Tool: {pending_tool_call.name}", + tool_calls=[ + { + "tool_call_id": pending_tool_call.call_id, + "function_name": pending_tool_call.name, + "arguments": arguments, + } + ], + ) + ) + + if not steps: + steps.append(_step("user", "(empty)")) + + usage = Usage() + for response in result.raw_responses: + usage.add(response.usage) + + return { + "schema_version": "ATIF-v1.6", + "session_id": getattr(result, "last_response_id", None) or "unknown", + "agent": {"name": "autoagent", "version": "0.1.0", "model_name": model}, + "steps": steps, + "final_metrics": { + "total_prompt_tokens": usage.input_tokens, + "total_completion_tokens": usage.output_tokens, + "total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0, + "total_cost_usd": None, + "total_steps": len(steps), + "extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)}, + }, + } + + +class AutoAgent(BaseAgent): + """Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container.""" + + SUPPORTS_ATIF = True + + def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs): + super().__init__(*args, **kwargs) + self._extra_env = dict(extra_env) if extra_env else {} + + @staticmethod + def name() -> str: + return "autoagent" + + def version(self) -> str | None: + return "0.1.0" + + async def setup(self, environment: BaseEnvironment) -> None: + pass + + async def run( + self, instruction: str, environment: BaseEnvironment, context: AgentContext + ) -> None: + await environment.exec(command="mkdir -p /task") + instr_file = self.logs_dir / "instruction.md" + instr_file.write_text(instruction) + await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md") + + result, duration_ms = await run_task(environment, instruction) + + atif = to_atif(result, model=MODEL, duration_ms=duration_ms) + traj_path = self.logs_dir / "trajectory.json" + traj_path.write_text(json.dumps(atif, indent=2)) + + try: + final_metrics = atif.get("final_metrics", {}) + context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0) + context.n_output_tokens = final_metrics.get("total_completion_tokens", 0) + context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0) + except Exception: + pass + + usage = Usage() + for response in result.raw_responses: + usage.add(response.usage) + print( + f"turns={len(result.raw_responses)} duration_ms={duration_ms} " + f"input={usage.input_tokens} output={usage.output_tokens}" + ) + + +__all__ = ["AutoAgent", "to_atif"] diff --git a/agent.py b/agent.py index d155db4..168e2c4 100644 --- a/agent.py +++ b/agent.py @@ -2,23 +2,11 @@ from __future__ import annotations -import json import time -from datetime import datetime, timezone from agents import Agent, Runner, function_tool -from agents.items import ( - ItemHelpers, - MessageOutputItem, - ReasoningItem, - ToolCallItem, - ToolCallOutputItem, -) from agents.tool import FunctionTool -from agents.usage import Usage -from harbor.agents.base import BaseAgent from harbor.environments.base import BaseEnvironment -from harbor.models.agent.context import AgentContext # ============================================================================ @@ -73,170 +61,4 @@ async def run_task( return result, duration_ms -# ============================================================================ -# FIXED ADAPTER BOUNDARY: do not modify unless the human explicitly asks. -# Harbor integration and trajectory serialization live here. -# ============================================================================ - -def to_atif(result: object, model: str, duration_ms: int = 0) -> dict: - """Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict.""" - steps: list[dict] = [] - step_id = 0 - now = datetime.now(timezone.utc).isoformat() - - def _step(source: str, message: str, **extra: object) -> dict: - nonlocal step_id - step_id += 1 - step = { - "step_id": step_id, - "timestamp": now, - "source": source, - "message": message, - } - step.update({key: value for key, value in extra.items() if value is not None}) - return step - - pending_tool_call = None - for item in result.new_items: - if isinstance(item, MessageOutputItem): - text = ItemHelpers.text_message_output(item) - if text: - steps.append(_step("agent", text, model_name=model)) - elif isinstance(item, ReasoningItem): - summaries = getattr(item.raw_item, "summary", None) - reasoning = "\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None - if reasoning: - steps.append( - _step( - "agent", - "(thinking)", - reasoning_content=reasoning, - model_name=model, - ) - ) - elif isinstance(item, ToolCallItem): - raw = item.raw_item - if hasattr(raw, "name"): - pending_tool_call = raw - elif isinstance(item, ToolCallOutputItem) and pending_tool_call: - arguments = ( - json.loads(pending_tool_call.arguments) - if isinstance(pending_tool_call.arguments, str) - else pending_tool_call.arguments - ) - output_str = str(item.output) if item.output else "" - steps.append( - _step( - "agent", - f"Tool: {pending_tool_call.name}", - tool_calls=[ - { - "tool_call_id": pending_tool_call.call_id, - "function_name": pending_tool_call.name, - "arguments": arguments, - } - ], - observation={ - "results": [ - { - "source_call_id": pending_tool_call.call_id, - "content": output_str, - } - ] - }, - ) - ) - pending_tool_call = None - - if pending_tool_call: - arguments = ( - json.loads(pending_tool_call.arguments) - if isinstance(pending_tool_call.arguments, str) - else pending_tool_call.arguments - ) - steps.append( - _step( - "agent", - f"Tool: {pending_tool_call.name}", - tool_calls=[ - { - "tool_call_id": pending_tool_call.call_id, - "function_name": pending_tool_call.name, - "arguments": arguments, - } - ], - ) - ) - - if not steps: - steps.append(_step("user", "(empty)")) - - usage = Usage() - for response in result.raw_responses: - usage.add(response.usage) - - return { - "schema_version": "ATIF-v1.6", - "session_id": getattr(result, "last_response_id", None) or "unknown", - "agent": {"name": "autoagent", "version": "0.1.0", "model_name": model}, - "steps": steps, - "final_metrics": { - "total_prompt_tokens": usage.input_tokens, - "total_completion_tokens": usage.output_tokens, - "total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0, - "total_cost_usd": None, - "total_steps": len(steps), - "extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)}, - }, - } - - -class AutoAgent(BaseAgent): - """Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container.""" - - SUPPORTS_ATIF = True - - def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs): - super().__init__(*args, **kwargs) - self._extra_env = dict(extra_env) if extra_env else {} - - @staticmethod - def name() -> str: - return "autoagent" - - def version(self) -> str | None: - return "0.1.0" - - async def setup(self, environment: BaseEnvironment) -> None: - pass - - async def run(self, instruction: str, environment: BaseEnvironment, context: AgentContext) -> None: - await environment.exec(command="mkdir -p /task") - instr_file = self.logs_dir / "instruction.md" - instr_file.write_text(instruction) - await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md") - - result, duration_ms = await run_task(environment, instruction) - - atif = to_atif(result, model=MODEL, duration_ms=duration_ms) - traj_path = self.logs_dir / "trajectory.json" - traj_path.write_text(json.dumps(atif, indent=2)) - - try: - final_metrics = atif.get("final_metrics", {}) - context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0) - context.n_output_tokens = final_metrics.get("total_completion_tokens", 0) - context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0) - except Exception: - pass - - usage = Usage() - for response in result.raw_responses: - usage.add(response.usage) - print( - f"turns={len(result.raw_responses)} duration_ms={duration_ms} " - f"input={usage.input_tokens} output={usage.output_tokens}" - ) - - -__all__ = ["AutoAgent"] +from adapter import AutoAgent, to_atif # noqa: F401 — Harbor entrypoint diff --git a/tests/test_adapter.py b/tests/test_adapter.py new file mode 100644 index 0000000..595ca1a --- /dev/null +++ b/tests/test_adapter.py @@ -0,0 +1,123 @@ +"""Tests for adapter.py — fixed Harbor adapter boundary.""" + +from __future__ import annotations + +import ast +import hashlib +from pathlib import Path + + +ADAPTER_PATH = Path(__file__).parent.parent / "adapter.py" +AGENT_PATH = Path(__file__).parent.parent / "agent.py" + + +def _parse_adapter() -> ast.Module: + return ast.parse(ADAPTER_PATH.read_text()) + + +def test_adapter_file_exists() -> None: + """adapter.py must exist on disk.""" + assert ADAPTER_PATH.exists(), "adapter.py not found" + + +def test_adapter_exposes_autoagent() -> None: + """adapter.py must define AutoAgent class at module level. + + Verified via AST to avoid importing heavy runtime dependencies. + """ + tree = _parse_adapter() + top_level_classes = { + node.name + for node in ast.walk(tree) + if isinstance(node, ast.ClassDef) and node.col_offset == 0 + } + assert "AutoAgent" in top_level_classes, "AutoAgent class not found in adapter.py" + + +def test_adapter_exposes_to_atif() -> None: + """adapter.py must define to_atif function at module level. + + Verified via AST to avoid importing heavy runtime dependencies. + """ + tree = _parse_adapter() + top_level_funcs = { + node.name + for node in ast.walk(tree) + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + and node.col_offset == 0 + } + assert "to_atif" in top_level_funcs, "to_atif function not found in adapter.py" + + +def test_autoagent_supports_atif() -> None: + """AutoAgent must declare SUPPORTS_ATIF = True as a class-level assignment. + + Verified via AST to avoid importing heavy runtime dependencies. + """ + tree = _parse_adapter() + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef) and node.name == "AutoAgent": + for stmt in node.body: + # SUPPORTS_ATIF = True → ast.Assign or ast.AnnAssign + if isinstance(stmt, ast.Assign): + for target in stmt.targets: + if isinstance(target, ast.Name) and target.id == "SUPPORTS_ATIF": + value = stmt.value + assert isinstance(value, ast.Constant) and value.value is True, ( + "AutoAgent.SUPPORTS_ATIF must be True" + ) + return + raise AssertionError("AutoAgent.SUPPORTS_ATIF = True not found in adapter.py") + + +def test_adapter_source_hash_is_stable() -> None: + """adapter.py source hash must be a valid SHA-256 hex digest (tamper detection baseline). + + This test does NOT assert a specific hash value — it verifies that the file + exists and produces a well-formed digest. Pin the expected value in CI/CD if + immutability enforcement is required. + """ + source = ADAPTER_PATH.read_bytes() + digest = hashlib.sha256(source).hexdigest() + assert len(digest) == 64, "SHA-256 digest must be 64 hex characters" + assert all(c in "0123456789abcdef" for c in digest), "Digest must be lowercase hex" + + +def test_agent_imports_autoagent_for_harbor_compat() -> None: + """agent.py must re-export AutoAgent via 'from adapter import AutoAgent'. + + Harbor uses --agent-import-path agent:AutoAgent, so agent.py must expose it. + Verified via AST to avoid importing heavy runtime dependencies. + """ + source = AGENT_PATH.read_text() + tree = ast.parse(source) + + found = False + for node in ast.walk(tree): + if ( + isinstance(node, ast.ImportFrom) + and node.module == "adapter" + and any(alias.name == "AutoAgent" for alias in node.names) + ): + found = True + break + + assert found, "agent.py must re-export AutoAgent via 'from adapter import AutoAgent'" + + +def test_adapter_does_not_define_run_task() -> None: + """run_task must live in agent.py (editable harness), not adapter.py. + + Verified via AST to keep the boundary clean. + """ + tree = _parse_adapter() + top_level_funcs = { + node.name + for node in ast.walk(tree) + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + and node.col_offset == 0 + } + assert "run_task" not in top_level_funcs, ( + "run_task must not be defined in adapter.py — it belongs in the editable harness (agent.py)" + ) From 0d5c29736758e75650e7d3b5ffb9724759c3d309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:45:13 +0900 Subject: [PATCH 03/12] feat: Docker read-only + network isolation for eval containers --- Dockerfile.base | 9 +++------ scripts/run_eval.sh | 12 ++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) create mode 100755 scripts/run_eval.sh diff --git a/Dockerfile.base b/Dockerfile.base index 705ad73..599e69c 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -5,13 +5,10 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* WORKDIR /app - -# Python deps — only what the agent needs (harbor excluded via .dockerignore) +COPY contracts.py adapter.py /app/fixed/ +COPY agent.py /app/editable/ COPY pyproject.toml ./ RUN uv pip install --system . - -# Agent code -COPY agent.py ./ - +ENV PYTHONPATH=/app/fixed:/app/editable:/app RUN ln -sf $(which python3) /usr/local/bin/python RUN mkdir -p /logs /app/output diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh new file mode 100755 index 0000000..b322419 --- /dev/null +++ b/scripts/run_eval.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +NETWORK="${EVAL_NETWORK:-none}" +docker run --rm --read-only \ + --network="$NETWORK" \ + --tmpfs /tmp:size=512M \ + --mount type=bind,source="$(pwd)/adapter.py",target=/app/fixed/adapter.py,readonly \ + --mount type=bind,source="$(pwd)/contracts.py",target=/app/fixed/contracts.py,readonly \ + -v "$(pwd)/agent.py:/app/editable/agent.py:rw" \ + -e PYTHONPATH=/app/fixed:/app/editable:/app \ + --security-opt no-new-privileges:true \ + autoagent-base "$@" From f2db2a144992d80657d6beca35a6341c8b65b180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:45:46 +0900 Subject: [PATCH 04/12] feat: preflight policy gate for mutation validation --- preflight.py | 49 ++++++++++++++++++++++++++++++++ tests/test_preflight.py | 63 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 preflight.py create mode 100644 tests/test_preflight.py diff --git a/preflight.py b/preflight.py new file mode 100644 index 0000000..e19c6ac --- /dev/null +++ b/preflight.py @@ -0,0 +1,49 @@ +"""Preflight policy gate — rule-based diff checker for mutation validation.""" + +import re +from dataclasses import dataclass + +FIXED_FILES = {"adapter.py", "contracts.py", "__init__.py"} + +FORBIDDEN_PATTERNS = [ + r'\bimport\s+importlib\b', + r'\bimport\s+ctypes\b', + r'\bsys\.modules\b', +] + + +@dataclass +class PreflightResult: + rejected: bool + reason: str + + +def check_diff(diff_text: str) -> PreflightResult: + """Check a unified diff for policy violations. + + Args: + diff_text: Unified diff string to validate. + + Returns: + PreflightResult with rejected=True and a reason if any rule is violated. + """ + for line in diff_text.splitlines(): + # Check if any fixed file is being modified (appears in diff --git header) + if line.startswith("diff --git"): + for fixed_file in FIXED_FILES: + if f"/{fixed_file}" in line or f" {fixed_file}" in line: + return PreflightResult( + rejected=True, + reason=f"modification of fixed file detected: {fixed_file}", + ) + + # Check forbidden patterns only in added lines + if line.startswith("+") and not line.startswith("+++"): + for pattern in FORBIDDEN_PATTERNS: + if re.search(pattern, line): + return PreflightResult( + rejected=True, + reason=f"forbidden pattern found: {pattern}", + ) + + return PreflightResult(rejected=False, reason="") diff --git a/tests/test_preflight.py b/tests/test_preflight.py new file mode 100644 index 0000000..f48a443 --- /dev/null +++ b/tests/test_preflight.py @@ -0,0 +1,63 @@ +"""Tests for preflight policy gate.""" + +from preflight import check_diff + + +def test_reject_fixed_modification(): + diff = """\ +diff --git a/adapter.py b/adapter.py +--- a/adapter.py ++++ b/adapter.py +@@ -1,3 +1,4 @@ ++# modified + import autoagent +""" + result = check_diff(diff) + assert result.rejected is True + assert "adapter.py" in result.reason + + +def test_reject_forbidden_import(): + diff = """\ +diff --git a/agent.py b/agent.py +--- a/agent.py ++++ b/agent.py +@@ -1,3 +1,4 @@ ++import importlib + async def run_task(task): + pass +""" + result = check_diff(diff) + assert result.rejected is True + assert result.reason != "" + + +def test_allow_clean_change(): + diff = """\ +diff --git a/agent.py b/agent.py +--- a/agent.py ++++ b/agent.py +@@ -1,3 +1,5 @@ ++import os ++ + async def run_task(task): +- pass ++ return {"score": 1.0} +""" + result = check_diff(diff) + assert result.rejected is False + + +def test_reject_sys_modules(): + diff = """\ +diff --git a/agent.py b/agent.py +--- a/agent.py ++++ b/agent.py +@@ -1,3 +1,4 @@ ++sys.modules["os"] = None + async def run_task(task): + pass +""" + result = check_diff(diff) + assert result.rejected is True + assert result.reason != "" From 5c6ad96a4801300844adf4c0493c9aa8b05f5114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:46:24 +0900 Subject: [PATCH 05/12] feat: structured experiment logging with ATIF sidecar index --- experiment_log.py | 60 +++++++++++++++++++++++++++++++++ tests/test_experiment_log.py | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 experiment_log.py create mode 100644 tests/test_experiment_log.py diff --git a/experiment_log.py b/experiment_log.py new file mode 100644 index 0000000..bc168cd --- /dev/null +++ b/experiment_log.py @@ -0,0 +1,60 @@ +"""Structured experiment logging with ATIF sidecar index.""" + +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Optional + + +@dataclass +class ExperimentEntry: + version: str + parent: Optional[str] + schema_version: int + editable_tree_hash: str + fixed_tree_hash: str + contract_version: str + container_image_digest: str + scores: dict[str, Any] + cost_usd: float + tokens_used: int + duration_sec: float + trace_id: str + atif_version: str + trajectory_uri: str + delta: dict[str, Any] + root_cause: str + meta_reasoning: str + network_profile: str + evaluator_digest: str + timestamp: str + meta: dict[str, Any] = field(default_factory=dict) + + +class ExperimentLogger: + """Append-only JSONL logger for experiment entries.""" + + def __init__(self, path: Path) -> None: + self._path = path + + def append(self, entry: ExperimentEntry) -> None: + """Append an entry to the JSONL log file.""" + with self._path.open("a", encoding="utf-8") as f: + f.write(json.dumps(asdict(entry)) + "\n") + + def read_all(self) -> list[ExperimentEntry]: + """Read all entries from the log file. + + Returns: + List of ExperimentEntry objects, or empty list if file does not exist. + """ + if not self._path.exists(): + return [] + entries = [] + with self._path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + data = json.loads(line) + entries.append(ExperimentEntry(**data)) + return entries diff --git a/tests/test_experiment_log.py b/tests/test_experiment_log.py new file mode 100644 index 0000000..5dfb873 --- /dev/null +++ b/tests/test_experiment_log.py @@ -0,0 +1,65 @@ +"""Tests for structured experiment logging.""" + +import json +import tempfile +from pathlib import Path + +from experiment_log import ExperimentEntry, ExperimentLogger + + +def _make_entry(**kwargs) -> ExperimentEntry: + defaults = dict( + version="v0.1.0", + parent=None, + schema_version=1, + editable_tree_hash="abc123", + fixed_tree_hash="def456", + contract_version="1.0", + container_image_digest="sha256:deadbeef", + scores={"pass@1": 0.85}, + cost_usd=0.05, + tokens_used=1500, + duration_sec=12.3, + trace_id="trace-001", + atif_version="0.3.0", + trajectory_uri="s3://bucket/traj/001.jsonl", + delta={"agent.py": "+10 -2"}, + root_cause="improved retry logic", + meta_reasoning="higher pass@1 expected from retry", + network_profile="none", + evaluator_digest="sha256:cafebabe", + timestamp="2026-04-04T00:00:00Z", + ) + defaults.update(kwargs) + return ExperimentEntry(**defaults) + + +def test_entry_has_atif_connection_keys(): + entry = _make_entry() + assert hasattr(entry, "trace_id") + assert hasattr(entry, "trajectory_uri") + assert entry.trace_id == "trace-001" + assert entry.trajectory_uri == "s3://bucket/traj/001.jsonl" + + +def test_append_and_read_back(): + with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as f: + path = Path(f.name) + + try: + logger = ExperimentLogger(path) + entry = _make_entry() + logger.append(entry) + + entries = logger.read_all() + assert len(entries) == 1 + assert entries[0].version == "v0.1.0" + assert entries[0].scores == {"pass@1": 0.85} + assert entries[0].trace_id == "trace-001" + finally: + path.unlink(missing_ok=True) + + +def test_read_all_returns_empty_list_for_missing_file(): + logger = ExperimentLogger(Path("/tmp/nonexistent_experiment_log_xyz.jsonl")) + assert logger.read_all() == [] From b95dc8a2f45547137d700db224f70099aa56a805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:48:47 +0900 Subject: [PATCH 06/12] feat: add 5 smoke test tasks for Level 1 evaluation --- scripts/run_smoke.sh | 5 ++++ tasks/smoke/csv-analysis/instruction.md | 1 + tasks/smoke/csv-analysis/task.toml | 4 +++ tasks/smoke/csv-analysis/tests/test.sh | 31 +++++++++++++++++++++ tasks/smoke/fibonacci/instruction.md | 1 + tasks/smoke/fibonacci/task.toml | 4 +++ tasks/smoke/fibonacci/tests/test.sh | 17 ++++++++++++ tasks/smoke/git-log/instruction.md | 1 + tasks/smoke/git-log/task.toml | 4 +++ tasks/smoke/git-log/tests/test.sh | 32 ++++++++++++++++++++++ tasks/smoke/hello-world/instruction.md | 1 + tasks/smoke/hello-world/task.toml | 4 +++ tasks/smoke/hello-world/tests/test.sh | 17 ++++++++++++ tasks/smoke/text-processing/instruction.md | 1 + tasks/smoke/text-processing/task.toml | 4 +++ tasks/smoke/text-processing/tests/test.sh | 17 ++++++++++++ 16 files changed, 144 insertions(+) create mode 100755 scripts/run_smoke.sh create mode 100644 tasks/smoke/csv-analysis/instruction.md create mode 100644 tasks/smoke/csv-analysis/task.toml create mode 100755 tasks/smoke/csv-analysis/tests/test.sh create mode 100644 tasks/smoke/fibonacci/instruction.md create mode 100644 tasks/smoke/fibonacci/task.toml create mode 100755 tasks/smoke/fibonacci/tests/test.sh create mode 100644 tasks/smoke/git-log/instruction.md create mode 100644 tasks/smoke/git-log/task.toml create mode 100755 tasks/smoke/git-log/tests/test.sh create mode 100644 tasks/smoke/hello-world/instruction.md create mode 100644 tasks/smoke/hello-world/task.toml create mode 100755 tasks/smoke/hello-world/tests/test.sh create mode 100644 tasks/smoke/text-processing/instruction.md create mode 100644 tasks/smoke/text-processing/task.toml create mode 100755 tasks/smoke/text-processing/tests/test.sh diff --git a/scripts/run_smoke.sh b/scripts/run_smoke.sh new file mode 100755 index 0000000..37cbdd2 --- /dev/null +++ b/scripts/run_smoke.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail +echo "=== Smoke Test (Level 1) ===" +uv run harbor run -p tasks/smoke/ --agent-import-path agent:AutoAgent -o jobs/smoke +echo "=== Smoke Complete ===" diff --git a/tasks/smoke/csv-analysis/instruction.md b/tasks/smoke/csv-analysis/instruction.md new file mode 100644 index 0000000..485cbff --- /dev/null +++ b/tasks/smoke/csv-analysis/instruction.md @@ -0,0 +1 @@ +Create a CSV file at /task/output/data.csv with headers "name,score" and 3 rows: Alice,85 Bob,92 Charlie,78. Then write the average score to /task/output/average.txt diff --git a/tasks/smoke/csv-analysis/task.toml b/tasks/smoke/csv-analysis/task.toml new file mode 100644 index 0000000..60ccecd --- /dev/null +++ b/tasks/smoke/csv-analysis/task.toml @@ -0,0 +1,4 @@ +[task] +name = "csv-analysis" +description = "Create a CSV file and compute the average score" +timeout_sec = 60 diff --git a/tasks/smoke/csv-analysis/tests/test.sh b/tasks/smoke/csv-analysis/tests/test.sh new file mode 100755 index 0000000..3b341e2 --- /dev/null +++ b/tasks/smoke/csv-analysis/tests/test.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +CSV_FILE="/task/output/data.csv" +AVG_FILE="/task/output/average.txt" + +if [ ! -f "$CSV_FILE" ]; then + echo "FAIL: $CSV_FILE does not exist" + exit 1 +fi + +if [ ! -f "$AVG_FILE" ]; then + echo "FAIL: $AVG_FILE does not exist" + exit 1 +fi + +# Count data rows (excluding header) +DATA_ROWS=$(tail -n +2 "$CSV_FILE" | grep -c '[^[:space:]]' || true) +if [ "$DATA_ROWS" -ne 3 ]; then + echo "FAIL: expected 3 data rows but got $DATA_ROWS" + exit 1 +fi + +# Check average (integer: (85+92+78)/3 = 85) +AVG_CONTENT=$(cat "$AVG_FILE" | tr -d '[:space:]') +if [ "$AVG_CONTENT" != "85" ]; then + echo "FAIL: expected average '85' but got '$AVG_CONTENT'" + exit 1 +fi + +echo "PASS" diff --git a/tasks/smoke/fibonacci/instruction.md b/tasks/smoke/fibonacci/instruction.md new file mode 100644 index 0000000..ce5b28a --- /dev/null +++ b/tasks/smoke/fibonacci/instruction.md @@ -0,0 +1 @@ +Calculate the 10th Fibonacci number and write it to /task/output/fib.txt diff --git a/tasks/smoke/fibonacci/task.toml b/tasks/smoke/fibonacci/task.toml new file mode 100644 index 0000000..59a3c26 --- /dev/null +++ b/tasks/smoke/fibonacci/task.toml @@ -0,0 +1,4 @@ +[task] +name = "fibonacci" +description = "Calculate the 10th Fibonacci number and write it to output" +timeout_sec = 60 diff --git a/tasks/smoke/fibonacci/tests/test.sh b/tasks/smoke/fibonacci/tests/test.sh new file mode 100755 index 0000000..9ba948e --- /dev/null +++ b/tasks/smoke/fibonacci/tests/test.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTPUT_FILE="/task/output/fib.txt" + +if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: $OUTPUT_FILE does not exist" + exit 1 +fi + +CONTENT=$(cat "$OUTPUT_FILE" | tr -d '[:space:]') +if [ "$CONTENT" != "55" ]; then + echo "FAIL: expected '55' but got '$CONTENT'" + exit 1 +fi + +echo "PASS" diff --git a/tasks/smoke/git-log/instruction.md b/tasks/smoke/git-log/instruction.md new file mode 100644 index 0000000..6478715 --- /dev/null +++ b/tasks/smoke/git-log/instruction.md @@ -0,0 +1 @@ +Initialize a git repo in /tmp/test-repo, create 3 commits with messages "first", "second", "third", then write the output of "git log --oneline" to /task/output/log.txt diff --git a/tasks/smoke/git-log/task.toml b/tasks/smoke/git-log/task.toml new file mode 100644 index 0000000..cfdf8a9 --- /dev/null +++ b/tasks/smoke/git-log/task.toml @@ -0,0 +1,4 @@ +[task] +name = "git-log" +description = "Initialize a git repo, create 3 commits, and write git log to output" +timeout_sec = 120 diff --git a/tasks/smoke/git-log/tests/test.sh b/tasks/smoke/git-log/tests/test.sh new file mode 100755 index 0000000..c5d9299 --- /dev/null +++ b/tasks/smoke/git-log/tests/test.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTPUT_FILE="/task/output/log.txt" + +if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: $OUTPUT_FILE does not exist" + exit 1 +fi + +LINE_COUNT=$(wc -l < "$OUTPUT_FILE") +if [ "$LINE_COUNT" -ne 3 ]; then + echo "FAIL: expected 3 lines but got $LINE_COUNT" + exit 1 +fi + +if ! grep -q "first" "$OUTPUT_FILE"; then + echo "FAIL: 'first' not found in log" + exit 1 +fi + +if ! grep -q "second" "$OUTPUT_FILE"; then + echo "FAIL: 'second' not found in log" + exit 1 +fi + +if ! grep -q "third" "$OUTPUT_FILE"; then + echo "FAIL: 'third' not found in log" + exit 1 +fi + +echo "PASS" diff --git a/tasks/smoke/hello-world/instruction.md b/tasks/smoke/hello-world/instruction.md new file mode 100644 index 0000000..a3f6479 --- /dev/null +++ b/tasks/smoke/hello-world/instruction.md @@ -0,0 +1 @@ +Write the text "Hello, World!" to /task/output/hello.txt diff --git a/tasks/smoke/hello-world/task.toml b/tasks/smoke/hello-world/task.toml new file mode 100644 index 0000000..23e6b43 --- /dev/null +++ b/tasks/smoke/hello-world/task.toml @@ -0,0 +1,4 @@ +[task] +name = "hello-world" +description = "Write hello world to output file" +timeout_sec = 60 diff --git a/tasks/smoke/hello-world/tests/test.sh b/tasks/smoke/hello-world/tests/test.sh new file mode 100755 index 0000000..b8b925d --- /dev/null +++ b/tasks/smoke/hello-world/tests/test.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTPUT_FILE="/task/output/hello.txt" + +if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: $OUTPUT_FILE does not exist" + exit 1 +fi + +CONTENT=$(cat "$OUTPUT_FILE") +if [ "$CONTENT" != "Hello, World!" ]; then + echo "FAIL: expected 'Hello, World!' but got '$CONTENT'" + exit 1 +fi + +echo "PASS" diff --git a/tasks/smoke/text-processing/instruction.md b/tasks/smoke/text-processing/instruction.md new file mode 100644 index 0000000..b9fcbb8 --- /dev/null +++ b/tasks/smoke/text-processing/instruction.md @@ -0,0 +1 @@ +Count the number of words in the following text and write the count to /task/output/count.txt: "The quick brown fox jumps over the lazy dog" diff --git a/tasks/smoke/text-processing/task.toml b/tasks/smoke/text-processing/task.toml new file mode 100644 index 0000000..58b7bf2 --- /dev/null +++ b/tasks/smoke/text-processing/task.toml @@ -0,0 +1,4 @@ +[task] +name = "text-processing" +description = "Count words in a given text and write the count to output" +timeout_sec = 60 diff --git a/tasks/smoke/text-processing/tests/test.sh b/tasks/smoke/text-processing/tests/test.sh new file mode 100755 index 0000000..9ae004a --- /dev/null +++ b/tasks/smoke/text-processing/tests/test.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +OUTPUT_FILE="/task/output/count.txt" + +if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: $OUTPUT_FILE does not exist" + exit 1 +fi + +CONTENT=$(cat "$OUTPUT_FILE" | tr -d '[:space:]') +if [ "$CONTENT" != "9" ]; then + echo "FAIL: expected '9' but got '$CONTENT'" + exit 1 +fi + +echo "PASS" From 0ff372816aeec33c4136d90cef890e711cdbb383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 10:56:29 +0900 Subject: [PATCH 07/12] fix: address code review findings - adapter.py: lazy import to avoid circular dependency on standalone import - adapter.py: track pending tool calls by call_id (dict) instead of single slot - preflight.py: exact root-relative path matching to avoid false positives - contracts.py: fix run_task return type to tuple[Any, int] - tests: add test for nested __init__.py not being flagged as fixed file --- adapter.py | 76 ++++++++++++++++++++++------------------- contracts.py | 4 +-- preflight.py | 18 ++++++---- tests/test_preflight.py | 13 +++++++ 4 files changed, 67 insertions(+), 44 deletions(-) diff --git a/adapter.py b/adapter.py index 5eaae1d..21778e6 100644 --- a/adapter.py +++ b/adapter.py @@ -17,10 +17,10 @@ from harbor.environments.base import BaseEnvironment from harbor.models.agent.context import AgentContext -# Import editable harness entry-points (run_task, MODEL) from agent.py. -# agent.py imports *this* module at the bottom, so by the time Python resolves -# this import agent.py's module-level symbols are already defined — no cycle. -from agent import MODEL, run_task # noqa: E402 +def _load_harness(): + """Lazy import to avoid circular dependency with agent.py.""" + from agent import MODEL, run_task + return MODEL, run_task def to_atif(result: object, model: str, duration_ms: int = 0) -> dict: @@ -41,7 +41,7 @@ def _step(source: str, message: str, **extra: object) -> dict: step.update({key: value for key, value in extra.items() if value is not None}) return step - pending_tool_call = None + pending_tool_calls: dict[str, object] = {} for item in result.new_items: if isinstance(item, MessageOutputItem): text = ItemHelpers.text_message_output(item) @@ -63,39 +63,44 @@ def _step(source: str, message: str, **extra: object) -> dict: ) elif isinstance(item, ToolCallItem): raw = item.raw_item - if hasattr(raw, "name"): - pending_tool_call = raw - elif isinstance(item, ToolCallOutputItem) and pending_tool_call: - arguments = ( - json.loads(pending_tool_call.arguments) - if isinstance(pending_tool_call.arguments, str) - else pending_tool_call.arguments + if hasattr(raw, "call_id") and hasattr(raw, "name"): + pending_tool_calls[raw.call_id] = raw + elif isinstance(item, ToolCallOutputItem): + output_call_id = ( + getattr(item.raw_item, "call_id", None) + or getattr(item.raw_item, "tool_call_id", None) ) - output_str = str(item.output) if item.output else "" - steps.append( - _step( - "agent", - f"Tool: {pending_tool_call.name}", - tool_calls=[ - { - "tool_call_id": pending_tool_call.call_id, - "function_name": pending_tool_call.name, - "arguments": arguments, - } - ], - observation={ - "results": [ + pending_tool_call = pending_tool_calls.pop(output_call_id, None) if output_call_id else None + if pending_tool_call: + arguments = ( + json.loads(pending_tool_call.arguments) + if isinstance(pending_tool_call.arguments, str) + else pending_tool_call.arguments + ) + output_str = str(item.output) if item.output else "" + steps.append( + _step( + "agent", + f"Tool: {pending_tool_call.name}", + tool_calls=[ { - "source_call_id": pending_tool_call.call_id, - "content": output_str, + "tool_call_id": pending_tool_call.call_id, + "function_name": pending_tool_call.name, + "arguments": arguments, } - ] - }, + ], + observation={ + "results": [ + { + "source_call_id": pending_tool_call.call_id, + "content": output_str, + } + ] + }, + ) ) - ) - pending_tool_call = None - if pending_tool_call: + for pending_tool_call in pending_tool_calls.values(): arguments = ( json.loads(pending_tool_call.arguments) if isinstance(pending_tool_call.arguments, str) @@ -165,9 +170,10 @@ async def run( instr_file.write_text(instruction) await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md") - result, duration_ms = await run_task(environment, instruction) + model, run_task_fn = _load_harness() + result, duration_ms = await run_task_fn(environment, instruction) - atif = to_atif(result, model=MODEL, duration_ms=duration_ms) + atif = to_atif(result, model=model, duration_ms=duration_ms) traj_path = self.logs_dir / "trajectory.json" traj_path.write_text(json.dumps(atif, indent=2)) diff --git a/contracts.py b/contracts.py index 843f94c..42f54bd 100644 --- a/contracts.py +++ b/contracts.py @@ -27,8 +27,8 @@ def create_agent(self, environment: Any) -> Any: """Build and return a configured agent instance.""" ... - async def run_task(self, environment: Any, instruction: str) -> dict: - """Execute a task and return a result mapping.""" + async def run_task(self, environment: Any, instruction: str) -> tuple[Any, int]: + """Execute a task and return (result, duration_ms).""" ... diff --git a/preflight.py b/preflight.py index e19c6ac..b78b3c4 100644 --- a/preflight.py +++ b/preflight.py @@ -28,14 +28,18 @@ def check_diff(diff_text: str) -> PreflightResult: PreflightResult with rejected=True and a reason if any rule is violated. """ for line in diff_text.splitlines(): - # Check if any fixed file is being modified (appears in diff --git header) + # Check if any fixed file is being modified (exact root-relative path) if line.startswith("diff --git"): - for fixed_file in FIXED_FILES: - if f"/{fixed_file}" in line or f" {fixed_file}" in line: - return PreflightResult( - rejected=True, - reason=f"modification of fixed file detected: {fixed_file}", - ) + parts = line.split() + if len(parts) >= 4: + left = parts[2].removeprefix("a/") + right = parts[3].removeprefix("b/") + for path in (left, right): + if path in FIXED_FILES: + return PreflightResult( + rejected=True, + reason=f"modification of fixed file detected: {path}", + ) # Check forbidden patterns only in added lines if line.startswith("+") and not line.startswith("+++"): diff --git a/tests/test_preflight.py b/tests/test_preflight.py index f48a443..3f4a864 100644 --- a/tests/test_preflight.py +++ b/tests/test_preflight.py @@ -48,6 +48,19 @@ async def run_task(task): assert result.rejected is False +def test_allow_nested_init_py(): + """tests/__init__.py should NOT be rejected (only root __init__.py is fixed).""" + diff = """\ +diff --git a/tests/__init__.py b/tests/__init__.py +--- a/tests/__init__.py ++++ b/tests/__init__.py +@@ -0,0 +1 @@ ++# test package +""" + result = check_diff(diff) + assert result.rejected is False + + def test_reject_sys_modules(): diff = """\ diff --git a/agent.py b/agent.py From 83370c2395ba7ab10d7a4a20f8ca112a2cd7e032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 11:01:13 +0900 Subject: [PATCH 08/12] fix: address round 2 code review findings - preflight: broaden forbidden patterns (from/import variants, __import__) - run_eval.sh: add writable tmpfs for /task, /logs, /app/output - tests: add cases for from importlib and __import__ bypass --- preflight.py | 5 +++-- scripts/run_eval.sh | 3 +++ tests/test_preflight.py | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/preflight.py b/preflight.py index b78b3c4..ad81db5 100644 --- a/preflight.py +++ b/preflight.py @@ -6,9 +6,10 @@ FIXED_FILES = {"adapter.py", "contracts.py", "__init__.py"} FORBIDDEN_PATTERNS = [ - r'\bimport\s+importlib\b', - r'\bimport\s+ctypes\b', + r'\b(?:from|import)\s+importlib\b', + r'\b(?:from|import)\s+ctypes\b', r'\bsys\.modules\b', + r'\b__import__\s*\(', ] diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh index b322419..5ffa3a8 100755 --- a/scripts/run_eval.sh +++ b/scripts/run_eval.sh @@ -4,6 +4,9 @@ NETWORK="${EVAL_NETWORK:-none}" docker run --rm --read-only \ --network="$NETWORK" \ --tmpfs /tmp:size=512M \ + --tmpfs /task:size=512M \ + --tmpfs /logs:size=128M \ + --tmpfs /app/output:size=128M \ --mount type=bind,source="$(pwd)/adapter.py",target=/app/fixed/adapter.py,readonly \ --mount type=bind,source="$(pwd)/contracts.py",target=/app/fixed/contracts.py,readonly \ -v "$(pwd)/agent.py:/app/editable/agent.py:rw" \ diff --git a/tests/test_preflight.py b/tests/test_preflight.py index 3f4a864..d356f21 100644 --- a/tests/test_preflight.py +++ b/tests/test_preflight.py @@ -61,6 +61,30 @@ def test_allow_nested_init_py(): assert result.rejected is False +def test_reject_from_importlib(): + diff = """\ +diff --git a/agent.py b/agent.py +--- a/agent.py ++++ b/agent.py +@@ -1,3 +1,4 @@ ++from importlib import import_module +""" + result = check_diff(diff) + assert result.rejected is True + + +def test_reject_dunder_import(): + diff = """\ +diff --git a/agent.py b/agent.py +--- a/agent.py ++++ b/agent.py +@@ -1,3 +1,4 @@ ++mod = __import__("os") +""" + result = check_diff(diff) + assert result.rejected is True + + def test_reject_sys_modules(): diff = """\ diff --git a/agent.py b/agent.py From 5e750b42066f0062650cc0adaa27e641d0a90831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 11:12:27 +0900 Subject: [PATCH 09/12] feat: evolutionary archive with exploit/explore pools --- archive_manager.py | 201 ++++++++++++++++++++++++++++++++++ tests/test_archive_manager.py | 160 +++++++++++++++++++++++++++ 2 files changed, 361 insertions(+) create mode 100644 archive_manager.py create mode 100644 tests/test_archive_manager.py diff --git a/archive_manager.py b/archive_manager.py new file mode 100644 index 0000000..bdc03c1 --- /dev/null +++ b/archive_manager.py @@ -0,0 +1,201 @@ +"""Evolutionary archive with exploit/explore pools.""" +import hashlib +import json +import shutil +import tarfile +from pathlib import Path + + +class ArchiveManager: + """Manages versioned snapshots of an editable directory in exploit/explore pools. + + Args: + archive_dir: Directory where tarballs and index.jsonl are stored. + editable_dir: Directory being snapshotted/restored. + cap: Total max versions across both pools. + exploit_cap: Max versions in exploit pool. + explore_cap: Max versions in explore pool. + explore_protected_min: Minimum explore entries that survive cap enforcement. + """ + + def __init__( + self, + archive_dir: Path, + editable_dir: Path, + cap: int = 50, + exploit_cap: int = 30, + explore_cap: int = 20, + explore_protected_min: int = 10, + ) -> None: + self.archive_dir = Path(archive_dir) + self.editable_dir = Path(editable_dir) + self.cap = cap + self.exploit_cap = exploit_cap + self.explore_cap = explore_cap + self.explore_protected_min = explore_protected_min + self.archive_dir.mkdir(parents=True, exist_ok=True) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def snapshot( + self, + version: str, + scores: dict, + pool: str = "exploit", + generation: int = 0, + stale_generations: int = 0, + ) -> str: + """Tarball editable_dir, record metadata, return tree_hash. + + Args: + version: Unique version identifier. + scores: Dict of suite→score floats. + pool: "exploit" or "explore". + generation: Current generation counter. + stale_generations: How many generations without improvement. + + Returns: + SHA-256 hex digest of directory contents. + """ + tree_hash = self._hash_dir(self.editable_dir) + tarball_path = self.archive_dir / f"{version}.tar.gz" + with tarfile.open(tarball_path, "w:gz") as tar: + tar.add(self.editable_dir, arcname=".") + + entry = { + "version": version, + "pool": pool, + "scores": scores, + "tree_hash": tree_hash, + "generation": generation, + "stale_generations": stale_generations, + } + index_path = self.archive_dir / "index.jsonl" + with index_path.open("a") as f: + f.write(json.dumps(entry) + "\n") + + self._enforce_cap() + return tree_hash + + def restore(self, version: str) -> None: + """Extract the tarball for version back into editable_dir. + + Args: + version: Version identifier to restore. + """ + tarball_path = self.archive_dir / f"{version}.tar.gz" + if not tarball_path.exists(): + raise FileNotFoundError(f"No tarball for version {version!r}") + + # Clear editable_dir then extract + shutil.rmtree(self.editable_dir) + self.editable_dir.mkdir(parents=True) + + with tarfile.open(tarball_path, "r:gz") as tar: + tar.extractall(self.editable_dir) + + def list_versions(self, pool: str | None = None) -> list[str]: + """Return list of archived version identifiers. + + Args: + pool: If given, filter to "exploit" or "explore". Otherwise return all. + + Returns: + List of version strings in insertion order. + """ + entries = self._read_index() + if pool is not None: + entries = [e for e in entries if e["pool"] == pool] + return [e["version"] for e in entries] + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _enforce_cap(self) -> None: + """Remove oldest entries when per-pool or total cap is exceeded. + + For the explore pool, never remove entries below explore_protected_min. + """ + entries = self._read_index() + + # Enforce per-pool caps first + exploit_entries = [e for e in entries if e["pool"] == "exploit"] + explore_entries = [e for e in entries if e["pool"] == "explore"] + + # Trim exploit (oldest first, no protected minimum) + while len(exploit_entries) > self.exploit_cap: + removed = exploit_entries.pop(0) + self._remove_version(removed["version"]) + + # Trim explore (respect protected minimum) + while len(explore_entries) > self.explore_cap and len(explore_entries) > self.explore_protected_min: + removed = explore_entries.pop(0) + self._remove_version(removed["version"]) + + # Enforce global cap on combined list (oldest first, explore protected) + combined = exploit_entries + explore_entries + combined.sort(key=lambda e: entries.index(e) if e in entries else 0) + + while len(combined) > self.cap: + # Find oldest non-protected entry to remove + explore_count = sum(1 for e in combined if e["pool"] == "explore") + removed = None + for e in combined: + if e["pool"] == "exploit": + removed = e + break + if e["pool"] == "explore" and explore_count > self.explore_protected_min: + removed = e + break + if removed is None: + break + combined.remove(removed) + self._remove_version(removed["version"]) + + # Rewrite index with surviving entries + surviving = {e["version"] for e in combined} + all_entries = self._read_index() + kept = [e for e in all_entries if e["version"] in surviving] + self._write_index(kept) + + def _remove_version(self, version: str) -> None: + """Delete tarball for a version (index rewrite is done by caller).""" + tarball = self.archive_dir / f"{version}.tar.gz" + if tarball.exists(): + tarball.unlink() + + def _hash_dir(self, path: Path) -> str: + """Compute deterministic SHA-256 hash over all files in a directory. + + Args: + path: Directory to hash. + + Returns: + 64-character hex digest. + """ + hasher = hashlib.sha256() + # Sort for determinism + for file_path in sorted(Path(path).rglob("*")): + if file_path.is_file(): + rel = file_path.relative_to(path) + hasher.update(str(rel).encode()) + hasher.update(file_path.read_bytes()) + return hasher.hexdigest() + + def _read_index(self) -> list[dict]: + """Read and parse index.jsonl into a list of entry dicts.""" + index_path = self.archive_dir / "index.jsonl" + if not index_path.exists(): + return [] + lines = index_path.read_text().splitlines() + return [json.loads(line) for line in lines if line.strip()] + + def _write_index(self, entries: list[dict]) -> None: + """Overwrite index.jsonl with given entries.""" + index_path = self.archive_dir / "index.jsonl" + with index_path.open("w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") diff --git a/tests/test_archive_manager.py b/tests/test_archive_manager.py new file mode 100644 index 0000000..68b82ce --- /dev/null +++ b/tests/test_archive_manager.py @@ -0,0 +1,160 @@ +"""Tests for ArchiveManager: evolutionary archive with exploit/explore pools.""" +import json +import tempfile +from pathlib import Path + +import pytest + +from archive_manager import ArchiveManager + + +@pytest.fixture +def tmp_dirs(): + """Provide temporary archive_dir and editable_dir.""" + with tempfile.TemporaryDirectory() as base: + archive_dir = Path(base) / "archive" + editable_dir = Path(base) / "editable" + archive_dir.mkdir() + editable_dir.mkdir() + yield archive_dir, editable_dir + + +def _seed_editable(editable_dir: Path, content: dict[str, str]) -> None: + """Write {filename: text} into editable_dir.""" + for name, text in content.items(): + (editable_dir / name).write_text(text) + + +def _read_index(archive_dir: Path) -> list[dict]: + index_path = archive_dir / "index.jsonl" + if not index_path.exists(): + return [] + return [json.loads(line) for line in index_path.read_text().splitlines() if line.strip()] + + +# --------------------------------------------------------------------------- +# test_snapshot_and_restore +# --------------------------------------------------------------------------- + +def test_snapshot_and_restore(tmp_dirs): + """snapshot v1 → modify editable → restore v1 → original content back.""" + archive_dir, editable_dir = tmp_dirs + _seed_editable(editable_dir, {"a.txt": "hello", "b.txt": "world"}) + + mgr = ArchiveManager(archive_dir, editable_dir) + tree_hash = mgr.snapshot("v1", scores={"smoke": 1.0}) + + # Mutate editable_dir + (editable_dir / "a.txt").write_text("mutated") + (editable_dir / "c.txt").write_text("new file") + + mgr.restore("v1") + + assert (editable_dir / "a.txt").read_text() == "hello" + assert (editable_dir / "b.txt").read_text() == "world" + assert not (editable_dir / "c.txt").exists() + + # Index should have one entry + entries = _read_index(archive_dir) + assert len(entries) == 1 + assert entries[0]["version"] == "v1" + assert entries[0]["tree_hash"] == tree_hash + + +# --------------------------------------------------------------------------- +# test_archive_cap +# --------------------------------------------------------------------------- + +def test_archive_cap(tmp_dirs): + """Create 5 snapshots with cap=3 → only 3 remain.""" + archive_dir, editable_dir = tmp_dirs + mgr = ArchiveManager(archive_dir, editable_dir, cap=3, exploit_cap=3, explore_cap=3) + + for i in range(1, 6): + _seed_editable(editable_dir, {"f.txt": f"v{i}"}) + mgr.snapshot(f"v{i}", scores={"smoke": 1.0}, pool="exploit") + + versions = mgr.list_versions() + assert len(versions) == 3 + + +# --------------------------------------------------------------------------- +# test_exploit_explore_pools +# --------------------------------------------------------------------------- + +def test_exploit_explore_pools(tmp_dirs): + """Snapshot to different pools → list_versions filters correctly.""" + archive_dir, editable_dir = tmp_dirs + mgr = ArchiveManager(archive_dir, editable_dir, cap=50, exploit_cap=30, explore_cap=20) + + _seed_editable(editable_dir, {"x.txt": "exploit1"}) + mgr.snapshot("e1", scores={"smoke": 1.0}, pool="exploit") + + _seed_editable(editable_dir, {"x.txt": "exploit2"}) + mgr.snapshot("e2", scores={"smoke": 0.9}, pool="exploit") + + _seed_editable(editable_dir, {"x.txt": "explore1"}) + mgr.snapshot("x1", scores={"smoke": 0.5}, pool="explore") + + all_versions = mgr.list_versions() + exploit_versions = mgr.list_versions(pool="exploit") + explore_versions = mgr.list_versions(pool="explore") + + assert set(all_versions) == {"e1", "e2", "x1"} + assert set(exploit_versions) == {"e1", "e2"} + assert set(explore_versions) == {"x1"} + + +# --------------------------------------------------------------------------- +# test_explore_protected_min +# --------------------------------------------------------------------------- + +def test_explore_protected_min(tmp_dirs): + """With explore_protected_min=2 and explore_cap=2, adding 3 explore entries + must keep at least 2 explore entries after cap enforcement.""" + archive_dir, editable_dir = tmp_dirs + mgr = ArchiveManager( + archive_dir, + editable_dir, + cap=50, + exploit_cap=30, + explore_cap=2, + explore_protected_min=2, + ) + + for i in range(1, 4): + _seed_editable(editable_dir, {"g.txt": f"explore{i}"}) + mgr.snapshot(f"xp{i}", scores={"smoke": 0.5}, pool="explore") + + explore_versions = mgr.list_versions(pool="explore") + assert len(explore_versions) >= 2 + + +# --------------------------------------------------------------------------- +# test_hash_dir_deterministic +# --------------------------------------------------------------------------- + +def test_hash_dir_deterministic(tmp_dirs): + """Same directory content → same hash regardless of call order.""" + archive_dir, editable_dir = tmp_dirs + mgr = ArchiveManager(archive_dir, editable_dir) + + _seed_editable(editable_dir, {"a.txt": "foo", "b.txt": "bar"}) + h1 = mgr._hash_dir(editable_dir) + h2 = mgr._hash_dir(editable_dir) + assert h1 == h2 + assert len(h1) == 64 # sha256 hex digest + + +def test_hash_dir_changes_on_content_change(tmp_dirs): + """Different content → different hash.""" + archive_dir, editable_dir = tmp_dirs + mgr = ArchiveManager(archive_dir, editable_dir) + + _seed_editable(editable_dir, {"a.txt": "foo"}) + h1 = mgr._hash_dir(editable_dir) + + (editable_dir / "a.txt").write_text("bar") + h2 = mgr._hash_dir(editable_dir) + + assert h1 != h2 From 83956b4f7079e69630d1b484c8ce08ab27d762e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 11:12:33 +0900 Subject: [PATCH 10/12] feat: per-suite promotion gates and migration rules --- promotion.py | 191 ++++++++++++++++++++++++++++++++++++++++ tests/test_promotion.py | 143 ++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+) create mode 100644 promotion.py create mode 100644 tests/test_promotion.py diff --git a/promotion.py b/promotion.py new file mode 100644 index 0000000..235dc35 --- /dev/null +++ b/promotion.py @@ -0,0 +1,191 @@ +"""Per-suite promotion gates and exploit/explore migration rules.""" +from dataclasses import dataclass, field + + +# --------------------------------------------------------------------------- +# Gate definitions +# --------------------------------------------------------------------------- + +@dataclass +class SuiteGate: + """Promotion threshold for a single evaluation suite. + + Args: + min_absolute: Minimum absolute score required to pass. + max_regression_pct: Maximum allowed regression from best score (0.0 = 0%). + """ + + min_absolute: float + max_regression_pct: float + + +@dataclass +class PromotionGates: + """Collection of per-suite gates used to decide promotion. + + Args: + gates: Mapping of suite name → SuiteGate. + """ + + gates: dict[str, SuiteGate] + + @classmethod + def defaults(cls) -> "PromotionGates": + """Return default gates: smoke(1.0, 0%), spreadsheet(0.80, 5%), terminal(0.40, 5%).""" + return cls( + gates={ + "smoke": SuiteGate(min_absolute=1.0, max_regression_pct=0.0), + "spreadsheet": SuiteGate(min_absolute=0.80, max_regression_pct=0.05), + "terminal": SuiteGate(min_absolute=0.40, max_regression_pct=0.05), + } + ) + + +# --------------------------------------------------------------------------- +# Promotion result +# --------------------------------------------------------------------------- + +@dataclass +class PromotionResult: + """Result of a promotion gate check. + + Args: + promoted: True if all gates passed. + reason: Human-readable explanation (empty string when promoted=True). + """ + + promoted: bool + reason: str = "" + + +def check_promotion( + scores: dict[str, float], + gates: PromotionGates, + best_scores: dict[str, float], +) -> PromotionResult: + """Check all promotion gates and return result. + + A candidate passes only if every defined suite gate is satisfied: + - score >= gate.min_absolute + - score >= best_score * (1 - gate.max_regression_pct) + + Args: + scores: Current candidate scores per suite. + gates: PromotionGates defining thresholds. + best_scores: Historical best scores per suite for regression check. + + Returns: + PromotionResult with promoted=True if all gates pass. + """ + for suite, gate in gates.gates.items(): + current = scores.get(suite, 0.0) + best = best_scores.get(suite, 0.0) + + if current < gate.min_absolute: + return PromotionResult( + promoted=False, + reason=f"{suite} score {current:.3f} below minimum {gate.min_absolute:.3f}", + ) + + regression_floor = best * (1.0 - gate.max_regression_pct) + if current < regression_floor: + return PromotionResult( + promoted=False, + reason=( + f"{suite} score {current:.3f} regresses more than " + f"{gate.max_regression_pct*100:.0f}% from best {best:.3f}" + ), + ) + + return PromotionResult(promoted=True) + + +# --------------------------------------------------------------------------- +# Migration config & plan +# --------------------------------------------------------------------------- + +@dataclass +class MigrationConfig: + """Configuration for periodic pool migration. + + Args: + interval_generations: How many generations between migration runs. + explore_to_exploit_top_k: Top-k explore entries (by score) promoted each interval. + exploit_to_explore_bottom_k: Bottom-k exploit entries demoted each interval. + cross_domain_fast_track_threshold: cross_domain_delta threshold for immediate promotion. + """ + + interval_generations: int = 3 + explore_to_exploit_top_k: int = 3 + exploit_to_explore_bottom_k: int = 5 + cross_domain_fast_track_threshold: float = 0.10 + + +@dataclass +class MigrationPlan: + """Versions to move between pools. + + Args: + promote_to_exploit: Explore versions to move into exploit pool. + demote_to_explore: Exploit versions to move into explore pool. + """ + + promote_to_exploit: list[str] = field(default_factory=list) + demote_to_explore: list[str] = field(default_factory=list) + + +def _mean_score(scores: dict[str, float]) -> float: + """Compute mean over all suite scores.""" + if not scores: + return 0.0 + return sum(scores.values()) / len(scores) + + +def compute_migration( + archive_index: list[dict], + generation: int, + config: MigrationConfig, +) -> MigrationPlan: + """Determine which versions should move between exploit and explore pools. + + Rules: + - Cross-domain fast track: any explore entry with cross_domain_delta above + threshold is promoted immediately regardless of generation. + - Interval migration: at multiples of interval_generations, top-k explore + entries (by mean score) are promoted and bottom-k exploit entries are demoted. + + Args: + archive_index: List of entry dicts from index.jsonl. + generation: Current generation number. + config: MigrationConfig thresholds. + + Returns: + MigrationPlan listing versions to promote and demote. + """ + plan = MigrationPlan() + + explore_entries = [e for e in archive_index if e.get("pool") == "explore"] + exploit_entries = [e for e in archive_index if e.get("pool") == "exploit"] + + # Cross-domain fast track (always active, independent of interval) + fast_tracked = set() + for entry in explore_entries: + delta = entry.get("cross_domain_delta", 0.0) + if delta > config.cross_domain_fast_track_threshold: + plan.promote_to_exploit.append(entry["version"]) + fast_tracked.add(entry["version"]) + + # Interval-based migration + if generation > 0 and generation % config.interval_generations == 0: + # Promote top-k explore by mean score (skip already fast-tracked) + eligible_explore = [e for e in explore_entries if e["version"] not in fast_tracked] + eligible_explore.sort(key=lambda e: _mean_score(e.get("scores", {})), reverse=True) + for entry in eligible_explore[: config.explore_to_exploit_top_k]: + plan.promote_to_exploit.append(entry["version"]) + + # Demote bottom-k exploit by mean score + exploit_entries.sort(key=lambda e: _mean_score(e.get("scores", {}))) + for entry in exploit_entries[: config.exploit_to_explore_bottom_k]: + plan.demote_to_explore.append(entry["version"]) + + return plan diff --git a/tests/test_promotion.py b/tests/test_promotion.py new file mode 100644 index 0000000..52ed4da --- /dev/null +++ b/tests/test_promotion.py @@ -0,0 +1,143 @@ +"""Tests for promotion gates and explore→exploit migration rules.""" +from promotion import ( + MigrationConfig, + MigrationPlan, + PromotionGates, + PromotionResult, + check_promotion, + compute_migration, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_index(entries: list[dict]) -> list[dict]: + """Build a minimal archive index from shorthand dicts.""" + defaults = { + "pool": "explore", + "scores": {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40}, + "tree_hash": "abc", + "generation": 0, + "stale_generations": 0, + "cross_domain_delta": 0.0, + } + return [{**defaults, **e} for e in entries] + + +# --------------------------------------------------------------------------- +# test_smoke_gate_pass +# --------------------------------------------------------------------------- + +def test_smoke_gate_pass(): + """All gates met → promoted=True.""" + scores = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45} + best = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45} + gates = PromotionGates.defaults() + + result = check_promotion(scores, gates, best_scores=best) + + assert isinstance(result, PromotionResult) + assert result.promoted is True + + +# --------------------------------------------------------------------------- +# test_smoke_gate_fail +# --------------------------------------------------------------------------- + +def test_smoke_gate_fail(): + """smoke < 1.0 → promoted=False.""" + scores = {"smoke": 0.9, "spreadsheet": 0.85, "terminal": 0.45} + best = {"smoke": 1.0, "spreadsheet": 0.85, "terminal": 0.45} + gates = PromotionGates.defaults() + + result = check_promotion(scores, gates, best_scores=best) + + assert result.promoted is False + assert "smoke" in result.reason.lower() + + +# --------------------------------------------------------------------------- +# test_regression_gate_fail +# --------------------------------------------------------------------------- + +def test_regression_gate_fail(): + """spreadsheet regresses > 5% from best → promoted=False.""" + best = {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40} + # 0.80 * (1 - 0.05) = 0.76; go below that + scores = {"smoke": 1.0, "spreadsheet": 0.70, "terminal": 0.40} + gates = PromotionGates.defaults() + + result = check_promotion(scores, gates, best_scores=best) + + assert result.promoted is False + assert "spreadsheet" in result.reason.lower() + + +# --------------------------------------------------------------------------- +# test_migration_at_interval +# --------------------------------------------------------------------------- + +def test_migration_at_interval(): + """At generation 3 (== interval), top-k explore entries get promoted to exploit.""" + config = MigrationConfig( + interval_generations=3, + explore_to_exploit_top_k=2, + exploit_to_explore_bottom_k=5, + cross_domain_fast_track_threshold=0.10, + ) + index = _make_index([ + {"version": "xp1", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.90, "terminal": 0.50}}, + {"version": "xp2", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.70, "terminal": 0.30}}, + {"version": "xp3", "pool": "explore", "scores": {"smoke": 1.0, "spreadsheet": 0.80, "terminal": 0.40}}, + {"version": "ex1", "pool": "exploit", "scores": {"smoke": 1.0, "spreadsheet": 0.95, "terminal": 0.60}}, + ]) + + plan = compute_migration(index, generation=3, config=config) + + assert isinstance(plan, MigrationPlan) + # Top-2 explore by score should be promoted + assert len(plan.promote_to_exploit) == 2 + assert "xp1" in plan.promote_to_exploit # highest spreadsheet + + +# --------------------------------------------------------------------------- +# test_migration_not_at_interval +# --------------------------------------------------------------------------- + +def test_migration_not_at_interval(): + """At generation 2 (not an interval), no migration happens.""" + config = MigrationConfig(interval_generations=3) + index = _make_index([ + {"version": "xp1", "pool": "explore"}, + {"version": "ex1", "pool": "exploit"}, + ]) + + plan = compute_migration(index, generation=2, config=config) + + assert plan.promote_to_exploit == [] + assert plan.demote_to_explore == [] + + +# --------------------------------------------------------------------------- +# test_cross_domain_fast_track +# --------------------------------------------------------------------------- + +def test_cross_domain_fast_track(): + """Entry with cross_domain_delta > 10% gets immediate shadow priority (fast-tracked).""" + config = MigrationConfig( + interval_generations=3, + cross_domain_fast_track_threshold=0.10, + ) + # Generation 1 — not an interval, but cross_domain_delta > threshold + index = _make_index([ + {"version": "xp_novel", "pool": "explore", "cross_domain_delta": 0.15}, + {"version": "xp_norm", "pool": "explore", "cross_domain_delta": 0.02}, + {"version": "ex1", "pool": "exploit"}, + ]) + + plan = compute_migration(index, generation=1, config=config) + + assert "xp_novel" in plan.promote_to_exploit + assert "xp_norm" not in plan.promote_to_exploit From 9df339fe4ca0eb61865eda43d6b60ff8015b36c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 11:13:50 +0900 Subject: [PATCH 11/12] feat: split program.md into fixed rules and editable strategy --- program-fixed.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ program-strategy.md | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 program-fixed.md create mode 100644 program-strategy.md diff --git a/program-fixed.md b/program-fixed.md new file mode 100644 index 0000000..54ae92e --- /dev/null +++ b/program-fixed.md @@ -0,0 +1,45 @@ +# Program Fixed Rules (DO NOT MODIFY) + +These rules are immutable. The meta-agent must not alter this file. + +## Safety Rules + +1. Never modify files in the `fixed/` directory (`adapter.py`, `contracts.py`). +2. Never use `importlib`, `ctypes`, `sys.modules`, or `__import__()` directly. +3. Always log results via `ExperimentLogger` to `experiments.jsonl`. +4. Never skip the preflight policy gate before evaluation. +5. Respect per-suite promotion gates — do not bypass thresholds. +6. Do not modify evaluator, promotion, or scoreboard logic. + +## Experiment Protocol + +1. Read the latest experiments.jsonl and recent task-level results. +2. Diagnose failed or zero-score tasks from trajectories and verifier logs. +3. Group failures by root cause. +4. Choose one general harness improvement. +5. Run preflight check on the diff. +6. Execute smoke test (Level 1) — all 5 must pass. +7. If smoke passes, execute domain suite (Level 2). +8. Log results to experiments.jsonl with full metadata. +9. If improved: snapshot to archive, commit. +10. If regressed: restore previous best, log root cause. + +## Model Constraint + +Do not change `MODEL` from `gpt-5` without explicit human approval. + +## Overfitting Rule + +Do not add task-specific hacks. Use this test: +"If this exact task disappeared, would this still be a worthwhile improvement?" + +## Keep / Discard Rules + +- If `passed` improved → keep. +- If `passed` stayed same and harness is simpler → keep. +- Otherwise → discard. Record root cause in experiments.jsonl. + +## Termination + +Continue iterating until the human explicitly stops. +Never pause to ask whether to continue. diff --git a/program-strategy.md b/program-strategy.md new file mode 100644 index 0000000..4bbbc47 --- /dev/null +++ b/program-strategy.md @@ -0,0 +1,33 @@ +# Program Strategy + +> **Stage 1: READ-ONLY.** This file becomes editable after Stage 2 gate passes +> (per-suite non-regression 10 consecutive runs + human approval). + +## Current Strategy + +- Focus on tool addition over prompt tuning (high-leverage). +- Prefer specialized tools (e.g., openpyxl for Excel) over raw shell. +- Test one change at a time for clear attribution. +- Prioritize tasks with highest failure rate. + +## Tool Design Guidelines + +- Each tool should do one thing well. +- Include input validation and clear error messages. +- Return structured output, not raw stdout. +- Match model's name-based priors (models pattern-match tool names). + +## Agent Architecture Strategy + +- Start with single agent + specialized tools. +- Consider `agent.as_tool()` for verification sub-agent when: + - Many tasks fail silently (agent thinks it succeeded but output is wrong). + - Verification logic is complex enough to benefit from a separate agent. + +## Simplicity Criterion + +All else being equal, simpler is better: +- Fewer components +- Less brittle logic +- Cleaner tool interfaces +- Less code for the same outcome From 08c68d6f4eb9ad145ae57a91a2746cd8d80a489f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9E=AC=EC=98=81?= Date: Sat, 4 Apr 2026 11:14:18 +0900 Subject: [PATCH 12/12] docs: add CONTRIBUTING.md with adapter, task, and evaluation guidelines --- CONTRIBUTING.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..5456150 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,78 @@ +# Contributing to AutoAgent + +## Adding a New Backend Adapter + +1. Copy `agent.py` as `agent-.py`. +2. Implement the `AgentWorkflow` protocol from `contracts.py`: + - `create_tools(environment) -> list` + - `create_agent(environment) -> agent` + - `async run_task(environment, instruction) -> tuple[result, duration_ms]` +3. Import `AutoAgent` from `adapter.py` — do not modify it. +4. Run smoke tests: `bash scripts/run_smoke.sh` +5. Log results using `ExperimentLogger` from `experiment_log.py`. + +See `agent-claude.py` for a Claude SDK reference implementation. + +## Adding New Tasks + +1. Create `tasks///` with: + - `task.toml` — task metadata (name, description, timeout_sec) + - `instruction.md` — what the agent should do + - `tests/test.sh` — verification script (exit 0 = pass, exit 1 = fail) +2. Follow existing patterns in `tasks/smoke/`. +3. Test scripts should use `set -euo pipefail`. + +## Evaluation Levels + +| Level | Location | Purpose | +|-------|----------|---------| +| Smoke | `tasks/smoke/` | Basic sanity (< 1 min) | +| Domain | `tasks//` | Domain-specific suite | +| Cross-domain | External benchmarks | Generalization test | + +## Reporting Benchmark Results + +Use `experiments.jsonl` format via `ExperimentLogger`: + +```python +from experiment_log import ExperimentLogger, ExperimentEntry + +logger = ExperimentLogger("experiments.jsonl") +logger.append(ExperimentEntry( + version="v1", + scores={"smoke": 1.0, "spreadsheet": 0.85}, + trace_id="trace-001", + trajectory_uri="jobs/v1/trajectory.json", + # ... other fields +)) +``` + +## Project Structure + +``` +autoagent/ +├── agent.py # Editable harness (meta-agent modifies this) +├── adapter.py # Fixed Harbor adapter (read-only) +├── contracts.py # Interface protocols (read-only) +├── preflight.py # Mutation validation gate +├── experiment_log.py # ATIF sidecar experiment logger +├── archive_manager.py # Evolutionary archive (exploit/explore) +├── promotion.py # Promotion gates and migration rules +├── program.md # Original meta-agent directive +├── program-fixed.md # Immutable safety rules +├── program-strategy.md # Editable strategy (Stage 2) +├── Dockerfile.base # Container base image +├── scripts/ +│ ├── run_eval.sh # Docker eval runner (read-only + network isolation) +│ └── run_smoke.sh # Smoke test runner +├── tasks/smoke/ # Level 1 smoke tests (5 tasks) +└── tests/ # Unit tests +``` + +## Safety Boundary + +Files in the **fixed boundary** must not be modified by the meta-agent: +- `adapter.py`, `contracts.py` — enforced via Docker read-only mount +- Evaluator logic, promotion gates — enforced via preflight policy gate + +The `preflight.py` gate automatically rejects diffs that touch fixed files or use forbidden imports.