kevinrgu · physics91 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,78 @@
+# Contributing to AutoAgent
+
+## Adding a New Backend Adapter
+
+1. Copy `agent.py` as `agent-<backend>.py`.
+2. Implement the `AgentWorkflow` protocol from `contracts.py`:
+   - `create_tools(environment) -> list`
+   - `create_agent(environment) -> agent`
+   - `async run_task(environment, instruction) -> tuple[result, duration_ms]`
+3. Import `AutoAgent` from `adapter.py` — do not modify it.
+4. Run smoke tests: `bash scripts/run_smoke.sh`
+5. Log results using `ExperimentLogger` from `experiment_log.py`.
+
+See `agent-claude.py` for a Claude SDK reference implementation.
+
+## Adding New Tasks
+
+1. Create `tasks/<category>/<name>/` with:
+   - `task.toml` — task metadata (name, description, timeout_sec)
+   - `instruction.md` — what the agent should do
+   - `tests/test.sh` — verification script (exit 0 = pass, exit 1 = fail)
+2. Follow existing patterns in `tasks/smoke/`.
+3. Test scripts should use `set -euo pipefail`.
+
+## Evaluation Levels
+
+| Level | Location | Purpose |
+|-------|----------|---------|
+| Smoke | `tasks/smoke/` | Basic sanity (< 1 min) |
+| Domain | `tasks/<domain>/` | Domain-specific suite |
+| Cross-domain | External benchmarks | Generalization test |
+
+## Reporting Benchmark Results
+
+Use `experiments.jsonl` format via `ExperimentLogger`:
+
+```python
+from experiment_log import ExperimentLogger, ExperimentEntry
+
+logger = ExperimentLogger("experiments.jsonl")
+logger.append(ExperimentEntry(
+    version="v1",
+    scores={"smoke": 1.0, "spreadsheet": 0.85},
+    trace_id="trace-001",
+    trajectory_uri="jobs/v1/trajectory.json",
+    # ... other fields
+))
+```
+
+## Project Structure
+
+```
+autoagent/
+├── agent.py              # Editable harness (meta-agent modifies this)
+├── adapter.py            # Fixed Harbor adapter (read-only)
+├── contracts.py          # Interface protocols (read-only)
+├── preflight.py          # Mutation validation gate
+├── experiment_log.py     # ATIF sidecar experiment logger
+├── archive_manager.py    # Evolutionary archive (exploit/explore)
+├── promotion.py          # Promotion gates and migration rules
+├── program.md            # Original meta-agent directive
+├── program-fixed.md      # Immutable safety rules
+├── program-strategy.md   # Editable strategy (Stage 2)
+├── Dockerfile.base       # Container base image
+├── scripts/
+│   ├── run_eval.sh       # Docker eval runner (read-only + network isolation)
+│   └── run_smoke.sh      # Smoke test runner
+├── tasks/smoke/          # Level 1 smoke tests (5 tasks)
+└── tests/                # Unit tests
+```
+
+## Safety Boundary
+
+Files in the **fixed boundary** must not be modified by the meta-agent:
+- `adapter.py`, `contracts.py` — enforced via Docker read-only mount
+- Evaluator logic, promotion gates — enforced via preflight policy gate
+
+The `preflight.py` gate automatically rejects diffs that touch fixed files or use forbidden imports.
diff --git a/Dockerfile.base b/Dockerfile.base
@@ -5,13 +5,10 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
-
-# Python deps — only what the agent needs (harbor excluded via .dockerignore)
+COPY contracts.py adapter.py /app/fixed/
+COPY agent.py /app/editable/
 COPY pyproject.toml ./
 RUN uv pip install --system .
-
-# Agent code
-COPY agent.py ./
-
+ENV PYTHONPATH=/app/fixed:/app/editable:/app
 RUN ln -sf $(which python3) /usr/local/bin/python
 RUN mkdir -p /logs /app/output
diff --git a/adapter.py b/adapter.py
@@ -0,0 +1,197 @@
+"""Fixed Harbor adapter — DO NOT MODIFY. Read-only in production."""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+
+from agents.items import (
+    ItemHelpers,
+    MessageOutputItem,
+    ReasoningItem,
+    ToolCallItem,
+    ToolCallOutputItem,
+)
+from agents.usage import Usage
+from harbor.agents.base import BaseAgent
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+def _load_harness():
+    """Lazy import to avoid circular dependency with agent.py."""
+    from agent import MODEL, run_task
+    return MODEL, run_task
+
+
+def to_atif(result: object, model: str, duration_ms: int = 0) -> dict:
+    """Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict."""
+    steps: list[dict] = []
+    step_id = 0
+    now = datetime.now(timezone.utc).isoformat()
+
+    def _step(source: str, message: str, **extra: object) -> dict:
+        nonlocal step_id
+        step_id += 1
+        step = {
+            "step_id": step_id,
+            "timestamp": now,
+            "source": source,
+            "message": message,
+        }
+        step.update({key: value for key, value in extra.items() if value is not None})
+        return step
+
+    pending_tool_calls: dict[str, object] = {}
+    for item in result.new_items:
+        if isinstance(item, MessageOutputItem):
+            text = ItemHelpers.text_message_output(item)
+            if text:
+                steps.append(_step("agent", text, model_name=model))
+        elif isinstance(item, ReasoningItem):
+            summaries = getattr(item.raw_item, "summary", None)
+            reasoning = (
+                "\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None
+            )
+            if reasoning:
+                steps.append(
+                    _step(
+                        "agent",
+                        "(thinking)",
+                        reasoning_content=reasoning,
+                        model_name=model,
+                    )
+                )
+        elif isinstance(item, ToolCallItem):
+            raw = item.raw_item
+            if hasattr(raw, "call_id") and hasattr(raw, "name"):
+                pending_tool_calls[raw.call_id] = raw
+        elif isinstance(item, ToolCallOutputItem):
+            output_call_id = (
+                getattr(item.raw_item, "call_id", None)
+                or getattr(item.raw_item, "tool_call_id", None)
+            )
+            pending_tool_call = pending_tool_calls.pop(output_call_id, None) if output_call_id else None
+            if pending_tool_call:
+                arguments = (
+                    json.loads(pending_tool_call.arguments)
+                    if isinstance(pending_tool_call.arguments, str)
+                    else pending_tool_call.arguments
+                )
+                output_str = str(item.output) if item.output else ""
+                steps.append(
+                    _step(
+                        "agent",
+                        f"Tool: {pending_tool_call.name}",
+                        tool_calls=[
+                            {
+                                "tool_call_id": pending_tool_call.call_id,
+                                "function_name": pending_tool_call.name,
+                                "arguments": arguments,
+                            }
+                        ],
+                        observation={
+                            "results": [
+                                {
+                                    "source_call_id": pending_tool_call.call_id,
+                                    "content": output_str,
+                                }
+                            ]
+                        },
+                    )
+                )
+
+    for pending_tool_call in pending_tool_calls.values():
+        arguments = (
+            json.loads(pending_tool_call.arguments)
+            if isinstance(pending_tool_call.arguments, str)
+            else pending_tool_call.arguments
+        )
+        steps.append(
+            _step(
+                "agent",
+                f"Tool: {pending_tool_call.name}",
+                tool_calls=[
+                    {
+                        "tool_call_id": pending_tool_call.call_id,
+                        "function_name": pending_tool_call.name,
+                        "arguments": arguments,
+                    }
+                ],
+            )
+        )
+
+    if not steps:
+        steps.append(_step("user", "(empty)"))
+
+    usage = Usage()
+    for response in result.raw_responses:
+        usage.add(response.usage)
+
+    return {
+        "schema_version": "ATIF-v1.6",
+        "session_id": getattr(result, "last_response_id", None) or "unknown",
+        "agent": {"name": "autoagent", "version": "0.1.0", "model_name": model},
+        "steps": steps,
+        "final_metrics": {
+            "total_prompt_tokens": usage.input_tokens,
+            "total_completion_tokens": usage.output_tokens,
+            "total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0,
+            "total_cost_usd": None,
+            "total_steps": len(steps),
+            "extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)},
+        },
+    }
+
+
+class AutoAgent(BaseAgent):
+    """Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container."""
+
+    SUPPORTS_ATIF = True
+
+    def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._extra_env = dict(extra_env) if extra_env else {}
+
+    @staticmethod
+    def name() -> str:
+        return "autoagent"
+
+    def version(self) -> str | None:
+        return "0.1.0"
+
+    async def setup(self, environment: BaseEnvironment) -> None:
+        pass
+
+    async def run(
+        self, instruction: str, environment: BaseEnvironment, context: AgentContext
+    ) -> None:
+        await environment.exec(command="mkdir -p /task")
+        instr_file = self.logs_dir / "instruction.md"
+        instr_file.write_text(instruction)
+        await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md")
+
+        model, run_task_fn = _load_harness()
+        result, duration_ms = await run_task_fn(environment, instruction)
+
+        atif = to_atif(result, model=model, duration_ms=duration_ms)
+        traj_path = self.logs_dir / "trajectory.json"
+        traj_path.write_text(json.dumps(atif, indent=2))
+
+        try:
+            final_metrics = atif.get("final_metrics", {})
+            context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0)
+            context.n_output_tokens = final_metrics.get("total_completion_tokens", 0)
+            context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0)
+        except Exception:
+            pass
+
+        usage = Usage()
+        for response in result.raw_responses:
+            usage.add(response.usage)
+        print(
+            f"turns={len(result.raw_responses)} duration_ms={duration_ms} "
+            f"input={usage.input_tokens} output={usage.output_tokens}"
+        )
+
+
+__all__ = ["AutoAgent", "to_atif"]