Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Contributing to AutoAgent

## Adding a New Backend Adapter

1. Copy `agent.py` as `agent-<backend>.py`.
2. Implement the `AgentWorkflow` protocol from `contracts.py`:
- `create_tools(environment) -> list`
- `create_agent(environment) -> agent`
- `async run_task(environment, instruction) -> tuple[result, duration_ms]`
3. Import `AutoAgent` from `adapter.py` — do not modify it.
4. Run smoke tests: `bash scripts/run_smoke.sh`
5. Log results using `ExperimentLogger` from `experiment_log.py`.

See `agent-claude.py` for a Claude SDK reference implementation.

## Adding New Tasks

1. Create `tasks/<category>/<name>/` with:
- `task.toml` — task metadata (name, description, timeout_sec)
- `instruction.md` — what the agent should do
- `tests/test.sh` — verification script (exit 0 = pass, exit 1 = fail)
2. Follow existing patterns in `tasks/smoke/`.
3. Test scripts should use `set -euo pipefail`.

## Evaluation Levels

| Level | Location | Purpose |
|-------|----------|---------|
| Smoke | `tasks/smoke/` | Basic sanity (< 1 min) |
| Domain | `tasks/<domain>/` | Domain-specific suite |
| Cross-domain | External benchmarks | Generalization test |

## Reporting Benchmark Results

Use `experiments.jsonl` format via `ExperimentLogger`:

```python
from experiment_log import ExperimentLogger, ExperimentEntry

logger = ExperimentLogger("experiments.jsonl")
logger.append(ExperimentEntry(
version="v1",
scores={"smoke": 1.0, "spreadsheet": 0.85},
trace_id="trace-001",
trajectory_uri="jobs/v1/trajectory.json",
# ... other fields
))
```

## Project Structure

```
autoagent/
├── agent.py # Editable harness (meta-agent modifies this)
├── adapter.py # Fixed Harbor adapter (read-only)
├── contracts.py # Interface protocols (read-only)
├── preflight.py # Mutation validation gate
├── experiment_log.py # ATIF sidecar experiment logger
├── archive_manager.py # Evolutionary archive (exploit/explore)
├── promotion.py # Promotion gates and migration rules
├── program.md # Original meta-agent directive
├── program-fixed.md # Immutable safety rules
├── program-strategy.md # Editable strategy (Stage 2)
├── Dockerfile.base # Container base image
├── scripts/
│ ├── run_eval.sh # Docker eval runner (read-only + network isolation)
│ └── run_smoke.sh # Smoke test runner
├── tasks/smoke/ # Level 1 smoke tests (5 tasks)
└── tests/ # Unit tests
```

## Safety Boundary

Files in the **fixed boundary** must not be modified by the meta-agent:
- `adapter.py`, `contracts.py` — enforced via Docker read-only mount
- Evaluator logic, promotion gates — enforced via preflight policy gate

The `preflight.py` gate automatically rejects diffs that touch fixed files or use forbidden imports.
9 changes: 3 additions & 6 deletions Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Python deps — only what the agent needs (harbor excluded via .dockerignore)
COPY contracts.py adapter.py /app/fixed/
COPY agent.py /app/editable/
COPY pyproject.toml ./
RUN uv pip install --system .

# Agent code
COPY agent.py ./

ENV PYTHONPATH=/app/fixed:/app/editable:/app
RUN ln -sf $(which python3) /usr/local/bin/python
RUN mkdir -p /logs /app/output
197 changes: 197 additions & 0 deletions adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""Fixed Harbor adapter — DO NOT MODIFY. Read-only in production."""

from __future__ import annotations

import json
from datetime import datetime, timezone

from agents.items import (
ItemHelpers,
MessageOutputItem,
ReasoningItem,
ToolCallItem,
ToolCallOutputItem,
)
from agents.usage import Usage
from harbor.agents.base import BaseAgent
from harbor.environments.base import BaseEnvironment
from harbor.models.agent.context import AgentContext

def _load_harness():
"""Lazy import to avoid circular dependency with agent.py."""
from agent import MODEL, run_task
return MODEL, run_task


def to_atif(result: object, model: str, duration_ms: int = 0) -> dict:
"""Convert OpenAI Agents SDK RunResult to an ATIF trajectory dict."""
steps: list[dict] = []
step_id = 0
now = datetime.now(timezone.utc).isoformat()

def _step(source: str, message: str, **extra: object) -> dict:
nonlocal step_id
step_id += 1
step = {
"step_id": step_id,
"timestamp": now,
"source": source,
"message": message,
}
step.update({key: value for key, value in extra.items() if value is not None})
return step

pending_tool_calls: dict[str, object] = {}
for item in result.new_items:
if isinstance(item, MessageOutputItem):
text = ItemHelpers.text_message_output(item)
if text:
steps.append(_step("agent", text, model_name=model))
elif isinstance(item, ReasoningItem):
summaries = getattr(item.raw_item, "summary", None)
reasoning = (
"\n".join(s.text for s in summaries if hasattr(s, "text")) if summaries else None
)
if reasoning:
steps.append(
_step(
"agent",
"(thinking)",
reasoning_content=reasoning,
model_name=model,
)
)
elif isinstance(item, ToolCallItem):
raw = item.raw_item
if hasattr(raw, "call_id") and hasattr(raw, "name"):
pending_tool_calls[raw.call_id] = raw
elif isinstance(item, ToolCallOutputItem):
output_call_id = (
getattr(item.raw_item, "call_id", None)
or getattr(item.raw_item, "tool_call_id", None)
)
pending_tool_call = pending_tool_calls.pop(output_call_id, None) if output_call_id else None
if pending_tool_call:
arguments = (
json.loads(pending_tool_call.arguments)
if isinstance(pending_tool_call.arguments, str)
else pending_tool_call.arguments
)
output_str = str(item.output) if item.output else ""
steps.append(
_step(
"agent",
f"Tool: {pending_tool_call.name}",
tool_calls=[
{
"tool_call_id": pending_tool_call.call_id,
"function_name": pending_tool_call.name,
"arguments": arguments,
}
],
observation={
"results": [
{
"source_call_id": pending_tool_call.call_id,
"content": output_str,
}
]
},
)
)

for pending_tool_call in pending_tool_calls.values():
arguments = (
json.loads(pending_tool_call.arguments)
if isinstance(pending_tool_call.arguments, str)
else pending_tool_call.arguments
)
steps.append(
_step(
"agent",
f"Tool: {pending_tool_call.name}",
tool_calls=[
{
"tool_call_id": pending_tool_call.call_id,
"function_name": pending_tool_call.name,
"arguments": arguments,
}
],
)
)

if not steps:
steps.append(_step("user", "(empty)"))

usage = Usage()
for response in result.raw_responses:
usage.add(response.usage)

return {
"schema_version": "ATIF-v1.6",
"session_id": getattr(result, "last_response_id", None) or "unknown",
"agent": {"name": "autoagent", "version": "0.1.0", "model_name": model},
"steps": steps,
"final_metrics": {
"total_prompt_tokens": usage.input_tokens,
"total_completion_tokens": usage.output_tokens,
"total_cached_tokens": getattr(usage.input_tokens_details, "cached_tokens", 0) or 0,
"total_cost_usd": None,
"total_steps": len(steps),
"extra": {"duration_ms": duration_ms, "num_turns": len(result.raw_responses)},
},
}


class AutoAgent(BaseAgent):
"""Harbor agent adapter. Runs the OpenAI agent host-side and proxies shell into the container."""

SUPPORTS_ATIF = True

def __init__(self, *args, extra_env: dict[str, str] | None = None, **kwargs):
super().__init__(*args, **kwargs)
self._extra_env = dict(extra_env) if extra_env else {}

@staticmethod
def name() -> str:
return "autoagent"

def version(self) -> str | None:
return "0.1.0"

async def setup(self, environment: BaseEnvironment) -> None:
pass

async def run(
self, instruction: str, environment: BaseEnvironment, context: AgentContext
) -> None:
await environment.exec(command="mkdir -p /task")
instr_file = self.logs_dir / "instruction.md"
instr_file.write_text(instruction)
await environment.upload_file(source_path=instr_file, target_path="/task/instruction.md")

model, run_task_fn = _load_harness()
result, duration_ms = await run_task_fn(environment, instruction)

atif = to_atif(result, model=model, duration_ms=duration_ms)
traj_path = self.logs_dir / "trajectory.json"
traj_path.write_text(json.dumps(atif, indent=2))

try:
final_metrics = atif.get("final_metrics", {})
context.n_input_tokens = final_metrics.get("total_prompt_tokens", 0)
context.n_output_tokens = final_metrics.get("total_completion_tokens", 0)
context.n_cache_tokens = final_metrics.get("total_cached_tokens", 0)
except Exception:
pass

usage = Usage()
for response in result.raw_responses:
usage.add(response.usage)
print(
f"turns={len(result.raw_responses)} duration_ms={duration_ms} "
f"input={usage.input_tokens} output={usage.output_tokens}"
)


__all__ = ["AutoAgent", "to_atif"]
Loading