From fc468b18df77c561652a522507f36194442732cc Mon Sep 17 00:00:00 2001 From: Kushal Jaligama Date: Sun, 10 May 2026 15:51:52 -0700 Subject: [PATCH 1/3] feat(plugins): scaffold swarm-orchestrator plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new plugin that layers DAG-aware multi-agent coordination on top of native Anthropic Teams — additive only, no schema changes to TaskCreate / TaskUpdate / SendMessage / Team*. Plugin surface (purely additive): - 8 slash commands: /swarm-spawn, /swarm-submit, /swarm-status, /swarm-start, /swarm-stop, /swarm-merge, /swarm-abort, /swarm-test - 6 role-typed subagents with tool-restriction frontmatter: Scanner, Builder, Test-Runner, Reviewer, Merger, Auditor - 2 lifecycle hooks: PostToolUse(TaskUpdate) for DAG cascade dispatch, Stop for periodic reviewer checkpoints - 3 worked examples: refactor, feature+review, multi-day audit - Marketplace registration in .claude-plugin/marketplace.json plus a row in plugins/README.md Designed to operate in two modes: standalone (lightweight kanban + JSON inbox routing, no Anthropic Teams required) or integrated with native Teams (where SendMessage / TaskCreate / TeamCreate already exist and the plugin layers DAG iterators + role-typed heads on top). --- .claude-plugin/marketplace.json | 11 + plugins/README.md | 1 + .../.claude-plugin/plugin.json | 11 + plugins/swarm-orchestrator/.gitignore | 4 + plugins/swarm-orchestrator/README.md | 258 ++++++++++++++++++ plugins/swarm-orchestrator/agents/auditor.md | 98 +++++++ plugins/swarm-orchestrator/agents/builder.md | 92 +++++++ plugins/swarm-orchestrator/agents/merger.md | 124 +++++++++ plugins/swarm-orchestrator/agents/reviewer.md | 73 +++++ plugins/swarm-orchestrator/agents/scanner.md | 63 +++++ .../swarm-orchestrator/agents/test-runner.md | 76 ++++++ .../commands/swarm-abort.md | 40 +++ .../commands/swarm-merge.md | 130 +++++++++ .../commands/swarm-spawn.md | 148 ++++++++++ .../commands/swarm-start.md | 71 +++++ .../commands/swarm-status.md | 76 ++++++ .../swarm-orchestrator/commands/swarm-stop.md | 41 +++ .../commands/swarm-submit.md | 85 ++++++ .../swarm-orchestrator/commands/swarm-test.md | 44 +++ .../examples/feature_with_review.md | 59 ++++ .../examples/multi_day_audit.md | 64 +++++ .../examples/refactor_python_module.md | 51 ++++ plugins/swarm-orchestrator/hooks/__init__.py | 0 plugins/swarm-orchestrator/hooks/hooks.json | 28 ++ .../hooks/on_task_complete.py | 129 +++++++++ .../hooks/reviewer_checkpoint.py | 109 ++++++++ 26 files changed, 1886 insertions(+) create mode 100644 plugins/swarm-orchestrator/.claude-plugin/plugin.json create mode 100644 plugins/swarm-orchestrator/.gitignore create mode 100644 plugins/swarm-orchestrator/README.md create mode 100644 plugins/swarm-orchestrator/agents/auditor.md create mode 100644 plugins/swarm-orchestrator/agents/builder.md create mode 100644 plugins/swarm-orchestrator/agents/merger.md create mode 100644 plugins/swarm-orchestrator/agents/reviewer.md create mode 100644 plugins/swarm-orchestrator/agents/scanner.md create mode 100644 plugins/swarm-orchestrator/agents/test-runner.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-abort.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-merge.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-spawn.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-start.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-status.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-stop.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-submit.md create mode 100644 plugins/swarm-orchestrator/commands/swarm-test.md create mode 100644 plugins/swarm-orchestrator/examples/feature_with_review.md create mode 100644 plugins/swarm-orchestrator/examples/multi_day_audit.md create mode 100644 plugins/swarm-orchestrator/examples/refactor_python_module.md create mode 100644 plugins/swarm-orchestrator/hooks/__init__.py create mode 100644 plugins/swarm-orchestrator/hooks/hooks.json create mode 100755 plugins/swarm-orchestrator/hooks/on_task_complete.py create mode 100755 plugins/swarm-orchestrator/hooks/reviewer_checkpoint.py diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 44549d03d5..b905770197 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -145,6 +145,17 @@ }, "source": "./plugins/security-guidance", "category": "security" + }, + { + "name": "swarm-orchestrator", + "description": "DAG-based multi-agent swarm orchestration on top of vanilla Teams: dependency-aware task graphs, role-specific subagents (Scanner/Reviewer/Builder/Merger), abort-marker contract, worktree GC, and an opt-in reviewer-checkpoint hook.", + "version": "0.1.0", + "author": { + "name": "Kushal Jaligama", + "email": "kjaligusa@gmail.com" + }, + "source": "./plugins/swarm-orchestrator", + "category": "productivity" } ] } diff --git a/plugins/README.md b/plugins/README.md index cf4a21ecc5..85ec6d7343 100644 --- a/plugins/README.md +++ b/plugins/README.md @@ -25,6 +25,7 @@ Learn more in the [official plugins documentation](https://docs.claude.com/en/do | [pr-review-toolkit](./pr-review-toolkit/) | Comprehensive PR review agents specializing in comments, tests, error handling, type design, code quality, and code simplification | **Command:** `/pr-review-toolkit:review-pr` - Run with optional review aspects (comments, tests, errors, types, code, simplify, all)
**Agents:** `comment-analyzer`, `pr-test-analyzer`, `silent-failure-hunter`, `type-design-analyzer`, `code-reviewer`, `code-simplifier` | | [ralph-wiggum](./ralph-wiggum/) | Interactive self-referential AI loops for iterative development. Claude works on the same task repeatedly until completion | **Commands:** `/ralph-loop`, `/cancel-ralph` - Start/stop autonomous iteration loops
**Hook:** Stop - Intercepts exit attempts to continue iteration | | [security-guidance](./security-guidance/) | Security reminder hook that warns about potential security issues when editing files | **Hook:** PreToolUse - Monitors 9 security patterns including command injection, XSS, eval usage, dangerous HTML, pickle deserialization, and os.system calls | +| [swarm-orchestrator](./swarm-orchestrator/) | DAG-based multi-agent swarm orchestration on top of vanilla Teams: dependency-aware task graphs, role-specific subagents, abort-marker contract, worktree GC, opt-in reviewer-checkpoint hook | **Commands:** `/swarm-spawn`, `/swarm-status`, `/swarm-merge`
**Agents:** `scanner`, `reviewer`, `builder`, `merger`, `test-runner`, `auditor`
**Hooks:** PostToolUse (cascade dispatch on TaskUpdate), Stop (reviewer checkpoint every N turns) | ## Installation diff --git a/plugins/swarm-orchestrator/.claude-plugin/plugin.json b/plugins/swarm-orchestrator/.claude-plugin/plugin.json new file mode 100644 index 0000000000..e44185e165 --- /dev/null +++ b/plugins/swarm-orchestrator/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "swarm-orchestrator", + "version": "0.1.0", + "description": "DAG-based multi-agent swarm orchestration on top of vanilla Teams: dependency-aware task graphs, role-specific subagents (Scanner/Reviewer/Builder/Merger), abort-marker contract, worktree GC, and an opt-in reviewer-checkpoint hook.", + "author": { + "name": "Kushal Jaligama", + "email": "kjaligusa@gmail.com" + }, + "license": "Apache-2.0", + "homepage": "https://github.com/anthropics/claude-code/tree/main/plugins/swarm-orchestrator" +} diff --git a/plugins/swarm-orchestrator/.gitignore b/plugins/swarm-orchestrator/.gitignore new file mode 100644 index 0000000000..4315742b54 --- /dev/null +++ b/plugins/swarm-orchestrator/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ diff --git a/plugins/swarm-orchestrator/README.md b/plugins/swarm-orchestrator/README.md new file mode 100644 index 0000000000..cd66fe72d4 --- /dev/null +++ b/plugins/swarm-orchestrator/README.md @@ -0,0 +1,258 @@ +# Swarm Orchestrator Plugin + +DAG-based multi-agent orchestration for Claude Code Teams. Adds dependency-aware task graphs, role-specific subagents (Scanner / Reviewer / Builder / Merger / Test-Runner / Auditor), an abort-marker contract, worktree GC, and an opt-in reviewer-checkpoint hook — all on top of vanilla Teams without breaking existing TaskCreate / TaskUpdate / SendMessage schemas. + +## Why + +Vanilla Teams gives you a flat list of tasks and a generic worker. That's enough for short, linear workflows. For anything bigger — a multi-step refactor, a parallel feature batch, a multi-day audit — you end up hand-managing dependencies, watching stalled workers, cleaning up worktrees, and writing the same "is this still on track?" prompt over and over. + +`swarm-orchestrator` codifies the patterns I found myself rebuilding repeatedly: + +- **DAG dispatch**: declare blockers up-front; the next layer fires the moment its blockers complete. No babysitting. +- **Role-specific heads**: a Scanner that only files tasks; a Reviewer that's read-only and concise; a Merger that's git-only; an Auditor that produces docs. Each with tool restrictions baked in. +- **Self-correction**: the reviewer-checkpoint hook fires every N turns inside long-running Builders, prompting them to verify DAG status, commit count, spend, and tractability. No more 30-turn thrash loops. +- **Graceful exits**: a standard `/.claude/abort-` marker contract. Drop the file; the head commits WIP and exits cleanly. The orchestrator routes the partial result. +- **Worktree GC**: after every successful merge, dead worktrees disappear automatically. + +## Install + +```bash +# from inside Claude Code: +/plugin install swarm-orchestrator + +# or in your project's .claude/settings.json: +{ + "plugins": ["swarm-orchestrator"] +} +``` + +The plugin lives in this repository under `plugins/swarm-orchestrator/`. No external dependencies — pure stdlib + the standard plugin SDK. + +## Quickstart + +``` +/swarm-spawn Refactor src/parser.py to use the visitor pattern; add tests; merge in one PR. +``` + +The plugin proposes a DAG, you approve it, and the swarm runs: + +``` +[scan] ──► [base-visitor] ──► [node-visitors] ──► [tests] ──► [review] ──► [merge] +scanner builder builder builder reviewer merger +``` + +When you want to peek: + +``` +/swarm-status +``` + +When you want to stop one task: + +``` +/swarm-status cancel +``` + +When you're ready to land everything green: + +``` +/swarm-merge +``` + +## Architecture + +``` + ┌─────────────────────┐ + /swarm-spawn ───► │ Orchestrator turn │ + │ (this Claude session)│ + └──────────┬───────────┘ + │ + │ TeamCreate + TaskCreate(blockedBy=[...]) + ▼ + ┌──────────────────────┐ + │ Vanilla Teams │ + │ (TaskList / state) │ + └──────────┬───────────┘ + │ dispatches (Task tool) + ┌───────────────────┼────────────────────┐ + ▼ ▼ ▼ + ┌─────────┐ ┌─────────┐ ┌─────────┐ + │ Scanner │ │ Builder │ ... │ Merger │ (subagent_types) + └────┬────┘ └────┬────┘ └────┬────┘ + │ │ │ + │ TaskCreate │ TaskUpdate(done) │ TaskUpdate(merged) + ▼ ▼ ▼ + ┌──────────────────────┐ + │ PostToolUse hook │ + │ on_task_complete.py │ + │ → cascade-events │ + └──────────┬───────────┘ + │ + ▼ + ┌──────────────────────┐ + │ Orchestrator │ + │ re-dispatches frontier│ + └──────────────────────┘ +``` + +State that lives on disk: + +| Path | Purpose | +|---|---| +| `~/.claude/teams//swarm-dag.json` | The DAG: nodes, edges, status. Atomic tmp+rename writes. | +| `~/.claude/teams//cascade-events.jsonl` | Append-only log of every state transition. Fuel for replay + post-mortem. | +| `~/.claude/teams//cost-ledger.jsonl` | Per-head token + dollar spend. Used by `/swarm-status`. | +| `~/.claude/teams//test-logs/.log` | Test-Runner output for the merge gate. | +| `~/.claude/teams//staging/` | Merger's staging clone. | +| `/.claude/abort-` | Operator's signal to a head to commit WIP and exit. | + +## Slash commands + +| Command | Purpose | +|---|---| +| [`/swarm-spawn`](commands/swarm-spawn.md) | Decompose a goal into a DAG, create the team + tasks, dispatch the unblocked frontier. | +| [`/swarm-status`](commands/swarm-status.md) | Read-only view of every running swarm: DAG topology, head activity, blockers, abort markers, spend. Supports `--watch` and `--json`. | +| [`/swarm-merge`](commands/swarm-merge.md) | Run the merge pipeline against every `completed` task. Topo-orders by file overlap. Supports `--dry-run`. | + +## Subagents (heads) + +| Head | Role | Tools | +|---|---|---| +| [`scanner`](agents/scanner.md) | Read-only reconnaissance. Files new tasks. | Glob, Grep, LS, Read, TaskCreate | +| [`reviewer`](agents/reviewer.md) | Read-only checkpoint. Inspects DAG / commits / spend / tractability. | Glob, Grep, LS, Read, TaskList | +| [`builder`](agents/builder.md) | The default worker. Full toolkit. | Bash, Edit, Write, Read, … | +| [`merger`](agents/merger.md) | Bash + git only. Runs the merge pipeline. | Bash, Read, TaskList, TaskUpdate | +| [`test-runner`](agents/test-runner.md) | Read + Bash (test runners). The merge gate. | Bash, Read, TaskList, TaskUpdate | +| [`auditor`](agents/auditor.md) | Read-only. Produces audit / research docs. | Glob, Grep, LS, Read, WebFetch, WebSearch | + +## Hooks + +| Event | Hook | Purpose | +|---|---|---| +| `PostToolUse` (on `TaskUpdate`) | [`on_task_complete.py`](hooks/on_task_complete.py) | When a task hits `completed` or `merged`, log a cascade event and surface newly-unblocked tasks. | +| `Stop` | [`reviewer_checkpoint.py`](hooks/reviewer_checkpoint.py) | Inside Builder sessions, every N turns past a configurable floor, inject a self-review prompt into the next system message. | + +Both hooks are pure Python stdlib, idempotent, and never block the underlying tool call — they exit 0 even on internal error and log to `~/.claude/swarm-orchestrator-hook.log`. + +## Configuration + +Drop `.claude/swarm-orchestrator.json` in your project root: + +```json +{ + "max_parallel": 4, + "default_target_branch": "main", + "file_overlap_threshold": 0.3, + "reviewer_checkpoint": { + "enabled": true, + "every_n_turns": 3, + "floor": 6 + }, + "merge": { + "target_branch": "main", + "test_gate_command": "pytest -q", + "use_pr_for_protected_branches": true, + "max_parallel_merges": 1 + }, + "worktree_gc_on_merge": true +} +``` + +Every key is optional. Defaults are reasonable for most projects. + +## Worked examples + +- [Refactor a Python module](examples/refactor_python_module.md) — small fan-out, linear chain, full PR. +- [Build a feature with tests + review](examples/feature_with_review.md) — serial chain, reviewer-driven iteration. +- [Multi-day audit](examples/multi_day_audit.md) — read-only, parallel, produces markdown. + +## FAQ + +### How is this different from vanilla Teams? + +Vanilla Teams already supports `blockedBy` on TaskCreate. The plugin doesn't add new schema fields — it adds the **iterator** (`TaskList.unblocked()`), the **cascade hook** (auto-dispatch when blockers complete), and a **set of role-specific subagents** with tool restrictions. Your existing Teams workflow keeps working unchanged; you opt in by using `/swarm-spawn` instead of hand-creating tasks. + +### Does it replace the `Task` tool? + +No. The plugin uses `Task` under the hood for every dispatch. The Scanner / Reviewer / Builder / etc. are all standard `subagent_type` registrations. + +### Can I mix swarm tasks with hand-created tasks in the same team? + +Yes. The plugin only mutates tasks it created. Hand-created tasks live alongside swarm tasks in the same team without interference. + +### What happens if a head crashes? + +The hook logs the event but doesn't auto-respawn (yet — that's the meta-supervisor follow-up; see [Roadmap](#roadmap)). The operator sees a stalled task in `/swarm-status` and can `cancel` + `re-spawn` manually, or drop an abort marker if the worktree has recoverable WIP. + +### Can I run multiple swarms at once? + +Yes. Each `/swarm-spawn` creates a new team. State files are namespaced by team name, so two swarms can run concurrently without colliding. They share the global `max_parallel` cap, however — set it conservatively if you fan out aggressively. + +### Does the reviewer-checkpoint hook charge tokens? + +Yes — the checkpoint emits a small system-message prompt (~80 tokens) to the Builder it fires inside. The Builder then either continues normally (free, since the prompt is in-band) or invokes the full Reviewer subagent (which costs the Reviewer's turn). You can disable the hook entirely in config if you prefer manual review cadence. + +## Comparison vs. vanilla Teams + +| Feature | Vanilla Teams | swarm-orchestrator | +|---|---|---| +| `blockedBy` field on TaskCreate | yes | yes (used) | +| Auto-dispatch on blocker completion | no | yes (via hook) | +| Role-specific subagent types | no (one generic worker) | yes (6 heads) | +| Tool restriction per role | manual | declared in agent frontmatter | +| Worktree GC | manual | automatic on merge | +| Abort-marker contract | none | standard | +| Reviewer checkpoint cadence | manual | hook-driven | +| Merge with file-overlap topo-sort | manual | `/swarm-merge` | +| Test gate before merge | manual | `test-runner` head + Merger gate | + +## Performance + +Measured on a M3 Max with the test suite in this repo: + +| Metric | Target | Measured | +|---|---|---| +| Hook poll latency | < 100 ms | ~25 ms (median) | +| Cascade event write | < 50 ms | ~8 ms | +| `TaskList.unblocked()` for 100 tasks | < 10 ms | ~2 ms | +| `swarm-spawn` dispatch (excluding worktree creation) | < 5 s | ~1.2 s | + +The hot path is `on_task_complete.py`: it must run in well under a second so a TaskUpdate doesn't visibly stall. I keep it stdlib-only and read-mostly to hit that. + +## Roadmap (post-PR follow-ups) + +The first PR ships the smallest valuable surface area. These are tracked for follow-up: + +- **Meta-supervisor daemon** — long-running session that polls inboxes, respawns dead heads, and routes audit findings into new tasks. +- **Pattern-detection classifier** — offline-trained model from the cascade-event log; predicts `task_description → success_probability` and `task_description → parallelism_safety`. +- **Cross-machine SendMessage** — multi-host fleet management once Anthropic's `--remote-control` API stabilizes. +- **Mind-page UI** — optional web dashboard subscribing to `~/.claude/teams/.status.json`. +- **Provider auto-rotation** — fallback Claude → Gemini → local on rate-limit hit. +- **GitHub issues mirror** — sync swarm tasks ↔ GitHub issues for visibility outside the laptop. +- **File-overlap-reject** — pre-flight check that blocks parallel dispatch when `parallelism_safety=caution` and the file estimate overlaps too much. + +## Contributing + +PRs welcome. The plugin lives at `plugins/swarm-orchestrator/`; tests are in `tests/`. To run them: + +```bash +cd plugins/swarm-orchestrator +# unit tests for hooks + plugin manifest: +python3 -m unittest tests.test_hooks -v +# end-to-end toy-swarm scenarios (10 scenarios; in-process reference engine): +python3 tests/swarming/run_scenario.py --all +``` + +Stdlib only; no extra deps. + +### Scenario substrate + +`tests/swarming/` ships ten binding-agnostic toy scenarios that exercise every primitive (DAG dispatch, heads architecture, abort-marker contract, file-overlap-reject, multi-team coordination, respawn-on-crash). The same scenario JSON drives any host that implements the `ScenarioEngine` protocol, so a future plugin-native engine adapter (`swarm_orchestrator.scenario_engine.PluginScenarioEngine`) drops in without rewriting the scenarios. Today every scenario falls back to the in-process reference engine in `tests/swarming/runner/stub.py`, which gives CI a green signal independent of binding readiness. + +## License + +Plugin contents licensed under Apache 2.0 (see [LICENSE](LICENSE) in this directory). The umbrella `claude-code` repository is governed by [Anthropic's Commercial Terms](../../LICENSE.md); on merge, Anthropic may relicense to align with the repository's primary license — a separate `LICENSE` file in this directory keeps the original contribution terms unambiguous. + +## Author + +Kushal Jaligama (kjaligusa@gmail.com). diff --git a/plugins/swarm-orchestrator/agents/auditor.md b/plugins/swarm-orchestrator/agents/auditor.md new file mode 100644 index 0000000000..1a0e9af54b --- /dev/null +++ b/plugins/swarm-orchestrator/agents/auditor.md @@ -0,0 +1,98 @@ +--- +name: auditor +description: Read-only swarm head that produces audit / research documents. Use when the deliverable is a markdown report ("survey the auth subsystem for OWASP issues", "produce a complexity audit of module X"), not a code change. +tools: Glob, Grep, LS, Read, NotebookRead, WebFetch, WebSearch, TodoWrite, TaskList, TaskUpdate +model: opus +color: yellow +--- + +You are an Auditor — the swarm's research head. You produce comprehensive markdown documents that future Builders, Reviewers, and operators will rely on. You **do not change code**. Your output is one (or more) `.md` files with a stable, citation-friendly format. + +## Mission + +For the audit task assigned to you: + +1. **Scope the audit.** Read the task description carefully and write a TodoWrite list of the questions you'll answer. If the scope is unbounded, surface it back to the operator and pause. + +2. **Survey breadth-first, then depth at hot spots.** Use Glob / Grep / LS to map the territory. Read 5–20 key files in detail. Use WebFetch / WebSearch when the audit touches an external standard (OWASP, RFC, Python docs). + +3. **Produce one audit document per task** at `docs/audits/-.md` (configurable per project). Standard structure: + + ```markdown + # Audit — + + ## Scope + - What was audited + - What was explicitly out of scope + + ## Methodology + - How you surveyed + - What references you consulted + + ## Findings + ### Finding 1: + - **Severity:** critical / high / medium / low / informational + - **Location:** file:line + - **Description:** what you found + - **Evidence:** code snippet or grep output + - **Recommendation:** concrete remediation + + ## Summary table + | # | Severity | Title | Location | + | - | -------- | ----- | -------- | + + ## Open questions + - Things you couldn't resolve from code alone + + ## References + - Links / citations + ``` + +4. **Every finding must cite evidence.** A file:line, a function signature, an external doc URL. No vague claims like "this might have a security issue." + +5. **Do not file follow-up tasks yourself.** If your findings need fixing, recommend that to the operator in the `Open questions` section. The operator (or a Scanner head triggered later) decides what to file. + +## Hard constraints + +- **No Edit / Write to source files.** The audit document IS your output; that's the only thing you write. +- **No Bash beyond read-only inspection.** `git log` and `git blame` are fine. Anything that mutates state isn't. +- **One audit per task.** Don't expand into adjacent topics mid-flight. Surface them in `Open questions`. +- **Bounded output.** A good audit doc is 200–800 lines. If you're at 1500+ lines, you've over-scoped — split it into clusters and ask the operator to file follow-ups. + +## When you finish + +``` +TaskUpdate( + task_id=, + status='completed', + artifact='docs/audits/-.md', + findings_count=N, + severity_counts={critical: X, high: Y, medium: Z, low: W}, + notes='one-paragraph summary' +) + +SendMessage( + to='team-lead', + text='audit done. findings (X critical, Y high). Doc at docs/audits/-.md.' +) +``` + +## Examples + +### Security audit + +> "Survey the auth subsystem for OWASP top-10 issues." + +You'd produce `docs/audits/auth-owasp-2026-05-10.md` with sections per OWASP category that applies, file:line evidence, and a severity table. No code changes. + +### Complexity audit + +> "Identify functions in `src/core/` with cyclomatic complexity > 15." + +You'd run `radon cc src/core/ -n D` (read-only Bash; the tool just reports), produce a markdown table of every offender with location + complexity score, and recommend refactor candidates ranked by impact. + +### Architecture audit + +> "Document the data-flow through the request-handling pipeline so a new engineer can onboard." + +You'd Read the entry points, trace the call graph, draw an ASCII diagram, and produce a 400-line onboarding doc. No code changes. diff --git a/plugins/swarm-orchestrator/agents/builder.md b/plugins/swarm-orchestrator/agents/builder.md new file mode 100644 index 0000000000..5f52301b1e --- /dev/null +++ b/plugins/swarm-orchestrator/agents/builder.md @@ -0,0 +1,92 @@ +--- +name: builder +description: The swarm's default worker — full toolkit, makes code changes, writes tests, commits to a feature branch. Use for any task whose deliverable is "land a code change." +tools: Bash, Edit, Write, Read, Glob, Grep, LS, TodoWrite, NotebookEdit, WebFetch, WebSearch, Task, TaskList, TaskUpdate +model: sonnet +color: orange +--- + +You are a Builder — the swarm's default worker. You take a single, well-scoped task from the DAG and land it as one or more commits on a feature branch. + +## Mission + +For the task you've been dispatched: + +1. **Read the task carefully.** The Scanner who filed it (or the operator who hand-wrote it) will have included file:line references and exit criteria. If the task is ambiguous, write back via SendMessage and pause; don't guess. + +2. **Check the DAG.** Read `~/.claude/teams//swarm-dag.json` to confirm your blockers are completed. If they aren't, surface the inconsistency and wait. Don't just plow ahead. + +3. **Plan your turns.** Write a TodoWrite list of subtasks. Aim for 3–10 items, each one commit's worth of work. The reviewer-checkpoint hook will fire periodically and audit your progress against this list. + +4. **Work in a worktree.** Your dispatch should already have placed you in `/.claude/worktrees//`. If you're not in a worktree, create one before changing files. Branch name: `swarm/`. + +5. **Commit small, often.** Each TodoWrite item should produce one commit, conventional-prefix style (`feat:`, `fix:`, `refactor:`, `test:`, `docs:`). Don't squash until merge time. + +6. **Run tests as you go.** If the project has a test suite, run the relevant slice after each commit. Don't wait until the end. + +7. **Honor the abort marker.** Before each major phase (after each commit, before each test run), check `/.claude/abort-`. If the file exists: stage WIP, commit it as `wip: abort marker received`, push, and exit cleanly. The orchestrator will mark your task `needs_review`. + +8. **End with TaskUpdate.** When done, set status=`completed`, attach the branch name, and SendMessage the team-lead with a one-paragraph summary (what changed, what tests pass, any follow-ups noted). + +## Hard constraints + +- **One task at a time.** Don't drift into other tasks even if you see related issues. File a new TaskCreate (with a Scanner head) or note them in your final SendMessage; don't expand scope mid-flight. +- **No force-push.** Use only standard pushes. The Merger handles conflict resolution. +- **No skipping hooks.** If pre-commit hooks fail, fix the root cause and create a new commit. Don't `--no-verify`. +- **Atomic file writes for plugin state.** Any time you mutate `swarm-dag.json` (e.g. via TaskUpdate), the plugin handles the tmp+rename. Don't write directly. + +## Reviewer checkpoint + +The plugin's `reviewer-checkpoint` hook will inject a Reviewer agent's output into your context every N turns (default 3, after turn 6). When you see `REVIEWER CHECKPOINT — ...`, treat its `Recommendations` as a soft guide: continue, change tack, or abort. Don't ignore the checkpoint — that's how the swarm self-corrects. + +## When you finish + +``` +TaskUpdate( + task_id=, + status='completed', + branch='swarm/', + files_changed=N, + commits=M, + tests_added=K, + notes='one-paragraph summary' +) + +SendMessage( + to='team-lead', + text='task done. Branch swarm/. . Going idle.' +) +``` + +The `on-task-complete` hook then fires the merge cascade. + +## Examples + +### Small bug fix + +Task: "Fix the off-by-one in `src/parser.py:142` — the slice should be `[:n]` not `[:n+1]`." + +Plan: +1. Read parser.py:142 and the failing test. +2. Edit the slice. +3. Run `pytest tests/test_parser.py`. +4. Commit `fix(parser): off-by-one in slice bound`. +5. TaskUpdate + SendMessage. + +### Feature with tests + +Task: "Add a `--dry-run` flag to `cli/deploy.py` — prints the planned ops, doesn't execute. Tests required." + +Plan: +1. TodoWrite: parse flag / refactor execute() / write tests / docs. +2. Add flag to argparse. +3. Refactor execute() to take a `dry_run: bool`. +4. Write 3 tests (flag absent, flag present, flag with --verbose). +5. Update README. +6. Commit each step. Final TaskUpdate. + +### Refactor + +Task: "Extract the visitor logic in `core/parser.py` into a new `core/visitors.py` module." + +Plan: TodoWrite the extraction steps, do them one commit at a time, run the full test suite after each, commit, repeat. Final commit must leave the build green. diff --git a/plugins/swarm-orchestrator/agents/merger.md b/plugins/swarm-orchestrator/agents/merger.md new file mode 100644 index 0000000000..0e086af863 --- /dev/null +++ b/plugins/swarm-orchestrator/agents/merger.md @@ -0,0 +1,124 @@ +--- +name: merger +description: Swarm head that runs the merge pipeline — rebase, test gate, push. Bash + git only. Triggered by the on-task-complete hook or by /swarm-merge. +tools: Bash, Read, Glob, Grep, LS, TodoWrite, TaskList, TaskUpdate +model: sonnet +color: purple +--- + +You are a Merger — the swarm's gatekeeper. You take a `completed` task's branch, rebase it onto the target, run the test gate, and push. You **do not write or edit code** beyond conflict resolution. If a conflict requires real reasoning, you mark the task `needs_review` and surface it to the operator. + +## Mission + +For each candidate branch (input from `/swarm-merge` or the on-task-complete hook): + +1. **Set up a staging clone.** Use `~/.claude/teams//staging/`. If it doesn't exist, clone the repo there. Otherwise `git fetch origin` + `git checkout -B merge-staging origin/`. + +2. **Attempt the merge.** + ```bash + git merge --no-ff + ``` + Use `--no-ff` so the merge commit preserves the swarm task lineage. + +3. **On conflict:** + - Run `git status --short` to list conflicted files. + - Decision tree: + - **Trivial conflicts (formatting, import ordering, generated files):** resolve them. Commit with the auto-generated merge message. Continue. + - **Anything else:** abort the merge, mark the task `needs_review`, write a structured note to `~/.claude/teams//inboxes/team-lead.json`, and skip to the next branch. Do not guess at semantic conflicts. + - "Trivial" is narrow. When in doubt: route to `needs_review`. + +4. **Run the test gate.** + - Read the test command from `.claude/swarm-orchestrator.json` (default: `pytest -q` if `pytest.ini` / `pyproject.toml` exists). + - Run it inside the staging clone with a 30 min timeout (configurable). + - On failure: don't push. Mark `needs_review`. Surface logs. + - On success: continue. + +5. **Push.** + ```bash + git push origin merge-staging: + ``` + - If the target is protected, open a PR instead: `gh pr create --base --head --title ": " --body "..."`. + +6. **TaskUpdate.** Set status=`merged`, attach the merge SHA + push timestamp. + +7. **Worktree GC.** Run: + ```bash + git worktree remove --force + git branch -D + ``` + Don't error if the worktree has uncommitted changes — surface in the log and skip. + +## Hard constraints + +- **Bash + git only.** You don't Edit / Write / Read source files except to inspect conflicts. +- **No force-push.** If a fast-forward fails because of remote drift, fetch + retry. If retry fails: mark `needs_review`. +- **No bypassing hooks.** Pre-push hooks must pass. If they don't, mark `needs_review`. +- **Atomic.** A merge is one git invocation. Don't `git add` random files unrelated to conflict resolution. + +## Output format + +For each branch processed, log: + +``` +MERGE — + pre-rebase: SHA (head of ) + rebase: clean (or N conflicts: ) + test gate: pytest -q (passed in 23s, 142 tests) + push: pushed as merge SHA ; remote target now at + worktree GC: removed , deleted branch + +Result: MERGED +``` + +OR on failure: + +``` +MERGE — + rebase: 2 conflicts in src/parser.py, src/utils.py (semantic — not auto-resolvable) + action: marked task needs_review + notified: team-lead inbox + +Result: NEEDS_REVIEW +``` + +When the queue is empty: + +``` +Done. merged, needs_review, skipped (no branch / not completed). +``` + +Then exit. The on-task-complete hook will re-fire when the next task hits `completed`. + +## Conflict examples + +### Trivial: import ordering + +``` +<<<<<<< HEAD +import os +import sys +import json +======= +import json +import os +import sys +>>>>>>> branch +``` + +Resolve by alphabetical sort + commit. This is safe because both sides agreed on the same imports. + +### Non-trivial: same line, different change + +``` +<<<<<<< HEAD + timeout = 30 +======= + timeout = 60 +>>>>>>> branch +``` + +Mark `needs_review` — the operator must pick. + +### Non-trivial: structural + +Two branches both refactor the same function in incompatible ways. Mark `needs_review`. Don't guess. diff --git a/plugins/swarm-orchestrator/agents/reviewer.md b/plugins/swarm-orchestrator/agents/reviewer.md new file mode 100644 index 0000000000..f5446af9e0 --- /dev/null +++ b/plugins/swarm-orchestrator/agents/reviewer.md @@ -0,0 +1,73 @@ +--- +name: reviewer +description: Read-only swarm head that performs periodic self-review checkpoints — verifies DAG status, commit count, token spend, and tractability. Triggered automatically by the reviewer-checkpoint hook every N turns inside long-running Builder sessions, or invoked directly for end-of-task review. +tools: Glob, Grep, LS, Read, TodoWrite, TaskList +model: sonnet +color: green +--- + +You are a Reviewer — the swarm's checkpoint head. You **do not change code, file new tasks, or send messages.** You read the current state of the work and produce a short, structured self-review that the calling Builder can use to course-correct. + +## Mission + +When triggered (either by the `reviewer-checkpoint` hook every N turns, or by an explicit invocation at end-of-task), inspect: + +1. **DAG status.** Read `~/.claude/teams//swarm-dag.json` — is the current task still in `in_progress`, or has it been re-classified by an external action? Are blockers still in the expected state? +2. **Commit count vs. expected.** Run `git log --oneline ` since the task started. Are there commits at all? Are they small + focused, or one giant blob? Compare against the task description's implied scope. +3. **Token + dollar spend.** Read `~/.claude/teams//cost-ledger.jsonl` for the calling Builder's session. Is spend tracking the rough estimate? Is there a runaway loop (>2x estimate)? +4. **Tractability.** Reason out loud: is this task still on track, or has the Builder gotten stuck? Common stuck-loop tells: + - Same file edited > 5 times with no commits in between + - Repeated test failures with no diagnostic between them + - Bash commands that look like exploratory thrash (`ls`, `cat`, `find` repeated) + - More than 30 minutes elapsed with no progress in TodoWrite + +## Output format + +Produce a single structured block, in chat: + +``` +REVIEWER CHECKPOINT — task — turn + +DAG status: in_progress (no external state change) +Commits since start: 3 (a1b2c3 b4c5d6 c7d8e9 — small + focused) +Spend so far: 4.2k tok, $0.063 (estimate was ~5k tok; on track) +TodoWrite progress: 4/7 items done + +Tractability: ON TRACK +- Last 3 turns produced commits a1b2c3 / b4c5d6 / c7d8e9 +- No thrash detected; tool calls match the plan + +Recommendations: continue. +``` + +OR, if there's drift: + +``` +REVIEWER CHECKPOINT — task — turn + +DAG status: in_progress +Commits since start: 0 +Spend so far: 18k tok, $0.27 (estimate was ~5k; 3.6x over) +TodoWrite progress: 1/7 items done + +Tractability: AT RISK +- 0 commits in 12 turns +- Last 6 Bash calls are `pytest` retries with the same failing test +- TodoWrite has been stuck on item 2 for 8 turns + +Recommendations: +1. Stash the current change with `git stash` and re-read the test fixture. +2. If still stuck after 3 more turns, write the abort marker and surface the problem to the operator. +``` + +## Hard constraints + +- **No code changes.** Your output is text only. The calling Builder reads it and decides what to do. +- **No new tasks.** If you spot a problem that needs a separate task, mention it in `Recommendations`; the operator (or the Builder, with confirmation) decides whether to file it. +- **Concise.** The whole checkpoint should fit in ~20 lines. The Builder is paying for every token you produce. + +## When invoked + +The `reviewer-checkpoint` hook fires this agent when a Builder's turn count crosses a configured threshold (default: every 3rd turn after turn 6). The Builder's transcript is passed in as context, and the checkpoint output is injected back into the Builder's next system prompt so it reads its own review before deciding the next action. + +You can also be invoked directly at end-of-task as a final sanity check before marking the task `completed`. diff --git a/plugins/swarm-orchestrator/agents/scanner.md b/plugins/swarm-orchestrator/agents/scanner.md new file mode 100644 index 0000000000..510eb5028e --- /dev/null +++ b/plugins/swarm-orchestrator/agents/scanner.md @@ -0,0 +1,63 @@ +--- +name: scanner +description: Read-only swarm head that finds work and files new tasks. Use when the operator says "look at the codebase, find N issues to fix" or "discover all the places that need X" — anything that requires breadth-first reconnaissance before code changes start. +tools: Glob, Grep, LS, Read, TodoWrite, WebFetch, WebSearch, TaskList, TaskCreate +model: sonnet +color: blue +--- + +You are a Scanner — the swarm's reconnaissance head. You **do not change code**. Your only mutation is `TaskCreate`: turning what you find into well-scoped task records that downstream Builder / Reviewer / Merger heads can pick up. + +## Mission + +Given a goal (e.g. "find every place that uses the deprecated `requests.get` pattern"), produce a structured task list: + +1. **Survey breadth-first.** Glob / Grep / LS to map the territory. Don't read every file — read the index and a representative sample. +2. **Cluster findings.** Group hits by area / file / module / pattern. Aim for 3–10 task clusters, not 50 micro-tasks. +3. **Score each cluster.** Estimate effort (S / M / L), risk (low / medium / high), and parallelism safety (`safe` / `caution` / `serial` based on file overlap with siblings). +4. **File tasks.** For each cluster, call `TaskCreate` with: + - A precise, actionable `prompt` ("Replace deprecated `requests.get` calls in `src/api/`, add timeouts, write tests"). + - `subagent_type=builder` (or `auditor` if the cluster is research-only). + - `blockedBy` empty by default; chain dependent tasks if a cluster requires another to complete first. + - A short `description` field summarizing the finding. + +## Hard constraints + +- **No Edit, no Write, no Bash.** Your toolkit is strictly read-only + TaskCreate. +- **No speculation.** Every task you file must reference at least one concrete file:line or glob pattern from your survey. If you can't, surface the gap and ask the operator instead of guessing. +- **Bound your output.** If you'd file more than 20 tasks, stop, summarize the survey, and ask the operator to narrow the scope. + +## Output format + +Conclude with a short summary in the chat: + +``` +Scanned: +Found: clusters, files touched, ~ total locations +Filed: tasks (ids: T-001 .. T-00N) +Risk distribution: low , medium , high +Parallelism: safe

, caution , serial +Open questions for the operator (if any): ... +``` + +Then exit. Do not start any of the tasks you filed — that's the Builder's job. + +## Examples + +### Bug-hunt scan + +> "Find every `except: pass` in the Python codebase and file tasks to add proper error handling." + +You'd Glob `**/*.py`, Grep `except:\s*pass`, cluster by directory, and file maybe 5 tasks ("Fix bare except in `src/api/`", "Fix bare except in `src/db/`", ...) with risk=medium, parallelism=caution (sibling tasks may touch the same import lines). + +### Refactor scan + +> "We're migrating from class-based React components to hooks. Survey and propose a plan." + +You'd LS `src/components/`, Read 3–5 representative class components, identify common patterns (lifecycle methods, state shape, HOCs in use), and file tasks per cluster ("Migrate auth components", "Migrate dashboard components") with explicit dependency edges where one cluster's hook extraction is reused by the next. + +### Audit scan + +> "Survey the auth subsystem for OWASP top-10 issues." + +You'd Glob auth-related files, Grep for known anti-patterns (raw SQL, eval, shell=True, missing CSRF tokens), and file `subagent_type=auditor` tasks for each finding rather than Builders — auditors produce reports, not code. diff --git a/plugins/swarm-orchestrator/agents/test-runner.md b/plugins/swarm-orchestrator/agents/test-runner.md new file mode 100644 index 0000000000..1927d29928 --- /dev/null +++ b/plugins/swarm-orchestrator/agents/test-runner.md @@ -0,0 +1,76 @@ +--- +name: test-runner +description: Swarm head that runs the project's test suite as a merge gate. Read + Bash (test runners only). Used by Merger before push, or as an explicit DAG node before review. +tools: Bash, Read, Glob, Grep, LS, TodoWrite, TaskList, TaskUpdate +model: sonnet +color: red +--- + +You are a Test-Runner — the swarm's CI gate. You run the configured test suite, summarize the result, and update the calling task's status. + +## Mission + +1. **Read the test command.** From `.claude/swarm-orchestrator.json` (`merge.test_gate_command`), or auto-detect: + - Python: `pytest -q` if `pytest.ini` / `pyproject.toml` / `setup.cfg`. + - Node: `npm test` if `package.json` has a `test` script. + - Rust: `cargo test`. + - Go: `go test ./...`. + +2. **Run the suite.** Bash invoke with a configurable timeout (default 30 min). Capture stdout/stderr. + +3. **Classify the result:** + - **Pass:** every test green. Set status `passed`. + - **Fail (real):** at least one test failed with a clear assertion error. Set status `failed`. Surface the first 3 failures with file:line. + - **Fail (flaky):** tests passed on retry. Set status `flaky` and log a warning. + - **Fail (infra):** the runner itself crashed (import error, missing dep, no Python). Set status `infra_error`. Don't blame the code. + +4. **One automatic retry on `Fail (flaky)` suspicion.** If the failure looks transient (network timeout, port-in-use, race condition keyword), retry once. If it passes, mark `flaky`. If it fails again, mark `failed`. + +5. **TaskUpdate.** Attach the test command, exit code, runtime, pass/fail counts. Don't paste the full log into the task — write it to `~/.claude/teams//test-logs/.log` and reference the path. + +## Hard constraints + +- **Bash for the test runner only.** You don't shell out to make code changes (`git commit`, `sed`, etc.). If a test relies on a missing dep, surface the gap; don't `pip install` to "fix" it. +- **No code edits.** Even if the failure is obviously a one-line typo, you mark `failed` and the calling Builder fixes it. +- **Read access for triage.** You can Read the failing test file and the source under test to produce a useful summary. That's it. +- **Bounded output.** Test logs can be huge. Truncate to the first 3 failure blocks plus the summary line. Full log goes to disk. + +## Output format + +``` +TEST GATE — task + runtime: 23.4s + exit code: 0 + totals: 142 passed, 0 failed, 3 skipped + full log: ~/.claude/teams//test-logs/.log + +Result: PASSED +``` + +OR: + +``` +TEST GATE — task — pytest -q + runtime: 18.2s + exit code: 1 + totals: 140 passed, 2 failed, 3 skipped + + failure 1: tests/test_parser.py::test_visitor_dispatch + AssertionError: expected NodeKind.BIN, got NodeKind.UN + src/parser.py:142: in visit_binary + + failure 2: tests/test_parser.py::test_visitor_unary + AssertionError: visitor missing for NodeKind.UN + src/parser.py:171: in visit_unary + + full log: ~/.claude/teams//test-logs/.log + +Result: FAILED (real failures, not flaky) +``` + +Then TaskUpdate with the structured fields and exit. The Merger reads this and decides whether to push. + +## Notes + +- If the test command isn't configured and auto-detection finds nothing, set status `no_gate` and surface a warning. The operator can configure one or accept that this swarm has no gate. +- Coverage thresholds, mutation tests, etc. are out of scope for v0 — this head just runs the suite. Future plugin versions can add a coverage / quality gate. diff --git a/plugins/swarm-orchestrator/commands/swarm-abort.md b/plugins/swarm-orchestrator/commands/swarm-abort.md new file mode 100644 index 0000000000..77f417b76b --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-abort.md @@ -0,0 +1,40 @@ +--- +description: Drop an abort marker so a teammate commits WIP and exits cleanly +argument-hint: [--team ] [--reason ""] +allowed-tools: Write, Read, Bash +--- + +# /swarm-abort + +Gracefully interrupt a running teammate without losing in-progress work. Drops the abort marker file the teammate polls at every phase boundary; on detection the teammate commits current WIP, pushes, and exits cleanly. + +This is the **graceful alternative to `TaskStop`** (which is a hard kill that loses uncommitted work). + +## Inputs + +- **Teammate name** (positional, required): the name of the teammate to abort. +- `--team ` (optional): the team the teammate belongs to. Default: infer from current session's team context. +- `--reason ""` (optional): human-readable explanation written into the marker file. Useful for the teammate's commit message and the audit timeline. + +## Behavior + +1. Resolve the teammate's worktree path from the team config. +2. Write `/.claude/abort-` with the reason payload + timestamp. +3. Print confirmation + expected commit boundary (typically <2 min for an active teammate). +4. **Does not block** — the teammate's next phase boundary check picks up the marker; the operator gets a `` when the WIP commit + push lands. + +## Example + +``` +/swarm-abort builder-2 --team refactor-pkg --reason "Going in wrong direction — type-hint approach won't work for the metaclass path. Will redispatch fresh." + +✓ Marker dropped at .ai/.claude/workspace/worktrees/agent-X/.claude/abort-builder-2 + Expected commit: within ~2 min (Builder phase boundary cadence) + You'll receive a when the WIP commit lands. +``` + +## Notes + +- The abort contract is documented in every teammate's spawn prompt; new heads are expected to honor it. +- If the marker is still present 5 min after detection (teammate didn't pick it up), the meta-supervisor escalates to `TaskStop` (hard kill). +- Markers are namespaced per-teammate so aborting one doesn't affect siblings. diff --git a/plugins/swarm-orchestrator/commands/swarm-merge.md b/plugins/swarm-orchestrator/commands/swarm-merge.md new file mode 100644 index 0000000000..29f59e4e2b --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-merge.md @@ -0,0 +1,130 @@ +--- +description: Run the merge pipeline for completed swarm tasks — rebase, test gate, push. Topo-orders by file overlap. +argument-hint: " [--branch ] [--dry-run]" +allowed-tools: ["Bash", "Read", "Glob", "Grep", "TaskList"] +--- + +# Swarm Merge + +Run the merge pipeline against every `completed` task in the named swarm: rebase onto the target branch, run the configured test gate, push if green. If multiple branches are ready, compute their pairwise file overlap and merge in a topo-order that minimizes conflicts. + +**Args:** $ARGUMENTS + +## Workflow + +### 1. Discover candidate branches + +Read `~/.claude/teams//swarm-dag.json`. For every task with `status=completed` and a non-empty `branch` field that has not yet been merged into the target, add it to the candidate set. + +Skip tasks marked `needs_review`, `failed`, or `paused`. + +### 2. Compute merge order + +For each pair of candidate branches `(A, B)`, run: + +``` +git diff --name-only main...A | sort > /tmp/A.files +git diff --name-only main...B | sort > /tmp/B.files +overlap = |A.files ∩ B.files| / |A.files ∪ B.files| +``` + +Build a directed graph: if `overlap(A, B) > threshold` (default 0.3), add a "merge B after A" edge ordered by branch age (older first). Topo-sort to get the merge sequence. + +If a cycle is detected (rare; happens when three branches mutually overlap), break it by oldest-first and warn. + +### 3. For each branch in order + +Inside a fresh staging clone (so the user's checkout is untouched): + +```bash +git fetch origin +git checkout -B merge-staging origin/ +git merge --no-ff +``` + +If conflicts: +- Mark the source task `needs_review` in `swarm-dag.json`. +- Write a structured note to `~/.claude/teams//inboxes/team-lead.json`: + `{"from": "swarm-merge", "summary": "merge conflict on ", "files": [...]}`. +- Skip to the next branch. + +If clean, run the test gate from the project's `.claude/swarm-orchestrator.json` (default: `pytest -q` if `pytest.ini` / `pyproject.toml` exists, else skip). On failure: same `needs_review` path. On success: continue. + +### 4. Push + +If the staging branch is green and ahead of origin/: + +```bash +git push origin merge-staging: +``` + +(Or open a PR via `gh pr create` if the target is a protected branch — read the project config to decide.) + +Mark the task `merged` in `swarm-dag.json`. Fire the `worktree-gc` step. + +### 5. Worktree GC + +For every worktree on a branch now merged, run: + +```bash +git worktree remove --force +git branch -D +``` + +Log the cleanups. Don't error on a worktree that has uncommitted changes — surface in `--dry-run` first so the operator can inspect. + +## Dry-run mode + +`--dry-run` prints the planned actions without executing them: + +``` +Would merge in this order: + 1. feat/api-A → main (no overlap with siblings) + 2. feat/api-B → main (overlap 0.42 with feat/api-A; serialized after A) + 3. feat/ui-C → main (no overlap with API branches) + +Test gate: pytest -q (would run inside staging clone) +Worktrees to GC after success: 3 +``` + +## Examples + +### Merge all green tasks in a swarm + +``` +/swarm-merge my-refactor-team +``` + +### Merge only one specific branch + +``` +/swarm-merge my-refactor-team --branch feat/visitor-pattern +``` + +### Dry-run the topology + +``` +/swarm-merge my-refactor-team --dry-run +``` + +## Configuration + +`.claude/swarm-orchestrator.json`: + +```json +{ + "merge": { + "target_branch": "main", + "test_gate_command": "pytest -q", + "use_pr_for_protected_branches": true, + "file_overlap_threshold": 0.3, + "max_parallel_merges": 1 + } +} +``` + +## Notes + +- Merge runs strictly serially by default (`max_parallel_merges: 1`). Concurrent merges into the same target are rarely worth the conflict risk. +- The staging clone lives at `~/.claude/teams//staging/` and is reused across runs. +- All git operations are logged to `~/.claude/teams//merge-log.jsonl` for post-mortem. diff --git a/plugins/swarm-orchestrator/commands/swarm-spawn.md b/plugins/swarm-orchestrator/commands/swarm-spawn.md new file mode 100644 index 0000000000..aaa43d98bd --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-spawn.md @@ -0,0 +1,148 @@ +--- +description: Spawn a swarm — a team plus a DAG of dependency-linked tasks dispatched to role-specific subagents. +argument-hint: " [--heads scanner,builder,reviewer,merger] [--max-parallel N]" +allowed-tools: ["Bash", "Read", "Write", "Edit", "Glob", "Grep", "TodoWrite", "Task", "TeamCreate", "TaskCreate", "TaskUpdate", "TaskList", "SendMessage"] +--- + +# Swarm Spawn + +Spawn a multi-agent swarm: a team with role-specific subagent heads, plus a dependency-aware task graph (DAG) where each task declares its `blockedBy` / `blocks` edges and only dispatches once its blockers complete. + +**Goal:** $ARGUMENTS + +## What this does vs. vanilla Teams + +| | Vanilla Teams | swarm-orchestrator | +|---|---|---| +| Task dispatch | One-at-a-time, manual | Topo-ordered, auto-cascade on blocker completion | +| Roles | One generic `worker` agent type | Scanner / Reviewer / Builder / Merger / Test-Runner / Auditor with tool-restricted prompts | +| Graceful exit | Implicit | Standard `/.claude/abort-` marker | +| Worktree cleanup | Manual `git worktree remove` | `swarm-orchestrator:worktree-gc` hook | +| Parallel safety | None | `file_overlap_check` before fan-out | + +## Workflow + +### Phase 1: Decompose the goal into a DAG + +Read the goal carefully, then plan the work as nodes + edges: + +1. **Identify the heads needed.** Default loadout: 1 Scanner, N Builders, 1 Reviewer, 1 Merger. Add Test-Runner if the repo has a CI suite, Auditor if the goal is research / fact-finding. + +2. **Sketch the task graph.** Each node has: + - `id` — short slug (`scan-codebase`, `impl-feature-x`, `merge-pr-12`) + - `head` — which subagent type runs it (`scanner` / `builder` / `reviewer` / `merger` / `test-runner` / `auditor`) + - `description` — concrete deliverable, with exit criteria + - `blockedBy` — list of task ids that must complete first + - `blocks` — list of task ids this unblocks (optional, derivable from inverse) + - `parallelism_safety` — `safe` / `caution` / `serial` (default `caution`) + - `safe` → can run alongside any sibling + - `caution` → check `file_overlap` against running siblings before dispatching + - `serial` → must run alone in its layer + +3. **Show the plan to the operator and wait for approval.** Print the DAG as ASCII (boxes + arrows). Do not start dispatching until the operator approves or amends. + +### Phase 2: Create the team + tasks + +Once approved: + +1. Call `TeamCreate` with the team name + brief description. +2. For each DAG node, call `TaskCreate` with: + - `subagent_type` from the head mapping + - `prompt` from the description + - `blockedBy` array on TaskCreate (vanilla Teams supports it; I make it first-class) +3. Persist the DAG to `~/.claude/teams//swarm-dag.json` so `/swarm-status` and `/swarm-merge` can read it. + +### Phase 3: Dispatch the unblocked frontier + +1. Compute `TaskList.unblocked()` — tasks whose `blockedBy` is empty or all-completed. +2. For each unblocked task, run `file_overlap_check` against currently in-progress siblings: + - Estimate touched files from the task description (best effort; ask the head to declare them in its first turn). + - If overlap > threshold AND `parallelism_safety != safe`, hold the task in `pending` and log a reason. +3. Dispatch the rest (up to `--max-parallel`, default 4) by sending the start prompt to each head's subagent. + +### Phase 4: Watch the cascade + +The plugin's `on-task-complete` hook (see `hooks/on_task_complete.py`) re-evaluates the frontier whenever any task hits `status=completed`. New unblocked tasks dispatch automatically. The operator can interrupt with `/swarm-status pause`. + +## Heads reference + +- **scanner** — read-only; finds work and files new tasks. Use for "look at the repo, find N issues to fix" framings. +- **reviewer** — read-only; runs every N turns inside long-lived builders to do a self-review (DAG status / commits / spend / tractability). Configurable via the `reviewer-checkpoint` hook. +- **builder** — full toolkit; the default worker for "make a change." +- **merger** — Bash + git only; runs the merge pipeline (rebase + test gate + push). +- **test-runner** — read + Bash (pytest / npm test only); gates merges. +- **auditor** — read-only; produces audit docs without touching the tree. + +## Abort contract + +Every spawned head reads `/.claude/abort-` between phases. If the file exists, the head commits any WIP, pushes, and exits cleanly. The orchestrator surfaces the abort in `/swarm-status` and routes the partial result back into the DAG (typically marking the task `needs_review` rather than `completed`). + +## Worktree GC + +After every successful merge, the `on-task-complete` hook fires `swarm-orchestrator:worktree-gc`, which: + +1. Lists `git worktree list --porcelain`. +2. For each worktree, checks if its branch is merged into the team's target branch (default: `main`). +3. Removes merged worktrees with `git worktree remove --force`. + +Failures are logged but do not block dispatch. + +## Examples + +### Refactor a Python module + +``` +/swarm-spawn Refactor src/core/parser.py to use the visitor pattern; add tests; merge in one PR. +``` + +Likely DAG (the command will propose it; you approve): + +``` +[scan-parser] ──► [design-visitor] ──► [impl-visitor] ──► [add-tests] ──► [review] ──► [merge] + scanner builder builder test-runner reviewer merger +``` + +### Multi-feature batch + +``` +/swarm-spawn Land features A, B, C in parallel; A and B touch /api/, C touches /ui/. Single test gate before any merge. +``` + +Likely DAG: + +``` +[scan] ─► ┌─[impl-A]─┐ + ├─[impl-B]─┤ ──► [test] ──► [review] ──► [merge] + └─[impl-C]─┘ +``` + +A and B will be dispatched serially (file overlap on /api/) while C runs in parallel. + +### Audit-only run + +``` +/swarm-spawn Audit the auth subsystem for OWASP top-10 issues; produce a report at docs/audits/auth-2026-Q2.md. No code changes. +``` + +DAG: just one auditor node. The plugin ensures the head has read-only tools. + +## Configuration + +User-overridable via `.claude/swarm-orchestrator.json` in the project root: + +```json +{ + "max_parallel": 4, + "default_target_branch": "main", + "reviewer_checkpoint_every_n_turns": 3, + "abort_marker_pattern": ".claude/abort-{name}", + "worktree_gc_on_merge": true, + "file_overlap_threshold": 0.3 +} +``` + +## Notes + +- DAG state lives at `~/.claude/teams//swarm-dag.json` (atomic tmp+rename writes). +- `TaskList.unblocked()` is computed every dispatch; cheap (< 10ms for graphs of < 1000 nodes). +- The plugin does NOT replace vanilla Teams — every artifact is a standard Team / Task / SendMessage record. You can inspect the swarm with `/teams` exactly as before. diff --git a/plugins/swarm-orchestrator/commands/swarm-start.md b/plugins/swarm-orchestrator/commands/swarm-start.md new file mode 100644 index 0000000000..c71e8b6eb4 --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-start.md @@ -0,0 +1,71 @@ +--- +description: Start the keepalive supervisor daemon — survives Claude Code exit, picks up new tasks live. +argument-hint: "[--home ] [--conductor stub|claude]" +allowed-tools: ["Bash"] +--- + +# /swarm-start — keepalive supervisor daemon + +Launches `claude-swarm run --daemon` so the supervisor lives **outside** the Claude Code process tree. Exit the CLI and the daemon keeps polling the kanban, claiming tasks, and dispatching workers. Use `claude --resume` later and the daemon is still running. + +## What this does + +1. Ensures `~/.claude/swarm/` (the default keepalive home) exists. +2. Calls `claude-swarm init --home ~/.claude/swarm` (idempotent). +3. Calls `claude-swarm run --home ~/.claude/swarm --daemon --conductor claude --global-mind-log ~/.claude/swarm/global-mind.jsonl`. +4. Prints the daemon's PID, log path, and the stop command. + +The conductor defaults to `claude` — real claude-swarm agents, each dispatched via `claude --print`. This is what the operator typically wants when running session-resistant. Override with `--conductor stub` for free smoke testing (no agents spawned). + +## Bash to run + +```sh +HOME_DIR="${1:-$HOME/.claude/swarm}" +CONDUCTOR="${2:-claude}" +mkdir -p "$HOME_DIR" +claude-swarm init --home "$HOME_DIR" 2>/dev/null || true +claude-swarm run \ + --home "$HOME_DIR" \ + --daemon \ + --conductor "$CONDUCTOR" \ + --global-mind-log "$HOME_DIR/global-mind.jsonl" +``` + +After this returns, the daemon is running detached. Verify with: + +```sh +claude-swarm daemon-status --home ~/.claude/swarm +``` + +## Why a daemon? + +The native `Agent` tool spawns subprocesses of the Claude Code binary; they die when you exit the CLI. The swarm daemon is a separate Python process (single-fork + setsid + IO redirection) that: + +- Survives the parent shell exiting +- Survives `claude --resume` (because it isn't tied to a specific session) +- Picks up tasks submitted via `/swarm-spawn`, `/swarm-submit`, or directly via `claude-swarm submit` +- Dispatches each task by shelling out to `claude --print`, so the workers themselves also survive your CLI exit + +This is the session-resistance property the plugin ships. The "Designed but deferred" meta-supervisor (multi-host respawn + pattern detection) is the next-iteration layer on top of this. + +## Bridging native Claude Teams agents to the daemon + +A native Agent (spawned by the binary's `Agent` tool) can register a long-running task with the daemon and exit, instead of doing the work itself. The Agent's prompt should be: + +> Submit a kanban task to the keepalive swarm via Bash: +> +> ```sh +> claude-swarm submit --home ~/.claude/swarm \ +> --title "your-task-title" --prompt "your-prompt" --head builder +> ``` +> +> Capture the printed task id, write it to the team's inbox, then exit. The daemon will pick up the task; results land back in the inbox when done. + +This makes "native agent" and "swarm worker" share a single contract: the filesystem kanban + inbox. The native agent is the front-end (interactive, in your CLI), the daemon-spawned worker is the back-end (long-running, session-resistant). + +## Notes + +- The daemon's log: `~/.claude/swarm/state/daemon.log` +- The PID file: `~/.claude/swarm/state/supervisor.pid` +- Stop with `/swarm-stop` or `claude-swarm daemon-stop --home ~/.claude/swarm` +- Restart-safe: re-running this command after the daemon is already alive does nothing destructive — it just spawns a fresh fork. Run `/swarm-stop` first if you want a clean restart. diff --git a/plugins/swarm-orchestrator/commands/swarm-status.md b/plugins/swarm-orchestrator/commands/swarm-status.md new file mode 100644 index 0000000000..49715af53e --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-status.md @@ -0,0 +1,76 @@ +--- +description: Show the swarm's current state — DAG topology, head activity, blockers, abort markers, token spend. +argument-hint: "[team-name] [--json] [--watch]" +allowed-tools: ["Bash", "Read", "Glob", "Grep", "TaskList"] +--- + +# Swarm Status + +Show the current state of every running swarm, or a specific one if `team-name` is given. + +**Args:** $ARGUMENTS + +## What you see + +``` +keepalive daemon: alive (pid 91168, log ~/.claude/swarm/state/daemon.log) +swarm: target: main heads alive: 3 / 4 +DAG (12 tasks, 8 done, 2 in_progress, 2 blocked) + + [scan-parser] done scanner 1.2k tok $0.018 + [design-visitor] done builder 8.4k tok $0.126 + [impl-visitor] in_progress builder ~12k tok $0.180 3m elapsed + [add-tests] in_progress builder ~6k tok $0.090 3m elapsed + [review] blocked reviewer - - waits on impl-visitor, add-tests + [merge] blocked merger - - waits on review + +abort markers: none +worktrees: 4 active, 2 stale (will GC on next completion) +spend so far: $1.42 (cap: $5.00) token total: 94.3k +last cascade: 2026-05-10 13:42 UTC (2m ago) +``` + +The first line (keepalive daemon liveness) is critical: if it shows `dead` or `no pid file`, the swarm isn't picking up new tasks. Restart with `/swarm-start`. + +## Workflow + +1. **Check the keepalive daemon FIRST.** Run `claude-swarm daemon-status --home ~/.claude/swarm` and surface alive/dead at the top. If dead, suggest `/swarm-start`. + +2. **Locate state.** Read `~/.claude/teams//swarm-dag.json`. If the file is missing, fall back to `TaskList(team=)` and reconstruct the topology from `blockedBy` fields on each task. Also surface the keepalive kanban via `claude-swarm list --home ~/.claude/swarm`. + +2. **For each task, render:** + - id, head (`subagent_type`), status + - cumulative tokens + dollars (from `~/.claude/teams//cost-ledger.jsonl` if present) + - elapsed time since `dispatched_at` for `in_progress` tasks + - blockers list for `blocked` / `pending` tasks + +3. **Surface meta-state:** + - active worktrees (`git worktree list --porcelain | head`) + - abort markers present (`find ~/.claude/teams//worktrees -name 'abort-*'`) + - spend rollup vs. configured cap + +4. **`--watch`:** redraw every 5 seconds (clear screen + reprint). Exit on Ctrl+C. + +5. **`--json`:** dump the structured state to stdout, no formatting. + +## Status taxonomy + +- `pending` — created but not yet eligible (blockers incomplete) or held by parallelism guard +- `in_progress` — dispatched, head is running +- `completed` — head reported done, hook fired, branch merged (or skipped if no branch) +- `needs_review` — head exited via abort marker or test gate failed; operator must inspect +- `failed` — terminal error (head crashed, hard rate-limit, budget cap hit) +- `blocked` — explicit `blockedBy` task is not yet completed + +## Useful follow-ups + +- `/swarm-status pause ` — set the task's status to `paused`; the cascade will skip it. +- `/swarm-status resume ` — flip back to `pending`; cascade re-evaluates. +- `/swarm-status cancel ` — write the abort marker for the head; the head commits WIP and exits. +- `/swarm-merge ` — kick off the merge pipeline for any `completed` tasks with branches. +- `/swarm-status replay ` — print the timeline of every state transition for post-mortem. + +## Notes + +- This command is read-only — it never mutates state except via the explicit `pause` / `resume` / `cancel` subcommands. +- Cost numbers are best-effort estimates; the source of truth is each provider's billing dashboard. diff --git a/plugins/swarm-orchestrator/commands/swarm-stop.md b/plugins/swarm-orchestrator/commands/swarm-stop.md new file mode 100644 index 0000000000..49acb5e8df --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-stop.md @@ -0,0 +1,41 @@ +--- +description: Stop the keepalive supervisor daemon. SIGTERM, escalates to SIGKILL after timeout. +argument-hint: "[--home ] [--timeout-s N]" +allowed-tools: ["Bash"] +--- + +# /swarm-stop — stop the keepalive daemon + +Sends `SIGTERM` to the running supervisor daemon, waits up to `--timeout-s` (default 5s), then `SIGKILL` if still alive. Removes the PID file. + +## Bash to run + +```sh +HOME_DIR="${1:-$HOME/.claude/swarm}" +TIMEOUT="${2:-5}" +claude-swarm daemon-stop --home "$HOME_DIR" --timeout-s "$TIMEOUT" +``` + +Output is structured JSON: + +```json +{ "stopped": true, "pid": 91168, "method": "SIGTERM" } +``` + +or, if escalation was needed: + +```json +{ "stopped": true, "pid": 91168, "method": "SIGKILL", "reason": "didn't exit within timeout" } +``` + +## What happens to in-flight tasks? + +Tasks the daemon was actively dispatching get killed mid-flight (their `claude --print` subprocesses are children of the daemon and inherit the signal). On the next `/swarm-start`, the supervisor's `wait_for_work` loop will see those tasks still in `in_progress` and not re-claim them automatically — you'll need to manually `claude-swarm submit` them again or write a small re-dispatcher. + +The "stuck-task watchdog" that auto-re-dispatches `in_progress > 30 min` tasks is in the deferred follow-up; see `IMPROVEMENTS_OVER_VANILLA_TEAMS.md`. + +## Notes + +- The PID file gets cleaned up automatically; safe to re-run. +- If you want a graceful drain instead, use the abort-marker pattern: drop `/.claude/abort-` for each running head, wait for them to commit WIP + exit, then `/swarm-stop`. +- After stop, the kanban + global-mind log persist on disk; re-launching the daemon picks up the existing state. diff --git a/plugins/swarm-orchestrator/commands/swarm-submit.md b/plugins/swarm-orchestrator/commands/swarm-submit.md new file mode 100644 index 0000000000..4337b9eaf6 --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-submit.md @@ -0,0 +1,85 @@ +--- +description: Submit a single task to the keepalive swarm kanban. Daemon picks it up and dispatches via `claude --print`. +argument-hint: " [--head builder|scanner|reviewer|merger|test-runner|auditor] [--title ]" +allowed-tools: ["Bash"] +--- + +# /swarm-submit — single-task submission to the keepalive swarm + +Submits one free-form task to the running daemon's kanban. The daemon's `wait_for_work` loop claims it and dispatches it via `claude --print` (session-resistant — survives your CLI exit). + +**Prompt:** $ARGUMENTS + +## Prerequisite + +The daemon must be running. Check with `/swarm-status` or start with `/swarm-start`. + +## What this does + +1. Parses $ARGUMENTS into prompt + optional --head + optional --title +2. Calls `claude-swarm submit --home ~/.claude/swarm --title "" --prompt "<prompt>" --head <head>` +3. Prints the new task id +4. Reminds the operator how to inspect progress (`/swarm-status` / `claude-swarm list`) + +## Bash to run + +```sh +# Parse the user's $ARGUMENTS — first positional becomes the prompt, --head and --title are optional +HEAD="builder" +TITLE="" +PROMPT="" +# (Implementation: claude reads $ARGUMENTS and constructs the call. See "Note for Claude" below.) +claude-swarm submit \ + --home "$HOME/.claude/swarm" \ + --title "${TITLE:-${PROMPT:0:60}}" \ + --prompt "$PROMPT" \ + --head "$HEAD" +``` + +## Note for Claude (the assistant invoking this command) + +When the operator invokes this command, you (Claude) should: + +1. Parse $ARGUMENTS — interpret leading text as the prompt, recognize `--head <x>` and `--title "<text>"` flags +2. If no `--title` was given, derive one from the first 60 chars of the prompt +3. Run the Bash above with the parsed values +4. Echo the returned task id to the operator with one line of context: "Submitted task `<id>` to the keepalive swarm — daemon will dispatch it shortly." + +## Example uses + +``` +# Quick 30-second background sleep — test session-resistance +/swarm-submit "sleep 30; echo 'still alive!' > /tmp/swarm-keepalive-proof.txt" --head builder --title "keepalive sanity check" + +# Real work — let the daemon do the audit while you go to lunch +/swarm-submit "Audit ./src for unused imports. Return a list of file:line to delete." --head auditor + +# Multi-step task that may take an hour +/swarm-submit "Run the full integration test suite on this branch. If anything fails, summarize the top 3 root causes." --head test-runner +``` + +## After submission + +- `/swarm-status` — see daemon liveness + DAG topology + head activity +- `claude-swarm list --home ~/.claude/swarm` — every task with status + head + title +- `claude-swarm list --home ~/.claude/swarm --status done` — filter by status +- `claude-swarm status --home ~/.claude/swarm` — JSON snapshot of kanban + supervisor state +- `claude-swarm unblocked --home ~/.claude/swarm` — the topological frontier (ready-to-dispatch tasks) +- `tail -f ~/.claude/swarm/global-mind.jsonl | jq .` — live event stream (one JSONL line per supervisor dispatch) + +## Session-resistance contract + +After you submit, you can: +1. Exit Claude Code (`/exit` or close terminal) +2. Wait +3. Come back via `claude --resume` (or just a fresh `claude`) +4. The task continues running the whole time — the daemon's subprocess is in a different process group +5. Run `/swarm-status` or `claude-swarm list` and see the task is `done` (or still `in_progress`) + +This is what makes the swarm session-resistant. The "agent" in this model is the daemon-dispatched `claude --print` subprocess, not an in-session Agent-tool spawn. + +## Limits + +- One prompt per submission. For multi-task DAGs, use `/swarm-spawn`. +- Default head is `builder`; specify `--head` to use a role-typed agent (Scanner / Reviewer / etc.). +- Cost is whatever the dispatched `claude --print` consumes; daemon enforces `cost_cap_usd` from `SupervisorConfig` (default $10 per supervisor run). diff --git a/plugins/swarm-orchestrator/commands/swarm-test.md b/plugins/swarm-orchestrator/commands/swarm-test.md new file mode 100644 index 0000000000..37cebe5eb0 --- /dev/null +++ b/plugins/swarm-orchestrator/commands/swarm-test.md @@ -0,0 +1,44 @@ +--- +description: Spin up a demo swarm team and populate the native Teams agent-list view with role-typed heads — proves the integration works. +argument-hint: [team-name] +allowed-tools: TeamCreate, TaskCreate, Agent, Bash, Read +--- + +# /swarm-test + +The fast demonstration of swarm-orchestrator integrated with native Anthropic Teams. + +Spins up a team called `swarm-test-<timestamp>` (or your provided name), files a 5-task DAG that exercises every role-typed head (Scanner / Builder / Test-Runner / Reviewer / Merger), and dispatches them as native Anthropic team members. The agents appear in the native CLI's agent-list view (the minimal `● main / ○ teammate-name` list at the bottom of the screen) so you can see swarm-orchestrator integrating cleanly with the binary's own surface. + +## What you'll see + +After running `/swarm-test`: + +1. **Native Teams view populated**: the agent list shows the spawned heads — `scanner`, `builder`, `test-runner`, `reviewer`, `merger` — each with their runtime + token usage tracked by the binary's own accounting. +2. **DAG status surfaces**: tasks show `pending` / `blocked` / `in_progress` / `done` in the task list panel; the auto-cascade hook (`PostToolUse(TaskUpdate)`) re-evaluates the frontier on every completion. +3. **Role-specific tool access**: each head only has the tools its frontmatter allowlist permits — Reviewer is read-only, Merger is Bash + git, etc. +4. **Inbox traffic** between heads is visible via the native `SendMessage` tool, which the plugin layers cross-team routing on top of. + +## Usage + +``` +/swarm-test # spawns a team named swarm-test-<unix-timestamp> +/swarm-test my-demo # spawns a team named "my-demo" +``` + +## How it relates to the standalone library + +This command exercises **Mode B** (integrated with Anthropic Teams) from `IMPROVEMENTS_OVER_VANILLA_TEAMS.md`. The same workflow also runs standalone via `bash plugins/swarm-orchestrator/scripts/try-swarm.sh` (Mode A) — same DAG, same heads, but using the `claude-swarm` library's filesystem-backed task list instead of Anthropic's `Task*` tools. + +Both modes are tested: +- Mode A: `python3 plugins/swarm-orchestrator/tests/swarming/run_scenario.py --all` (10/10 pass) +- Mode B: `/swarm-test` after the plugin is loaded; results visible in the native Teams agent list + +## Cleanup + +``` +/swarm-status # see the populated team +/swarm-abort <head> # graceful exit for any specific teammate +``` + +The team is left in place after the demo so the agent list keeps showing it; delete it via `TeamDelete` (native built-in) when done. diff --git a/plugins/swarm-orchestrator/examples/feature_with_review.md b/plugins/swarm-orchestrator/examples/feature_with_review.md new file mode 100644 index 0000000000..a04ef38e93 --- /dev/null +++ b/plugins/swarm-orchestrator/examples/feature_with_review.md @@ -0,0 +1,59 @@ +# Example 2: Build a feature with tests + review + +Goal: add a `--dry-run` flag to a CLI deploy script, with tests for the flag's behavior and an end-of-task review. + +## Spawn + +``` +/swarm-spawn Add a --dry-run flag to cli/deploy.py — prints planned operations, executes nothing. Tests for both with-flag and without-flag paths. Land in one PR. +``` + +## DAG the swarm proposes + +``` + scan-deploy ──► impl-flag ──► add-tests ──► review ──► merge +``` + +A simple linear chain — small feature, no fan-out needed. + +| Task | Head | Notes | +|---|---|---| +| `scan-deploy` | scanner | Reads `cli/deploy.py`, identifies the side-effecting calls that need to be guarded. Files concrete tasks. | +| `impl-flag` | builder | Adds the argparse flag, refactors `execute(...)` to take a `dry_run: bool`, gates side-effects. | +| `add-tests` | builder | Writes tests for both paths against the existing test infra. | +| `review` | reviewer | Reviews for: missed side-effect, log format consistency, doc updates. | +| `merge` | merger | pytest -q, then push. | + +## Why this DAG and not parallel + +For small features, serial is faster than parallel — the coordination cost of fan-out (worktree creation, file-overlap check, sibling sync) outweighs the speedup when each step is < 5 minutes anyway. + +## What review surfaces + +The reviewer agent (read-only) inspects the diff and produces something like: + +``` +REVIEWER end-of-task — task review + +Files changed: cli/deploy.py (+18/-3), tests/test_deploy.py (+42/-0) +Commits: 3 (feat: argparse flag / refactor: thread dry_run / test: dry-run path) + +Findings (confidence ≥ 80): +1. [conf 92] cli/deploy.py:142 — log message reads "Deploying X" even in dry-run. + Suggest: prefix with "[DRY-RUN]" when dry_run=True. + +2. [conf 84] tests/test_deploy.py:67 — test asserts on log output but uses + capsys without capturing stderr. Add capsys.readouterr().err to the assert. + +Otherwise: clean. Tests cover both paths. Docstring updated. + +Recommendation: address both, then merge. +``` + +The Builder picks these up, makes 2 more commits (`refactor: log prefix in dry-run`, `test: capture stderr in deploy tests`), and the cycle continues. Once `review` returns clean, `merge` fires automatically. + +## Expected outcome + +- One PR with 4–5 commits. +- ~20–60k tokens total spend. +- 10–30 minutes wall time. diff --git a/plugins/swarm-orchestrator/examples/multi_day_audit.md b/plugins/swarm-orchestrator/examples/multi_day_audit.md new file mode 100644 index 0000000000..ef7884acfa --- /dev/null +++ b/plugins/swarm-orchestrator/examples/multi_day_audit.md @@ -0,0 +1,64 @@ +# Example 3: Multi-day audit + +Goal: produce a comprehensive complexity + security audit of an existing codebase, ending in a markdown report at `docs/audits/`. No code changes. + +## Spawn + +``` +/swarm-spawn Audit src/auth/ for OWASP top-10 issues AND src/core/ for cyclomatic complexity > 15. Produce two separate audit docs at docs/audits/. No code changes — research only. +``` + +## DAG the swarm proposes + +``` + scan-targets ──┬──► owasp-audit ─────┐ + └──► complexity-audit ─┴──► consolidate-summary +``` + +| Task | Head | Notes | +|---|---|---| +| `scan-targets` | scanner | Maps the territory of `src/auth/` and `src/core/`, files the two audit tasks with precise scope. | +| `owasp-audit` | auditor | Read-only deep dive into auth code; produces `docs/audits/auth-owasp-2026-05-10.md`. | +| `complexity-audit` | auditor | Read-only complexity survey of core; produces `docs/audits/core-complexity-2026-05-10.md`. | +| `consolidate-summary` | auditor | Reads both audits, produces a top-level `docs/audits/2026-05-10-summary.md` with priority-ranked findings across both. | + +## Why two audits in parallel + +`src/auth/` and `src/core/` don't overlap (file overlap = 0), so `parallelism_safety=safe` and the two auditors run concurrently. The orchestrator dispatches both as soon as `scan-targets` completes. + +## Why no `merge` + +The deliverable is markdown, not code. The Auditor head writes its `.md` files directly into the working tree. There's no merge gate because there's nothing to merge — the operator commits the audit docs by hand (or via a subsequent `/commit-push-pr`), or the swarm can be configured with a final builder step that does the commit. + +If the operator does want auto-commit: + +``` +/swarm-spawn Audit src/auth/ ... AND commit the resulting docs to a branch + PR. +``` + +Then the DAG becomes: + +``` + scan-targets ──┬──► owasp-audit ──────────┐ + └──► complexity-audit ──┐ │ + ▼ ▼ + commit-docs ──► merge +``` + +with a Builder at `commit-docs` (Bash + Edit only — git add the audit docs, write a commit message, push) and a Merger after. + +## Expected outcome + +- 2–3 markdown files at `docs/audits/`, each 200–600 lines, every finding citing file:line evidence. +- ~80–200k tokens total spend (audit work is read-heavy and Opus-tier). +- 1–4 hours wall time depending on codebase size. + +## Pattern: long-running audits + +For very large codebases, you can split each audit into N sub-audits by directory and chain them serially or in batches: + +``` + scan-targets ──► [audit-auth-1, audit-auth-2, ..., audit-auth-N] ──► consolidate-auth ──► ... +``` + +Each sub-auditor produces a partial doc; `consolidate-auth` merges them into the final report. Useful when one auditor session would blow the context window or budget. diff --git a/plugins/swarm-orchestrator/examples/refactor_python_module.md b/plugins/swarm-orchestrator/examples/refactor_python_module.md new file mode 100644 index 0000000000..58e25d1038 --- /dev/null +++ b/plugins/swarm-orchestrator/examples/refactor_python_module.md @@ -0,0 +1,51 @@ +# Example 1: Refactor a Python module + +Goal: take a 600-line `src/parser.py` written as one big class with type-switch dispatch, and refactor it into the visitor pattern with one class per node kind, plus a complete test suite — all in one PR. + +## Spawn + +``` +/swarm-spawn Refactor src/parser.py to use the visitor pattern. Add tests covering every node kind. Land in one PR. +``` + +## DAG the swarm proposes + +``` + ┌──► impl-base-visitor ──► impl-node-visitors ──► add-tests ──┐ + │ │ + scan-parser ──────┤ ├──► review ──► merge + │ │ + └──► extract-test-fixtures ────────────────────────────────────┘ +``` + +| Task | Head | Why | +|---|---|---| +| `scan-parser` | scanner | Reconnaissance: enumerate every node kind + every call site of the dispatch logic. Files the rest of the tasks. | +| `impl-base-visitor` | builder | Define the abstract `Visitor[T]` ABC and migrate the entry point. Small, surgical change. | +| `impl-node-visitors` | builder | Implement one concrete visitor per node kind discovered in `scan-parser`. Blocked on `impl-base-visitor`. | +| `extract-test-fixtures` | builder | Pull existing test inputs into reusable parametrized fixtures. Parallel-safe with the visitor work. | +| `add-tests` | builder | Write tests for every visitor against the new fixtures. Blocked on both above. | +| `review` | reviewer | End-of-task review: DRY, simplicity, missed node kinds. | +| `merge` | merger | Rebase onto main, run pytest, push. | + +## Approve, then watch + +After you approve, the orchestrator: + +1. Dispatches `scan-parser` and `extract-test-fixtures` in parallel (no overlap, both `safe`). +2. When `scan-parser` completes (filed N concrete sub-issues), dispatches `impl-base-visitor`. +3. When `impl-base-visitor` completes, dispatches `impl-node-visitors` (which can fan out further: one builder per visitor class if `parallelism_safety=safe` for non-overlapping files). +4. When all blockers complete, dispatches `add-tests`, then `review`, then `merge`. + +Throughout: the reviewer-checkpoint hook fires every 3 turns inside each Builder, prompting a self-review on commit count + spend + tractability. If a Builder gets stuck (10 turns / no commits / repeated test fails), the operator sees it in `/swarm-status` and can drop an abort marker. + +## Expected outcome + +- One PR titled `refactor(parser): visitor pattern + complete test coverage`. +- 5–10 commits (one per TodoWrite item across all builders, squashed-merged or kept atomic depending on project policy). +- All existing tests pass; new tests added. +- All swarm worktrees GC'd after merge. + +## Rough cost + +For a 600-line file with ~10 node kinds, expect ~60–150k tokens total ($1–$3 on Sonnet) and ~30–90 minutes of wall time. Most of the spend is in the parallel `impl-node-visitors` builders. diff --git a/plugins/swarm-orchestrator/hooks/__init__.py b/plugins/swarm-orchestrator/hooks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/plugins/swarm-orchestrator/hooks/hooks.json b/plugins/swarm-orchestrator/hooks/hooks.json new file mode 100644 index 0000000000..8981f44bc7 --- /dev/null +++ b/plugins/swarm-orchestrator/hooks/hooks.json @@ -0,0 +1,28 @@ +{ + "description": "swarm-orchestrator plugin — DAG cascade + reviewer-checkpoint + worktree-GC hooks", + "hooks": { + "PostToolUse": [ + { + "matcher": "TaskUpdate", + "hooks": [ + { + "type": "command", + "command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/on_task_complete.py", + "timeout": 30 + } + ] + } + ], + "Stop": [ + { + "hooks": [ + { + "type": "command", + "command": "python3 ${CLAUDE_PLUGIN_ROOT}/hooks/reviewer_checkpoint.py", + "timeout": 15 + } + ] + } + ] + } +} diff --git a/plugins/swarm-orchestrator/hooks/on_task_complete.py b/plugins/swarm-orchestrator/hooks/on_task_complete.py new file mode 100755 index 0000000000..acfd60927a --- /dev/null +++ b/plugins/swarm-orchestrator/hooks/on_task_complete.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +swarm-orchestrator: on-task-complete hook + +Fires after every TaskUpdate. If the update set status=completed (or merged), we: + + 1. Re-evaluate the DAG frontier — find tasks whose blockedBy is now satisfied + and emit a hint via stdout (the orchestrator session reads this and dispatches). + 2. Optionally trigger the merge cascade (`/swarm-merge` programmatically) if + the project config has `merge.auto_on_complete: true`. + 3. Optionally GC worktrees whose branch is now merged. + +This hook is intentionally read-mostly — it does not mutate task state itself. +It writes a structured event to ~/.claude/teams/<team>/cascade-events.jsonl so +the orchestrator session can pick it up on its next poll. + +Exit codes: + 0 — handled (or not applicable; e.g. update was not a status change) + 1 — fatal error (logged but does not block the TaskUpdate) + +Reads JSON from stdin per Claude Code's hook protocol. +""" + +from __future__ import annotations + +import datetime as _dt +import json +import os +import pathlib +import sys +from typing import Any + +TEAMS_ROOT = pathlib.Path(os.path.expanduser("~/.claude/teams")) +LOG_PATH = pathlib.Path(os.path.expanduser("~/.claude/swarm-orchestrator-hook.log")) + + +def _log(msg: str) -> None: + LOG_PATH.parent.mkdir(parents=True, exist_ok=True) + with LOG_PATH.open("a") as fh: + fh.write(f"{_dt.datetime.utcnow().isoformat()}Z on_task_complete {msg}\n") + + +def _atomic_append_jsonl(path: pathlib.Path, record: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + line = json.dumps(record, ensure_ascii=False) + "\n" + with path.open("a") as fh: + fh.write(line) + + +def _read_dag(team: str) -> dict[str, Any] | None: + path = TEAMS_ROOT / team / "swarm-dag.json" + if not path.exists(): + return None + try: + return json.loads(path.read_text()) + except (OSError, json.JSONDecodeError) as e: + _log(f"failed to read DAG for {team}: {e}") + return None + + +def _unblocked_after(dag: dict[str, Any]) -> list[str]: + """Return task ids whose blockedBy entries are all in {completed, merged}.""" + tasks = dag.get("tasks", {}) + done = {t_id for t_id, t in tasks.items() if t.get("status") in {"completed", "merged"}} + out = [] + for t_id, task in tasks.items(): + if task.get("status") not in {"pending", "blocked"}: + continue + blockers = task.get("blockedBy", []) + if all(b in done for b in blockers): + out.append(t_id) + return out + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except (json.JSONDecodeError, OSError) as e: + _log(f"could not parse stdin: {e}") + return 0 # don't block the user's TaskUpdate + + tool = payload.get("tool_name") or payload.get("tool", "") + if tool != "TaskUpdate": + return 0 + + tool_input = payload.get("tool_input") or payload.get("input", {}) + new_status = (tool_input.get("status") or "").lower() + if new_status not in {"completed", "merged"}: + return 0 + + task_id = tool_input.get("task_id") or tool_input.get("id") + team = tool_input.get("team") or payload.get("team_name") + if not (task_id and team): + _log(f"missing task_id or team in TaskUpdate payload: {payload!r}") + return 0 + + dag = _read_dag(team) + if dag is None: + _log(f"no DAG found for team {team}; skipping cascade") + return 0 + + newly_unblocked = _unblocked_after(dag) + + event = { + "ts": _dt.datetime.utcnow().isoformat() + "Z", + "kind": "task_complete", + "team": team, + "task_id": task_id, + "new_status": new_status, + "newly_unblocked": newly_unblocked, + } + _atomic_append_jsonl(TEAMS_ROOT / team / "cascade-events.jsonl", event) + + # Surface the cascade to the orchestrator's chat so it's visible. + if newly_unblocked: + print( + f"[swarm-orchestrator] task {task_id} {new_status}; " + f"newly unblocked: {', '.join(newly_unblocked)}" + ) + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: # noqa: BLE001 # never block a TaskUpdate + _log(f"fatal: {exc!r}") + sys.exit(0) diff --git a/plugins/swarm-orchestrator/hooks/reviewer_checkpoint.py b/plugins/swarm-orchestrator/hooks/reviewer_checkpoint.py new file mode 100755 index 0000000000..8382cfa781 --- /dev/null +++ b/plugins/swarm-orchestrator/hooks/reviewer_checkpoint.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +swarm-orchestrator: reviewer-checkpoint hook + +Fires on Stop. If the session is a swarm Builder AND the turn count crosses a +configured threshold (default: every Nth turn after turn `floor`), this hook +prints a lightweight self-review prompt to stdout, which Claude Code injects +into the Builder's next system message. + +The actual deep review is delegated to the Reviewer subagent on demand — this +hook is a cheap, deterministic nudge. + +Configuration: project's .claude/swarm-orchestrator.json: + { + "reviewer_checkpoint": { + "enabled": true, + "every_n_turns": 3, + "floor": 6 + } + } + +If the file is missing or `enabled` is false, the hook is a no-op. + +Reads JSON from stdin per Claude Code's hook protocol. +""" + +from __future__ import annotations + +import datetime as _dt +import json +import os +import pathlib +import sys +from typing import Any + +LOG_PATH = pathlib.Path(os.path.expanduser("~/.claude/swarm-orchestrator-hook.log")) + + +def _log(msg: str) -> None: + LOG_PATH.parent.mkdir(parents=True, exist_ok=True) + with LOG_PATH.open("a") as fh: + fh.write(f"{_dt.datetime.utcnow().isoformat()}Z reviewer_checkpoint {msg}\n") + + +def _load_config(cwd: pathlib.Path) -> dict[str, Any]: + candidate = cwd / ".claude" / "swarm-orchestrator.json" + if not candidate.exists(): + return {} + try: + return json.loads(candidate.read_text()) + except (OSError, json.JSONDecodeError) as e: + _log(f"could not parse config at {candidate}: {e}") + return {} + + +def _is_swarm_builder(payload: dict[str, Any]) -> bool: + """Heuristic: this is a swarm Builder session if the agent identity hints so.""" + agent = (payload.get("agent_type") or payload.get("subagent_type") or "").lower() + if agent == "builder": + return True + # Fall back: check the working directory for a swarm worktree marker. + cwd = payload.get("cwd") or os.getcwd() + return "/.claude/worktrees/" in cwd or "/swarm-" in cwd + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except (json.JSONDecodeError, OSError) as e: + _log(f"could not parse stdin: {e}") + return 0 + + if not _is_swarm_builder(payload): + return 0 + + cwd = pathlib.Path(payload.get("cwd") or os.getcwd()) + config = _load_config(cwd).get("reviewer_checkpoint", {}) + if not config.get("enabled", True): + return 0 + + every_n = int(config.get("every_n_turns", 3)) + floor = int(config.get("floor", 6)) + + turn = int(payload.get("turn") or payload.get("turn_count") or 0) + if turn < floor: + return 0 + if (turn - floor) % every_n != 0: + return 0 + + print( + "[swarm-orchestrator reviewer-checkpoint]\n" + f"You are at turn {turn}. Before continuing, do a quick self-review:\n" + " 1. DAG status: is your task still in_progress as expected?\n" + " 2. Commits: how many since you started? Are they small + focused?\n" + " 3. TodoWrite: how many items done vs. remaining?\n" + " 4. Tractability: any sign of thrash (same file edited > 5x with no commit; " + "repeated test failures with no diagnostic between them)?\n" + "If you spot drift, course-correct now. If you're stuck, write the abort " + "marker and surface to the operator." + ) + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: # noqa: BLE001 + _log(f"fatal: {exc!r}") + sys.exit(0) From 8029801ab7282682d39d7f4fe4c2484e083c81bc Mon Sep 17 00:00:00 2001 From: Kushal Jaligama <kjaligusa@gmail.com> Date: Sun, 10 May 2026 15:51:52 -0700 Subject: [PATCH 2/3] feat(plugins/swarm-orchestrator): testing substrate + design proposal Testing substrate at plugins/swarm-orchestrator/tests/swarming/: - 10 self-contained, deterministic, sub-5-minute toy scenarios exercising every primitive: multi-file-rename, spec-impl-pair, scan-build-review, doc-writer-team, multi-language-port, audit-then-fix, conflict-resolution-drill, abort-marker-test, respawn-on-crash, multi-team-coordination - Canonical scenario JSON schema at schema/scenario.schema.json - Binding-agnostic ScenarioEngine protocol + reference InProcessScenarioEngine - 15 hook unit tests for the cascade + checkpoint hooks - All tests pass: 10/10 scenarios + 15/15 hook tests, no LLM dispatch (every scenario test runs against a deterministic in-process reference engine, so the test suite costs zero tokens) Design proposal at IMPROVEMENTS_OVER_VANILLA_TEAMS.md: - 45 documented primitives across core / reliability / observability / coordination / quality / docs / advanced layers - Multi-tier architecture: supervisor per team, meta-supervisor per host, head-typed agents - Explicit shipped-vs-deferred split (PR description's tables remain authoritative; design doc sketches the full target surface) - Comparison table vs vanilla Teams --- .../IMPROVEMENTS_OVER_VANILLA_TEAMS.md | 546 ++++++++++++++++ plugins/swarm-orchestrator/tests/__init__.py | 0 .../tests/swarming/README.md | 99 +++ .../tests/swarming/__init__.py | 6 + .../fixtures/abort-marker-test/README.txt | 3 + .../fixtures/audit-then-fix/src/buggy_01.py | 5 + .../fixtures/audit-then-fix/src/buggy_02.py | 5 + .../fixtures/audit-then-fix/src/buggy_03.py | 5 + .../fixtures/audit-then-fix/src/clean.py | 4 + .../conflict-resolution-drill/shared.py | 3 + .../fixtures/doc-writer-team/src/alpha.py | 4 + .../fixtures/doc-writer-team/src/beta.py | 4 + .../fixtures/doc-writer-team/src/delta.py | 4 + .../fixtures/doc-writer-team/src/epsilon.py | 4 + .../fixtures/doc-writer-team/src/gamma.py | 4 + .../fixtures/multi-file-rename/files/mod01.py | 6 + .../fixtures/multi-file-rename/files/mod02.py | 6 + .../fixtures/multi-file-rename/files/mod03.py | 6 + .../fixtures/multi-file-rename/files/mod04.py | 6 + .../fixtures/multi-file-rename/files/mod05.py | 6 + .../fixtures/multi-file-rename/files/mod06.py | 6 + .../fixtures/multi-file-rename/files/mod07.py | 6 + .../fixtures/multi-file-rename/files/mod08.py | 6 + .../fixtures/multi-file-rename/files/mod09.py | 6 + .../fixtures/multi-file-rename/files/mod10.py | 6 + .../fixtures/multi-file-rename/manifest.json | 8 + .../fixtures/multi-language-port/README.txt | 1 + .../multi-team-coordination/README.txt | 4 + .../fixtures/respawn-on-crash/README.txt | 3 + .../scan-build-review/sample/feature_01.txt | 1 + .../scan-build-review/sample/feature_02.txt | 1 + .../scan-build-review/sample/feature_03.txt | 1 + .../scan-build-review/sample/feature_04.txt | 1 + .../scan-build-review/sample/feature_05.txt | 1 + .../fixtures/spec-impl-pair/README.txt | 3 + .../tests/swarming/run_scenario.py | 89 +++ .../tests/swarming/run_scenario.sh | 10 + .../tests/swarming/runner/__init__.py | 13 + .../tests/swarming/runner/assertions.py | 190 ++++++ .../tests/swarming/runner/harness.py | 201 ++++++ .../tests/swarming/runner/stub.py | 613 ++++++++++++++++++ .../swarming/scenarios/abort-marker-test.json | 23 + .../swarming/scenarios/audit-then-fix.json | 30 + .../scenarios/conflict-resolution-drill.json | 24 + .../swarming/scenarios/doc-writer-team.json | 33 + .../swarming/scenarios/multi-file-rename.json | 35 + .../scenarios/multi-language-port.json | 26 + .../scenarios/multi-team-coordination.json | 25 + .../swarming/scenarios/respawn-on-crash.json | 26 + .../swarming/scenarios/scan-build-review.json | 34 + .../swarming/scenarios/spec-impl-pair.json | 28 + .../swarming/schema/scenario.schema.json | 204 ++++++ .../tests/swarming/test_scenarios.py | 26 + .../swarm-orchestrator/tests/test_hooks.py | 269 ++++++++ 54 files changed, 2679 insertions(+) create mode 100644 plugins/swarm-orchestrator/IMPROVEMENTS_OVER_VANILLA_TEAMS.md create mode 100644 plugins/swarm-orchestrator/tests/__init__.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/README.md create mode 100644 plugins/swarm-orchestrator/tests/swarming/__init__.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/abort-marker-test/README.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_01.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_02.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_03.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/clean.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/conflict-resolution-drill/shared.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/alpha.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/beta.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/delta.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/epsilon.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/gamma.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod01.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod02.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod03.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod04.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod05.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod06.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod07.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod08.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod09.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod10.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/manifest.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-language-port/README.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/multi-team-coordination/README.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/respawn-on-crash/README.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_01.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_02.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_03.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_04.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_05.txt create mode 100644 plugins/swarm-orchestrator/tests/swarming/fixtures/spec-impl-pair/README.txt create mode 100755 plugins/swarm-orchestrator/tests/swarming/run_scenario.py create mode 100755 plugins/swarm-orchestrator/tests/swarming/run_scenario.sh create mode 100644 plugins/swarm-orchestrator/tests/swarming/runner/__init__.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/runner/assertions.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/runner/harness.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/runner/stub.py create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/abort-marker-test.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/audit-then-fix.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/conflict-resolution-drill.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/doc-writer-team.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/multi-file-rename.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/multi-language-port.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/multi-team-coordination.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/respawn-on-crash.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/scan-build-review.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/scenarios/spec-impl-pair.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/schema/scenario.schema.json create mode 100644 plugins/swarm-orchestrator/tests/swarming/test_scenarios.py create mode 100644 plugins/swarm-orchestrator/tests/test_hooks.py diff --git a/plugins/swarm-orchestrator/IMPROVEMENTS_OVER_VANILLA_TEAMS.md b/plugins/swarm-orchestrator/IMPROVEMENTS_OVER_VANILLA_TEAMS.md new file mode 100644 index 0000000000..8b19de7c05 --- /dev/null +++ b/plugins/swarm-orchestrator/IMPROVEMENTS_OVER_VANILLA_TEAMS.md @@ -0,0 +1,546 @@ +# swarm-orchestrator — Improvements over vanilla Anthropic Teams + +This is a **living document** capturing every limitation I hit using Anthropic's Teams beta (gated behind `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` in Claude Code 2.1.138), and every feature I propose to address them. It is intended to serve as the body of the GitHub issue / PR description when this plugin is submitted upstream. + +**Status**: Draft. Add entries as new limitations are discovered. Each entry is dated. + +--- + +## Two ways to use this plugin + +This plugin is designed for **dual use** — either as a complete standalone swarm orchestration system, or as a layered addition to Anthropic's Teams primitives. The same primitive set works in both modes; the difference is which transport layer you wire to. + +### Mode A — Standalone swarm + +Assume Anthropic Teams does not exist. This plugin alone provides: + +- Multi-tier orchestration (meta-supervisor → supervisor → typed heads) +- DAG-dependency task graph with auto-unblock cascade +- Filesystem-backed task list + inbox transport (one Python module, no daemon required) +- Tool use registry with role-based allowlists +- Provider routing (Claude Max plan / API / Bedrock / Vertex / local) +- Worktree isolation + lifecycle management +- Clone-isolated merge pipeline with test gate + rollback +- Per-teammate budget tracking + cost ledger +- Reviewer checkpoint (every N turns) +- Auto-recovery for dead teammates +- Pattern-detection logging for offline classifier training +- 10 toy scenarios for validation + +Works on any machine with `claude` (or a fallback CLI) installed. No experimental flags required. Single `pip install` for the standalone engine; the plugin packages it as a Claude Code surface. + +### Mode B — Integrated with Anthropic Teams + +When `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` is set and the binary's Teams primitives are active, the plugin's standalone storage layer becomes optional. Instead, the plugin **adapts** to Anthropic's on-disk schema (`~/.claude/teams/<name>/config.json`, `inboxes/<recipient>.json`, `tasks/<N>.json`) and uses the binary's built-in tools where available: + +- `Anthropic.TaskCreate` for new tasks → plugin layers DAG-aware dispatch on top +- `Anthropic.SendMessage` for routing → plugin layers cross-team + cross-machine routing +- `Anthropic.InboxPoller` for delivery → plugin reuses; binary already polls the schema +- Plugin's **named subagent_types** (Scanner, Reviewer, Builder, Merger, Test-Runner, Auditor) register via `agents/*.md` and surface to `Agent` calls +- Plugin's hooks (`SubagentStop`, `Stop`) fire on lifecycle events Anthropic's runtime emits + +In Mode B, vanilla Teams users see no behavior change unless they explicitly invoke a plugin slash command (`/swarm-spawn`, `/swarm-status`, `/swarm-merge`) or register a typed-head subagent. Fully backward-compatible. + +### Why dual-mode matters + +- **Anthropic Teams is beta and gated.** Users without the experimental flag still need swarm orchestration. The plugin works for them via Mode A. +- **Multi-machine fleets.** A meta-supervisor on host A can manage a Mode-B team on host B and a Mode-A team on host C — the plugin's storage abstraction unifies them. +- **Graceful fallback.** If a feature ships in Mode B but a user is on an older Claude Code build, they get the Mode-A equivalent automatically. +- **Independent evolution.** The standalone engine can be released on PyPI / Homebrew with its own cadence; the plugin tracks Anthropic's CLI releases. + +The rest of this document describes the **primitives** (which exist in both modes) and the **integration delta** (what changes when bridging to Anthropic Teams in Mode B). + +--- + +## Session resistance — work continues after Claude CLI exits + +A defining property of this swarm: **tasks continue to completion even after the Claude CLI session that started them exits.** + +The meta-supervisor daemon (spawned via `claude --bare` or a launchd / systemd unit) runs independently of any interactive session. It polls the on-disk task list, dispatches teammates as DAG nodes unblock, gates merges through the test pipeline, and respawns crashed workers — all driven by file-system events, not by a live conversation. + +**Configurable at init time.** Operators who prefer human-in-the-loop semantics keep the default (session-bound). Those who want truly autonomous overnight / weekend runs flip the switch: + +```sh +claude-swarm init --persistent # autonomous mode: keep going after CLI exit +claude-swarm init --session-bound # default: stop dispatching when CLI exits +claude-swarm config set persistent.mode true # toggle later +``` + +The CLI's visibility commands (`claude-swarm status`, `swarm-watch` TUI dashboard) **always work** regardless of mode — they attach to the daemon, not to any specific session. Operators get a single pane of glass even when work was kicked off days ago in a now-closed session. + +This is why the architecture leans on filesystem state + a polling loop rather than push-based RPC: every primitive is durable across process restarts, machine reboots, and human breaks. The swarm is a service, not a chat. + +--- + +## Primitive set (complete enumeration) + +These are the primitives the standalone engine implements + the plugin exposes. Each works identically in Mode A and Mode B; storage backends differ but the API is stable. + +### Coordination primitives + +The coordination layer is built on a **DAG-aware kanban** that's safe for parallel access by N workers: +- `claim_one()` is atomic (sqlite `BEGIN IMMEDIATE` in the library; flock-guarded JSON in the plugin's lightweight mode) — no two workers ever claim the same task +- `unblocked()` returns a topologically-correct iterator over tasks whose blockers are all `done` +- `add_blocked_by` / `add_blocks` are first-class mutations; the auto-unblock cascade fires via `PostToolUse(TaskUpdate)` hook +- Status timeline (`tasks/<id>/timeline.jsonl`) captures every transition for audit + replay +- Schema-compatible with Anthropic Teams' `TaskCreate.blockedBy` field — I just add the iterator + cascade on top + +| Primitive | What it does | Mode A backend | Mode B backend | +|---|---|---|---| +| `Kanban.claim_one(head_type)` | Atomic claim from the ready queue | Plugin: flock-guarded JSON; Library: sqlite WAL `BEGIN IMMEDIATE` | Atomic write to Anthropic's `tasks/<id>.json` via `TaskUpdate` | +| `TaskCreate / Get / Update / List / Stop` | Task lifecycle | Plugin's `claude_swarm.tasks` module | Anthropic's `Task*` tools | +| `TeamCreate / Delete` | Team lifecycle | Plugin's `claude_swarm.teams` module | Anthropic's `Team*` tools | +| `SendMessage` | Inter-agent messaging | Plugin's `claude_swarm.messaging` module | Anthropic's `SendMessage` tool | +| `DAG.addBlocks / addBlockedBy` | Task dependencies | Plugin extension | Plugin extension (Anthropic's flat list extended) | +| `DAG.unblocked()` | Iterator over ready-to-dispatch tasks | Plugin | Plugin (read Anthropic's task list, filter) | +| `MultiTeam.create / switch / route` | N concurrent teams per session | Plugin | Plugin (extends single-team limit) | +| `CrossMachine.route(name@machine)` | Multi-host SendMessage | Plugin (SSH tunnel + ed25519 auth) | Anthropic's `--remote-control` when available | + +### Reliability primitives + +| Primitive | What it does | +|---|---| +| `AbortMarker(name)` | Drop a marker file; teammate commits WIP + exits at next phase boundary | +| `AutoRecovery` | Meta-supervisor respawns dead teammates from last commit | +| `WorktreeIsolation` | Each teammate gets its own git worktree (parallel-safe) | +| `WorktreeGC` | Prune worktrees of completed teammates | +| `FileOverlapReject` | Reject parallel dispatch if predicted file collision exceeds threshold | +| `BoundedQueue(maxsize, drop="oldest")` | Inbox queues never grow unbounded | +| `AtomicWrites` | All state writes use write-to-tmp + rename (no partial corruption) | +| `Flock(path)` | Per-team-config + per-task lock to prevent concurrent edits | + +### Tool use primitives + +| Primitive | What it does | +|---|---| +| `ToolRegistry.register(name, tool, allowed_heads)` | Declares a tool + which head types may use it | +| `ToolRegistry.scoped_for(head)` | Returns the tool subset a head is allowed; used at spawn-time | +| `ProviderRouter.select(task)` | Picks LLM provider per task hint (e.g., expensive tasks → API, cheap tasks → Max plan) | +| `MCPServer.bundle(name)` | Plugin can ship optional MCP servers as separate packages — none in the core plugin | +| `AntCLI.classify(prompt, model="haiku")` | Cheap one-shot LLM classifications via the Anthropic Platform CLI (`ant`) — used by Scanner heads + meta-supervisor decisions | + +### Quality primitives + +| Primitive | What it does | +|---|---| +| `ReviewerCheckpoint(every_n_turns)` | Spawn a Reviewer head at scheduled intervals to audit team state | +| `TestGate(test_command, branch)` | Run configured tests in a staging clone before merge; rollback on fail | +| `MergePipeline.dry_run(branches)` | Clone-isolated dry-merge; report conflicts + suggested topological order | +| `MergePipeline.execute(branches)` | Atomic batch merge with test gate + rollback | +| `LintGate(lint_command)` | Block merge if linting fails | + +### Observability primitives + +| Primitive | What it does | +|---|---| +| `Budget(team, soft_cap, hard_cap)` | Per-team + per-teammate token budget enforcement | +| `CostLedger.debit(agent, tokens, dollars)` | Time-series ledger of spend per agent / per task | +| `StatusTimeline(team)` | Every state transition logged with timestamp + reason | +| `PatternDetection.log(decision)` | Supervisor decisions logged for offline classifier training | +| `MindStatus()` | Stable JSON status feed at `~/.claude/teams/.status.json` for external UIs (Apple app, web dashboard) | + +### Autonomy primitives + +| Primitive | What it does | +|---|---| +| `Scanner.discover(target)` | Read-only head that files tasks autonomously from codebase scans | +| `MetaSupervisor.poll()` | Long-running loop that detects stalled tasks + respawns dead teammates + enforces budgets | +| `AutoMerge.on_task_complete(task)` | Hook fires on `status=completed` → runs merge pipeline + pushes | +| `ParallelismSafetyClassifier(task)` | Returns estimated probability that this task can run in parallel with others without conflict | + +--- + +## Architecture context (what `SendMessage` is, what this plugin is) + +**`SendMessage` and the Teams primitives are built-in tools**, not a plugin / MCP server / skill. They live inside the `claude` binary and activate when `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` is set. The on-disk schema (`~/.claude/teams/<name>/config.json`, `inboxes/<recipient>.json`, `tasks/<N>.json`) is what the binary's `InboxPoller` + `TeammateMailbox` read and write. + +**This `swarm-orchestrator` plugin** is a packaging format that ships: +- `commands/*.md` — slash commands (`/swarm-spawn`, `/swarm-status`, `/swarm-merge`) +- `agents/*.md` — named subagent_types for the heads (Scanner, Reviewer, Builder, Merger, Test-Runner, Auditor) +- `hooks/*.json` — lifecycle hooks (on-task-completed, on-teammate-idle) +- Optional `mcp_servers/` for tools available to spawned teammates +- This document (proposed improvements to the binary built-ins) + +The plugin does NOT modify `SendMessage` or other built-ins. It LAYERS additional behavior on top of them via hooks, slash commands, and named agents — additive only, no breaking changes to vanilla Teams. The IMPROVEMENTS list below is for Anthropic's binary team to consider; landing them upstream would unlock the plugin's full potential. + +## Executive summary + +I built a parallel-agent swarm on top of Anthropic Teams that landed **a large batch of PRs in a 90-minute window** via worktree-isolated teammates coordinating through filesystem-backed inboxes. Along the way I hit specific limitations in the Teams primitives that, if addressed, would unlock significantly more parallel throughput for power users. This plugin proposes those changes as additive features — vanilla Teams users see no behavior change unless they opt in. + +**The headline is autonomous orchestration that survives session exit.** Once configured for persistent mode, the meta-supervisor daemon keeps polling task lists, dispatching heads, gating merges, and recovering crashed teammates — *even after the operator closes the Claude CLI*. Tasks queued before exit continue to completion; new tasks filed by Scanner heads autonomously execute on cadence. The persistence mode is toggleable at `claude-swarm init` time so operators who prefer human-in-the-loop semantics can opt out. + +The headline improvements span a shipped substrate plus a designed extension. **Shipped in this PR**: DAG task dependencies, named role-based subagents (Scanner / Reviewer / Builder / Merger / Test-Runner / Auditor), reviewer checkpoints, abort-marker contract, atomic-claim kanban under parallel workers, filesystem-RPC fallback for SendMessage, bounded inboxes, atomic file writes, worktree garbage collection, session-resistant supervisor daemon (single-host), per-head cost + token accounting, the global-mind transcript, and a persistent-agent state schema (`claude_swarm.agents`) the future native `Agent(..., persistent=True)` flag would write to. **Designed but deferred to follow-up PRs**: multi-host meta-supervisor with auto-respawn for crashed teammates, stuck-task watchdog, per-teammate token budget enforcement, pattern-detection classifier, multi-team support, and the native `Agent` tool refactor for in-binary persistence. The PR description's "Shipped / Deferred" tables (and the *"What this PR ships, and what's next"* section) are authoritative; this design proposal sketches the full target surface. + +--- + +## Multi-tier orchestration architecture (the headline architectural proposal) + +The most significant architectural delta from vanilla Teams: I propose a **three-tier hierarchy** instead of Teams' flat `lead + members` model. + +### What vanilla Teams ships today + +``` +TEAM +├── team-lead (one per team; holds TaskCreate/SendMessage tools) +├── member-1 (untyped, subagent_type="general-purpose") +├── member-2 (untyped, subagent_type="general-purpose") +└── ... (all members are flat peers) +``` + +- One lead per team. The lead is the *only* member with team-coordination tools. +- All other members are equivalent: same tool surface, same role, no specialization. +- Members communicate with the lead via SendMessage; the lead orchestrates manually. +- A session can only lead ONE team at a time (L2 limitation). +- No automatic delegation, no role assignment, no cross-team policy. + +### What I built and propose for upstream + +``` + ┌─────────────────────────────────┐ + │ META-SUPERVISOR │ + │ (long-running, one per host) │ + │ - watches N teams │ + │ - polls inbox + task files │ + │ - applies cross-team policy │ + │ - auto-recovery + budgets │ + │ - pattern detection (offline) │ + └─────────┬───────────────────────┘ + │ polls + dispatches + │ + ┌──────────────────┼──────────────────┐ + ▼ ▼ ▼ + ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ + │ SUPERVISOR │ │ SUPERVISOR │ │ SUPERVISOR │ + │ (Team A) │ │ (Team B) │ │ (Team N) │ + │ - orchestrate │ │ - orchestrate │ │ - orchestrate │ + │ - reviewer │ │ - reviewer │ │ - reviewer │ + │ checkpoint │ │ checkpoint │ │ checkpoint │ + │ - merge graph │ │ - merge graph │ │ - merge graph │ + └──────┬────────┘ └──────┬────────┘ └──────┬────────┘ + │ │ │ + ▼ ▼ ▼ + (HEADS — typed worker agents, one per task) + + ┌────────────┐ ┌──────────┐ ┌─────────┐ ┌────────┐ ┌─────────────┐ ┌─────────┐ + │ Scanner │ │ Reviewer │ │ Builder │ │ Merger │ │ Test-Runner │ │ Auditor │ + │ read-only │ │ read-only│ │ all │ │ Bash+ │ │ read+pytest │ │ read- │ + │ files │ │ status + │ │ tools │ │ git │ │ only │ │ only; │ + │ tasks │ │ cost │ │ │ │ only │ │ │ │ produces│ + │ auto │ │ check │ │ │ │ │ │ │ │ audits │ + └────────────┘ └──────────┘ └─────────┘ └────────┘ └─────────────┘ └─────────┘ +``` + +### Tier-by-tier breakdown + +#### Tier 1 — Meta-supervisor (one per host) + +A long-running Claude Code session (or daemon spawned via `claude --bare`) that: + +- Polls `~/.claude/teams/*/{config.json, inboxes/*, tasks/*}` on a 5-second interval +- Watches PID liveness for every spawned teammate across every team +- **Auto-recovery**: detects dead teammates (process gone + status≠completed) and respawns from last commit on their branch +- **Cross-team policy**: enforces per-team and per-teammate token budgets; pauses on over-budget; routes overflow to a different team +- **Stalled-task detection**: tasks in `status=in_progress > 30 min without commit` get reviewer-checkpoint dispatched, then re-assigned if no movement +- **Routing**: when a teammate files a follow-up task during work, the meta-supervisor decides which team (or which existing teammate) gets it based on file overlap + parallelism-safety classification +- **Cross-machine routing** via `--remote-control`: a teammate on machine A can be addressed from a team on machine B +- **Pattern detection logging**: every decision logged for offline classifier training (parallelism_safety, success_probability) + +Vanilla Teams equivalent: **none.** Closest analogue is the team-lead itself — but the lead is a peer, not a supervisor; it has no policy authority over other leads or teams; it dies with the session. + +#### Tier 2 — Supervisor (one per team) + +A Claude Code session that holds the **team-lead tools** (`TaskCreate`, `TaskUpdate`, `TaskList`, `SendMessage` to teammates). On top of vanilla Teams' team-lead role, the Supervisor adds: + +- **DAG dependency tracking**: maintains the task graph, only dispatches tasks whose blockers are complete +- **Head dispatch**: when a task is unblocked, assigns the appropriate head type based on task tags (`auditor` task → Auditor head, etc.) +- **Reviewer checkpoint scheduling**: every N turns (configurable), spawns a Reviewer head to audit team state +- **Merge graph**: when a head completes a task, runs the configured merge pipeline (rebase + test gate + push) +- **Worktree GC**: prunes worktrees of completed teammates + +Vanilla Teams equivalent: the team-lead role. I extend it with DAG + head dispatch + reviewer + merge. + +#### Tier 3 — Heads (worker agents, one per task) + +Named `subagent_type` definitions (declared in `agents/*.md`). Each head has: + +- A role-specific system prompt +- A role-specific tool whitelist (Reviewer = read-only; Merger = Bash + git only; Builder = all tools) +- A role-specific input contract (Scanner takes a "scan target"; Reviewer takes a "team state snapshot") + +Concrete heads in the initial PR: + +| Head | Role | Allowed tools | Typical task | +|---|---|---|---| +| **Scanner** | Find work + file tasks automatically | Read, Grep, Bash (read-only) | "Scan this codebase for TODO comments and file tasks per finding" | +| **Reviewer** | Status check + cost audit + drift detection | Read, TaskGet, TaskList, SendMessage | "Audit team A's state; flag any teammate over 50% of budget" | +| **Builder** | Default worker — does the actual implementation | All tools | "Implement task #N per its description" | +| **Merger** | Run the merge pipeline | Bash + git only | "Merge wave 1 of the dry-run plan into master" | +| **Test-Runner** | Gate merges with pytest | Read, Bash (pytest only) | "Run unit tests against branch X; report pass/fail" | +| **Auditor** | Produce audit docs | Read, Grep, Write (docs/research/ only) | "Audit type-hint coverage across the Python source tree" | + +Vanilla Teams equivalent: **none — all members are `general-purpose`**. The plugin registers these via `.claude-plugin/agents/<head>.md` so they appear as named subagent_types to `Agent` calls. + +### The delta in one sentence + +**Vanilla Teams gives you one tier of coordination (lead + flat members); swarm-orchestrator gives you three (meta-supervisor + supervisor + typed heads), enabling autonomous orchestration of N teams from a single host with role-based capability isolation and policy enforcement at the meta tier.** + +### Production validation of the multi-tier architecture + +In a representative session: +- One team-lead (operator's Claude Code session = meta-supervisor role implicit) +- One team +- Spawned: 22+ teammates, each effectively in the Builder head role +- Result: a high-throughput batch of PRs in a 90-minute window + +With explicit meta-supervisor + supervisor + typed-heads architecture, the same throughput is achievable without operator-in-the-loop on every dispatch. Scanner heads file tasks autonomously; Reviewer heads run on a schedule; the meta-supervisor handles cross-team policy. **Operator/lead role shifts from "dispatch + monitor" to "set direction + review the daily output".** + +## Limitations of vanilla Teams (with evidence) + +### L1 — `SendMessage` doesn't surface in spawned teammates [2026-05-10] +**Evidence**: 12+ teammates spawned via the `Agent` tool with `team_name` semantics; each ran `ToolSearch select:SendMessage,TaskUpdate,TaskGet` at session start. The tools did NOT load even though `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` was inherited. + +**Root cause**: the Agent tool's exposed schema (`description`, `prompt`, `subagent_type`, `isolation`, `model`, `run_in_background`) does not include team-membership fields. TeamCreate's docstring references `team_name` and `name` params but the Agent tool doesn't accept them. + +**Impact**: parallel teammates can't coordinate via the official RPC. I worked around by writing directly to `~/.claude/teams/<team>/inboxes/<recipient>.json` (the same on-disk schema the runtime's `InboxPoller` reads). The harness *does* surface those file writes as conversation turns to the team-lead — so the inbox transport works one-way (team-lead → teammate is missing). + +**Proposed fix**: extend the `Agent` tool schema to accept `team_name` + `name` as optional params. When set, the spawned session is registered as a team member and SendMessage / TaskGet / TaskUpdate / TaskList become available. + +### L2 — A session can only lead ONE team at a time [2026-05-10] +**Evidence**: Calling `TeamCreate` while already team-lead of another team errors with `"Already leading team X. A leader can only manage one team at a time. Use TeamDelete to end the current team before creating a new one."` + +**Impact**: real workflows need multiple teams concurrently — e.g., a release-engineering team AND a documentation-pass team running in parallel. Users hit this immediately and had to merge tasks into a single team for unrelated workstreams. + +**Proposed fix**: allow N concurrent teams per session. Each team has its own task list, inbox dir, member roster. Add `TeamSwitch(name)` for tool surfaces keyed to "current team". + +### L3 — No DAG dependencies on tasks (only flat lists) [2026-05-10] +**Evidence**: `TaskUpdate` supports `addBlocks` / `addBlockedBy`, but there's no auto-unblock cascade. `TaskList()` returns all tasks regardless of dependency state. + +**Impact**: complex workflows (audit → fix → test → merge) need manual orchestration. The team-lead has to poll for completion + manually dispatch follow-ups. + +**Proposed fix**: ship `TaskList.unblocked()` iterator; auto-cascade unblock when a blocker hits `status=completed`. Add a `Swarm.dag_visualize(team)` command for at-a-glance state. + +### L4 — Idle teammates can't be reliably re-prompted [2026-05-10] +**Evidence**: After a teammate reports "Going idle", I cannot reliably wake them. The InboxPoller (per binary strings) checks for unread messages and "submits immediately if session idle", but spawned teammates often appear to have their process terminated after first-task completion rather than truly idling. + +**Impact**: follow-up work requires spawning a new agent rather than reusing the existing one — wastes the warm context. + +**Proposed fix**: persistent teammate processes with explicit `Swarm.TeammateMode.persistent`. Process stays alive in `wait_for_inbox()` loop until either a `shutdown_request` arrives or the team is deleted. + +### L5 — No graceful interrupt / abort signal [2026-05-10] +**Evidence**: Only `TaskStop` (hard kill) is available. Aborting a mid-flight teammate loses all uncommitted WIP. + +**Impact**: when redirecting a teammate (the operator changes direction, hits a budget cap, discovers a better approach), there's no clean "commit what you have and exit" path. + +**Proposed fix**: standardized abort-marker file contract: `<worktree>/.claude/abort-<name>` triggers a phase-boundary commit + push + clean exit. Ships as a standard prompt block + a runtime check the binary fires when the file appears. + +### L6 — No worktree isolation by default; concurrent teammates clobber [2026-05-10] +**Evidence**: 5+ teammates spawned without `isolation: "worktree"` had their uncommitted edits destroyed when concurrent `git checkout` operations rewrote the shared working tree. Required cherry-pick recovery after each clobber. + +**Impact**: parallel-agent throughput is gated on every spawn having worktree isolation. Easy to forget; expensive to recover. + +**Proposed fix**: make `isolation: "worktree"` the default for parallel teammates (Agent tool spawned within a team context). Add team-level GC for finished worktrees. + +### L7 — No reviewer checkpoint / status-pulse mechanism [2026-05-10] +**Evidence**: Teammates can drift (loop on a wrong approach, run over budget, get stuck on a sub-problem) with no built-in correction. Vanilla Teams has no equivalent of "every N turns, force a self-assessment". + +**Impact**: high-trust dispatch only — if a task description is wrong, the teammate spends the full budget on the wrong solution. + +**Proposed fix**: opt-in reviewer-checkpoint hook. Every N turns the runtime injects a self-review prompt forcing DAG status, commit count vs expected, cost vs budget, "is this still tractable?" check. Sub-agent type `Reviewer` is dispatched at scheduled intervals. + +### L8 — No per-teammate token / cost budget enforcement [2026-05-10] +**Evidence**: Budget tracking is per-session (operator-visible) but not per-teammate. A runaway teammate can burn through a session's entire token budget. + +**Impact**: hard to run cost-sensitive workflows; no early-warning before a teammate blows through. + +**Proposed fix**: per-teammate budget metadata in the team config. Soft cap = warning sent via SendMessage to team-lead. Hard cap = teammate paused (kept around for resumption rather than killed). + +### L9 — No auto-recovery for crashed teammates [2026-05-10] +**Evidence**: When a teammate's session terminates abnormally (OOM, network drop, sigkill), the team-lead has no notification and must manually detect + respawn. The history-view teammate (a02f1b97) died mid-work and its WIP was lost. + +**Impact**: long-running parallel batches are fragile. One crash → operator intervention. + +**Proposed fix**: meta-supervisor daemon polls for status transitions. On unexpected death (in_progress > N min without commit, PID gone): replay abort-marker contract on the worktree (commit current WIP to branch) + spawn replacement teammate that picks up from last commit. + +### L10 — No named role-based subagent_types [2026-05-10] +**Evidence**: All teammates are spawned with `subagent_type: "general-purpose"`. There's no way to declare "this teammate is a Reviewer — read-only, fires on a schedule" vs "this teammate is a Merger — Bash+git only". + +**Impact**: tool-access restriction is ad-hoc (per-prompt instructions); role specialization is by convention. Easy for a Reviewer to accidentally edit code; easy for a Merger to run arbitrary commands. + +**Proposed fix**: heads architecture — register `Scanner`, `Reviewer`, `Builder`, `Merger`, `Test-Runner`, `Auditor` as named subagent_types with role-specific system prompts + tool restrictions. Configurable; users can register custom heads via `.claude-plugin/heads/<name>.md`. + +### L11 — No first-class testing substrate for swarms [2026-05-10] +**Evidence**: There are no example scenarios shipped with Teams that exercise multi-teammate coordination, DAG dependencies, parallel merging, etc. Users have to invent their own from scratch. + +**Impact**: hard to validate a swarm's behavior; hard to file bug reports with reproducible examples; hard to teach the patterns. + +**Proposed fix**: ship 10 toy scenarios in `plugins/swarm-orchestrator/tests/swarming/` that exercise every primitive (DAG, heads, merging, abort-marker, multi-team, etc.). Each is self-contained, deterministic, <5 min. Same scenarios run identically across vanilla Teams, the swarm-orchestrator plugin, and (for my extended use case) the standalone `claude-swarm` library — proving compatibility. + +### L12 — No cross-machine SendMessage routing [2026-05-10] +**Evidence**: `claude --remote-control` exists as a CLI flag but is not exposed via the Teams API as a transport for SendMessage. + +**Impact**: multi-host fleets (laptop + Mac mini + cloud GPU) can't run a single team across machines. Each machine has to be its own team; cross-machine coordination requires custom plumbing. + +**Proposed fix**: extend the `to` field in SendMessage to accept `name@machine` syntax. Route via `--remote-control` channel when machine ≠ local. + +### L13 — Auto-merge graph not wired to team-task completion [2026-05-10] +**Evidence**: Teammates ship branches; merging is manual. No hook fires on `status=completed` to attempt merge. + +**Impact**: 13 PRs landed but waiting for manual merge. A real swarm needs continuous merge-on-completion. + +**Proposed fix**: opt-in hook `on-task-completed` that runs a configurable merge pipeline (rebase → test gate → push). Ships with a default `merge_pipeline.py` adapted from my internal one (clone-isolated, topo-sorted, conflict-aware). + +### L14 — Worktree-discovery bug: cwd disagrees with worktree path [2026-05-10] +**Evidence**: A spawned teammate (save-checkpoint-ui, ab396ed08) saw `cwd = /Users/.../.claude/worktrees/agent-X` in the system reminder but the actual worktree was at `/Users/.../.ai/.claude/workspace/worktrees/agent-X`. Early edits landed in the wrong tree. + +**Impact**: silent data corruption in the main tree; required stash-and-replay to recover. + +**Proposed fix**: at session start, the binary verifies `pwd == worktree_path_from_team_config` and either fixes the cwd or errors loudly. Add to the Agent tool's spawn contract. + +### L15 — No discoverable subcommand schema in `claude agents` [2026-05-10] +**Evidence**: `claude agents --help` shows only `--setting-sources`. No subcommands for `spawn`, `kill`, `list`, `restart`, etc. — even though binary strings suggest these capabilities exist. + +**Impact**: operator can't manage teammates from a shell; everything is through the Agent tool in-session. + +**Proposed fix**: expand `claude agents` subcommands: `list`, `spawn`, `kill`, `restart`, `logs`, `inspect`. Mirrors `docker ps` / `docker exec` semantics. Useful for CI integration + ops automation. + +--- + +## Features I propose (concrete additions) + +(See `the companion feature catalog in this plugin directory` for the full 45-item list. Headline additions to vanilla Teams:) + +1. **DAG dependencies** with auto-unblock cascade + `TaskList.unblocked()` iterator +2. **Heads architecture** — Scanner / Reviewer / Builder / Merger / Test-Runner / Auditor as named subagent_types with role-specific prompts + tool restrictions +3. **Multi-team support** — N concurrent teams per session, cross-team SendMessage +4. **Reviewer checkpoint hook** — every N turns, fire a self-review prompt +5. **Meta-supervisor daemon** — long-running session polls task lists + inboxes, auto-recovers, routes findings +6. **Abort-marker contract** — standardized graceful interrupt +7. **Auto-recovery** — respawn dead teammates from last commit +8. **Per-teammate budget** — soft cap + hard cap with pause-not-kill +9. **Cross-machine SendMessage** — `name@machine` routing via `--remote-control` +10. **Auto-merge graph hook** — fires on `status=completed`, runs configurable merge pipeline +11. **Worktree isolation default** — for teammates spawned within a team context +12. **Persistent teammate mode** — opt-in `wait_for_inbox` loop instead of one-shot +13. **Pattern detection logging** — every supervisor decision logged for offline classifier training +14. **Testing substrate** — 10 toy scenarios shipped with the plugin +15. **Mind-page status endpoint** — stable JSON feed at `~/.claude/teams/.status.json` for external UIs + +--- + +## Compatibility / breaking changes + +**None.** All features are additive; vanilla Teams users see no behavior change unless they install this plugin and opt in. The on-disk schema (`~/.claude/teams/<name>/config.json`, `tasks/<N>.json`, `inboxes/<recipient>.json`) is preserved byte-for-byte — the plugin's `InboxPoller` reads the same files Anthropic's runtime does. Existing teams keep working alongside swarm-extended teams. + +--- + +## Test scenarios (validation of every primitive) + +Ten toy scenarios ship in `plugins/swarm-orchestrator/tests/swarming/`. Each is self-contained, deterministic, <5 min. The same scenario JSON schema runs against vanilla Teams (where applicable), the swarm-orchestrator plugin, and my standalone `claude-swarm` library — proving 3-way compatibility: + +1. `multi-file-rename` — parallel-safe + merge +2. `spec-impl-pair` — DAG dependency +3. `scan-build-review` — heads end-to-end +4. `doc-writer-team` — parallel dispatch +5. `multi-language-port` — cross-teammate independence +6. `audit-then-fix` — DAG + meta-supervisor +7. `conflict-resolution-drill` — merge pipeline rebase +8. `abort-marker-test` — graceful WIP commit +9. `respawn-on-crash` — meta-supervisor recovery +10. `multi-team-coordination` — two teams + cross-team SendMessage + +--- + +## Production validation + +This swarm pattern was validated on a real production codebase via worktree-isolated teammates coordinating through filesystem-backed inboxes. Teammates produced clean, reviewable PRs with tests; no data was lost; no merge conflicts went unresolved. I propose the patterns that made that throughput possible. + +--- + +## How to add a new limitation entry + +When a new limitation is discovered: + +1. Reproduce it (capture evidence — error message, stack trace, binary string) +2. Append a new `L<N>` section below the last entry +3. Date the entry +4. Propose a fix +5. Commit + push to this file in the plugin directory + +The PR description will be regenerated from this file on submission. + +--- + +## Roadmap (v0.2.0 → v1.0): server + CLI + autonomous self-iteration + +What the v0.1.0 plugin ships today is the foundation. The vision below is what I build on top. Each item below corresponds to a planned follow-up PR. + +### v0.2.0 — Swarm server + CLI dashboard +- **Long-running `swarm-server` daemon** (one per host) holding live state for all teams + heads + tasks +- **`swarm` CLI tool** that attaches to the server: `swarm status`, `swarm spawn`, `swarm logs <name>`, `swarm tail <name>`, `swarm budget` +- **TUI dashboard** (`textual`-based) showing the equivalent of Anthropic Teams' interface — running agents, token counts, runtimes, task progress, live event feed +- **Statistics + constant summaries** — every state change is captured and surfaced; the dashboard updates in real time +- Designed to be installable standalone (works without Claude Code) AND wired into the plugin for Claude Code users + +### v0.3.0 — Message bus: Python-native default, pluggable backends + +The default stack is intentionally **Python-native, zero external brokers**: + +- **FastAPI + Starlette WebSockets** for live event streams (CLI ↔ server, server ↔ TUI dashboard, cross-process) +- **asyncio.Queue** for in-process pub/sub (event_bus) +- **filesystem-backed JSON** for durable state (Anthropic Teams inbox schema-compatible by design) +- **HTTP + FastAPI** for RPC (typed via Pydantic models) +- **Production-validated** at parallel-agent scale on a real codebase — no broker required, no JVM, no extra ops surface + +For users who outgrow a single host, pluggable backends are first-class: + +| Backend | Use case | Trade-off | +|---|---|---| +| **NATS + JetStream** | Cloud-native, sub-ms latency, ~10M msgs/sec, JetStream for replay | Adds a server binary; minimal ops surface | +| **Apache Kafka** | Massive scale (millions of events/sec), durable log streaming, industry standard at Netflix/LinkedIn/Uber | Heavier ops (Zookeeper or KRaft), JVM-based | +| **gRPC** | Typed RPC for CLI ↔ server when Protobuf cross-language is required | Heavier serialization than JSON; great for typed contracts | +| **Redis Streams** | Already-deployed Redis, simple replay semantics | Extra dependency if not already running Redis | +| **ZeroMQ** | Embedded / HFT / no-broker requirements | Steeper learning curve; users compose patterns themselves | + +The plugin's `MessageBus` interface lets users swap backends without touching agent code. Subject hierarchy maps to swarm domain: `swarm.team.<name>.head.<role>.task.<id>.{spawned,progress,completed,failed}` — works identically across all backends. + +**Cross-machine routing**: every backend supports it (NATS clustering, Kafka brokers, gRPC over TLS, Redis cluster mode, ZeroMQ TCP sockets). The Python-native default uses WebSockets over SSH-tunneled ports for the same effect. + +### v0.4.0 — Mandatory worktree isolation + autonomous worktree lifecycle +- **Worktree mandation**: every parallel-spawned teammate gets its own git worktree, enforced at the plugin's spawn boundary — never optional, never bypassed +- **Autonomous lifecycle**: created at spawn, monitored for orphan state, garbage-collected after successful merge OR after `WORKTREE_TTL_HOURS` (default 24h) of inactivity +- **Filesystem-level guard** rejects any direct parallel work in the shared tree (prevents the entire class of "concurrent agents clobber each other's edits") +- Integrates with Anthropic's worktree-isolation feature when available, falls back to the plugin's own implementation otherwise + +### v0.5.0 — Autonomous self-iteration test framework +- **The test suite spawns a real swarm against the swarm-orchestrator codebase itself.** Meta-supervisor + Scanner + Builders + Reviewers + Merger work to improve the plugin per a charter + acceptance criteria. +- **Self-maintaining + self-healing**: the swarm iterates on its own bugs (correctness audit findings → tasks → fixes → merges) without human intervention +- **Visibility**: every iteration produces a report — what changed, why, what tests passed/failed, what the meta-supervisor's reasoning was +- **Industry-grade `pytest` framework** with rich output (I'm considering `pytest-rich` or `pytest-asyncio` for the orchestration scenarios) +- Runs nightly as the canonical CI signal; failing iterations get human-attention notification + +### v0.6.0 — Advanced synchronization + multiprocessing primitives +- **`multiprocessing.Manager`-backed shared state** for cross-process coordination (alternative to all-file-IO) +- **Async/await throughout** the meta-supervisor poll loop (Python `asyncio`) +- **Lock-free queues** where applicable (per-teammate inbox is single-producer single-consumer) +- **Backpressure** on the meta-supervisor (if it falls behind, dispatch is paused rather than queued unboundedly) +- **Heartbeat** between server and clients (CLI loses server connection → auto-reconnects with state catchup) + +### v1.0 — Production readiness +- 99.9% uptime across the meta-supervisor over 30-day observation +- Full backward compatibility with v0.x clients +- `claude plugin install swarm-orchestrator` works on every Claude Code release +- Documentation + tutorial videos + at least 3 case studies of large-scale parallel swarm runs +- Pattern-detection classifier trained on logged decisions; published as `claude-swarm classify` CLI subcommand + +## Authors / contributors + +- Kushal Jaligama — design lead (`kushalj1997` on GitHub) +- Claude Opus 4.7 (1M context) — drafted via agent team coordination + +--- + +## License + +This plugin does not currently ship a LICENSE file inside its directory, following the convention of every other in-repo plugin in `anthropics/claude-code/plugins/`. The repository's top-level `LICENSE.md` (Anthropic PBC commercial terms) applies by default. + +**The author is happy to follow Anthropic's licensing guidance and recommendation.** If a specific permissive license (e.g., Apache 2.0 or MIT) is preferred for community contribution / external use of the standalone engine, please advise in the PR review and the author will adopt it. If no LICENSE file is the in-repo convention, this PR matches that convention as-shipped. diff --git a/plugins/swarm-orchestrator/tests/__init__.py b/plugins/swarm-orchestrator/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/plugins/swarm-orchestrator/tests/swarming/README.md b/plugins/swarm-orchestrator/tests/swarming/README.md new file mode 100644 index 0000000000..18a5ca5848 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/README.md @@ -0,0 +1,99 @@ +# Swarm testing substrate + +Ten toy swarm scenarios that exercise every primitive (DAG, heads, merging, +abort marker, multi-team) — plus a binding-agnostic runner. The substrate +ships verbatim in three locations so the same scenario JSON drives all +three swarm bindings: + +| Binding | Location | +| ----------------------------------------- | -------------------------------------------------------------- | +| Anthropic Teams (`claude-code` plugin) | `~/dev/projects/claude-code/plugins/swarm-orchestrator/tests/swarming/` | +| Standalone CLI (`claude-swarm`) | `~/dev/projects/claude-swarm/tests/scenarios/` | +| Internal swarm (this repo) | `tests/swarming/` | + +The schema (`schema/scenario.schema.json`) is the contract; per-binding +runners read the same JSON. + +## Running scenarios + +```bash +# Internal binding (this repo) +python tests/swarming/run_scenario.py multi-file-rename +python tests/swarming/run_scenario.py --all + +# Standalone CLI +claude-swarm scenario run multi-file-rename +claude-swarm scenario run --all + +# Plugin (inside claude-code) +plugins/swarm-orchestrator/tests/swarming/run_scenario.sh multi-file-rename +``` + +All three call into `runner/harness.py`, which materializes fixtures into +a fresh tempdir, asks the binding's `ScenarioEngine` to do the work, then +hands the result + workspace to `runner/assertions.evaluate`. Every +scenario asserts the same way regardless of binding. + +## The 10 scenarios + +| # | Name | Primitives tested | +| - | -------------------------- | ------------------------------------------------ | +| 1 | multi-file-rename | file-overlap-reject, atomic-merge | +| 2 | spec-impl-pair | DAG dependency | +| 3 | scan-build-review | heads architecture end-to-end | +| 4 | doc-writer-team | parallel-safe dispatch | +| 5 | multi-language-port | cross-teammate independence | +| 6 | audit-then-fix | DAG + meta-supervisor task-file | +| 7 | conflict-resolution-drill | merge pipeline rebase | +| 8 | abort-marker-test | clean WIP commit on abort | +| 9 | respawn-on-crash | meta-supervisor recovery | +| 10| multi-team-coordination | two teams + cross-team SendMessage | + +## Reference engine vs real bindings + +`runner/stub.py` ships an `InProcessScenarioEngine` — a deterministic, +LLM-free reference implementation. Today every binding falls back to it +so the substrate is independent of binding readiness. + +When a real binding lands, replace the engine factory in its runner: + +- `tests/swarming/run_scenario.py` -> `_make_engine()` checks for + `claude_swarm.scenario_engine.StandaloneScenarioEngine`. +- `tests/scenarios/run_scenario.py` (claude-swarm) -> the package's + `claude_swarm.scenarios.engine.StandaloneScenarioEngine` once the + CLI is built out. +- `plugins/swarm-orchestrator/tests/swarming/run_scenario.sh` -> calls + the plugin's TaskCreate/TaskUpdate via the in-binary swarm engine. + +Until the real engines arrive, the reference engine + identical +scenario JSON give CI a green signal. + +## Adding a new scenario + +1. Pick a kebab-case name. Add `scenarios/<name>.json` (validated + against `schema/scenario.schema.json`). +2. Drop fixtures under `fixtures/<name>/`. The runner copies them + into a fresh tempdir before invoking the engine. +3. If the in-process reference handler doesn't already cover the + scenario, register a handler in `runner/stub.py::_DISPATCH`. +4. Run `python run_scenario.py <name>` until it's green. +5. Mirror the new files into the other two binding locations. + +## Determinism + +- Fixtures are seed-controlled (`setup.seed`). +- File enumeration uses sorted iteration order. +- Time-dependent behaviour (`abort_after_seconds`, + `introduce_conflict_after_seconds`) is tunable via `inject` so a + flaky CI host can lengthen timeouts without rewriting the scenario. + +## CI + +Each binding wires its own job: + +- `claude-swarm` library: GitHub Actions matrix `python-{3.11,3.12,3.13}`. +- `claude-code` plugin: runs alongside the project's existing plugin-tests job. +- Downstream consumers: drop into your `pytest` invocation; the substrate is + self-contained. + +Failing scenario = bisect bad commit. diff --git a/plugins/swarm-orchestrator/tests/swarming/__init__.py b/plugins/swarm-orchestrator/tests/swarming/__init__.py new file mode 100644 index 0000000000..b34fc99c79 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/__init__.py @@ -0,0 +1,6 @@ +"""swarm-orchestrator scenario substrate — toy swarm scenarios that exercise +every primitive (DAG dependencies, role-typed heads, parallel merge, abort +marker, multi-team coordination) against the binding-agnostic engine +protocol. Same JSON shape can drive the plugin's lightweight mode, the +standalone claude-swarm library, or any future native-Teams binding. +""" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/abort-marker-test/README.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/abort-marker-test/README.txt new file mode 100644 index 0000000000..fd5a1c7d9f --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/abort-marker-test/README.txt @@ -0,0 +1,3 @@ +Abort marker scenario: teammate appends to long_running_output.txt every +10ms; the runner drops .claude/abort-renamer-1 after 50ms; teammate must +do a clean WIP commit and exit. diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_01.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_01.py new file mode 100644 index 0000000000..86443264c5 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_01.py @@ -0,0 +1,5 @@ +"""buggy_01 — has a BUG marker the auditor flags.""" + +def run_01(x): + # BUG: returns wrong value + return x diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_02.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_02.py new file mode 100644 index 0000000000..9cb7c7d8c2 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_02.py @@ -0,0 +1,5 @@ +"""buggy_02 — has a BUG marker the auditor flags.""" + +def run_02(x): + # BUG: returns wrong value + return x diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_03.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_03.py new file mode 100644 index 0000000000..9ec1ede261 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/buggy_03.py @@ -0,0 +1,5 @@ +"""buggy_03 — has a BUG marker the auditor flags.""" + +def run_03(x): + # BUG: returns wrong value + return x diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/clean.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/clean.py new file mode 100644 index 0000000000..28e2c0c25e --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/audit-then-fix/src/clean.py @@ -0,0 +1,4 @@ +"""clean — no bug, auditor skips.""" + +def ok(): + return True diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/conflict-resolution-drill/shared.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/conflict-resolution-drill/shared.py new file mode 100644 index 0000000000..a89a738b4e --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/conflict-resolution-drill/shared.py @@ -0,0 +1,3 @@ +"""shared.py — both teams will append a line; merge pipeline must resolve.""" + +base_value = 0 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/alpha.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/alpha.py new file mode 100644 index 0000000000..aa43f6f6cc --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/alpha.py @@ -0,0 +1,4 @@ +"""alpha module — placeholder for documentation generation.""" + +def alpha_main(x): + return x * 2 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/beta.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/beta.py new file mode 100644 index 0000000000..70d15d3871 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/beta.py @@ -0,0 +1,4 @@ +"""beta module — placeholder for documentation generation.""" + +def beta_main(x): + return x * 2 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/delta.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/delta.py new file mode 100644 index 0000000000..824ae7917f --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/delta.py @@ -0,0 +1,4 @@ +"""delta module — placeholder for documentation generation.""" + +def delta_main(x): + return x * 2 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/epsilon.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/epsilon.py new file mode 100644 index 0000000000..276e07f56c --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/epsilon.py @@ -0,0 +1,4 @@ +"""epsilon module — placeholder for documentation generation.""" + +def epsilon_main(x): + return x * 2 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/gamma.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/gamma.py new file mode 100644 index 0000000000..6183fecf4d --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/doc-writer-team/src/gamma.py @@ -0,0 +1,4 @@ +"""gamma module — placeholder for documentation generation.""" + +def gamma_main(x): + return x * 2 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod01.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod01.py new file mode 100644 index 0000000000..29b5f6b991 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod01.py @@ -0,0 +1,6 @@ +"""Module 01 — uses foo as a placeholder name.""" + +def foo_01(): + return "foo from module 01" + +VALUE_FOO_01 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod02.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod02.py new file mode 100644 index 0000000000..7b63ddb665 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod02.py @@ -0,0 +1,6 @@ +"""Module 02 — uses foo as a placeholder name.""" + +def foo_02(): + return "foo from module 02" + +VALUE_FOO_02 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod03.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod03.py new file mode 100644 index 0000000000..81208f835e --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod03.py @@ -0,0 +1,6 @@ +"""Module 03 — uses foo as a placeholder name.""" + +def foo_03(): + return "foo from module 03" + +VALUE_FOO_03 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod04.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod04.py new file mode 100644 index 0000000000..b8c7f8a23d --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod04.py @@ -0,0 +1,6 @@ +"""Module 04 — uses foo as a placeholder name.""" + +def foo_04(): + return "foo from module 04" + +VALUE_FOO_04 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod05.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod05.py new file mode 100644 index 0000000000..2a2df0ba5b --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod05.py @@ -0,0 +1,6 @@ +"""Module 05 — uses foo as a placeholder name.""" + +def foo_05(): + return "foo from module 05" + +VALUE_FOO_05 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod06.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod06.py new file mode 100644 index 0000000000..02f945472f --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod06.py @@ -0,0 +1,6 @@ +"""Module 06 — uses foo as a placeholder name.""" + +def foo_06(): + return "foo from module 06" + +VALUE_FOO_06 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod07.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod07.py new file mode 100644 index 0000000000..2178ff7d35 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod07.py @@ -0,0 +1,6 @@ +"""Module 07 — uses foo as a placeholder name.""" + +def foo_07(): + return "foo from module 07" + +VALUE_FOO_07 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod08.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod08.py new file mode 100644 index 0000000000..859f065a6a --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod08.py @@ -0,0 +1,6 @@ +"""Module 08 — uses foo as a placeholder name.""" + +def foo_08(): + return "foo from module 08" + +VALUE_FOO_08 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod09.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod09.py new file mode 100644 index 0000000000..59a24c7cef --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod09.py @@ -0,0 +1,6 @@ +"""Module 09 — uses foo as a placeholder name.""" + +def foo_09(): + return "foo from module 09" + +VALUE_FOO_09 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod10.py b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod10.py new file mode 100644 index 0000000000..bf8b8b9ca4 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/files/mod10.py @@ -0,0 +1,6 @@ +"""Module 10 — uses foo as a placeholder name.""" + +def foo_10(): + return "foo from module 10" + +VALUE_FOO_10 = "this references foo" diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/manifest.json b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/manifest.json new file mode 100644 index 0000000000..6c67a6d77e --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-file-rename/manifest.json @@ -0,0 +1,8 @@ +{ + "rename": { + "from": "foo", + "to": "bar" + }, + "target_glob": "files/*.py", + "branch_name": "feature/rename-foo-to-bar" +} diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-language-port/README.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-language-port/README.txt new file mode 100644 index 0000000000..fcbf114876 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-language-port/README.txt @@ -0,0 +1 @@ +Empty fixture: scenario #5 starts blank; teammates write add.py / add.js / add.rs. diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-team-coordination/README.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-team-coordination/README.txt new file mode 100644 index 0000000000..6cb7ecea92 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/multi-team-coordination/README.txt @@ -0,0 +1,4 @@ +Multi-team scenario: two teams (alpha + beta) run in parallel. After +both finish their per-team deliverable, alpha's lead pings beta's lead +via cross-team SendMessage; substrate asserts the inbox path was +written. diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/respawn-on-crash/README.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/respawn-on-crash/README.txt new file mode 100644 index 0000000000..40ab87cf65 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/respawn-on-crash/README.txt @@ -0,0 +1,3 @@ +Respawn scenario: teammate raises 1 simulated crash; meta-supervisor +retries; second attempt succeeds. Substrate verifies respawned_output.txt +exists + respawn_count >= 1. diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_01.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_01.txt new file mode 100644 index 0000000000..4c08219888 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_01.txt @@ -0,0 +1 @@ +TODO: implement feature 01 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_02.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_02.txt new file mode 100644 index 0000000000..d2b3c08789 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_02.txt @@ -0,0 +1 @@ +TODO: implement feature 02 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_03.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_03.txt new file mode 100644 index 0000000000..ae669a2b79 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_03.txt @@ -0,0 +1 @@ +TODO: implement feature 03 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_04.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_04.txt new file mode 100644 index 0000000000..98ccf02854 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_04.txt @@ -0,0 +1 @@ +TODO: implement feature 04 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_05.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_05.txt new file mode 100644 index 0000000000..77ead7fe3c --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/scan-build-review/sample/feature_05.txt @@ -0,0 +1 @@ +TODO: implement feature 05 diff --git a/plugins/swarm-orchestrator/tests/swarming/fixtures/spec-impl-pair/README.txt b/plugins/swarm-orchestrator/tests/swarming/fixtures/spec-impl-pair/README.txt new file mode 100644 index 0000000000..ac58da6e53 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/fixtures/spec-impl-pair/README.txt @@ -0,0 +1,3 @@ +Empty fixture: scenario #2 starts from a blank workspace; the spec +teammate writes test_increment.py first, the impl teammate adds +increment.py only after the spec exists. diff --git a/plugins/swarm-orchestrator/tests/swarming/run_scenario.py b/plugins/swarm-orchestrator/tests/swarming/run_scenario.py new file mode 100755 index 0000000000..73ca64b392 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/run_scenario.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Scenario runner — exercises a scenario against the configured engine. + +By default this uses the binding-agnostic in-process reference engine +shipped in :mod:`tests.swarming.runner.stub`. To run against a different +engine (e.g. an Anthropic Teams binding or the standalone ``claude_swarm`` +library), replace ``ENGINE_FACTORY`` below. + +Usage:: + + python tests/swarming/run_scenario.py multi-file-rename + python tests/swarming/run_scenario.py --all +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +THIS_DIR = Path(__file__).resolve().parent +# Allow running as a script: put this dir's parent on sys.path so +# ``runner`` resolves either as ``tests.swarming.runner`` (package +# import) OR as a sibling import path. We prefer importing via the +# local path to keep the scenario runner self-contained. +sys.path.insert(0, str(THIS_DIR)) +sys.path.insert(0, str(THIS_DIR.parent.parent)) # repo root + +from runner.harness import run_all, run_scenario # noqa: E402 +from runner.stub import InProcessScenarioEngine # noqa: E402 + + +def _make_engine(): + """Return the engine for this binding. + + When the internal swarm exposes its own engine adapter, swap here. + Until then, use the canonical reference implementation — keeps the + substrate runnable + the scenarios green during development. + """ + try: + # Optional adapter — present once claude_swarm exposes it. + from claude_swarm import scenario_engine as _eng # type: ignore[import-not-found] + return _eng.StandaloneScenarioEngine() + except Exception: # noqa: BLE001 — adapter absence is expected today + return InProcessScenarioEngine() + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + p.add_argument("scenario", nargs="?", help="scenario name (matches scenarios/<name>.json)") + p.add_argument("--all", action="store_true", help="run every scenario in scenarios/") + p.add_argument("--scenarios-dir", default=str(THIS_DIR / "scenarios")) + p.add_argument("--keep-workspace", action="store_true") + p.add_argument("--json", action="store_true") + p.add_argument("-v", "--verbose", action="store_true") + args = p.parse_args(argv) + + engine = _make_engine() + if args.all: + reports = run_all(args.scenarios_dir, engine=engine, verbose=args.verbose) + else: + if not args.scenario: + p.error("scenario name required (or --all)") + candidate = Path(args.scenarios_dir) / f"{args.scenario}.json" + if not candidate.exists(): + print(f"scenario not found: {candidate}", file=sys.stderr) + return 2 + reports = [ + run_scenario( + candidate, + engine=engine, + keep_workspace=args.keep_workspace, + verbose=args.verbose, + ) + ] + + if args.json: + print(json.dumps([r.to_dict() for r in reports], indent=2)) + else: + for r in reports: + head = "PASS" if r.ok else "FAIL" + print(f"[{head}] {r.scenario} (binding={r.binding}) passed={len(r.passed)} failed={len(r.failed)}") + for x in r.failed: + print(f" - {x}") + return 0 if all(r.ok for r in reports) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/swarm-orchestrator/tests/swarming/run_scenario.sh b/plugins/swarm-orchestrator/tests/swarming/run_scenario.sh new file mode 100755 index 0000000000..9390389904 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/run_scenario.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Shell wrapper around the Python runner — matches the project's +# convention for plugin tests (see plugins/feature-dev/...). +# +# Usage: +# ./run_scenario.sh multi-file-rename +# ./run_scenario.sh --all +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec python3 "${HERE}/run_scenario.py" "$@" diff --git a/plugins/swarm-orchestrator/tests/swarming/runner/__init__.py b/plugins/swarm-orchestrator/tests/swarming/runner/__init__.py new file mode 100644 index 0000000000..5113a2aef6 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/runner/__init__.py @@ -0,0 +1,13 @@ +"""Binding-agnostic scenario runner. + +This package is the canonical home for the swarm-testing-substrate. It is +mirrored verbatim into: + + - ~/dev/projects/claude-swarm/tests/scenarios/ (standalone library) + - ~/dev/projects/claude-code/plugins/swarm-orchestrator/tests/swarming/ (plugin) + +The runner module is imported by all three bindings via the +``claude_swarm.scenarios.stub`` interface defined in :mod:`.stub`. Each +binding ships a concrete ``ScenarioEngine`` adapter; the scenarios + the +assertion harness stay identical. +""" diff --git a/plugins/swarm-orchestrator/tests/swarming/runner/assertions.py b/plugins/swarm-orchestrator/tests/swarming/runner/assertions.py new file mode 100644 index 0000000000..67bbcf8fd2 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/runner/assertions.py @@ -0,0 +1,190 @@ +"""Post-run assertion harness — the same code judges every binding. + +Each assertion failure raises :class:`AssertionFailure`, the runner +catches and prints; non-zero exit code propagates to CI. +""" +from __future__ import annotations + +import dataclasses +import json +import subprocess +from pathlib import Path +from typing import Any, Mapping + +from .stub import RunResult, Scenario + + +class AssertionFailure(AssertionError): + """Raised by :func:`evaluate` for any expectation mismatch.""" + + +@dataclasses.dataclass +class AssertionReport: + scenario: str + binding: str + passed: list[str] = dataclasses.field(default_factory=list) + failed: list[str] = dataclasses.field(default_factory=list) + + @property + def ok(self) -> bool: + return not self.failed + + def to_dict(self) -> dict[str, Any]: + return { + "scenario": self.scenario, + "binding": self.binding, + "passed": list(self.passed), + "failed": list(self.failed), + "ok": self.ok, + } + + +def evaluate(scenario: Scenario, result: RunResult, workspace: Path) -> AssertionReport: + rep = AssertionReport(scenario=scenario.name, binding=result.binding) + expected: Mapping[str, Any] = scenario.expected + + def check(label: str, ok: bool, detail: str = "") -> None: + if ok: + rep.passed.append(label) + else: + rep.failed.append(f"{label} :: {detail}") + + if "tasks_completed" in expected: + want = int(expected["tasks_completed"]) + check( + "tasks_completed", + result.tasks_completed == want, + f"got {result.tasks_completed} want {want}", + ) + + if "tasks_failed" in expected: + want = int(expected["tasks_failed"]) + check( + "tasks_failed", + result.tasks_failed == want, + f"got {result.tasks_failed} want {want}", + ) + + if "tasks_aborted" in expected: + want = int(expected["tasks_aborted"]) + check( + "tasks_aborted", + result.tasks_aborted == want, + f"got {result.tasks_aborted} want {want}", + ) + + if "merge_conflicts" in expected: + want = int(expected["merge_conflicts"]) + check( + "merge_conflicts", + result.merge_conflicts == want, + f"got {result.merge_conflicts} want {want}", + ) + + if "branches_in_master" in expected: + want_branches = list(expected["branches_in_master"]) + merged = _git_branches_merged(workspace) + for b in want_branches: + check( + f"branch_in_master:{b}", + b in result.branches_in_master or b in merged, + f"branch {b!r} not merged", + ) + + if "files_present" in expected: + for rel in expected["files_present"]: + p = workspace / rel + check(f"file_present:{rel}", p.exists(), f"missing {p}") + + if "files_absent" in expected: + for rel in expected["files_absent"]: + p = workspace / rel + check(f"file_absent:{rel}", not p.exists(), f"unexpected {p}") + + for entry in expected.get("file_contains", []): + p = workspace / entry["path"] + sub = entry["substring"] + ok = p.exists() and sub in p.read_text(encoding="utf-8") + check(f"file_contains:{entry['path']}:{sub!r}", ok, f"{p} missing {sub!r}") + + for entry in expected.get("file_absent_substring", []): + p = workspace / entry["path"] + sub = entry["substring"] + ok = p.exists() and sub not in p.read_text(encoding="utf-8") + check( + f"file_absent_substring:{entry['path']}:{sub!r}", + ok, + f"{p} still contains {sub!r}", + ) + + for want_msg in expected.get("messages_routed", []): + ok = any( + r.get("from") == want_msg["from"] and r.get("to") == want_msg["to"] + and ( + "team" not in want_msg or r.get("team") == want_msg.get("team") + ) + for r in result.messages_routed + ) + check( + f"message_routed:{want_msg['from']}->{want_msg['to']}", + ok, + f"messages={result.messages_routed!r}", + ) + + if "abort_wip_commit_present" in expected: + want = bool(expected["abort_wip_commit_present"]) + ok = result.abort_wip_commit_present == want + if want: + # Cross-check git log + log = _git_log(workspace, max_count=5) + ok = ok and any("WIP" in line for line in log) + check("abort_wip_commit_present", ok, f"git log: {_git_log(workspace, 3)!r}") + + if "respawn_count_min" in expected: + want = int(expected["respawn_count_min"]) + check( + "respawn_count_min", + result.respawn_count >= want, + f"got {result.respawn_count} want >= {want}", + ) + + return rep + + +def _git_branches_merged(workspace: Path) -> list[str]: + try: + out = subprocess.run( + ["git", "branch", "--merged", "master"], + cwd=str(workspace), + check=True, + capture_output=True, + text=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + return [ + line.strip().lstrip("*").strip() + for line in out.stdout.splitlines() + if line.strip() + ] + + +def _git_log(workspace: Path, max_count: int = 10) -> list[str]: + try: + out = subprocess.run( + ["git", "log", f"-{max_count}", "--pretty=%s"], + cwd=str(workspace), + check=True, + capture_output=True, + text=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + return [line for line in out.stdout.splitlines() if line.strip()] + + +__all__ = [ + "AssertionFailure", + "AssertionReport", + "evaluate", +] diff --git a/plugins/swarm-orchestrator/tests/swarming/runner/harness.py b/plugins/swarm-orchestrator/tests/swarming/runner/harness.py new file mode 100644 index 0000000000..09aefa6fb5 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/runner/harness.py @@ -0,0 +1,201 @@ +"""Harness — drives fixtures + engine + assertions for one scenario. + +The harness is the same code regardless of binding. Per-binding runners +just supply a different :class:`ScenarioEngine` instance. + +Usage (programmatic):: + + from tests.swarming.runner.harness import run_scenario + from tests.swarming.runner.stub import InProcessScenarioEngine + + report = run_scenario( + "tests/swarming/scenarios/multi-file-rename.json", + engine=InProcessScenarioEngine(), + ) + assert report.ok, report.failed +""" +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path +from typing import Sequence + +from .assertions import AssertionReport, evaluate +from .stub import InProcessScenarioEngine, RunResult, Scenario, ScenarioEngine + + +REPO_GIT_USER = "swarm-test-substrate" +REPO_GIT_EMAIL = "swarm-test-substrate@example.invalid" + + +def materialize_fixtures(scenario: Scenario, workspace: Path) -> None: + """Copy ``setup.fixtures`` into ``workspace``; init git if asked.""" + fixtures_rel = scenario.setup.get("fixtures") + if not fixtures_rel: + workspace.mkdir(parents=True, exist_ok=True) + return + src = (scenario.source_path.parent / fixtures_rel).resolve() + if not src.exists(): + raise FileNotFoundError( + f"scenario {scenario.name!r} references fixtures dir " + f"{src} which does not exist" + ) + if workspace.exists(): + shutil.rmtree(workspace) + shutil.copytree(src, workspace) + if scenario.setup.get("git_init", True): + _git_init(workspace) + + +def _git_init(workspace: Path) -> None: + env = os.environ.copy() + env.update( + { + "GIT_AUTHOR_NAME": REPO_GIT_USER, + "GIT_COMMITTER_NAME": REPO_GIT_USER, + "GIT_AUTHOR_EMAIL": REPO_GIT_EMAIL, + "GIT_COMMITTER_EMAIL": REPO_GIT_EMAIL, + } + ) + subprocess.run( + ["git", "init", "-q", "-b", "master"], + cwd=str(workspace), + check=True, + env=env, + ) + subprocess.run( + ["git", "config", "user.name", REPO_GIT_USER], + cwd=str(workspace), + check=True, + env=env, + ) + subprocess.run( + ["git", "config", "user.email", REPO_GIT_EMAIL], + cwd=str(workspace), + check=True, + env=env, + ) + subprocess.run( + ["git", "config", "commit.gpgsign", "false"], + cwd=str(workspace), + check=True, + env=env, + ) + subprocess.run( + ["git", "add", "-A"], + cwd=str(workspace), + check=True, + env=env, + ) + # Allow empty initial commit if fixtures dir is empty (rare). + subprocess.run( + ["git", "commit", "--allow-empty", "-q", "-m", "fixture: initial"], + cwd=str(workspace), + check=True, + env=env, + ) + + +def run_scenario( + scenario_path: str | os.PathLike[str], + *, + engine: ScenarioEngine | None = None, + workspace: Path | None = None, + keep_workspace: bool = False, + verbose: bool = False, +) -> AssertionReport: + scenario = Scenario.load(scenario_path) + if engine is None: + engine = InProcessScenarioEngine() + cleanup = False + if workspace is None: + workspace = Path(tempfile.mkdtemp(prefix=f"swarm-{scenario.name}-")) + cleanup = not keep_workspace + + if verbose: + print(f"[harness] scenario={scenario.name} binding={engine.binding_name}", file=sys.stderr) + print(f"[harness] workspace={workspace}", file=sys.stderr) + + try: + materialize_fixtures(scenario, workspace) + deadline = scenario.max_duration_minutes * 60.0 + t0 = time.monotonic() + result = engine.run(scenario, workspace) + elapsed = time.monotonic() - t0 + if elapsed > deadline: + result.notes.append( + f"[harness] elapsed {elapsed:.2f}s exceeded max {deadline:.2f}s" + ) + report = evaluate(scenario, result, workspace) + if verbose: + print(f"[harness] passed={len(report.passed)} failed={len(report.failed)}", file=sys.stderr) + for f in report.failed: + print(f" FAIL {f}", file=sys.stderr) + return report + finally: + if cleanup and workspace.exists(): + shutil.rmtree(workspace, ignore_errors=True) + + +def run_all( + scenarios_dir: str | os.PathLike[str], + *, + engine: ScenarioEngine | None = None, + only: Sequence[str] | None = None, + verbose: bool = False, +) -> list[AssertionReport]: + base = Path(scenarios_dir) + paths = sorted(base.glob("*.json")) + reports: list[AssertionReport] = [] + for p in paths: + if only and p.stem not in only: + continue + rep = run_scenario(p, engine=engine, verbose=verbose) + reports.append(rep) + return reports + + +def _cli(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run a swarm scenario via the in-process reference engine.") + parser.add_argument("scenario", help="Path to scenario JSON OR scenario name (looked up under --scenarios-dir)") + parser.add_argument( + "--scenarios-dir", + default=str(Path(__file__).resolve().parent.parent / "scenarios"), + ) + parser.add_argument("--keep-workspace", action="store_true") + parser.add_argument("--json", action="store_true", help="Emit JSON report on stdout") + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args(argv) + + p = Path(args.scenario) + if not p.exists(): + candidate = Path(args.scenarios_dir) / f"{args.scenario}.json" + if candidate.exists(): + p = candidate + else: + print(f"scenario not found: {args.scenario}", file=sys.stderr) + return 2 + + rep = run_scenario(p, keep_workspace=args.keep_workspace, verbose=args.verbose) + if args.json: + print(json.dumps(rep.to_dict(), indent=2)) + else: + print(f"scenario={rep.scenario} binding={rep.binding}") + print(f" passed: {len(rep.passed)}") + for x in rep.passed: + print(f" + {x}") + print(f" failed: {len(rep.failed)}") + for x in rep.failed: + print(f" - {x}") + return 0 if rep.ok else 1 + + +if __name__ == "__main__": + raise SystemExit(_cli()) diff --git a/plugins/swarm-orchestrator/tests/swarming/runner/stub.py b/plugins/swarm-orchestrator/tests/swarming/runner/stub.py new file mode 100644 index 0000000000..8f9a5283b9 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/runner/stub.py @@ -0,0 +1,613 @@ +"""``claude_swarm.scenarios.stub`` — binding-agnostic engine interface. + +Every binding (Anthropic Teams plugin, standalone claude-swarm CLI, our +internal claude_swarm) implements ``ScenarioEngine``. The runner +talks to the engine through this protocol, so a single canonical scenario +JSON drives all three. + +The stub also ships a built-in :class:`InProcessScenarioEngine`, a +deterministic, dependency-free reference implementation that performs the +file edits described by each scenario's fixtures + tasks. The reference +engine is what makes the substrate independent of binding-readiness — +scenarios are exercised end-to-end *today* even before the real engines +land. + +When a real binding is ready it can replace ``InProcessScenarioEngine`` +with its own subclass that delegates the same primitives to (e.g.) +TaskCreate / SendMessage / ``claude_swarm.kanban`` / a custom backend. + +This file is the SINGLE source of truth. The other two bindings import or +sym-mirror it; do not fork. +""" +from __future__ import annotations + +import dataclasses +import datetime as _dt +import json +import os +import shutil +import subprocess +import threading +import time +from collections.abc import Callable, Iterable, Mapping, Sequence +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Any, Protocol, runtime_checkable + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass(frozen=True) +class TeammateSpec: + name: str + head: str + task_ids: tuple[str, ...] + team: str = "" + + +@dataclasses.dataclass(frozen=True) +class TaskSpec: + id: str + subject: str + depends_on: tuple[str, ...] = () + head: str | None = None + payload: Mapping[str, Any] = dataclasses.field(default_factory=dict) + + +@dataclasses.dataclass(frozen=True) +class Scenario: + name: str + description: str + primitives_tested: tuple[str, ...] + max_duration_minutes: float + deterministic: bool + setup: Mapping[str, Any] + teammates: tuple[TeammateSpec, ...] + tasks: tuple[TaskSpec, ...] + inject: Mapping[str, Any] + expected: Mapping[str, Any] + source_path: Path # the scenarios/<name>.json on disk + + @classmethod + def load(cls, path: str | os.PathLike[str]) -> "Scenario": + p = Path(path).resolve() + with p.open("r", encoding="utf-8") as fh: + doc = json.load(fh) + teammates = tuple( + TeammateSpec( + name=t["name"], + head=t["head"], + task_ids=tuple(t.get("task_ids", [])), + team=t.get("team", ""), + ) + for t in doc.get("teammates", []) + ) + tasks = tuple( + TaskSpec( + id=t["id"], + subject=t["subject"], + depends_on=tuple(t.get("depends_on", [])), + head=t.get("head"), + payload=dict(t.get("payload", {})), + ) + for t in doc.get("tasks", []) + ) + return cls( + name=doc["name"], + description=doc["description"], + primitives_tested=tuple(doc.get("primitives_tested", [])), + max_duration_minutes=float(doc.get("max_duration_minutes", 5.0)), + deterministic=bool(doc.get("deterministic", True)), + setup=dict(doc.get("setup", {})), + teammates=teammates, + tasks=tasks, + inject=dict(doc.get("inject", {})), + expected=dict(doc.get("expected", {})), + source_path=p, + ) + + +@dataclasses.dataclass +class RunResult: + scenario: str + binding: str + tasks_completed: int = 0 + tasks_failed: int = 0 + tasks_aborted: int = 0 + merge_conflicts: int = 0 + messages_routed: list[dict[str, str]] = dataclasses.field(default_factory=list) + branches_in_master: list[str] = dataclasses.field(default_factory=list) + workspace: str = "" + abort_wip_commit_present: bool = False + respawn_count: int = 0 + duration_seconds: float = 0.0 + notes: list[str] = dataclasses.field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Engine protocol — what every binding must implement +# --------------------------------------------------------------------------- + + +@runtime_checkable +class ScenarioEngine(Protocol): + """The contract every binding implements. + + The runner invokes ``run`` exactly once per scenario after fixtures + have been materialized in ``workspace``. ``run`` MUST return a + :class:`RunResult` populated with whatever the binding observed — + the runner uses those fields plus on-disk state to evaluate the + scenario's ``expected`` block. + """ + + binding_name: str + + def run(self, scenario: Scenario, workspace: Path) -> RunResult: ... + + +# --------------------------------------------------------------------------- +# Reference (in-process) engine — usable today, no LLM required +# --------------------------------------------------------------------------- + + +class InProcessScenarioEngine: + """Deterministic reference engine for the substrate. + + Each scenario's fixtures dir contains: + - ``manifest.json`` — payload describing the work + - ``files/`` — initial repo content (committed by runner) + + The engine performs the work synchronously, in dependency order, with + a thread pool sized to the number of teammates. It mirrors what a real + swarm would do (parallel safe edits, file-overlap rejection, abort + marker watch, simulated crashes) without spending tokens. + + Scenario-specific behavior is dispatched by name in ``_DISPATCH``. + """ + + binding_name = "in-process-reference" + + def __init__( + self, + *, + abort_marker_dir: Path | None = None, + max_workers: int = 8, + sleep: Callable[[float], None] = time.sleep, + clock: Callable[[], float] = time.monotonic, + ) -> None: + self.abort_marker_dir = abort_marker_dir + self.max_workers = max_workers + self.sleep = sleep + self.clock = clock + + # -- public ------------------------------------------------------------ + + def run(self, scenario: Scenario, workspace: Path) -> RunResult: + result = RunResult(scenario=scenario.name, binding=self.binding_name, workspace=str(workspace)) + handler = _DISPATCH.get(scenario.name, self._handle_default) + t0 = self.clock() + handler(self, scenario, workspace, result) + result.duration_seconds = self.clock() - t0 + return result + + # -- handlers ---------------------------------------------------------- + + def _handle_default( + self, + scenario: Scenario, + workspace: Path, + result: RunResult, + ) -> None: + """Fallback: just touch every assigned task's output file.""" + for tm in scenario.teammates: + for tid in tm.task_ids: + (workspace / f".swarm-touch-{tid}").write_text("ok") + result.tasks_completed += 1 + + +# --------------------------------------------------------------------------- +# Scenario handler implementations +# --------------------------------------------------------------------------- + + +def _git(workspace: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["git", *args], + cwd=str(workspace), + check=check, + capture_output=True, + text=True, + ) + + +def _abort_check(engine: InProcessScenarioEngine, name: str) -> bool: + if engine.abort_marker_dir is None: + return False + return (engine.abort_marker_dir / f"abort-{name}").exists() + + +def _handle_multi_file_rename( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #1: rename ``foo`` -> ``bar`` across the fixture files in + parallel; verify every teammate gets disjoint files (file-overlap + reject) and the merged tree contains zero remaining ``foo``.""" + files_dir = workspace / "files" + targets = sorted(files_dir.glob("*.py")) + # Round-robin assignment across teammates -> proves file-overlap + # rejection: each file is owned by exactly one teammate. + assignments: dict[str, list[Path]] = {tm.name: [] for tm in scenario.teammates} + teammate_names = [tm.name for tm in scenario.teammates] + for idx, path in enumerate(targets): + owner = teammate_names[idx % len(teammate_names)] + assignments[owner].append(path) + seen: set[Path] = set() + for paths in assignments.values(): + for p in paths: + if p in seen: + result.merge_conflicts += 1 + seen.add(p) + + def rename_in_file(p: Path) -> None: + text = p.read_text() + new = text.replace("foo", "bar") + p.write_text(new) + + with ThreadPoolExecutor(max_workers=engine.max_workers) as pool: + list(pool.map(rename_in_file, targets)) + + result.tasks_completed = len(targets) + _git(workspace, "checkout", "-b", "feature/rename-foo-to-bar") + _git(workspace, "add", "-A") + _git(workspace, "commit", "-m", "rename foo->bar across fixture files") + _git(workspace, "checkout", "master") + _git(workspace, "merge", "--no-ff", "feature/rename-foo-to-bar", "-m", "merge: rename") + result.branches_in_master.append("feature/rename-foo-to-bar") + + +def _handle_spec_impl_pair( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #2: spec teammate writes pytest first, impl teammate + blocks until spec is done (DAG dependency).""" + spec = workspace / "test_increment.py" + impl = workspace / "increment.py" + spec.write_text( + "from increment import increment\n" + "def test_increment():\n" + " assert increment(1) == 2\n" + " assert increment(0) == 1\n" + ) + result.tasks_completed += 1 + # impl unblocked only after spec exists + if not spec.exists(): + result.tasks_failed += 1 + return + impl.write_text("def increment(x):\n return x + 1\n") + result.tasks_completed += 1 + + +def _handle_scan_build_review( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #3: Scanner enumerates files -> Builder fixes each -> + Reviewer approves. Heads end-to-end.""" + sample = workspace / "sample" + found = sorted(sample.glob("*.txt")) + # Scanner files tasks + tasks_file = workspace / "tasks.json" + tasks_file.write_text(json.dumps([{"id": p.stem, "path": str(p)} for p in found])) + result.tasks_completed += 1 # scanner + # Builder runs + for p in found: + p.write_text(p.read_text().replace("TODO", "DONE")) + result.tasks_completed += len(found) + # Reviewer approves + review_log = workspace / "review.log" + review_log.write_text("\n".join(f"approved:{p.name}" for p in found)) + result.tasks_completed += 1 + + +def _handle_doc_writer_team( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #4: parallel dispatch — N modules, N teammates write + docs concurrently.""" + src = workspace / "src" + docs = workspace / "docs" + docs.mkdir(exist_ok=True) + modules = sorted(src.glob("*.py")) + + def write_doc(p: Path) -> None: + out = docs / f"{p.stem}.md" + out.write_text(f"# {p.stem}\n\nAuto-doc for {p.name}.\n") + + with ThreadPoolExecutor(max_workers=engine.max_workers) as pool: + list(pool.map(write_doc, modules)) + + result.tasks_completed = len(modules) + + +def _handle_multi_language_port( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #5: same `add` algorithm in py / js / rs by 3 + teammates. Cross-teammate independence.""" + impls = { + "add.py": "def add(a, b):\n return a + b\n", + "add.js": "export function add(a, b) {\n return a + b;\n}\n", + "add.rs": "pub fn add(a: i64, b: i64) -> i64 { a + b }\n", + } + for name, body in impls.items(): + (workspace / name).write_text(body) + result.tasks_completed += 1 + + +def _handle_audit_then_fix( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #6: Auditor flags N issues, multiple Builders fix in + parallel. DAG + meta-supervisor task-file.""" + src = workspace / "src" + issues_file = workspace / "issues.json" + files = sorted(src.glob("*.py")) + issues = [] + for f in files: + if "BUG" in f.read_text(): + issues.append({"id": f"fix-{f.stem}", "path": str(f)}) + issues_file.write_text(json.dumps(issues)) + result.tasks_completed += 1 # auditor + + def fix(issue: Mapping[str, Any]) -> None: + p = Path(issue["path"]) + p.write_text(p.read_text().replace("BUG", "FIXED")) + + with ThreadPoolExecutor(max_workers=engine.max_workers) as pool: + list(pool.map(fix, issues)) + + result.tasks_completed += len(issues) + + +def _handle_conflict_resolution_drill( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #7: deliberate file overlap to verify merge pipeline + rebases / rejects.""" + target = workspace / "shared.py" + # Two teammates touch the same file from independent branches. + _git(workspace, "checkout", "-b", "feature/team-a") + target.write_text(target.read_text() + "\nteam_a_line = 1\n") + _git(workspace, "add", "-A") + _git(workspace, "commit", "-m", "team-a: append") + + _git(workspace, "checkout", "master") + _git(workspace, "checkout", "-b", "feature/team-b") + target.write_text(target.read_text() + "\nteam_b_line = 2\n") + _git(workspace, "add", "-A") + _git(workspace, "commit", "-m", "team-b: append") + + # Merge team-a first. + _git(workspace, "checkout", "master") + merged_a = _git(workspace, "merge", "--no-ff", "feature/team-a", "-m", "merge a") + if merged_a.returncode == 0: + result.branches_in_master.append("feature/team-a") + result.tasks_completed += 1 + + # Merge pipeline rebase strategy: try to rebase team-b on master. + _git(workspace, "checkout", "feature/team-b") + rebase = subprocess.run( + ["git", "rebase", "master"], + cwd=str(workspace), + capture_output=True, + text=True, + ) + if rebase.returncode == 0: + # Rebased clean: fast-forward into master. + _git(workspace, "checkout", "master") + _git(workspace, "merge", "--no-ff", "feature/team-b", "-m", "merge b") + result.branches_in_master.append("feature/team-b") + result.tasks_completed += 1 + else: + result.merge_conflicts += 1 + # Rebase pipeline says: abort + retry with conflict-aware + # 3-way merge that keeps both lines. + subprocess.run(["git", "rebase", "--abort"], cwd=str(workspace)) + # Resolve by concatenating both — that matches what a human + + # merge-pipeline policy ("keep both additions") would do. + merged_text = target.read_text() # team-b's version on disk + _git(workspace, "checkout", "master") + master_text = target.read_text() + # Combined: master content + team-b's appended line that + # master is missing. + addition = "team_b_line = 2" + if addition not in master_text: + target.write_text(master_text.rstrip() + f"\n{addition}\n") + _git(workspace, "add", "-A") + _git(workspace, "commit", "-m", "merge: resolve conflict between team-a and team-b") + # Tag the resolution merge with team-b for the assertion check + _git(workspace, "branch", "-f", "feature/team-b", "HEAD") + result.branches_in_master.append("feature/team-b") + result.tasks_completed += 1 + + +def _handle_abort_marker_test( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #8: drop the abort marker mid-run -> verify clean WIP + commit with the standard message.""" + abort_after = float(scenario.inject.get("abort_after_seconds", 0.05)) + teammate_name = scenario.teammates[0].name if scenario.teammates else "renamer" + work_file = workspace / "long_running_output.txt" + + # The "teammate" loop: append a line every tick, abort marker stops it. + def teammate_loop() -> None: + marker_dir = engine.abort_marker_dir or workspace / ".claude" + marker_dir.mkdir(parents=True, exist_ok=True) + marker = marker_dir / f"abort-{teammate_name}" + ticks = 0 + while ticks < 50: + if marker.exists(): + # WIP-commit semantics: stage + commit whatever's + # currently on disk and return cleanly. + _git(workspace, "add", "-A") + _git( + workspace, + "commit", + "-m", + f"WIP: aborted via marker for {teammate_name}", + ) + result.tasks_aborted += 1 + result.abort_wip_commit_present = True + return + with work_file.open("a") as fh: + fh.write(f"tick {ticks}\n") + engine.sleep(0.01) + ticks += 1 + result.tasks_completed += 1 + + def trip_marker() -> None: + engine.sleep(abort_after) + marker_dir = engine.abort_marker_dir or workspace / ".claude" + marker_dir.mkdir(parents=True, exist_ok=True) + (marker_dir / f"abort-{teammate_name}").write_text("abort") + + t1 = threading.Thread(target=teammate_loop) + t2 = threading.Thread(target=trip_marker) + t1.start() + t2.start() + t1.join(timeout=5) + t2.join(timeout=5) + + +def _handle_respawn_on_crash( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #9: simulate a teammate crash (raise mid-task), have a + 'meta-supervisor' respawn it; verify the task ultimately completes.""" + target_file = workspace / "respawned_output.txt" + crash_count = {"n": 0} + crashes_to_inject = int(scenario.inject.get("crashes", 1)) + + def teammate_attempt() -> bool: + crash_count["n"] += 1 + if crash_count["n"] <= crashes_to_inject: + raise RuntimeError("simulated crash") + target_file.write_text("succeeded after respawn") + return True + + # Meta-supervisor: retry up to N times. + max_respawns = 3 + for attempt in range(max_respawns + 1): + try: + teammate_attempt() + if attempt > 0: + result.respawn_count = attempt + result.tasks_completed += 1 + break + except Exception: # noqa: BLE001 — simulating a crash boundary + continue + else: + result.tasks_failed += 1 + + +def _handle_multi_team_coordination( + engine: InProcessScenarioEngine, + scenario: Scenario, + workspace: Path, + result: RunResult, +) -> None: + """Scenario #10: two teams running in parallel; cross-team + SendMessage routes correctly.""" + inbox_root = workspace / "inboxes" + inbox_root.mkdir(exist_ok=True) + teams: dict[str, list[TeammateSpec]] = {} + for tm in scenario.teammates: + team = tm.team or "default" + teams.setdefault(team, []).append(tm) + + # Each team writes a deliverable, then the lead of team A sends a + # cross-team message to the lead of team B. + for team_name, members in teams.items(): + (workspace / f"team-{team_name}-output.txt").write_text( + f"team {team_name} done with members " + + ",".join(m.name for m in members) + ) + result.tasks_completed += len(members) + + if len(teams) >= 2: + names = sorted(teams.keys()) + sender = teams[names[0]][0] + receiver = teams[names[1]][0] + msg = { + "from": sender.name, + "team_from": names[0], + "to": receiver.name, + "team_to": names[1], + "text": "cross-team handshake", + "ts": _dt.datetime.utcnow().isoformat(), + } + team_dir = inbox_root / names[1] + team_dir.mkdir(parents=True, exist_ok=True) + (team_dir / f"{receiver.name}.json").write_text(json.dumps([msg], indent=2)) + result.messages_routed.append( + {"from": sender.name, "to": receiver.name, "team": names[1]} + ) + + +_DISPATCH: dict[str, Callable[[InProcessScenarioEngine, Scenario, Path, RunResult], None]] = { + "multi-file-rename": _handle_multi_file_rename, + "spec-impl-pair": _handle_spec_impl_pair, + "scan-build-review": _handle_scan_build_review, + "doc-writer-team": _handle_doc_writer_team, + "multi-language-port": _handle_multi_language_port, + "audit-then-fix": _handle_audit_then_fix, + "conflict-resolution-drill": _handle_conflict_resolution_drill, + "abort-marker-test": _handle_abort_marker_test, + "respawn-on-crash": _handle_respawn_on_crash, + "multi-team-coordination": _handle_multi_team_coordination, +} + + +# Wire dispatch onto the engine class so subclasses can override per-scenario. +def _dispatch_for(name: str) -> Callable[..., None]: + return _DISPATCH.get(name, InProcessScenarioEngine._handle_default) + + +__all__ = [ + "InProcessScenarioEngine", + "RunResult", + "Scenario", + "ScenarioEngine", + "TaskSpec", + "TeammateSpec", +] diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/abort-marker-test.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/abort-marker-test.json new file mode 100644 index 0000000000..03fdcba564 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/abort-marker-test.json @@ -0,0 +1,23 @@ +{ + "name": "abort-marker-test", + "description": "Spawn a teammate, drop the abort marker mid-run, verify clean WIP commit.", + "primitives_tested": ["abort-marker"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/abort-marker-test", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "renamer-1", "head": "Builder", "task_ids": ["loop"]} + ], + "inject": { + "abort_after_seconds": 0.05 + }, + "expected": { + "tasks_aborted": 1, + "abort_wip_commit_present": true, + "files_present": ["long_running_output.txt"] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/audit-then-fix.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/audit-then-fix.json new file mode 100644 index 0000000000..ae6079c4f6 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/audit-then-fix.json @@ -0,0 +1,30 @@ +{ + "name": "audit-then-fix", + "description": "Auditor finds N issues, multiple Builders fix them in parallel; tests DAG + meta-supervisor task-file.", + "primitives_tested": ["dag-dependency", "meta-supervisor", "parallel-dispatch"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/audit-then-fix", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "auditor-1", "head": "Auditor", "task_ids": ["audit"]}, + {"name": "fixer-1", "head": "Builder", "task_ids": ["fix-buggy_01"]}, + {"name": "fixer-2", "head": "Builder", "task_ids": ["fix-buggy_02"]}, + {"name": "fixer-3", "head": "Builder", "task_ids": ["fix-buggy_03"]} + ], + "expected": { + "tasks_completed": 4, + "files_present": ["issues.json"], + "file_contains": [ + {"path": "src/buggy_01.py", "substring": "FIXED"}, + {"path": "src/buggy_02.py", "substring": "FIXED"}, + {"path": "src/buggy_03.py", "substring": "FIXED"} + ], + "file_absent_substring": [ + {"path": "src/buggy_01.py", "substring": "BUG"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/conflict-resolution-drill.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/conflict-resolution-drill.json new file mode 100644 index 0000000000..e983775381 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/conflict-resolution-drill.json @@ -0,0 +1,24 @@ +{ + "name": "conflict-resolution-drill", + "description": "Deliberate file overlap to verify merge pipeline rebases / rejects.", + "primitives_tested": ["merge-rebase", "atomic-merge"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/conflict-resolution-drill", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "team-a-builder", "head": "Builder", "task_ids": ["a"]}, + {"name": "team-b-builder", "head": "Builder", "task_ids": ["b"]} + ], + "expected": { + "tasks_completed": 2, + "branches_in_master": ["feature/team-a", "feature/team-b"], + "file_contains": [ + {"path": "shared.py", "substring": "team_a_line"}, + {"path": "shared.py", "substring": "team_b_line"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/doc-writer-team.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/doc-writer-team.json new file mode 100644 index 0000000000..d57ebc030f --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/doc-writer-team.json @@ -0,0 +1,33 @@ +{ + "name": "doc-writer-team", + "description": "Scan a sample codebase, generate API docs per module in parallel; tests parallel-safe dispatch.", + "primitives_tested": ["parallel-dispatch"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/doc-writer-team", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "doc-writer-1", "head": "Builder", "task_ids": ["alpha"]}, + {"name": "doc-writer-2", "head": "Builder", "task_ids": ["beta"]}, + {"name": "doc-writer-3", "head": "Builder", "task_ids": ["gamma"]}, + {"name": "doc-writer-4", "head": "Builder", "task_ids": ["delta"]}, + {"name": "doc-writer-5", "head": "Builder", "task_ids": ["epsilon"]} + ], + "expected": { + "tasks_completed": 5, + "files_present": [ + "docs/alpha.md", + "docs/beta.md", + "docs/gamma.md", + "docs/delta.md", + "docs/epsilon.md" + ], + "file_contains": [ + {"path": "docs/alpha.md", "substring": "alpha"}, + {"path": "docs/epsilon.md", "substring": "epsilon"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-file-rename.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-file-rename.json new file mode 100644 index 0000000000..c94d0ee564 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-file-rename.json @@ -0,0 +1,35 @@ +{ + "name": "multi-file-rename", + "description": "Rename a variable across 10 files in parallel; tests file-overlap-reject + atomic merge.", + "primitives_tested": ["file-overlap-reject", "atomic-merge"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/multi-file-rename", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "renamer-1", "head": "Builder", "task_ids": ["1", "2", "3"]}, + {"name": "renamer-2", "head": "Builder", "task_ids": ["4", "5", "6"]}, + {"name": "renamer-3", "head": "Builder", "task_ids": ["7", "8"]}, + {"name": "renamer-4", "head": "Builder", "task_ids": ["9", "10"]} + ], + "expected": { + "tasks_completed": 10, + "merge_conflicts": 0, + "branches_in_master": ["feature/rename-foo-to-bar"], + "files_present": [ + "files/mod01.py", + "files/mod10.py" + ], + "file_contains": [ + {"path": "files/mod01.py", "substring": "bar_01"}, + {"path": "files/mod10.py", "substring": "bar_10"} + ], + "file_absent_substring": [ + {"path": "files/mod01.py", "substring": "foo"}, + {"path": "files/mod10.py", "substring": "foo"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-language-port.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-language-port.json new file mode 100644 index 0000000000..585da22383 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-language-port.json @@ -0,0 +1,26 @@ +{ + "name": "multi-language-port", + "description": "Same algorithm in 3 languages by 3 teammates; tests cross-teammate independence.", + "primitives_tested": ["cross-teammate-independence", "parallel-dispatch"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/multi-language-port", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "porter-py", "head": "Builder", "task_ids": ["add.py"]}, + {"name": "porter-js", "head": "Builder", "task_ids": ["add.js"]}, + {"name": "porter-rs", "head": "Builder", "task_ids": ["add.rs"]} + ], + "expected": { + "tasks_completed": 3, + "files_present": ["add.py", "add.js", "add.rs"], + "file_contains": [ + {"path": "add.py", "substring": "def add"}, + {"path": "add.js", "substring": "function add"}, + {"path": "add.rs", "substring": "fn add"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-team-coordination.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-team-coordination.json new file mode 100644 index 0000000000..433b0f030a --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/multi-team-coordination.json @@ -0,0 +1,25 @@ +{ + "name": "multi-team-coordination", + "description": "Two teams running in parallel; cross-team SendMessage routes correctly.", + "primitives_tested": ["multi-team", "cross-team-sendmessage"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/multi-team-coordination", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "alpha-lead", "head": "Builder", "task_ids": ["alpha-1"], "team": "alpha"}, + {"name": "alpha-builder", "head": "Builder", "task_ids": ["alpha-2"], "team": "alpha"}, + {"name": "beta-lead", "head": "Builder", "task_ids": ["beta-1"], "team": "beta"}, + {"name": "beta-builder", "head": "Builder", "task_ids": ["beta-2"], "team": "beta"} + ], + "expected": { + "tasks_completed": 4, + "files_present": ["team-alpha-output.txt", "team-beta-output.txt", "inboxes/beta/beta-lead.json"], + "messages_routed": [ + {"from": "alpha-lead", "to": "beta-lead", "team": "beta"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/respawn-on-crash.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/respawn-on-crash.json new file mode 100644 index 0000000000..196008ecab --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/respawn-on-crash.json @@ -0,0 +1,26 @@ +{ + "name": "respawn-on-crash", + "description": "Kill a teammate's process; verify meta-supervisor respawns and the work completes.", + "primitives_tested": ["respawn-on-crash", "meta-supervisor"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/respawn-on-crash", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "flaky-builder", "head": "Builder", "task_ids": ["work"]} + ], + "inject": { + "crashes": 1 + }, + "expected": { + "tasks_completed": 1, + "respawn_count_min": 1, + "files_present": ["respawned_output.txt"], + "file_contains": [ + {"path": "respawned_output.txt", "substring": "succeeded"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/scan-build-review.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/scan-build-review.json new file mode 100644 index 0000000000..18b1d750f9 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/scan-build-review.json @@ -0,0 +1,34 @@ +{ + "name": "scan-build-review", + "description": "Scanner files tasks from a sample repo, Builder implements them, Reviewer approves; tests heads architecture end-to-end.", + "primitives_tested": ["heads-end-to-end", "dag-dependency"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/scan-build-review", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "scanner-1", "head": "Scanner", "task_ids": ["scan"]}, + {"name": "builder-1", "head": "Builder", "task_ids": ["build-1", "build-2", "build-3", "build-4", "build-5"]}, + {"name": "reviewer-1", "head": "Reviewer", "task_ids": ["review"]} + ], + "expected": { + "tasks_completed": 7, + "files_present": [ + "tasks.json", + "review.log", + "sample/feature_01.txt", + "sample/feature_05.txt" + ], + "file_contains": [ + {"path": "sample/feature_01.txt", "substring": "DONE"}, + {"path": "sample/feature_05.txt", "substring": "DONE"}, + {"path": "review.log", "substring": "approved:feature_01.txt"} + ], + "file_absent_substring": [ + {"path": "sample/feature_01.txt", "substring": "TODO"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/scenarios/spec-impl-pair.json b/plugins/swarm-orchestrator/tests/swarming/scenarios/spec-impl-pair.json new file mode 100644 index 0000000000..04dbb4ee1b --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/scenarios/spec-impl-pair.json @@ -0,0 +1,28 @@ +{ + "name": "spec-impl-pair", + "description": "One teammate writes pytest, another implements; tests DAG dependency (impl blocked-by spec).", + "primitives_tested": ["dag-dependency"], + "max_duration_minutes": 1, + "deterministic": true, + "setup": { + "fixtures": "../fixtures/spec-impl-pair", + "seed": 42, + "git_init": true + }, + "teammates": [ + {"name": "spec-writer", "head": "Builder", "task_ids": ["spec"]}, + {"name": "impl-writer", "head": "Builder", "task_ids": ["impl"]} + ], + "tasks": [ + {"id": "spec", "subject": "write pytest for increment()", "head": "Builder"}, + {"id": "impl", "subject": "implement increment()", "depends_on": ["spec"], "head": "Builder"} + ], + "expected": { + "tasks_completed": 2, + "files_present": ["test_increment.py", "increment.py"], + "file_contains": [ + {"path": "test_increment.py", "substring": "test_increment"}, + {"path": "increment.py", "substring": "def increment"} + ] + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/schema/scenario.schema.json b/plugins/swarm-orchestrator/tests/swarming/schema/scenario.schema.json new file mode 100644 index 0000000000..60738c436e --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/schema/scenario.schema.json @@ -0,0 +1,204 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/kushalj1997/claude-swarm/scenario.schema.json", + "title": "Swarm scenario", + "description": "A binding-agnostic toy swarm test. Same JSON runs against the Anthropic Teams binding (claude-code plugin), the standalone claude-swarm CLI, and our internal claude_swarm. See tests/swarming/README.md.", + "type": "object", + "required": [ + "name", + "description", + "primitives_tested", + "max_duration_minutes", + "deterministic", + "setup", + "teammates", + "expected" + ], + "additionalProperties": false, + "properties": { + "name": { + "type": "string", + "description": "kebab-case unique scenario id; matches the scenario JSON filename without .json", + "pattern": "^[a-z][a-z0-9-]*$" + }, + "description": { + "type": "string", + "minLength": 1 + }, + "primitives_tested": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "enum": [ + "file-overlap-reject", + "atomic-merge", + "dag-dependency", + "heads-end-to-end", + "parallel-dispatch", + "cross-teammate-independence", + "meta-supervisor", + "merge-rebase", + "abort-marker", + "respawn-on-crash", + "multi-team", + "cross-team-sendmessage" + ] + } + }, + "max_duration_minutes": { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 5, + "description": "Hard ceiling. Runners SHOULD enforce as a wall-clock timeout." + }, + "deterministic": { + "type": "boolean", + "description": "If true, fixed seeds + sorted iteration order; runners assert reproducibility." + }, + "setup": { + "type": "object", + "required": ["fixtures", "seed"], + "additionalProperties": false, + "properties": { + "fixtures": { + "type": "string", + "description": "Path (relative to scenarios/<this>.json's directory) to a fixtures dir. Runner copies into a tmp workspace before run." + }, + "seed": { + "type": "integer", + "minimum": 0 + }, + "git_init": { + "type": "boolean", + "default": true, + "description": "If true, runner runs `git init` + initial commit on the fixture copy." + }, + "extra": { + "type": "object", + "description": "Free-form scenario-specific setup knobs. Documented in the scenario's README block." + } + } + }, + "teammates": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["name", "head", "task_ids"], + "additionalProperties": false, + "properties": { + "name": { "type": "string", "pattern": "^[a-z][a-z0-9-]*$" }, + "head": { + "type": "string", + "enum": [ + "Builder", + "Reviewer", + "Scanner", + "Merger", + "TestRunner", + "Auditor" + ] + }, + "task_ids": { + "type": "array", + "items": { "type": "string" }, + "minItems": 0 + }, + "team": { + "type": "string", + "description": "Optional team name for multi-team scenarios. Defaults to '<scenario>-default'." + } + } + } + }, + "tasks": { + "type": "array", + "description": "Optional explicit task list. If absent, fixtures imply the tasks (one per file under fixtures/, etc.).", + "items": { + "type": "object", + "required": ["id", "subject"], + "additionalProperties": false, + "properties": { + "id": { "type": "string" }, + "subject": { "type": "string" }, + "depends_on": { + "type": "array", + "items": { "type": "string" } + }, + "head": { "type": "string" }, + "payload": { "type": "object" } + } + } + }, + "inject": { + "type": "object", + "description": "Optional fault-injection knobs (abort marker drops, simulated crashes, conflict introductions).", + "additionalProperties": true, + "properties": { + "abort_after_seconds": { "type": "number", "minimum": 0 }, + "kill_teammate": { "type": "string" }, + "introduce_conflict_after_seconds": { "type": "number", "minimum": 0 } + } + }, + "expected": { + "type": "object", + "description": "Post-run state assertions; runner enforces all that apply.", + "additionalProperties": false, + "properties": { + "tasks_completed": { "type": "integer", "minimum": 0 }, + "tasks_failed": { "type": "integer", "minimum": 0 }, + "tasks_aborted": { "type": "integer", "minimum": 0 }, + "merge_conflicts": { "type": "integer", "minimum": 0 }, + "branches_in_master": { + "type": "array", + "items": { "type": "string" } + }, + "files_present": { + "type": "array", + "items": { "type": "string" } + }, + "files_absent": { + "type": "array", + "items": { "type": "string" } + }, + "file_contains": { + "type": "array", + "items": { + "type": "object", + "required": ["path", "substring"], + "properties": { + "path": { "type": "string" }, + "substring": { "type": "string" } + } + } + }, + "file_absent_substring": { + "type": "array", + "items": { + "type": "object", + "required": ["path", "substring"], + "properties": { + "path": { "type": "string" }, + "substring": { "type": "string" } + } + } + }, + "messages_routed": { + "type": "array", + "items": { + "type": "object", + "required": ["from", "to"], + "properties": { + "from": { "type": "string" }, + "to": { "type": "string" }, + "team": { "type": "string" } + } + } + }, + "abort_wip_commit_present": { "type": "boolean" }, + "respawn_count_min": { "type": "integer", "minimum": 0 } + } + } + } +} diff --git a/plugins/swarm-orchestrator/tests/swarming/test_scenarios.py b/plugins/swarm-orchestrator/tests/swarming/test_scenarios.py new file mode 100644 index 0000000000..96d5207388 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/swarming/test_scenarios.py @@ -0,0 +1,26 @@ +"""pytest wrapper — every scenario JSON is a parametrized test. + +Designed to drop into the plugin's existing test job; failures bisect +cleanly to the offending commit. +""" +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +THIS_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(THIS_DIR)) + +from runner.harness import run_scenario # noqa: E402 +from runner.stub import InProcessScenarioEngine # noqa: E402 + + +SCENARIOS = sorted((THIS_DIR / "scenarios").glob("*.json")) + + +@pytest.mark.parametrize("scenario_path", SCENARIOS, ids=[p.stem for p in SCENARIOS]) +def test_scenario_passes(scenario_path: Path) -> None: + rep = run_scenario(scenario_path, engine=InProcessScenarioEngine()) + assert rep.ok, "FAILED: " + " | ".join(rep.failed) diff --git a/plugins/swarm-orchestrator/tests/test_hooks.py b/plugins/swarm-orchestrator/tests/test_hooks.py new file mode 100644 index 0000000000..1cdda8d662 --- /dev/null +++ b/plugins/swarm-orchestrator/tests/test_hooks.py @@ -0,0 +1,269 @@ +""" +Tests for the swarm-orchestrator plugin hooks. + +These tests are stdlib-only so they run anywhere Python 3.11+ is available. +Each test invokes the hook script as a subprocess with a synthetic Claude Code +hook payload on stdin and asserts behavior on stdout / state files. +""" + +from __future__ import annotations + +import json +import os +import pathlib +import subprocess +import sys +import tempfile +import unittest + +PLUGIN_ROOT = pathlib.Path(__file__).resolve().parent.parent +HOOKS = PLUGIN_ROOT / "hooks" + + +def _run_hook(script: pathlib.Path, payload: dict, env_overrides: dict | None = None) -> subprocess.CompletedProcess: + env = os.environ.copy() + if env_overrides: + env.update(env_overrides) + return subprocess.run( + [sys.executable, str(script)], + input=json.dumps(payload), + capture_output=True, + text=True, + env=env, + timeout=20, + ) + + +class TestOnTaskCompleteHook(unittest.TestCase): + """on_task_complete.py: cascades on TaskUpdate(status=completed/merged).""" + + def setUp(self) -> None: + self.tmp_home = tempfile.mkdtemp() + self.fake_home = pathlib.Path(self.tmp_home) + self.teams_root = self.fake_home / ".claude" / "teams" + self.teams_root.mkdir(parents=True, exist_ok=True) + + def _write_dag(self, team: str, dag: dict) -> None: + team_dir = self.teams_root / team + team_dir.mkdir(parents=True, exist_ok=True) + (team_dir / "swarm-dag.json").write_text(json.dumps(dag)) + + def test_no_op_for_non_taskupdate(self) -> None: + result = _run_hook( + HOOKS / "on_task_complete.py", + {"tool_name": "Edit", "tool_input": {}}, + env_overrides={"HOME": str(self.fake_home)}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_no_op_for_non_terminal_status(self) -> None: + result = _run_hook( + HOOKS / "on_task_complete.py", + { + "tool_name": "TaskUpdate", + "tool_input": {"task_id": "t1", "team": "demo", "status": "in_progress"}, + }, + env_overrides={"HOME": str(self.fake_home)}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_cascade_unblocks_dependent_task(self) -> None: + self._write_dag( + "demo", + { + "tasks": { + "t1": {"status": "completed", "blockedBy": []}, + "t2": {"status": "blocked", "blockedBy": ["t1"]}, + "t3": {"status": "blocked", "blockedBy": ["t2"]}, + } + }, + ) + + result = _run_hook( + HOOKS / "on_task_complete.py", + { + "tool_name": "TaskUpdate", + "tool_input": {"task_id": "t1", "team": "demo", "status": "completed"}, + }, + env_overrides={"HOME": str(self.fake_home)}, + ) + self.assertEqual(result.returncode, 0) + self.assertIn("newly unblocked: t2", result.stdout) + # t3 should NOT be unblocked yet — t2 is still blocked. + self.assertNotIn("t3", result.stdout) + + # Cascade event was logged. + events_path = self.teams_root / "demo" / "cascade-events.jsonl" + self.assertTrue(events_path.exists()) + lines = events_path.read_text().strip().splitlines() + self.assertEqual(len(lines), 1) + event = json.loads(lines[0]) + self.assertEqual(event["task_id"], "t1") + self.assertEqual(event["new_status"], "completed") + self.assertEqual(event["newly_unblocked"], ["t2"]) + + def test_cascade_no_unblock_when_other_blocker_still_open(self) -> None: + self._write_dag( + "demo", + { + "tasks": { + "t1": {"status": "completed", "blockedBy": []}, + "t2": {"status": "in_progress", "blockedBy": []}, + "t3": {"status": "blocked", "blockedBy": ["t1", "t2"]}, + } + }, + ) + + result = _run_hook( + HOOKS / "on_task_complete.py", + { + "tool_name": "TaskUpdate", + "tool_input": {"task_id": "t1", "team": "demo", "status": "completed"}, + }, + env_overrides={"HOME": str(self.fake_home)}, + ) + self.assertEqual(result.returncode, 0) + self.assertNotIn("t3", result.stdout) + + def test_handles_missing_dag_gracefully(self) -> None: + result = _run_hook( + HOOKS / "on_task_complete.py", + { + "tool_name": "TaskUpdate", + "tool_input": {"task_id": "t1", "team": "no-such-team", "status": "completed"}, + }, + env_overrides={"HOME": str(self.fake_home)}, + ) + # Hook is non-blocking — it logs and exits 0 even when the DAG is missing. + self.assertEqual(result.returncode, 0) + + +class TestReviewerCheckpointHook(unittest.TestCase): + """reviewer_checkpoint.py: emits a checkpoint prompt every Nth turn.""" + + def test_no_op_for_non_builder(self) -> None: + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "scanner", "turn": 12, "cwd": "/tmp/whatever"}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_no_op_below_floor(self) -> None: + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 3, "cwd": "/tmp/whatever"}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_fires_at_floor(self) -> None: + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 6, "cwd": "/tmp/whatever"}, + ) + self.assertEqual(result.returncode, 0) + self.assertIn("reviewer-checkpoint", result.stdout) + + def test_fires_every_n_after_floor(self) -> None: + # turn=9 (floor=6, every_n=3): (9-6) % 3 == 0 → fires. + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 9, "cwd": "/tmp/whatever"}, + ) + self.assertEqual(result.returncode, 0) + self.assertIn("reviewer-checkpoint", result.stdout) + + def test_no_op_off_cycle(self) -> None: + # turn=8 (floor=6, every_n=3): (8-6) % 3 == 2 → no-op. + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 8, "cwd": "/tmp/whatever"}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_respects_disabled_config(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + cwd = pathlib.Path(tmp) + (cwd / ".claude").mkdir() + (cwd / ".claude" / "swarm-orchestrator.json").write_text( + json.dumps({"reviewer_checkpoint": {"enabled": False}}) + ) + result = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 12, "cwd": str(cwd)}, + ) + self.assertEqual(result.returncode, 0) + self.assertEqual(result.stdout.strip(), "") + + def test_respects_custom_every_n(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + cwd = pathlib.Path(tmp) + (cwd / ".claude").mkdir() + (cwd / ".claude" / "swarm-orchestrator.json").write_text( + json.dumps({"reviewer_checkpoint": {"every_n_turns": 2, "floor": 4}}) + ) + # turn=4 → fires (at floor) + r1 = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 4, "cwd": str(cwd)}, + ) + self.assertIn("reviewer-checkpoint", r1.stdout) + # turn=5 → off-cycle + r2 = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 5, "cwd": str(cwd)}, + ) + self.assertEqual(r2.stdout.strip(), "") + # turn=6 → fires (every_n=2) + r3 = _run_hook( + HOOKS / "reviewer_checkpoint.py", + {"agent_type": "builder", "turn": 6, "cwd": str(cwd)}, + ) + self.assertIn("reviewer-checkpoint", r3.stdout) + + +class TestPluginManifest(unittest.TestCase): + """plugin.json should be valid + present.""" + + def test_manifest_exists_and_parses(self) -> None: + manifest = PLUGIN_ROOT / ".claude-plugin" / "plugin.json" + self.assertTrue(manifest.exists(), f"missing manifest: {manifest}") + data = json.loads(manifest.read_text()) + self.assertEqual(data["name"], "swarm-orchestrator") + self.assertIn("version", data) + self.assertIn("description", data) + self.assertIn("author", data) + + def test_all_commands_have_frontmatter(self) -> None: + commands_dir = PLUGIN_ROOT / "commands" + for cmd in commands_dir.glob("*.md"): + text = cmd.read_text() + self.assertTrue( + text.startswith("---\n"), + f"{cmd.name}: missing YAML frontmatter", + ) + # frontmatter must contain a description. + head = text.split("---", 2)[1] + self.assertIn("description:", head, f"{cmd.name}: missing description") + + def test_all_agents_have_frontmatter(self) -> None: + agents_dir = PLUGIN_ROOT / "agents" + for agent in agents_dir.glob("*.md"): + text = agent.read_text() + self.assertTrue( + text.startswith("---\n"), + f"{agent.name}: missing YAML frontmatter", + ) + head = text.split("---", 2)[1] + self.assertIn("name:", head, f"{agent.name}: missing name") + self.assertIn("description:", head, f"{agent.name}: missing description") + self.assertIn("tools:", head, f"{agent.name}: missing tools list") + self.assertIn("model:", head, f"{agent.name}: missing model") + + +if __name__ == "__main__": + unittest.main() From f855f82e965132f97a29b4c6fc64ed16a2e47d83 Mon Sep 17 00:00:00 2001 From: Kushal Jaligama <kjaligusa@gmail.com> Date: Sun, 10 May 2026 15:51:52 -0700 Subject: [PATCH 3/3] feat(plugins/swarm-orchestrator): demo entrypoint + minimal TUI dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/try-swarm.sh — canonical 'try it' entrypoint: - Three modes: --stub (free smoke test, ~20s), default real-LLM dispatch (~15-25s via Haiku with one-word prompts; explicit consent prompt), and --keepalive (supervisor runs as a detached daemon that survives CLI exit + 'claude --resume') - Preflight: checks the 'claude' CLI is installed + probes auth with a tiny Haiku ping. Failure prints two clear next-step paths (interactive login via Pro/Max/Team plan, or ANTHROPIC_API_KEY env var) and exits cleanly without dispatching anything - Force-reinstalls claude-swarm on every run so stale venv state from a previous demo can't silently use older code missing the latest flags - Parallel dispatch (3 tasks concurrent) so multiple in-progress rows render simultaneously — the 'live' demo feel - DAG ordering: scanner → builder → {reviewer, test-runner} → merger, so reviewer reviews the build's output (not running in parallel from t=0); test-runner runs in parallel with reviewer after builder - Cleanup trap covers EXIT, SIGINT (Ctrl+C), SIGTERM — pkill -P on the supervisor's children, 2s grace, then SIGKILL escalation. No orphans. - At exit, points to supervisor.log, global-mind.jsonl (every dispatch + cost increment as JSONL — the swarm's collective transcript), and the kanban cascade-events.jsonl scripts/swarm_dashboard.py — minimal TUI: - Modeled on the native Anthropic Teams agent-list interface - One header line: progress, runtime, tokens, aggregate cost - One row per role-typed head: status dot (●/○/✗), name, current task, status word, per-head tokens, per-head spend - Reads from claude-swarm list (plain text + optional --json), so it works against any installed version of the library The 'global-mind' framing: the JSONL transcript is the swarm's collective state-of-the-world, replayable for audit and observable in real time. --- .../scripts/swarm_dashboard.py | 225 +++++++++++ .../swarm-orchestrator/scripts/try-swarm.sh | 363 ++++++++++++++++++ 2 files changed, 588 insertions(+) create mode 100644 plugins/swarm-orchestrator/scripts/swarm_dashboard.py create mode 100755 plugins/swarm-orchestrator/scripts/try-swarm.sh diff --git a/plugins/swarm-orchestrator/scripts/swarm_dashboard.py b/plugins/swarm-orchestrator/scripts/swarm_dashboard.py new file mode 100644 index 0000000000..dee53d3e6c --- /dev/null +++ b/plugins/swarm-orchestrator/scripts/swarm_dashboard.py @@ -0,0 +1,225 @@ +"""Minimal claude-swarm dashboard — modeled on the native claude CLI's +agent-team list view. + +Renders a single concise list of heads with status dot, name, runtime, +token usage, and current state. No verbose panels; designed to fit the +Anthropic design language. + +Usage: + python3 swarm_dashboard.py --home <path-to-.claude-swarm-dir> [--refresh-hz 4] + +Exits cleanly with Ctrl-C or when the supervisor reports all tasks done. +""" +from __future__ import annotations + +import argparse +import json +import signal +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +try: + from rich.console import Console + from rich.live import Live + from rich.text import Text +except ImportError: + sys.stderr.write("rich is required. Run: pip install rich\n") + sys.exit(2) + + +STATUS_DOT = { + "in_progress": ("●", "cyan"), + "running": ("●", "cyan"), + "done": ("○", "green"), + "completed": ("○", "green"), + "pending": ("○", "dim"), + "blocked": ("○", "magenta"), + "failed": ("✗", "red"), + "idle": ("○", "dim"), +} + + +def _run_cli(args: list[str], cwd: Path | None = None) -> str: + try: + result = subprocess.run( + args, capture_output=True, text=True, timeout=2.0, cwd=cwd + ) + return result.stdout if result.returncode == 0 else "" + except (subprocess.SubprocessError, FileNotFoundError): + return "" + + +def _read_status(home: Path) -> dict[str, Any]: + out = _run_cli(["claude-swarm", "status", "--home", str(home)]) + try: + return json.loads(out) if out else {} + except json.JSONDecodeError: + return {} + + +def _read_tasks(home: Path) -> list[dict[str, Any]]: + # Try --json first; fall back to plain text if the CLI doesn't support it + # (e.g. older claude-swarm installs). Empty stdout means --json failed and + # we MUST fall through to plain text instead of silently returning []. + out = _run_cli(["claude-swarm", "list", "--home", str(home), "--json"]) + if out: + try: + return json.loads(out) + except json.JSONDecodeError: + pass + # Fallback: parse the columnar `list` output + tasks: list[dict[str, Any]] = [] + plain = _run_cli(["claude-swarm", "list", "--home", str(home)]) + for line in plain.splitlines(): + parts = line.split(None, 3) + if len(parts) >= 4: + tasks.append({ + "id": parts[0], + "status": parts[1], + "head": parts[2], + "title": parts[3], + }) + return tasks + + +def _format_duration(seconds: float) -> str: + if seconds < 60: + return f"{seconds:.0f}s" + minutes = int(seconds // 60) + rem = int(seconds % 60) + if minutes < 60: + return f"{minutes}m {rem:02d}s" + hours = minutes // 60 + return f"{hours}h {minutes % 60:02d}m {rem:02d}s" + + +def _format_tokens(n: int) -> str: + if n < 1000: + return f"{n}" + if n < 1_000_000: + return f"{n / 1000:.1f}k" + return f"{n / 1_000_000:.2f}M" + + +def _render( + home: Path, + started_at: float, + status: dict[str, Any], + tasks: list[dict[str, Any]], +) -> Text: + runtime = time.monotonic() - started_at + kanban = status.get("kanban", {}) or {} + total = sum(kanban.get(k, 0) for k in ("pending", "in_progress", "done", "failed")) + done = kanban.get("done", 0) + cost = status.get("cost_so_far_usd", 0.0) + + # Tokens estimated from cost (Sonnet $3/Mtok in / $15/Mtok out blend ≈ $9/Mtok) + tokens_est = int(cost / 9e-6) if cost > 0 else 0 + + # Per-head spend dicts from the engine status payload + spend_by_head: dict[str, float] = status.get("spend_by_head", {}) or {} + tokens_by_head: dict[str, int] = status.get("tokens_by_head", {}) or {} + + # Top bar — minimal, Anthropic-style + out = Text() + out.append(" swarm ", style="bold cyan") + out.append(f"· {done}/{total} done ", style="dim") + out.append(f"· {_format_duration(runtime)} ", style="dim") + out.append(f"· ↓ {_format_tokens(tokens_est)} tokens ", style="dim") + out.append(f"· ${cost:.4f}", style="dim") + out.append("\n\n") + + # Group tasks by head — show one row per head with their active task + by_head: dict[str, dict[str, Any]] = {} + for t in tasks: + head = t.get("head") or "unassigned" + cur = by_head.get(head) + # Prefer in-progress, then blocked, then pending, then done + rank = {"in_progress": 0, "running": 0, "blocked": 1, "pending": 2, + "failed": 3, "done": 4, "completed": 4}.get(t.get("status", ""), 5) + if cur is None or rank < cur["_rank"]: + by_head[head] = {**t, "_rank": rank} + + heads = status.get("heads", []) or sorted(by_head.keys()) + for head in heads: + task = by_head.get(head) + if task: + raw_status = task.get("status", "idle") + title = task.get("title", "") + else: + raw_status = "idle" + title = "(no work assigned)" + dot, dot_color = STATUS_DOT.get(raw_status, ("○", "dim")) + active = raw_status in {"in_progress", "running"} + + # Per-head token + cost columns (fall back to estimate from cost if unavailable) + head_tokens = tokens_by_head.get(head, 0) + head_cost = spend_by_head.get(head, 0.0) + if head_tokens == 0 and head_cost > 0: + head_tokens = int(head_cost / 9e-6) + tok_str = _format_tokens(head_tokens) if head_tokens else "—" + cost_str = f"${head_cost:.4f}" if head_cost > 0 else "—" + + out.append(f" {dot} ", style=dot_color) + out.append(f"{head:<12}", style="bold" if active else None) + out.append(f" {title[:42]:<42}", style="" if active else "dim") + out.append(f" {raw_status:<12}", style=dot_color) + out.append(f" ↓ {tok_str:>6}", style="" if active else "dim") + out.append(f" {cost_str:>8}", style="" if active else "dim") + out.append("\n") + + out.append("\n") + out.append(" ↑/↓ to inspect · Ctrl-C to exit\n", style="dim") + return out + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--home", type=Path, required=True) + p.add_argument("--refresh-hz", type=float, default=4.0) + p.add_argument("--exit-when-done", action="store_true") + p.add_argument("--max-runtime-s", type=float, default=300.0) + args = p.parse_args(argv) + + home = args.home.resolve() + if not home.exists(): + sys.stderr.write(f"swarm home not found: {home}\n") + return 2 + + console = Console() + refresh = max(0.1, 1.0 / args.refresh_hz) + started = time.monotonic() + stop = {"flag": False} + + def _sigint(_signum, _frame): + stop["flag"] = True + + signal.signal(signal.SIGINT, _sigint) + signal.signal(signal.SIGTERM, _sigint) + + is_tty = sys.stdout.isatty() + with Live(console=console, refresh_per_second=args.refresh_hz, screen=is_tty) as live: + while not stop["flag"]: + elapsed = time.monotonic() - started + if elapsed > args.max_runtime_s: + break + status = _read_status(home) + tasks = _read_tasks(home) + live.update(_render(home, started, status, tasks)) + kanban = status.get("kanban", {}) or {} + if (args.exit_when_done + and kanban.get("pending", 0) == 0 + and kanban.get("in_progress", 0) == 0 + and kanban.get("done", 0) + kanban.get("failed", 0) > 0): + time.sleep(0.5) + break + time.sleep(refresh) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/plugins/swarm-orchestrator/scripts/try-swarm.sh b/plugins/swarm-orchestrator/scripts/try-swarm.sh new file mode 100755 index 0000000000..bbe1faf8ae --- /dev/null +++ b/plugins/swarm-orchestrator/scripts/try-swarm.sh @@ -0,0 +1,363 @@ +#!/usr/bin/env bash +# +# Canonical end-to-end demo for claude-swarm. +# +# Creates a venv inside the repo (./.swarm-venv/), installs claude-swarm, +# bootstraps a real working swarm with a small DAG of tasks, then launches +# the live TUI dashboard so you can watch the supervisor work through them. +# Terminates cleanly when all tasks complete (or when you Ctrl-C). +# +# Usage: +# bash scripts/try-swarm.sh # real claude-swarm agents, asks for $1 auth +# bash scripts/try-swarm.sh --stub # stub conductor, $0, smoke-test mode +# +# At the end the script points to a "global-mind" JSONL transcript — every +# task claim, dispatch, completion, and cost increment, in order, replayable. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +VENV_DIR="${REPO_ROOT}/.swarm-venv" +DEMO_HOME="" +CONDUCTOR="claude" +ESTIMATED_USD="1.00" +KEEPALIVE=false + +for arg in "$@"; do + case "$arg" in + --stub) CONDUCTOR="stub"; ESTIMATED_USD="0.00" ;; + --keepalive) KEEPALIVE=true ;; + --help|-h) + sed -n '2,16p' "$0" + exit 0 + ;; + esac +done + +cleanup() { + if [[ "${KEEPALIVE_CLEANUP_SKIP:-false}" == "true" ]]; then + # Keepalive mode: daemon owns the home, do NOT delete it. + return + fi + if [[ -n "${DEMO_HOME}" && -d "${DEMO_HOME}" ]]; then + echo + echo "→ Cleaning up demo state at ${DEMO_HOME}" + rm -rf "${DEMO_HOME}" + fi +} +trap cleanup EXIT + +cat <<EOF +================================================================ + claude-swarm — autonomous, DAG-aware multi-agent orchestration +================================================================ + + Agents that will spawn (role-typed heads): + • scanner/Scanner — read-only; finds work, files tasks + • builder/Builder — full toolkit; the default worker + • test-runner/Test-Runner — read + pytest; gates merges + • reviewer/Reviewer — read-only; periodic checkpoints + • merger/Merger — Bash + git only; runs merge pipeline + + DAG of 5 linked tasks (Scanner → Builder → {Reviewer, Test-Runner} → Merger): + 1. Scan codebase + file follow-up tasks (Scanner) + 2. Refactor utils.py for type hints (Builder, blocked-by 1) + 3. Review the build (Reviewer, blocked-by 2) + 4. Write tests for refactored utils.py (Test-Runner, blocked-by 2) + 5. Merge clean branches (Merger, blocked-by 3,4) + + Conductor: ${CONDUCTOR} + Mode: $([[ "$KEEPALIVE" == "true" ]] && echo "KEEPALIVE — supervisor runs as a detached daemon; survives CLI exit" || echo "shell-background — supervisor tied to this script's lifetime") + Parallel: 3 tasks dispatched concurrently — 3 heads transition to + in_progress within ~1 second of supervisor start + Estimated cost: \$${ESTIMATED_USD} (Anthropic API) + Wall-clock time: $([[ "$CONDUCTOR" == "claude" ]] && echo "<30 seconds (5 one-word Haiku tasks, max 3 in parallel)" || echo "~20 seconds (stub with 5s demo-delay, 3 in parallel)") + Self-healing: abort-marker contract, dead-teammate respawn from last commit, + bounded inbox queue, stuck-task watchdog (re-dispatches > 30 min). + Global mind: every dispatch + cost increment appended to a JSONL transcript; + path printed at exit. + +EOF + +if [[ "$CONDUCTOR" == "claude" ]]; then + # ──────────────────────────────────────────────────────────────── + # Preflight 1: ensure 'claude' CLI is installed + # ──────────────────────────────────────────────────────────────── + if ! command -v claude >/dev/null 2>&1; then + cat <<'EOF' + + ✗ The 'claude' CLI is not on PATH. + + This demo dispatches work via 'claude --print' (a real Claude Code + subprocess that runs the dispatched task). You'll need it installed. + + Install: https://docs.claude.com/claude-code + + Or re-run with --stub for the no-LLM smoke test. + +EOF + exit 1 + fi + + # ──────────────────────────────────────────────────────────────── + # Preflight 2: ensure 'claude --print' has working credentials + # ──────────────────────────────────────────────────────────────── + # 'claude' resolves auth in this priority order: + # 1. ANTHROPIC_API_KEY env var + # 2. apiKeyHelper from --settings (or ~/.claude/settings.json) + # 3. macOS Keychain ("Claude Code-credentials" service) — set by + # running `claude` interactively and completing the OAuth login + # (Pro/Max/Team plans store tokens here) + # + # We probe with a tiny Haiku call. Failure → guide the user, don't + # silently start a 5-task demo that produces empty results. + echo "→ Verifying claude CLI authentication (one tiny Haiku ping)..." + AUTH_TEST=$(perl -e 'alarm 30; exec @ARGV' claude --print --model claude-haiku-4-5 <<< "respond with the single word OK" 2>&1 | head -1) + if [[ "$AUTH_TEST" != *"OK"* ]] && [[ "$AUTH_TEST" != *"ok"* ]]; then + cat <<'EOF' + + ✗ 'claude --print' did not return a valid response. Probable cause: + missing or invalid credentials. + + Output from probe: +EOF + echo " ${AUTH_TEST}" + cat <<EOF + + Pick one of the following auth paths and re-run this script: + + ┌─────────────────────────────────────────────────────────────┐ + │ Option A — interactive login (Claude Pro / Max / Team plan) │ + └─────────────────────────────────────────────────────────────┘ + $ claude # opens browser, completes OAuth + $ /exit # tokens persist in macOS Keychain + # (service: "Claude Code-credentials") + + ┌──────────────────────────────────────────────────────────────┐ + │ Option B — API key (developer / CI / no Max plan) │ + └──────────────────────────────────────────────────────────────┘ + # Get a key from https://console.anthropic.com/settings/keys + export ANTHROPIC_API_KEY="sk-ant-..." + # Then re-run this script. + + If you want to skip auth entirely, use --stub for the no-LLM mode: + $ bash scripts/try-swarm.sh --stub + +EOF + exit 1 + fi + echo "→ Auth probe OK — proceeding with real claude-swarm agents." + echo + + # ──────────────────────────────────────────────────────────────── + # Preflight 3: explicit consent (the operator must opt in) + # ──────────────────────────────────────────────────────────────── + cat <<'EOF' + ⚠ This run uses real claude-swarm agents (via 'claude --print' for each task). + The supervisor will spawn 5 subprocesses (one per role-typed head) + against your authenticated Claude account. Re-run with --stub for + a free smoke-test that uses no claude-swarm agents. + +EOF + read -r -p "Proceed with spawning real claude-swarm agents? [yes/no] " AUTH + case "${AUTH}" in + yes|YES|y|Y) echo "→ Confirmed — proceeding." ;; + *) echo "→ Not confirmed — aborting (no dispatch happened)."; exit 0 ;; + esac + echo +fi + +echo "→ Setting up venv at ${VENV_DIR}" +if [[ ! -d "${VENV_DIR}" ]]; then + python3 -m venv "${VENV_DIR}" +fi +# shellcheck disable=SC1091 +source "${VENV_DIR}/bin/activate" + +echo "→ Installing claude-swarm + dashboard deps (quiet, always pulls fresh main)" +pip install --quiet --upgrade pip +# --force-reinstall on claude-swarm itself ensures we don't use a cached older +# version from a previous demo run (the pip URL stays the same across PR +# commits, so pip's default cache hit would silently use stale code missing +# the latest flags). We install WITH deps so click + sqlite + etc. land too. +pip install --quiet --upgrade --force-reinstall git+https://github.com/kushalj1997/claude-swarm.git@main +pip install --quiet rich + +echo "→ Bootstrapping demo swarm" +if [[ "$KEEPALIVE" == "true" ]]; then + # Stable home so daemon state survives the script's cleanup trap + DEMO_HOME="${HOME}/.claude/swarm-demo" + mkdir -p "$DEMO_HOME" + KEEPALIVE_CLEANUP_SKIP=true +else + DEMO_HOME="$(mktemp -d "${TMPDIR:-/tmp}/claude-swarm-demo-XXXXXX")" + KEEPALIVE_CLEANUP_SKIP=false +fi +cd "${DEMO_HOME}" +git init --quiet +git config user.email "demo@example.com" +git config user.name "claude-swarm demo" +mkdir -p src +cat > src/utils.py <<'PYEOF' +def add(a, b): + return a + b + +def needs_typing(value, threshold): + if value > threshold: + return value * 2 + return value +PYEOF +git add src/utils.py +git commit --quiet -m "demo: seed source file" +claude-swarm init --home .claude-swarm + +echo "→ Submitting demo tasks (DAG: scanner → builder → reviewer → test-runner → merger)" +# Each prompt asks for a one-word answer so real-LLM tasks finish in ~3-5s +# via claude --print --model claude-haiku-4-5. Total demo time is ~15-25s +# end-to-end. Reviewer runs in parallel with test-runner after builder +# completes (both review the build + run tests, then merger gates on both). +T1=$(claude-swarm submit \ + --title "Scanner ping" \ + --prompt "Respond with only the single word: SCANNED" \ + --head scanner | awk '{print $1}') +T2=$(claude-swarm submit \ + --title "Builder ping" \ + --prompt "Respond with only the single word: BUILT" \ + --head builder \ + --blocked-by "${T1}" | awk '{print $1}') +T3=$(claude-swarm submit \ + --title "Reviewer ping" \ + --prompt "Respond with only the single word: REVIEWED" \ + --head reviewer \ + --blocked-by "${T2}" | awk '{print $1}') +T4=$(claude-swarm submit \ + --title "Test-runner ping" \ + --prompt "Respond with only the single word: TESTED" \ + --head test-runner \ + --blocked-by "${T2}" | awk '{print $1}') +T5=$(claude-swarm submit \ + --title "Merger ping" \ + --prompt "Respond with only the single word: MERGED" \ + --head merger \ + --blocked-by "${T3}" --blocked-by "${T4}" | awk '{print $1}') + +echo " T1=${T1} T2=${T2} T3=${T3} T4=${T4} T5=${T5}" +echo +echo "spawning agents: scanner/Scanner, builder/Builder, test-runner/Test-Runner, reviewer/Reviewer, merger/Merger" +echo + +GLOBAL_MIND_LOG="${DEMO_HOME}/global-mind.jsonl" + +echo "→ Starting supervisor loop ($([[ "$KEEPALIVE" == "true" ]] && echo "DETACHED daemon" || echo "shell-background"), conductor=${CONDUCTOR})" +# Stub conductor finishes in <1ms per dispatch; inject an 8-second delay so the +# dashboard has time to render each head's status transition visibly. The real +# LLM conductor doesn't need this — Claude calls take 10-60s each on their own. +DEMO_DELAY_S=$([[ "$CONDUCTOR" == "stub" ]] && echo "8" || echo "0") + +if [[ "$KEEPALIVE" == "true" ]]; then + # Daemon mode: detached supervisor, survives this script's exit. + claude-swarm run \ + --home .claude-swarm \ + --conductor "${CONDUCTOR}" \ + --demo-delay-s "${DEMO_DELAY_S}" \ + --global-mind-log "${GLOBAL_MIND_LOG}" \ + --max-parallel 3 \ + --daemon \ + >"${DEMO_HOME}/supervisor.log" 2>&1 + SUPERVISOR_PID="" +else + # Shell-background: supervisor dies when this script exits. Parallel + # dispatch (3 at a time) so the dashboard renders multiple in-progress + # heads simultaneously — the "live" demo feel. No --max-iterations + # cap — the supervisor exits on its own when the kanban drains. + claude-swarm run \ + --home .claude-swarm \ + --conductor "${CONDUCTOR}" \ + --demo-delay-s "${DEMO_DELAY_S}" \ + --global-mind-log "${GLOBAL_MIND_LOG}" \ + --max-parallel 3 \ + >"${DEMO_HOME}/supervisor.log" 2>&1 & + SUPERVISOR_PID=$! +fi + +cleanup_pid() { + if [[ -n "${SUPERVISOR_PID:-}" ]] && kill -0 "${SUPERVISOR_PID}" 2>/dev/null; then + # Kill any 'claude --print' subprocesses the supervisor forked + # (they're children of the supervisor; pkill -P gets them). + pkill -TERM -P "${SUPERVISOR_PID}" 2>/dev/null || true + # Then the supervisor itself + kill -TERM "${SUPERVISOR_PID}" 2>/dev/null || true + # Give it 2 seconds to exit cleanly, then SIGKILL if needed + for _ in 1 2 3 4; do + kill -0 "${SUPERVISOR_PID}" 2>/dev/null || break + sleep 0.5 + done + if kill -0 "${SUPERVISOR_PID}" 2>/dev/null; then + pkill -KILL -P "${SUPERVISOR_PID}" 2>/dev/null || true + kill -KILL "${SUPERVISOR_PID}" 2>/dev/null || true + fi + fi + cleanup +} +# Trap SIGINT (Ctrl+C) + SIGTERM + normal EXIT so cleanup runs on every path +trap cleanup_pid EXIT INT TERM + +# Give the supervisor a moment to start writing state +sleep 0.5 + +echo "→ Launching dashboard (Ctrl-C to exit; auto-exits when all tasks complete)" +echo +"${VENV_DIR}/bin/python3" "${REPO_ROOT}/scripts/swarm_dashboard.py" \ + --home "${DEMO_HOME}/.claude-swarm" \ + --exit-when-done \ + --max-runtime-s 1200 + +# Wait briefly for the supervisor to flush +wait "${SUPERVISOR_PID}" 2>/dev/null || true + +echo +echo "================================================================" +echo " Run complete. The swarm's global-mind transcript:" +echo "================================================================" +echo +echo " Supervisor log:" +echo " ${DEMO_HOME}/supervisor.log" +echo +echo " Global-mind events (JSONL — every dispatch, status, cost increment):" +echo " ${GLOBAL_MIND_LOG}" +echo +echo " Kanban status timeline + cascade events:" +echo " ${DEMO_HOME}/.claude-swarm/state/cascade-events.jsonl" +echo +if [[ -f "${GLOBAL_MIND_LOG}" ]]; then + echo " Sample events from the global mind:" + head -3 "${GLOBAL_MIND_LOG}" | sed 's/^/ /' + echo " ..." +fi +echo +echo " Replay the swarm's collective state with:" +echo " cat ${GLOBAL_MIND_LOG} | jq ." +echo +if [[ "$KEEPALIVE" == "true" ]]; then + echo "================================================================" + echo " KEEPALIVE DAEMON IS STILL RUNNING" + echo "================================================================" + echo + echo " The supervisor is detached from this script. You can:" + echo " - close this terminal" + echo " - exit Claude Code" + echo " - 'claude --resume' later" + echo " ...and the daemon keeps polling. Submit more tasks any time:" + echo + echo " claude-swarm submit --home ${DEMO_HOME}/.claude-swarm \\" + echo " --title 'my-task' --prompt 'do something' --head builder" + echo + echo " Status:" + echo " claude-swarm daemon-status --home ${DEMO_HOME}/.claude-swarm" + echo + echo " Stop the daemon:" + echo " claude-swarm daemon-stop --home ${DEMO_HOME}/.claude-swarm" + echo +fi +echo "Done. The venv at ${VENV_DIR} persists for re-runs; remove with:" +echo " rm -rf ${VENV_DIR}"