From 3dbdc299933860a4db762cf3bebbd37f840757db Mon Sep 17 00:00:00 2001 From: Runchao Han Date: Sat, 18 Apr 2026 20:16:18 +0800 Subject: [PATCH] Make skills package host-agnostic and portable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reposition Reaper as a host-agnostic skills package that works across 45+ AI coding agents (Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, Claude Code, …) instead of a Claude-Code-only plugin. - Standardize "Path Resolution Protocol" preamble in every skill that uses path placeholders, listing per-host install paths and the sibling-skill dependency - Replace shell-fragile `` syntax with `{{SKILL_DIR}}`-style template tokens that survive bash redirection - Adopt the `/` typographic convention in user-facing docs; preserve bare names in YAML frontmatter, JSON identifiers, and path-substitution lists where tooling/paths require them - Soften Claude-default framing in CLAUDE.md, README.md, ROADMAP.md, evals.json, and the reaper orchestrator - Add CI workflow with three jobs: required offline structure tests, non-blocking live-API integration tests, and a strict npx-skills install-and-discover check - Strengthen structure tests: assert frontmatter `name` matches `[a-z][a-z0-9-]*` and equals the directory name, every `{{*SKILL_DIR}}` placeholder is defined locally, every skill using placeholders declares a Path Resolution Protocol section, and no skill invokes Python via `python skills/...` relative paths Co-Authored-By: Claude Opus 4.7 --- .github/workflows/ci.yml | 89 ++++++ CLAUDE.md | 36 ++- README.md | 133 +++++---- dev/ROADMAP.md | 267 +++++++++--------- evals/evals.json | 26 +- skills/analyze-paper/SKILL.md | 27 +- skills/brainstorm/SKILL.md | 31 +- skills/clarify-goal/SKILL.md | 10 +- skills/critique/SKILL.md | 37 ++- skills/formalize-problem/SKILL.md | 31 +- skills/investigate/SKILL.md | 41 ++- skills/reaper/SKILL.md | 92 +++--- .../reaper/references/codex-consultation.md | 11 +- skills/reaper/references/search-tools.md | 25 +- skills/review-literature/SKILL.md | 49 ++-- skills/search-arxiv/SKILL.md | 27 +- skills/search-iacr/SKILL.md | 29 +- skills/synthesize/SKILL.md | 4 +- tests/test_skills_structure.py | 168 ++++++++++- 19 files changed, 788 insertions(+), 345 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..7dd4d2c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,89 @@ +name: ci + +on: + push: + branches: [main] + pull_request: + +jobs: + # Offline structure tests — required. Validates skill manifests, frontmatter, + # path placeholders, and inter-skill references. Must pass for any merge. + structure-tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install Python dependencies + run: pip install pytest + - name: Run structure tests + run: pytest tests/test_skills_structure.py -v + + # Network-dependent integration tests against live arXiv / IACR ePrint APIs. + # Non-blocking: external services can rate-limit, return 5xx, or change + # their HTML — none of which means the package is broken. We still run + # them so regressions surface in PR logs, but they do not gate merges. + integration-tests: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install Python dependencies + run: pip install pytest arxiv requests beautifulsoup4 + - name: Run integration tests (search scripts against live APIs) + run: pytest tests/test_search_arxiv.py tests/test_search_iacr.py -v + + npx-skills-discovery: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: "20" + - name: Install skills CLI from local path into universal target + env: + REAPER_REPO: ${{ github.workspace }} + run: | + mkdir -p /tmp/skills-test + cd /tmp/skills-test + npx -y skills@latest add "$REAPER_REPO" --agent universal --copy -y + - name: Verify every expected skill is present and complete + run: | + cd /tmp/skills-test + target=.agents/skills + if [ ! -d "$target" ]; then + echo "::error::Skills install target $target does not exist" + exit 1 + fi + ls -la "$target" + expected="reaper clarify-goal analyze-paper review-literature formalize-problem brainstorm investigate critique synthesize search-arxiv search-iacr" + missing=0 + for skill in $expected; do + if [ ! -f "$target/$skill/SKILL.md" ]; then + echo "::error::Missing skill: $skill (no SKILL.md at $target/$skill/)" + missing=$((missing + 1)) + fi + done + # Co-located Python scripts must travel with their skill dirs + for script in search-arxiv/search_arxiv.py search-iacr/search_iacr.py; do + if [ ! -f "$target/$script" ]; then + echo "::error::Missing required asset: $target/$script" + missing=$((missing + 1)) + fi + done + # Reference files used by other skills must travel with the reaper skill dir + for ref in model.md impossibility-results.md definitional-standards.md methodology.md paper-analysis.md venue-tiers.md search-tools.md codex-consultation.md; do + if [ ! -f "$target/reaper/references/$ref" ]; then + echo "::error::Missing reaper reference file: $target/reaper/references/$ref" + missing=$((missing + 1)) + fi + done + if [ "$missing" -gt 0 ]; then + echo "::error::$missing required asset(s) missing after npx skills install" + exit 1 + fi + echo "All expected skills, scripts, and reference files installed successfully." diff --git a/CLAUDE.md b/CLAUDE.md index 6e6cff6..6d4e3d4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,18 +1,19 @@ # Reaper -AI-native scientific research pipeline distributed as a Claude Code plugin. Takes a research goal — optionally with a research paper — and autonomously runs a multi-step research loop. Ships with reference files for cryptography and distributed systems, but the skills themselves are domain-agnostic — swap the reference files to adapt to other research domains. +AI-native scientific research pipeline distributed as a host-agnostic skills package. Each pipeline stage is a `SKILL.md` folder that runs on any AI coding agent supporting the [skills convention](https://github.com/vercel-labs/skills) — Cursor, OpenAI Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, Claude Code, and 40+ others. Takes a research goal — optionally with a research paper — and autonomously runs a multi-step research loop. Ships with reference files for cryptography and distributed systems, but the skills themselves are domain-agnostic — swap the reference files to adapt to other research domains. ## Project structure -- `skills/` — 11 composable Claude skills (each has a `SKILL.md` defining its behavior) - - `reaper/` — Main orchestrator that chains all other skills - - `clarify-goal/` — Interactive goal clarification (asks user targeted questions before pipeline runs) - - `analyze-paper/`, `review-literature/`, `formalize-problem/`, `brainstorm/`, `investigate/`, `critique/`, `synthesize/` — Pipeline stages - - `search-arxiv/`, `search-iacr/` — Academic search via Python scripts +- `skills/` — 11 composable skills (each has a `SKILL.md` defining its behavior; the `/` form is the canonical display convention used in all user-facing docs) + - `/reaper` — Main orchestrator that chains all other skills + - `/clarify-goal` — Interactive goal clarification (asks user targeted questions before pipeline runs) + - `/analyze-paper`, `/review-literature`, `/formalize-problem`, `/brainstorm`, `/investigate`, `/critique`, `/synthesize` — Pipeline stages + - `/search-arxiv`, `/search-iacr` — Academic search via Python scripts - `tests/` — Python tests for skill structure and search scripts - `evals/` — Test cases with quality criteria (`evals.json`) - `dev/` — Development docs including `ROADMAP.md` (full methodology and design) -- `.claude-plugin/plugin.json` — Plugin metadata +- `.claude-plugin/` — Claude-Code-specific plugin manifest (`plugin.json`, `marketplace.json`); other hosts ignore this directory +- `.github/workflows/` — CI (pytest + strict `npx skills` discovery check that asserts every expected skill, script, and reference file is present after installation) ## Commands @@ -26,15 +27,26 @@ pip install arxiv requests beautifulsoup4 ## Key conventions -- Skills are the unit of composition. Each skill directory contains a `SKILL.md` with frontmatter. -- The orchestrator (`/reaper`) runs the full pipeline: clarify → analyze → literature → formalize → brainstorm → investigate ↔ critique → synthesize. After delivery, users can iterate via `/reaper:critique "feedback"`. +- Skills are the unit of composition. Each skill directory contains a `SKILL.md` with YAML frontmatter — `name` (lowercase + hyphens, matches the directory name) and `description` are required by the [`vercel-labs/skills`](https://github.com/vercel-labs/skills) parser; everything else is optional. +- The orchestrator skill (`/reaper`) runs the full pipeline: clarify → analyze → literature → formalize → brainstorm → investigate ↔ critique → synthesize. After delivery, users can iterate by re-invoking the `/critique` skill with feedback. - Runtime state goes in `reaper-workspace/` (gitignored). Never commit workspace artifacts. - The six methodology principles (separation of concerns, fixed evaluation signal, structured results log, keep-or-discard loop, never stop, clarity and simplicity) govern how skills behave. - Domain-specific content (impossibility results, trust model checklists, venue tiers, definitional standards) lives in `skills/reaper/references/`, not inline in skills. Skills reference these files but remain domain-agnostic — the reference files can be swapped for a different research domain. - Python scripts live alongside the skill that uses them (e.g., `skills/search-arxiv/search_arxiv.py`). -- No JavaScript/TypeScript in this project — it's Claude skills + Python only. -- When adding, removing, or renaming a skill, update `.claude-plugin/marketplace.json` to keep the `skills` array in sync. Also keep `version` in both `plugin.json` and `marketplace.json` consistent with the current release. -- The license is Apache-2.0. If `plugin.json` references a license field, it must say `"Apache-2.0"`. +- No JavaScript/TypeScript in this project — it's `SKILL.md` files + Python only. +- The license is Apache-2.0. Any plugin manifest that references a license field must say `"Apache-2.0"`. - When cutting a release tag, the tag message should summarize changes since the last tag (use `git log ..HEAD`). - Always use squash merge for PRs. - Before finishing a task, check if important docs (README.md, CLAUDE.md, dev/ROADMAP.md) need to be updated to reflect your changes. + +## Distribution + +Primary distribution: [`vercel-labs/skills`](https://github.com/vercel-labs/skills) — `npx skills add SebastianElvis/reaper` shallow-clones the repo and copies all skill directories into the host agent's conventional skills folder. Targets 45+ agents including Cursor, OpenAI Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, OpenCode, Warp, Goose, Replit, and Claude Code. + +- Pin syntax: `npx skills add SebastianElvis/reaper#v0.3.8`. Tagged releases are the pin contract. +- The installer copies the entire skill directory (including Python scripts and `references/`); only `metadata.json`, `.git`, `__pycache__`, `__pypackages__` are excluded. +- All `SKILL.md` files must use host-agnostic phrasing ("invoke the `` skill") for inter-skill calls. Sub-skill `Usage` blocks may show host-specific invocation forms (e.g. `/reaper:` for Claude Code) as examples, clearly labeled as such. + +Secondary distribution: Claude Code plugin via `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json`. When adding, removing, or renaming a skill, keep the `skills` array in `marketplace.json` in sync. Keep `version` in both `plugin.json` and `marketplace.json` consistent with the current release tag — note that `marketplace.json.version` is ignored by `npx skills` (which uses git tags), so it serves only the Claude Code plugin path. + +Claude-Code-specific frontmatter keys (`user-invocable`, `argument-hint`, hooks, `context: fork`) are preserved in `SKILL.md` files but no-op on other hosts. The `--codex` flag depends on a host with MCP support; non-MCP hosts silently fall back to self-review. diff --git a/README.md b/README.md index c017549..d1e828d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Reaper -**Reaper (REAd PapER)** — an AI-native scientific research pipeline. A [Claude Code plugin](https://code.claude.com/docs/en/discover-plugins) that takes a research goal — optionally with a research paper — and autonomously conducts rigorous, multi-step academic research. +**Reaper (REAd PapER)** — an AI-native scientific research pipeline. A composable set of [AI agent skills](https://github.com/vercel-labs/skills) that takes a research goal — optionally with a research paper — and autonomously conducts rigorous, multi-step academic research. Runs on any agent that supports the `SKILL.md` convention (Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, Claude Code, and 40+ more). -[![Claude Code Plugin](https://img.shields.io/badge/Claude_Code-Plugin-blue)](https://code.claude.com/docs/en/discover-plugins) +[![Skills](https://img.shields.io/badge/skills-SKILL.md-brightgreen)](https://github.com/vercel-labs/skills) ## What Reaper Does @@ -16,6 +16,8 @@ Give Reaper a research question — with or without a PDF. It reads the paper (i /reaper "determine if the security proof in Section 4 holds under asynchrony" path/to/paper.pdf ``` +How you invoke a skill depends on the host agent. The `/` form above is the canonical display convention used throughout these docs — it works directly on slash-command hosts (e.g. Claude Code), and on auto-discovery hosts (Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, …) you simply ask the agent to "run the `/reaper` skill on …" by its bare name. + **Key capabilities:** - **Autonomous multi-stage pipeline** — goal clarification, paper analysis, literature review, hypothesis formalization, parallel investigation, critique, and synthesis all chain automatically @@ -24,52 +26,55 @@ Give Reaper a research question — with or without a PDF. It reads the paper (i - **Domain-agnostic design** — ships with cryptography and distributed systems references, but swap the reference files to adapt to any research domain - **Multi-model AI consultation** — optionally consult Codex, Gemini, DeepSeek, or local models for a second opinion at every pipeline stage - **Composable skills** — each pipeline stage is an independent skill you can run standalone +- **Host-agnostic** — distributed as plain `SKILL.md` folders that work across 45+ AI coding agents ## How It Works Reaper executes a multi-stage pipeline where investigation runs in parallel batches and critique provides feedback from multiple sources: ``` - ┌── /analyze-paper (if paper) ──┐ -/clarify-goal ──────> │ ├─> /formalize-problem - └── /review-literature ─────────┘ │ - │ (parallel) v - │ ┌──────────> /brainstorm - └── calls │ │ - /analyze-paper │ │ - per downloaded │ ┌─ /investigate ─────────────────┐ - paper │ │ plan batch │ - │ │ ├──> agent H1 ─┐ │ - │ │ ├──> agent H2 ─┼──> merge │ - │ │ └──> agent H3 ─┘ │ │ - │ │ next batch or done │ - │ └────────────────────────────────┘ - │ │ - │ ┌─ /critique ────────────────────┐ - │ │ --self --codex "feedback" │ - │ └────┬───────────────────┬───────┘ - │ │ │ - │ deepen/explore rewrite/done ──> /synthesize ──> report.md - └─────────┘ + ┌── /analyze-paper (if paper) ──┐ +/clarify-goal ─────> │ ├─> /formalize-problem + └── /review-literature ─────────┘ │ + │ (parallel) v + │ ┌──────────> /brainstorm + └── calls │ │ + /analyze-paper │ │ + per downloaded │ ┌─ /investigate ─────────────────┐ + paper │ │ plan batch │ + │ │ ├──> agent H1 ─┐ │ + │ │ ├──> agent H2 ─┼──> merge │ + │ │ └──> agent H3 ─┘ │ │ + │ │ next batch or done │ + │ └────────────────────────────────┘ + │ │ + │ ┌─ /critique ────────────────────┐ + │ │ --self --codex "feedback" │ + │ └────┬───────────────────┬───────┘ + │ │ │ + │ deepen/explore rewrite/done ──> /synthesize ──> report.md + └─────────┘ ``` ## Skills -Each skill can be used independently or composed by the orchestrator: +Each skill can be used independently or composed by the orchestrator. Invoke by skill `name` using your host's native skill-loading mechanism. | Skill | What it does | |-------|-------------| | `/reaper` | Full pipeline: clarify → analyze → literature → formalize → brainstorm → investigate ↔ critique → synthesize | -| `/reaper:clarify-goal` | Ask targeted clarifying questions to sharpen a vague research goal | -| `/reaper:analyze-paper` | Extract structured information from a research paper | -| `/reaper:review-literature` | Search and summarize related academic work | -| `/reaper:formalize-problem` | Produce precise, testable hypotheses from a research question | -| `/reaper:brainstorm` | Generate, prioritize, and refine research ideas based on current state | -| `/reaper:investigate` | Run investigation cycles with keep-or-discard discipline | -| `/reaper:critique` | Provide critique via human feedback, Codex consultation, or self-review (can trigger more investigation) | -| `/reaper:synthesize` | Generate a structured research report from investigation results | -| `/reaper:search-arxiv` | Search arXiv papers, download PDFs, and trace citation graphs | -| `/reaper:search-iacr` | Search IACR ePrint archive for cryptography papers | +| `/clarify-goal` | Ask targeted clarifying questions to sharpen a vague research goal | +| `/analyze-paper` | Extract structured information from a research paper | +| `/review-literature` | Search and summarize related academic work | +| `/formalize-problem` | Produce precise, testable hypotheses from a research question | +| `/brainstorm` | Generate, prioritize, and refine research ideas based on current state | +| `/investigate` | Run investigation cycles with keep-or-discard discipline | +| `/critique` | Provide critique via human feedback, Codex consultation, or self-review (can trigger more investigation) | +| `/synthesize` | Generate a structured research report from investigation results | +| `/search-arxiv` | Search arXiv papers, download PDFs, and trace citation graphs | +| `/search-iacr` | Search IACR ePrint archive for cryptography papers | + +> The `/` form is the canonical display convention used throughout these docs. Slash-command hosts (Claude Code) invoke them directly that way (with sub-skills as `/reaper:clarify-goal` etc.). Auto-discovery hosts (Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, …) invoke them by the bare skill name — drop the leading `/` when asking the agent to run a skill. ## Installation @@ -81,18 +86,37 @@ The search skills require Python packages: pip install arxiv requests beautifulsoup4 ``` -### From the marketplace +> **Note**: `npx skills` only copies `SKILL.md` files, Python scripts, and reference files into your agent's skills folder. It does **not** install Python dependencies, register MCP servers, or create the `reaper-workspace/` directory. Install Python deps yourself with the command above; register the Codex MCP server separately if you want `--codex` (see [Optional: Multi-model AI consultation](#optional-multi-model-ai-consultation) below); the workspace directory is created automatically the first time the pipeline runs. + +### Install via `npx skills` (recommended — works on 45+ agents) + +Reaper is distributed as standard `SKILL.md` folders. The cross-agent installer [`vercel-labs/skills`](https://github.com/vercel-labs/skills) shallow-clones this repository and copies all 11 skill directories — including Python scripts and reference files — into your agent's conventional skills folder. + +```bash +# Latest from the default branch +npx skills add SebastianElvis/reaper + +# Pinned to a specific release (recommended for reproducibility) +npx skills add SebastianElvis/reaper#v0.3.8 + +# Install into a specific agent (defaults to all detected) +npx skills add SebastianElvis/reaper --agent cursor +``` + +Supported targets include Cursor, OpenAI Codex CLI, Cline, Continue, Gemini CLI, GitHub Copilot, Windsurf, OpenCode, Warp, Goose, Replit, Claude Code, and a `universal` target at `.agents/skills/`. See `npx skills list-agents` for the full list. + +> **Reminder**: `npx skills add` copies files only. Python deps and MCP server registration are separate steps — see Prerequisites above and [Optional: Multi-model AI consultation](#optional-multi-model-ai-consultation) below. -Add the marketplace and install the plugin: +### Install on Claude Code (as a plugin) + +Claude Code can also consume Reaper via its native plugin marketplace mechanism, which bundles the same skills with slash-command routing: ``` /plugin marketplace add SebastianElvis/reaper /plugin install reaper@SebastianElvis-reaper ``` -### Manual installation via Git - -Clone the repository and add it as a local marketplace: +Or clone and add as a local marketplace: ```bash git clone https://github.com/SebastianElvis/reaper.git @@ -100,20 +124,18 @@ git clone https://github.com/SebastianElvis/reaper.git /plugin install reaper@reaper ``` -Or manually copy skills into your Claude configuration: +See the [Claude Code plugin docs](https://code.claude.com/docs/en/discover-plugins) for more details. -```bash -# Clone the repository -git clone https://github.com/SebastianElvis/reaper.git +### Invocation across hosts -# Global installation (available in all projects) -cp -r reaper/skills/* ~/.claude/skills/ +- **Slash-command hosts** (Claude Code): `/reaper ""`, `/reaper:analyze-paper `, etc. The `/:` routing is built into the host. +- **Auto-discovery hosts** (Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, …): the agent loads `SKILL.md` files from its skills folder and invokes them by name when the task matches the skill's `description`. Ask the agent to run the skill, e.g. *"use the reaper skill to research X"*. +- **Manual invocation**: any host can be pointed at a specific `SKILL.md` if its native discovery doesn't pick it up. -# Or project-level installation (available in current project only) -cp -r reaper/skills/* ./.claude/skills/ -``` +A few skill features are host-specific: -See the [Claude Code plugin docs](https://code.claude.com/docs/en/discover-plugins) for more details on installing plugins. +- The `--codex` flag enables external-model consultation via MCP. It currently requires a host with MCP support (Claude Code, OpenCode, etc.) and silently falls back to self-review elsewhere. +- Frontmatter keys `user-invocable`, `argument-hint`, hooks, and `context: fork` are Claude-Code-specific. They are preserved in the SKILL.md files but no-op on other hosts. ### Optional: Multi-model AI consultation @@ -123,18 +145,18 @@ Pass `--codex` to enable pipeline-wide AI consultation — every skill gains a c | Model | Setup | Strength | |-------|-------|----------| -| OpenAI Codex/o3 | `claude mcp add codex-cli -- npx -y codex-mcp-server` | Adversarial review, stress-testing arguments | +| OpenAI Codex/o3 | Register `codex-mcp-server` in your host's MCP config | Adversarial review, stress-testing arguments | | Google Gemini | *(coming soon)* | Long-context review across full paper corpora | | DeepSeek R1 | *(coming soon)* | Proof checking, formal reasoning | | Local models | *(coming soon — via ollama)* | Offline/private use, cost control | -To start with Codex (the currently supported backend), register the [codex-mcp-server](https://github.com/tuannvm/codex-mcp-server): +Example registration on Claude Code: ```bash claude mcp add codex-cli -- npx -y codex-mcp-server ``` -If no model backends are configured, AI consultation is silently skipped and the pipeline continues with self-review only. +Other MCP-capable hosts use their own equivalent registration. If no model backends are configured, AI consultation is silently skipped and the pipeline continues with self-review only. ## Workspace @@ -162,6 +184,8 @@ reaper-workspace/ └── report.md # Final synthesized output ``` +The workspace contract is host-agnostic — any agent that can read and write files in the working directory produces the same workspace structure. + ## Methodology Reaper's research loop follows six principles: @@ -181,8 +205,8 @@ See [`dev/ROADMAP.md`](dev/ROADMAP.md) for the full roadmap. - **Horizon 1 (The Pipeline)**: Core skills, orchestrator, and eval framework — *complete; LaTeX report output planned* - **Horizon 2 (The Library)**: arXiv/ePrint search via Python scripts + citation graph — *complete* -- **Horizon 3 (The Committee)**: Multi-model critique via `/reaper:critique --codex` — *Codex complete, Gemini/DeepSeek/local planned* -- **Horizon 3.5 (The Polyglot)**: Platform portability — run Reaper on Codex CLI, Gemini CLI, [OpenClaw](https://openclaw.ai/) — *planned* +- **Horizon 3 (The Committee)**: Multi-model critique via the `/critique` skill's `--codex` mode — *Codex complete, Gemini/DeepSeek/local planned* +- **Horizon 3.5 (The Polyglot)**: Cross-agent distribution via `npx skills` and host-agnostic skill prose — *complete; per-host orchestration polish ongoing* - **Horizon 4 (The Academy)**: Broader topic search (Scholar/DBLP), author-centric and venue-centric search — *planned* - **Horizon 5 (The Apprentice)**: Evidence quality taxonomy, evidence-aware critique — *planned* - **Horizon 6 (The Examiner)**: Proactive reformulation trigger, claim provenance, formal verification — *planned* @@ -197,6 +221,7 @@ Reaper's methodology draws from the following sources: - **[Simon Peyton Jones, "How to Write a Great Research Paper"](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/How-to-write-a-great-research-paper.pdf)** — Writing as a primary mechanism for doing research, not just reporting it. Shapes the report structure: one clear "ping," explicit refutable contributions, examples before generality. - **[S. Keshav, "How to Read a Paper" (ACM SIGCOMM CCR, 2007)](http://ccr.sigcomm.org/online/files/p83-keshavA.pdf)** — The three-pass method for reading papers: first pass for the big picture, second pass to grasp content without details, third pass to virtually re-derive the work and challenge every assumption. Structures how the literature review skill reads downloaded papers at increasing depth. - **[Mathew Stiller-Reeve, "How to Write a Thorough Peer Review" (Nature, 2018)](https://www.nature.com/articles/d41586-018-06991-0)** — Three-reading review method (aims → scientific substance → presentation) and the mirror technique. Structures the per-paper notes in the literature review skill: mirror the paper's claims, classify issues as major/minor/fatal, evaluate whether conclusions answer the introduction's questions. +- **[`vercel-labs/skills`](https://github.com/vercel-labs/skills)** — The cross-agent skills convention and CLI installer that makes Reaper portable across 45+ AI coding agents. ## License diff --git a/dev/ROADMAP.md b/dev/ROADMAP.md index e264933..57e1511 100644 --- a/dev/ROADMAP.md +++ b/dev/ROADMAP.md @@ -42,11 +42,11 @@ The AI and the human have non-overlapping territories: Before investigation begins, the problem must be precisely defined with concrete evaluation criteria. This happens in two stages: -**Stage 0: Clarify the goal.** Before any analysis begins, `clarify-goal` quick-scans the paper and asks the user 3-5 targeted clarifying questions about scope, assumptions, and success criteria. This produces `notes/clarified-goal.md` which grounds all downstream skills. If the goal is already precise, this step proceeds without questions. +**Stage 0: Clarify the goal.** Before any analysis begins, `/clarify-goal` quick-scans the paper and asks the user 3-5 targeted clarifying questions about scope, assumptions, and success criteria. This produces `notes/clarified-goal.md` which grounds all downstream skills. If the goal is already precise, this step proceeds without questions. -**Stage 1: Establish baseline.** Before formalizing the problem, `analyze-paper` and `review-literature` establish what is already known — the paper's claims, existing approaches, and the state of the art. This grounds the investigation in reality rather than starting from a vacuum. +**Stage 1: Establish baseline.** Before formalizing the problem, `/analyze-paper` and `/review-literature` establish what is already known — the paper's claims, existing approaches, and the state of the art. This grounds the investigation in reality rather than starting from a vacuum. -**Stage 2: Formalize the problem.** `formalize-problem` then produces a problem statement in `notes/problem-statement.md` containing: +**Stage 2: Formalize the problem.** `/formalize-problem` then produces a problem statement in `notes/problem-statement.md` containing: - **Trust assumptions**: Every dimension pinned down unambiguously — communication, timing, PKI/setup, corruption (timing, power, bound), computation, composition, cryptographic hardness, protocol-specific assumptions. A hypothesis without fully specified trust assumptions is rejected. - **Security properties**: What must hold, stated as formal predicates, game-based definitions, simulation-based definitions, or precise references to existing definitions. Informal descriptions like "safety" or "liveness" without formal definitions are not acceptable. @@ -96,7 +96,7 @@ Run all N cycles without asking "should I continue?" The only valid early stop i 3. Re-read `notes/results.md`. Can two "discard" results be combined into something useful? 4. Search for related work you haven't found yet. 5. Try a radically different approach to the same hypothesis. -6. If all execution-level tactics are exhausted, log the cycle as inconclusive and continue. The orchestrator will call `brainstorm` after the batch to generate new ideas (applying Hamming inversion, Qian's patterns, gap-finding) based on the pattern of failures. +6. If all execution-level tactics are exhausted, log the cycle as inconclusive and continue. The orchestrator will call `/brainstorm` after the batch to generate new ideas (applying Hamming inversion, Qian's patterns, gap-finding) based on the pattern of failures. Uncertainty about whether the human wants you to continue is *never* a reason to stop. The human will interrupt when they want you to stop. @@ -125,18 +125,18 @@ When evaluating whether a cycle produced progress, weight clarity and elegance a reaper/ ├── skills/ │ ├── reaper/SKILL.md # Orchestrator — composes the pipeline -│ ├── clarify-goal/SKILL.md # /reaper:clarify-goal -│ ├── analyze-paper/SKILL.md # /reaper:analyze-paper -│ ├── review-literature/SKILL.md # /reaper:review-literature -│ ├── formalize-problem/SKILL.md # /reaper:formalize-problem -│ ├── brainstorm/SKILL.md # /reaper:brainstorm (recurring ideation) -│ ├── investigate/SKILL.md # /reaper:investigate (proof/analysis cycles) -│ ├── critique/SKILL.md # /reaper:critique (human/Codex/self review) -│ ├── synthesize/SKILL.md # /reaper:synthesize (report generation) -│ ├── search-arxiv/ # /reaper:search-arxiv +│ ├── clarify-goal/SKILL.md # Stage 0: clarify the goal +│ ├── analyze-paper/SKILL.md # Stage 1a: paper analysis +│ ├── review-literature/SKILL.md # Stage 1b: literature review +│ ├── formalize-problem/SKILL.md # Stage 2: formalize the problem +│ ├── brainstorm/SKILL.md # Stage 2.5: recurring ideation +│ ├── investigate/SKILL.md # Stage 3: investigate (proof/analysis cycles) +│ ├── critique/SKILL.md # Stage 3 sub-step: human / external-model / self review +│ ├── synthesize/SKILL.md # Stage 4: synthesize (report generation) +│ ├── search-arxiv/ # Crypto/CS topic search via arXiv │ │ ├── SKILL.md │ │ └── search_arxiv.py # arXiv API + Semantic Scholar citations -│ └── search-iacr/ # /reaper:search-iacr +│ └── search-iacr/ # Crypto-specific IACR ePrint search │ ├── SKILL.md │ └── search_iacr.py # IACR ePrint scraper ├── tests/ # Python tests @@ -145,13 +145,18 @@ reaper/ │ └── test-papers/ # Papers for testing ├── evals/ │ └── evals.json # Test cases for skill evaluation -├── .claude-plugin/plugin.json # Plugin metadata +├── .claude-plugin/ # Claude-Code-specific plugin manifest (other hosts ignore) +│ ├── plugin.json +│ └── marketplace.json +├── .github/workflows/ # CI: pytest + npx-skills discovery check └── README.md ``` +Skills are distributed primarily via [`vercel-labs/skills`](https://github.com/vercel-labs/skills) (`npx skills add SebastianElvis/reaper`) and run on any host that supports the `SKILL.md` convention. + ### User's Workspace (generated at runtime) -When a user invokes `/reaper`, this structure is created in their working directory: +When a user invokes the `/reaper` skill (by name on auto-discovery hosts, as `/reaper` on slash-command hosts), this structure is created in their working directory: ``` reaper-workspace/ @@ -167,10 +172,7 @@ reaper-workspace/ │ └── NNN-/ # One directory per hypothesis ├── feedbacks/ # Append-only — one file per event, never modified ├── logs/ # Append-only — one file per event, never modified -└── report/ # Final synthesized output (LaTeX project) - ├── main.tex # Paper source — compilable with latexmk - ├── references.bib # BibTeX entries from literature review - └── Makefile # `make` → PDF via latexmk +└── report.md # Final synthesized output (Markdown report) — written by `/synthesize`. A future horizon may add a compilable LaTeX project (see ROADMAP horizons for that direction). ``` --- @@ -197,30 +199,30 @@ H6 The Examiner: + reformulation + (s **Goal:** Build the full research pipeline as composable sub-skills that each map 1:1 to a methodology stage. Each skill is independently useful, has a clear file contract, and can be composed by the orchestrator with subagent parallelism. Literature search uses WebSearch initially (MCP servers come in H2). -**What success looks like:** `/reaper paper.pdf "check if the security proof in Section 4 holds under asynchrony"` produces a workspace with: +**What success looks like:** invoking the `/reaper` skill with `"check if the security proof in Section 4 holds under asynchrony" paper.pdf` produces a workspace with: - `notes/problem-statement.md` containing a precise problem statement (trust assumptions, security properties, performance goals) - `notes/ideas.md` containing the research ideas/hypotheses and their resolution status - `notes/results.md` showing cycle-by-cycle progression with keep/discard decisions - `notes/current-understanding.md` with the accumulated findings -- `report/` containing a compilable LaTeX project that a researcher could submit to a venue +- `report.md` containing a synthesized Markdown report that a researcher could review (a compilable LaTeX project is a future horizon, not H1) -And each skill works standalone: `/reaper:analyze-paper paper.pdf` for just a structured summary, `/reaper:formalize-problem` for just a problem statement, etc. +And each skill works standalone: invoke `analyze-paper paper.pdf` for just a structured summary, `/formalize-problem` for just a problem statement, etc. #### Skills and File Contracts | Skill | Methodology Stage | Reads | Writes | |-------|------------------|-------|--------| -| `/reaper:clarify-goal` | Stage 0: Clarify | Input paper, goal prompt | `notes/clarified-goal.md` (refined goal, scope, assumptions, Q&A) | -| `/reaper:analyze-paper` | Stage 1a: Baseline (paper) | Input paper | `notes/paper-summary.md` | -| `/reaper:review-literature` | Stage 1b: Baseline (literature) | `notes/clarified-goal.md`, `notes/paper-summary.md` | `notes/literature.md` | -| `/reaper:formalize-problem` | Stage 2: Formalize | `notes/clarified-goal.md`, `notes/paper-summary.md`, `notes/literature.md`, goal prompt | `notes/problem-statement.md` (trust assumptions, security properties, performance goals), `notes/ideas.md` (initial ideas) | -| `/reaper:brainstorm` | Stage 2.5: Recurring ideation | `notes/problem-statement.md`, `notes/ideas.md`, `notes/current-understanding.md`, `notes/results.md`, `notes/literature.md`, `notes/paper-summary.md` | Updates `notes/ideas.md` (adds new, edits existing inline) | -| `/reaper:investigate` | Stage 3: Investigate (one cycle) | `notes/problem-statement.md`, `notes/ideas.md`, `notes/current-understanding.md` | `investigations/NNN-/` (reuses on revisit), updates `notes/results.md` inline, edits `current-understanding.md` on keep | -| `/reaper:critique` | Stage 3 sub-step: review | `investigations/`, `notes/current-understanding.md`, `notes/ideas.md` | `feedbacks/`, `logs/`, may add hypotheses to `notes/ideas.md` | -| `/reaper:synthesize` | Stage 4: Synthesize | All `notes/`, `investigations/`, `notes/results.md` | `report/` (LaTeX project: `main.tex`, `references.bib`, `Makefile`) | +| `/clarify-goal` | Stage 0: Clarify | Input paper, goal prompt | `notes/clarified-goal.md` (refined goal, scope, assumptions, Q&A) | +| `/analyze-paper` | Stage 1a: Baseline (paper) | Input paper | `notes/paper-summary.md` | +| `/review-literature` | Stage 1b: Baseline (literature) | `notes/clarified-goal.md`, `notes/paper-summary.md` | `notes/literature.md` | +| `/formalize-problem` | Stage 2: Formalize | `notes/clarified-goal.md`, `notes/paper-summary.md`, `notes/literature.md`, goal prompt | `notes/problem-statement.md` (trust assumptions, security properties, performance goals), `notes/ideas.md` (initial ideas) | +| `/brainstorm` | Stage 2.5: Recurring ideation | `notes/problem-statement.md`, `notes/ideas.md`, `notes/current-understanding.md`, `notes/results.md`, `notes/literature.md`, `notes/paper-summary.md` | Updates `notes/ideas.md` (adds new, edits existing inline) | +| `/investigate` | Stage 3: Investigate (one cycle) | `notes/problem-statement.md`, `notes/ideas.md`, `notes/current-understanding.md` | `investigations/NNN-/` (reuses on revisit), updates `notes/results.md` inline, edits `current-understanding.md` on keep | +| `/critique` | Stage 3 sub-step: review | `investigations/`, `notes/current-understanding.md`, `notes/ideas.md` | `feedbacks/`, `logs/`, may add hypotheses to `notes/ideas.md` | +| `/synthesize` | Stage 4: Synthesize | All `notes/`, `investigations/`, `notes/results.md` | `report.md` (synthesized Markdown report; a future horizon may add a compilable LaTeX project) | | `/reaper` | Orchestrator | Paper + goal prompt | Full workspace | -**`synthesize` report structure** (following Peyton Jones): +**`/synthesize` report structure** (following Peyton Jones): - **One "ping"**: The report must have one clear, central finding stated upfront. If the research yielded multiple findings, the report must still identify the single most important one. - **Explicit, refutable contributions**: A bulleted list of specific claims, each concrete enough that a reader could disagree. Not "we analyze protocol X" but "we show that claim Y fails because Z." - **Examples before generality**: Introduce findings with a concrete example (a specific execution trace, a specific adversary strategy) before presenting the general argument. @@ -228,35 +230,35 @@ And each skill works standalone: `/reaper:analyze-paper paper.pdf` for just a st #### Subagent Parallelism -- **Orchestrator**: Run `analyze-paper` and `review-literature` as parallel subagents (Stage 1a and 1b are independent) -- **`review-literature`**: Spawn parallel subagents to search different sources simultaneously, then merge results -- **`investigate`**: When multiple independent hypotheses exist, spawn parallel subagents to explore them concurrently +- **Orchestrator**: Run `/analyze-paper` and `/review-literature` as parallel subagents (Stage 1a and 1b are independent) +- **`/review-literature`**: Spawn parallel subagents to search different sources simultaneously, then merge results +- **`/investigate`**: When multiple independent hypotheses exist, spawn parallel subagents to explore them concurrently #### Tasks - [x] Write `references/methodology.md` (proof verification, security analysis, protocol extension, comparison, counterexample patterns) - [x] Write `references/paper-analysis.md` (extraction guide for crypto/distributed systems/blockchain papers) - [x] Define the workspace file contract between skills (the table above, formalized) -- [x] Build `/reaper:analyze-paper`; test independently -- [x] Build `/reaper:review-literature` (WebSearch only for now); test independently -- [x] Build `/reaper:formalize-problem`; test that it produces trust assumptions + security properties + performance goals -- [x] Build `/reaper:investigate` with full loop discipline: +- [x] Build `/analyze-paper` skill; test independently +- [x] Build `/review-literature` skill (WebSearch only for now); test independently +- [x] Build `/formalize-problem` skill; test that it produces trust assumptions + security properties + performance goals +- [x] Build `/investigate` skill with full loop discipline: - `notes/results.md` structured log with keep/discard per cycle (Principle 3) - `current-understanding.md` that only advances on keep (Principle 4) - Never-stop and when-stuck guidance (Principle 5) - Simplicity criterion for evaluating cycles (Principle 6) -- [x] Build `/reaper:synthesize`; test independently +- [x] Build `/synthesize` skill; test independently - [x] Build the `/reaper` orchestrator that composes them with subagent parallelism - [x] Create eval framework (`evals/evals.json`) with test cases and quality criteria - [x] Create test paper specifications (`dev/test-papers/README.md`) - [x] Tune skill descriptions for reliable triggering (added action verbs, specific outputs, broader trigger phrases) -- [ ] Upgrade `synthesize` to produce a compilable LaTeX project instead of markdown: +- [ ] Upgrade `/synthesize` to produce a compilable LaTeX project instead of markdown: - [ ] Change output from `report.md` to `report/` directory (`main.tex`, `references.bib`, `Makefile`) - [ ] Use `article` class by default; support venue-specific document classes (`llncs`, `acmart`, `IEEEtran`) selectable via clarified goal or user argument - [ ] Map existing paper structure to LaTeX: `\begin{definition}`, `\begin{lemma}`, `\begin{theorem}`, `\begin{proof}` via `amsthm`; `\begin{conjecture}` for unproven claims - [ ] Generate `references.bib` from `notes/literature.md` entries with `\cite{}` references in the body - [ ] Include a `Makefile` that runs `latexmk -pdf main.tex` so `make` produces a PDF - - [ ] Update the `synthesize` SKILL.md template: replace the markdown template with LaTeX equivalents + - [ ] Update the `/synthesize` SKILL.md template: replace the markdown template with LaTeX equivalents - [ ] Ensure the orchestrator and other skills that reference `report.md` (e.g., critique reading the report) are updated to read `report/main.tex` - [ ] Test: does `make` in `report/` produce a valid PDF without manual fixes? - [ ] Test full pipeline end-to-end with 3 real papers: @@ -269,9 +271,9 @@ And each skill works standalone: `/reaper:analyze-paper paper.pdf` for just a st **Methodology stage:** Enriches Stage 1b (establish baseline from literature) with real academic paper servers. -**Goal:** Upgrade `review-literature` from generic web search to structured academic search — arXiv, IACR ePrint, citation graph traversal — using lightweight Python scripts (no MCP dependency). Also enable `investigate` to pull in new references mid-loop when a cycle reveals a gap in context. +**Goal:** Upgrade `/review-literature` from generic web search to structured academic search — arXiv, IACR ePrint, citation graph traversal — using lightweight Python scripts (no MCP dependency). Also enable `/investigate` to pull in new references mid-loop when a cycle reveals a gap in context. -**What success looks like:** `/reaper:review-literature "post-quantum threshold signatures"` automatically searches arXiv and IACR ePrint, traces forward/backward citations via Semantic Scholar, and produces a structured literature survey with precise references. +**What success looks like:** invoking the `/review-literature` skill with `"post-quantum threshold signatures"` automatically searches arXiv and IACR ePrint, traces forward/backward citations via Semantic Scholar, and produces a structured literature survey with precise references. #### Search Tools @@ -282,11 +284,11 @@ And each skill works standalone: `/reaper:analyze-paper paper.pdf` for just a st #### Tasks -- [x] Build `search-arxiv` skill with Python script (arXiv API + Semantic Scholar citations) -- [x] Build `search-iacr` skill with Python script (IACR ePrint scraper) +- [x] Build `/search-arxiv` skill with Python script (arXiv API + Semantic Scholar citations) +- [x] Build `/search-iacr` skill with Python script (IACR ePrint scraper) - [x] Write `references/search-tools.md` — catalog of search tools with usage patterns and decision tree -- [x] Update `review-literature` skill: structured search as primary, WebSearch as fallback, citation graph, recent papers -- [x] Update `investigate` skill: mid-cycle literature search via search scripts +- [x] Update `/review-literature` skill: structured search as primary, WebSearch as fallback, citation graph, recent papers +- [x] Update `/investigate` skill: mid-cycle literature search via search scripts - [x] Handle graceful degradation when search scripts are unavailable - [x] Document Python prerequisites in README - [ ] Test: given a seed paper, can Reaper find and summarize the 10 most relevant related works? @@ -297,12 +299,12 @@ And each skill works standalone: `/reaper:analyze-paper paper.pdf` for just a st **Goal:** After investigation cycles, get external critique via human feedback, multi-model consultation, or self-review — finding flaws, suggesting alternative approaches, sanity-checking conclusions. This compensates for the lack of an objective numeric metric by adding independent perspectives, analogous to peer review in human research. Different models catch different classes of errors: formal reasoning models (DeepSeek R1) excel at proof checking, long-context models (Gemini) can review entire paper corpora, and adversarial reasoners (o3) stress-test arguments. -**What success looks like:** Reaper sends its proof that "protocol X is insecure under asynchrony" to multiple models as devil's advocates. DeepSeek R1 catches a gap in the formal reduction. Gemini, having ingested the full paper corpus, points out a related construction the literature review missed. The critique triggers additional investigation cycles, the flaws are addressed, and the final report includes the corrections. Human users can also inject feedback at any point via `/reaper:critique "your analysis misses the abort case"`. When no API keys are configured, the pipeline degrades gracefully to self-review only. +**What success looks like:** Reaper sends its proof that "protocol X is insecure under asynchrony" to multiple models as devil's advocates. DeepSeek R1 catches a gap in the formal reduction. Gemini, having ingested the full paper corpus, points out a related construction the literature review missed. The critique triggers additional investigation cycles, the flaws are addressed, and the final report includes the corrections. Human users can also inject feedback at any point by invoking the `/critique` skill with `"your analysis misses the abort case"`. When no API keys are configured, the pipeline degrades gracefully to self-review only. #### Architecture ``` -investigate ──> /reaper:critique +investigate ──> critique │ ┌───────────┼───────────────────────┐ ▼ ▼ ▼ @@ -334,9 +336,9 @@ investigate ──> /reaper:critique | Skill | Methodology Stage | Reads | Writes | |-------|------------------|-------|--------| -| `/reaper:critique` | Stage 3 sub-step: adversarial review | `investigations/`, `notes/current-understanding.md` | `feedbacks/*.md`, `logs/*.md`, may add hypotheses | +| `/critique` | Stage 3 sub-step: adversarial review | `investigations/`, `notes/current-understanding.md` | `feedbacks/*.md`, `logs/*.md`, may add hypotheses | -The original `cross-verify` concept was implemented as the more general `/reaper:critique` skill, which supports three modes: human feedback, Codex MCP consultation (devil's advocate / inspiration), and self-review. +The original `cross-verify` concept was implemented as the more general `/critique` skill, which supports three modes: human feedback, external-model MCP consultation (devil's advocate / inspiration), and self-review. #### Model Backends @@ -375,15 +377,15 @@ Different models have different strengths. The critique skill should route consu #### Tasks -- [x] Build `/reaper:critique` skill (replaces planned `cross-verify`): +- [x] Build `/critique` skill (replaces planned `cross-verify`): - Human feedback mode: user provides critique text - - Codex consultation mode: alternates devil's advocate / inspiration via MCP + - External-model consultation mode: alternates devil's advocate / inspiration via MCP - Self-review mode: self-critique of current findings - Can trigger additional investigation cycles - [x] Define feedback prompt templates (devil's advocate, alternative approach) - [x] Integrate critique into the orchestrator (investigate ↔ critique loop) - [x] Apply keep-or-discard to external feedback -- [x] Document codex-mcp-server setup in README +- [x] Document MCP-server setup in README (Codex MCP server as the first integrated backend) - [ ] Generalize critique skill's Codex-specific protocol to a model-agnostic consultation protocol (model name as parameter, not hardcoded) - [ ] Add Gemini backend (MCP or API wrapper) for long-context review - [ ] Add DeepSeek R1 backend (API wrapper) for formal reasoning / proof checking @@ -395,69 +397,66 @@ Different models have different strengths. The critique skill should route consu - [ ] Test: does multi-model feedback catch errors that single-model analysis misses? - [ ] Test: does routing (proof→DeepSeek, adversarial→o3, breadth→Gemini) outperform uniform model selection? -### Horizon 3.5: The Polyglot (Platform Portability) +### Horizon 3.5: The Polyglot (Cross-Agent Distribution) **Methodology stage:** All stages — makes the entire pipeline portable across AI agent platforms. -**Current state:** Reaper is a Claude Code plugin. Skills are defined as SKILL.md files with Claude Code frontmatter, invoked via `/reaper:*` slash commands, and depend on Claude Code's tool set (Agent, Bash, Read, Write, WebSearch, Glob, Grep). The workspace file contract (`reaper-workspace/`) and the methodology are platform-agnostic, but the skill format is not. - -**Goal:** Abstract Reaper's skill definitions so the pipeline can run on other AI coding agent platforms — starting with those closest in architecture (Codex CLI, Gemini CLI), then expanding to IDE-based and autonomous agents. The workspace file contract is already portable (just files); the work is in adapting skill invocation, tool usage, and orchestration to each platform's conventions. - -**What success looks like:** A researcher using Codex CLI can run the Reaper pipeline with comparable quality to Claude Code. The same research goal, same seed paper, same workspace output format — different execution engine. Skills are authored once and transpiled/adapted per platform. - -#### Target Platforms - -| Platform | Plugin Format | Tool Equivalents | Feasibility | Priority | -|----------|--------------|-----------------|-------------|----------| -| **Claude Code** | `.claude-plugin/` + SKILL.md | Agent, Bash, Read, Write, WebSearch | Current | — | -| **OpenAI Codex CLI** | Agent instructions + sandbox | `shell`, file I/O, `web_search` | High | 1st | -| **Gemini CLI** | GEMINI.md + extensions | Bash, file tools, search | High | 2nd | -| **OpenClaw** | Skills via ClawHub | Shell, file I/O, browser, persistent memory | High | 3rd | -| **Cline / Cursor** | Rules files + tool access | IDE file tools, terminal, browser | Low | Later | - -**Design notes:** -- The **workspace file contract** is the portability anchor. All platforms can read/write files. As long as skills produce the same workspace structure, the methodology works regardless of execution engine. -- **Tool mapping** is the main challenge. Each platform has different names and capabilities for the same operations (e.g., Claude's `Agent` subagent spawning vs. Codex's sandbox forking). A mapping table per platform defines equivalents. -- **Orchestration** is the hardest part. Claude Code's Agent tool enables parallel subagent spawning (e.g., investigate multiple hypotheses concurrently). Platforms without native parallelism fall back to sequential execution — slower but functionally identical. -- **Skill transpilation** vs. **native authoring**: Start with manual adaptation (write platform-specific skill files for Codex CLI), then generalize if patterns emerge. Premature abstraction (a universal skill DSL) is worse than two well-adapted implementations. - -#### Architecture - -``` -skills/ # Canonical skill definitions (Claude Code format) -├── reaper/SKILL.md -├── analyze-paper/SKILL.md -├── ... -│ -adapters/ # Platform-specific adaptations -├── codex-cli/ -│ ├── README.md # Setup instructions for Codex CLI -│ ├── agent-instructions.md # Codex CLI agent config (equivalent to orchestrator SKILL.md) -│ └── tool-mapping.md # Claude tools → Codex equivalents -├── gemini-cli/ -│ ├── README.md -│ ├── GEMINI.md # Gemini CLI config -│ └── tool-mapping.md -├── openclaw/ -│ ├── README.md # Setup instructions for OpenClaw -│ ├── skill-config.md # OpenClaw skill definition (ClawHub format) -│ └── tool-mapping.md # Claude tools → OpenClaw equivalents -└── ... -``` +**Current state:** Reaper ships as standard `SKILL.md` folders compatible with the [`vercel-labs/skills`](https://github.com/vercel-labs/skills) convention. A single `npx skills add SebastianElvis/reaper` shallow-clones the repo and copies all 11 skill directories into the host agent's conventional skills folder, supporting 45+ targets including Cursor, OpenAI Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, OpenCode, Warp, Goose, Replit, and Claude Code. The orchestrator and inter-skill triggers use host-agnostic phrasing ("invoke the `` skill") so that auto-discovery agents and slash-command agents both route correctly. + +**Goal:** Make Reaper a first-class skills package — authored once, runnable on any host that consumes `SKILL.md` files. Achieved by converging on the `SKILL.md` convention rather than building per-host adapters. + +**What success looks like:** A researcher on any supported host (Cursor, Codex CLI, Gemini CLI, Cline, Continue, Copilot, Windsurf, OpenCode, Warp, Goose, Claude Code, …) can install Reaper with one command and run the same pipeline against the same workspace contract — the only thing that varies between hosts is the surface invocation form. No per-host translation layer required. + +#### Distribution Mechanism + +| Component | Mechanism | Status | +|-----------|-----------|--------| +| **Skill format** | `SKILL.md` with `name` + `description` frontmatter (vercel-labs/skills spec) | ✓ | +| **Cross-agent installer** | `npx skills add SebastianElvis/reaper` | ✓ | +| **Pin syntax** | `npx skills add SebastianElvis/reaper#v0.3.8` (git tags) | ✓ | +| **Inter-skill calls** | Host-agnostic prose ("invoke the `` skill") | ✓ | +| **Python script bundling** | Whole-directory copy includes `search_arxiv.py`, `search_iacr.py`, `references/` | ✓ | +| **Frontmatter compatibility** | Claude-only keys (`user-invocable`, `argument-hint`, hooks) preserved as opaque YAML, no-op on other hosts | ✓ | +| **CI validation** | Frontmatter regex check + strict `npx skills add` discovery test (verifies every expected skill, Python script, and reference file is present after install; fails the build if any asset is missing) | ✓ | +| **Claude Code plugin path** | `.claude-plugin/marketplace.json` for slash-command routing | ✓ | + +#### Host Compatibility + +| Host | Skills folder | Discovery model | Notes | +|------|--------------|-----------------|-------| +| Claude Code | `.claude/skills/` | Slash commands (`/reaper:`) + Skill tool | Native plugin path also available via `.claude-plugin/marketplace.json` | +| Cursor | `.agents/skills/` (universal) | Auto-route by `description` match | | +| OpenAI Codex CLI | `.agents/skills/` (universal) | Auto-route by `description` match | | +| Cline | `.agents/skills/` (universal) | Auto-route by `description` match | | +| Continue | `.continue/skills/` | Auto-route by `description` match | | +| Gemini CLI | `.agents/skills/` (universal) | Auto-route by `description` match | | +| GitHub Copilot | `.agents/skills/` (universal) | Auto-route by `description` match | | +| Windsurf | `.windsurf/skills/` | Auto-route by `description` match | | +| OpenCode | `.agents/skills/` (universal) | Auto-route by `description` match | MCP support enables `--codex` mode | +| Warp | `.agents/skills/` (universal) | Auto-route by `description` match | | +| Goose | `.goose/skills/` | Auto-route by `description` match | | +| Replit | `.agents/skills/` (universal) | Auto-route by `description` match | | +| Universal target | `.agents/skills/` | Manual invocation | Fallback for hosts without auto-discovery | + +#### Known Caveats + +- The `--codex` flag depends on a host with MCP support and a registered Codex MCP server. Non-MCP hosts silently fall back to self-review. +- Auto-discovery quality varies by host. Reliable routing depends on the skill's `description` matching the user's request — Reaper's descriptions are tuned for action-verb match (e.g. "Run the full Reaper research pipeline…") to improve auto-routing. +- Python dependencies (`arxiv`, `requests`, `beautifulsoup4`) are not installed by `npx skills` — users must `pip install` separately. The skill prose tells the agent to do this if missing. +- Sub-skill `Usage` blocks now lead with the bare-name form and show `/reaper:` as a slash-command-host example only. #### Tasks -- [ ] Define platform-neutral skill interface spec (inputs, outputs, file contracts, required tool capabilities) -- [ ] Create tool mapping table: Claude Code tools → equivalents for each target platform -- [ ] Build Codex CLI adapter: translate orchestrator + key skills (analyze-paper, investigate, critique, synthesize) to Codex CLI format -- [ ] Build Gemini CLI adapter: translate orchestrator + key skills to Gemini CLI format -- [ ] Build OpenClaw adapter: translate skills to ClawHub skill format, leverage OpenClaw's persistent memory for workspace state and browser automation for paper fetching -- [ ] Document per-platform setup instructions (API keys, tool installation, workspace conventions) -- [ ] Handle platform capability gaps: graceful degradation when a platform lacks subagent parallelism or WebSearch -- [ ] Test: same research goal on Claude Code vs. Codex CLI — compare workspace output quality -- [ ] Test: same research goal on Claude Code vs. Gemini CLI — compare workspace output quality -- [ ] Test: same research goal on Claude Code vs. OpenClaw — compare workspace output quality -- [ ] Evaluate whether a shared skill DSL is worth building (only after ≥2 adapters exist to extract patterns from) +- [x] Adopt `vercel-labs/skills` `SKILL.md` convention as the canonical distribution format +- [x] Audit all skill frontmatter for `npx skills` parser compliance (`name` regex, `name` matches directory) +- [x] Rewrite orchestrator + critique inter-skill triggers to host-agnostic phrasing +- [x] Update sub-skill `Usage` blocks to lead with skill-name invocation, mark `/reaper:` as slash-command-host example +- [x] Document multi-host install in README (`npx skills add` as primary, Claude Code plugin as secondary) +- [x] Add CI: frontmatter validation + `npx skills add` discovery check +- [ ] Per-host smoke test: same research goal on Cursor, Codex CLI, Gemini CLI — compare workspace output quality and routing reliability +- [ ] Document `description` tuning patterns for reliable auto-routing across hosts +- [ ] Investigate listing in the skills.sh registry (used by `npx skills find`) for discoverability +- [ ] If specific hosts can't auto-discover sibling skills the orchestrator triggers, document host-specific install instructions or build a thin Python driver as a fallback orchestration path ### Horizon 4: The Academy @@ -471,7 +470,7 @@ adapters/ # Platform-specific adaptations 2. **Author-centric search** — Given a name from a paper's references or a related work, find their DBLP profile or Google Scholar page, retrieve their publication list, identify their recent focus areas, and find co-authors who work on similar problems. This is how human researchers navigate literature: "Elaine Shi has been working on this — what else has her group done?" 3. **Venue-centric search** — Given a conference (CCS, CRYPTO, PODC, S&P, EUROCRYPT), find recent proceedings, accepted papers, program committee members, and keynote topics. This surfaces work that topic search misses because the terminology differs across communities (e.g., "Byzantine agreement" vs. "atomic broadcast" vs. "state machine replication"). -**What success looks like:** Given a paper on BFT consensus, `review-literature` automatically: +**What success looks like:** Given a paper on BFT consensus, `/review-literature` automatically: - Searches arXiv + ePrint (existing) for direct topic matches - Searches Google Scholar and DBLP for broader CS coverage - Identifies key authors from initial results, retrieves their recent publications @@ -495,7 +494,7 @@ Following H2's pattern: lightweight Python scripts, JSON output, graceful degrad #### Search Strategies -The `review-literature` skill currently does topic-centric search only. With the new tools, it should orchestrate three search strategies in parallel: +The `/review-literature` skill currently does topic-centric search only. With the new tools, it should orchestrate three search strategies in parallel: | Strategy | When to use | Tools | |----------|------------|-------| @@ -503,15 +502,15 @@ The `review-literature` skill currently does topic-centric search only. With the | **Author search** | When initial results identify recurring authors (≥2 papers by the same group) | DBLP author lookup, Google Scholar profile | | **Venue search** | When the paper targets a specific venue or subfield (identified from paper metadata or clarified goal) | DBLP venue proceedings, conference website | -The `investigate` skill's mid-cycle search should also gain access to author and venue search — "this proof technique was introduced by [author], what else have they published on this?" is a common mid-investigation need. +The `/investigate` skill's mid-cycle search should also gain access to author and venue search — "this proof technique was introduced by [author], what else have they published on this?" is a common mid-investigation need. #### Tasks - [ ] Build `search-dblp` skill with Python script (DBLP REST API: topic search, author publications, venue proceedings) - [ ] Build `search-scholar` skill with Python script (Google Scholar: topic search, author profiles, citation traversal) - [ ] Build `search-venue` skill with Python script (conference proceedings scraper, starting with IACR + ACM DL) -- [ ] Update `review-literature` skill: add author-centric and venue-centric search strategies alongside existing topic search -- [ ] Update `investigate` skill: mid-cycle author/venue search when a cycle reveals a key researcher or venue +- [ ] Update `/review-literature` skill: add author-centric and venue-centric search strategies alongside existing topic search +- [ ] Update `/investigate` skill: mid-cycle author/venue search when a cycle reveals a key researcher or venue - [ ] Update `references/search-tools.md`: add new tools to the catalog and decision tree - [ ] Handle graceful degradation: Google Scholar blocking, conference site structure changes - [ ] Test: given a seed paper, does the expanded search surface find relevant work that arXiv + ePrint missed? @@ -521,7 +520,7 @@ The `investigate` skill's mid-cycle search should also gain access to author and **Methodology stage:** Strengthens the evaluation signal across all stages by making the system self-aware of its evidence quality. -**Goal:** The `investigate` skill already tracks confidence (High/Medium/Low) and outcome (confirmed/refuted/inconclusive), but these are vibes — there's no framework distinguishing a formal proof from a plausible argument from a heuristic suspicion. The Apprentice adds an evidence taxonomy so Reaper is honest about *what kind* of evidence backs each claim, and enforces that weak evidence gets elevated or discarded. +**Goal:** The `/investigate` skill already tracks confidence (High/Medium/Low) and outcome (confirmed/refuted/inconclusive), but these are vibes — there's no framework distinguishing a formal proof from a plausible argument from a heuristic suspicion. The Apprentice adds an evidence taxonomy so Reaper is honest about *what kind* of evidence backs each claim, and enforces that weak evidence gets elevated or discarded. **Current state:** The investigate skill has confidence levels with a "default one level lower than instinct" heuristic and outcome tags (confirmed/refuted/partially-confirmed/inconclusive/new-hypothesis/reformulate). The critique skill classifies feedback into scope/deepen/explore/rewrite. Neither reasons about evidence *strength*. @@ -540,11 +539,11 @@ Every claim in `notes/results.md` and `current-understanding.md` must be tagged | **Plausible argument** | Informal but coherent reasoning | "The simulator likely cannot handle abort because it has no rewinding opportunity" | | **Heuristic suspicion** | Pattern match or intuition, not yet substantiated | "The proof structure resembles [X] which had a known flaw" | -The `investigate` skill's keep-or-discard decision should account for evidence level: a "keep" at the "heuristic suspicion" level must be followed by a cycle that attempts to elevate it. The orchestrator's adaptation signals (currently ">50% discard → brainstorm") should also consider evidence distribution — a round where most keeps are heuristic-level warrants deepening, not advancing. +The `/investigate` skill's keep-or-discard decision should account for evidence level: a "keep" at the "heuristic suspicion" level must be followed by a cycle that attempts to elevate it. The orchestrator's adaptation signals (currently ">50% discard → brainstorm") should also consider evidence distribution — a round where most keeps are heuristic-level warrants deepening, not advancing. #### Evidence-Aware Critique -The `critique` skill's self-review mode currently identifies "weak claims" and "untested assumptions" but has no systematic way to evaluate them. With the evidence taxonomy: +The `/critique` skill's self-review mode currently identifies "weak claims" and "untested assumptions" but has no systematic way to evaluate them. With the evidence taxonomy: - **Self-review** checks each claim's evidence level against its stated confidence. High confidence + heuristic suspicion = flag. - **Codex consultation** receives evidence levels as context, enabling more targeted devil's advocate ("this claim rests on a plausible argument — can you construct a counterexample?"). @@ -552,11 +551,11 @@ The `critique` skill's self-review mode currently identifies "weak claims" and " #### Tasks -- [ ] Define and document the evidence taxonomy (the table above, integrated into `investigate` and `critique` skills) +- [ ] Define and document the evidence taxonomy (the table above, integrated into `/investigate` and `/critique` skills) - [ ] Update `notes/results.md` format to include evidence level column alongside existing confidence and outcome -- [ ] Update `investigate` skill: tag every claim with evidence level, require elevation plan for heuristic-level keeps -- [ ] Update `critique` skill: self-review checks evidence level vs. confidence, Codex consultation includes evidence context -- [ ] Update `synthesize` skill: distinguish proven claims from conjectures in the report (leverages LaTeX theorem/conjecture environments from H1) +- [ ] Update `/investigate` skill: tag every claim with evidence level, require elevation plan for heuristic-level keeps +- [ ] Update `/critique` skill: self-review checks evidence level vs. confidence, Codex consultation includes evidence context +- [ ] Update `/synthesize` skill: distinguish proven claims from conjectures in the report (leverages LaTeX theorem/conjecture environments from H1) - [ ] Update orchestrator adaptation signals: factor in evidence distribution, not just keep/discard ratio - [ ] Test: does evidence tagging change keep/discard decisions compared to current behavior? @@ -566,15 +565,15 @@ The `critique` skill's self-review mode currently identifies "weak claims" and " **Goal:** Address two structural gaps: (1) the pipeline can reformulate reactively (the investigate skill already emits `outcome: reformulate` which triggers re-formalization), but has no *proactive* trigger when a pattern of failure suggests the problem statement itself is wrong; (2) claims in the final report don't link back to the investigation cycles that support them, making audit difficult. -**Current state:** The investigate skill has `outcome: reformulate` which hands control to the orchestrator to re-run `formalize-problem`. The orchestrator checks for this after each batch. But this only fires when a single cycle explicitly concludes "reformulate" — it doesn't detect the pattern of 5 consecutive inconclusive results that suggests the formalization is flawed. Separately, `synthesize` reads investigation directories selectively but doesn't generate provenance links in the report. +**Current state:** The investigate skill has `outcome: reformulate` which hands control to the orchestrator to re-run `/formalize-problem`. The orchestrator checks for this after each batch. But this only fires when a single cycle explicitly concludes "reformulate" — it doesn't detect the pattern of 5 consecutive inconclusive results that suggests the formalization is flawed. Separately, `/synthesize` reads investigation directories selectively but doesn't generate provenance links in the report. -**What success looks like:** After 5 consecutive inconclusive/discard cycles where no individual cycle triggered reformulation, the orchestrator proactively escalates — passing the accumulated failure evidence to `formalize-problem` for re-examination. The final report includes investigation references for every claim, so a reader can trace any finding back to its supporting reasoning. +**What success looks like:** After 5 consecutive inconclusive/discard cycles where no individual cycle triggered reformulation, the orchestrator proactively escalates — passing the accumulated failure evidence to `/formalize-problem` for re-examination. The final report includes investigation references for every claim, so a reader can trace any finding back to its supporting reasoning. #### Proactive Reformulation Trigger The existing reactive mechanism (`outcome: reformulate`) handles cases where a cycle discovers a specific flaw in the formalization. The proactive trigger handles the subtler case: persistent failure without a clear cause. -**Trigger condition:** After N consecutive discard/inconclusive results (default N=5), or when the `critique` skill flags a systematic pattern of failure, the orchestrator invokes `formalize-problem` again with: +**Trigger condition:** After N consecutive discard/inconclusive results (default N=5), or when the `/critique` skill flags a systematic pattern of failure, the orchestrator invokes `/formalize-problem` again with: - The original inputs (paper, goal, literature) - The accumulated evidence of what doesn't work (from `notes/results.md`) - An explicit directive: "The current formalization may be flawed. Re-examine the trust assumptions, security property definitions, and hypothesis framing in light of these failed attempts." @@ -583,13 +582,13 @@ The reformulated `problem-statement.md` replaces the old one (old version archiv #### Claim Provenance -Every claim in `report.md` should reference the investigation cycle(s) that support it. The `synthesize` skill already reads investigations selectively; provenance links are a natural extension: +Every claim in `report.md` should reference the investigation cycle(s) that support it. The `/synthesize` skill already reads investigations selectively; provenance links are a natural extension: - Each claim references the investigation directory and notes/results.md cycle that produced it - Evidence level (from H5) is included so readers know the strength of support - Claims supported by multiple cycles reference all of them -This doesn't require a rigid format — the `synthesize` skill should produce natural prose with inline references, not a mechanical template. +This doesn't require a rigid format — the `/synthesize` skill should produce natural prose with inline references, not a mechanical template. #### Formal Verification (Stretch) @@ -607,8 +606,8 @@ The investigate skill already has access to Bash for running external tools. Int #### Tasks - [ ] Add proactive reformulation trigger to orchestrator: count consecutive discards/inconclusives, escalate at N=5 -- [ ] Update `formalize-problem` skill: accept "reformulation mode" with prior failure evidence alongside existing initial mode -- [ ] Update `synthesize` skill: generate investigation references for each claim in the report (uses LaTeX `\label`/`\ref` cross-references and BibTeX `\cite{}` from H1) +- [ ] Update `/formalize-problem` skill: accept "reformulation mode" with prior failure evidence alongside existing initial mode +- [ ] Update `/synthesize` skill: generate investigation references for each claim in the report (uses LaTeX `\label`/`\ref` cross-references and BibTeX `\cite{}` from H1) - [ ] Add `references/computation.md` — decision tree for when mechanical checking is worth attempting (Z3 for bounded search, Tamarin for protocol models) - [ ] (Stretch) Build Z3 integration for bounded counterexample search — lowest barrier, highest payoff - [ ] (Stretch) Build Tamarin integration for protocol security claims @@ -626,9 +625,9 @@ Reaper's methodology draws from four sources: **[Richard Hamming, "You and Your Research"](https://d37ugbyn3rpeym.cloudfront.net/stripe-press/TAODSAE_zine_press.pdf)** (Stripe Press edition of *The Art of Doing Science and Engineering*) — The importance filter and problem-inversion technique. Hamming's central question — "Why are you not working on the important problems in your field?" — shapes the importance filter in Principle 2: prioritize hypotheses by consequence, not convenience. His technique of inverting blockages into insights (if you can't prove it, try to disprove it) is built into the "when stuck" protocol in Principle 5. Hamming also taught that effort compounds — steady, disciplined investigation cycles accumulate understanding the way compound interest accumulates capital. -**[Zhiyun Qian, "How to Look for Ideas in Computer Science Research"](https://medium.com/digital-diplomacy/how-to-look-for-ideas-in-computer-science-research-7a3fa6f4696f)** — Systematic idea generation patterns. Qian's six patterns (fill-in-the-blank, expansion, build-a-hammer, start-small-then-generalize, reproduce-prior-work, external-sources) are incorporated into Principles 2 and 5, and into the `formalize-problem` skill's approach to generating ideas. The "fill in the blank" pattern — mapping dimensions of existing research and finding unexplored combinations — is particularly powerful for theoretical research where the design space of threat models, protocol families, and security properties can be systematically enumerated. +**[Zhiyun Qian, "How to Look for Ideas in Computer Science Research"](https://medium.com/digital-diplomacy/how-to-look-for-ideas-in-computer-science-research-7a3fa6f4696f)** — Systematic idea generation patterns. Qian's six patterns (fill-in-the-blank, expansion, build-a-hammer, start-small-then-generalize, reproduce-prior-work, external-sources) are incorporated into Principles 2 and 5, and into the `/formalize-problem` skill's approach to generating ideas. The "fill in the blank" pattern — mapping dimensions of existing research and finding unexplored combinations — is particularly powerful for theoretical research where the design space of threat models, protocol families, and security properties can be systematically enumerated. -**[Simon Peyton Jones, "How to Write a Great Research Paper"](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/How-to-write-a-great-research-paper.pdf)** — Writing as research methodology. Peyton Jones's core insight — that writing is a primary mechanism for doing research, not just for reporting it — is woven into Principle 6 (Clarity and Simplicity). His structural advice (one clear "ping," explicit refutable contributions, examples before generality, narrative flow over chronological recounting) shapes the `synthesize` skill's report format. Most importantly, the idea that you should write *before* you fully understand forces Reaper to crystallize its understanding in `current-understanding.md` at every cycle, not just at the end. +**[Simon Peyton Jones, "How to Write a Great Research Paper"](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/How-to-write-a-great-research-paper.pdf)** — Writing as research methodology. Peyton Jones's core insight — that writing is a primary mechanism for doing research, not just for reporting it — is woven into Principle 6 (Clarity and Simplicity). His structural advice (one clear "ping," explicit refutable contributions, examples before generality, narrative flow over chronological recounting) shapes the `/synthesize` skill's report format. Most importantly, the idea that you should write *before* you fully understand forces Reaper to crystallize its understanding in `current-understanding.md` at every cycle, not just at the end. ### Why a Skill, Not Python @@ -647,7 +646,7 @@ The AI *is* the research agent. No wrapper needed. ### Why a Pipeline of Skills -A monolithic "do research" skill is hard to test, hard to improve, and hard to reuse partially. By decomposing into `analyze-paper`, `review-literature`, `formalize-problem`, `investigate`, `cross-verify`, and `synthesize`, each skill: +A monolithic "do research" skill is hard to test, hard to improve, and hard to reuse partially. By decomposing into `/analyze-paper`, `/review-literature`, `/formalize-problem`, `/investigate`, `cross-verify`, and `/synthesize`, each skill: - Can be tested and iterated independently - Can be used standalone (e.g., just analyze a paper without running the full pipeline) - Has a clear input/output contract via workspace files diff --git a/evals/evals.json b/evals/evals.json index b32599d..49b2b97 100644 --- a/evals/evals.json +++ b/evals/evals.json @@ -82,50 +82,50 @@ ], "skill_unit_tests": [ { - "skill": "reaper:analyze-paper", + "skill": "analyze-paper", "test": "Given any crypto/distributed systems paper, paper-summary.md must have all sections filled (Metadata, Problem Statement, System Model with all 5 sub-dimensions, Construction Overview, Key Results with verbatim theorems, Proof Technique, Complexity Claims, Strengths, Weaknesses, Key Definitions, Red Flags). Sections are proportional to what the paper warrants — omit what doesn't apply." }, { - "skill": "reaper:review-literature", + "skill": "review-literature", "test": "Given a research goal (and optionally paper-summary.md), literature.md must have a landscape summary, at least 10 related works table entries with non-empty Relevance column, key prior results, and at least 2 specific gaps" }, { - "skill": "reaper:formalize-problem", + "skill": "formalize-problem", "test": "Given paper-summary.md and literature.md, problem-statement.md must have complete trust assumptions (all 5 dimensions), at least 2 ideas each with statement/success/failure conditions, and priorities justified by consequence" }, { - "skill": "reaper:formalize-problem (paper-less)", + "skill": "formalize-problem (paper-less)", "test": "Given clarified-goal.md and literature.md WITHOUT paper-summary.md, problem-statement.md must have complete trust assumptions (all 5 dimensions), at least 2 ideas each with statement/success/failure conditions, and priorities justified by consequence. literature.md serves as the primary technical context." }, { - "skill": "reaper:investigate", + "skill": "investigate", "test": "After N cycles: notes/results.md has one row per hypothesis (updated inline on revisit, not duplicated), investigation directories 001 through NNN exist with analysis.md files, current-understanding.md changed only on keep cycles, every cycle outcome is statable in one sentence" }, { - "skill": "reaper:synthesize", + "skill": "synthesize", "test": "report.md has a one-sentence central finding, bulleted refutable contributions, each finding starts with a concrete example, no chronological narration, open questions are specific" }, { - "skill": "reaper:search-arxiv", + "skill": "search-arxiv", "test": "search command returns valid JSON array with fields: arxiv_id, title, authors, year, abstract, pdf_url. citations command returns references and citations arrays. Tested via: python -m pytest tests/test_search_arxiv.py" }, { - "skill": "reaper:search-iacr", + "skill": "search-iacr", "test": "search command returns valid JSON array with fields: eprint_id, title, pdf_url, url. url command returns correct ePrint URLs. Tested via: python -m pytest tests/test_search_iacr.py" }, { - "skill": "reaper:review-literature (H2)", + "skill": "review-literature (H2)", "test": "literature.md includes papers found via arXiv and ePrint search scripts (not just WebSearch). Citation Graph section present with forward/backward citations. Graceful degradation: pipeline completes with WebSearch fallback when scripts unavailable." }, { - "skill": "reaper:investigate (H2)", + "skill": "investigate (H2)", "test": "When stuck at step 4, mid-cycle literature search uses search scripts. New papers integrated into literature.md inline (added to appropriate existing sections). literature-search action-type logged in notes/results.md." } ], "evaluation_protocol": { - "setup": "1. Place a test paper PDF in dev/test-papers/ with the filename matching the test case\n2. Open Claude Code in the repo root", - "run_full_pipeline": "/reaper \"\" dev/test-papers/.pdf", - "run_individual_skill": "/reaper: ", + "setup": "1. Place a test paper PDF in dev/test-papers/ with the filename matching the test case\n2. Open any skills-supporting AI agent (Claude Code, Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, OpenCode, Warp, Goose, Replit, etc.) in the repo root with the reaper skills installed (`npx skills add SebastianElvis/reaper`)", + "run_full_pipeline": "Invoke the `reaper` skill with arguments: `\"\" dev/test-papers/.pdf`. On slash-command hosts: `/reaper \"\" dev/test-papers/.pdf`. On auto-discovery hosts: ask the agent to run reaper with the goal and paper path.", + "run_individual_skill": "Invoke the named sub-skill (e.g. `analyze-paper`, `review-literature`) by name with its arguments. On slash-command hosts: `/reaper: `. On auto-discovery hosts: ask the agent to run the skill by name.", "evaluate": "After the pipeline completes, check each expected_output and quality_criteria. Score pass/fail per criterion.", "iterate": "For each failure, identify whether the issue is in the skill prompt, the orchestrator composition, or the model's capability. Fix skill prompts first." } diff --git a/skills/analyze-paper/SKILL.md b/skills/analyze-paper/SKILL.md index 3ea8d00..a12616a 100644 --- a/skills/analyze-paper/SKILL.md +++ b/skills/analyze-paper/SKILL.md @@ -11,25 +11,40 @@ Extract structured information from an academic paper, producing a comprehensive ## Usage +Invoke this skill by name with the paper path (and optional flags). On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:analyze-paper `). + ``` # Analyze the primary paper under study -/reaper:analyze-paper path/to/paper.pdf +analyze-paper path/to/paper.pdf # Analyze a literature paper with research goal as context -/reaper:analyze-paper reaper-workspace/papers/2024-1234.pdf --goal "post-quantum threshold signatures" --output reaper-workspace/papers/2024-1234-notes.md +analyze-paper reaper-workspace/papers/2024-1234.pdf --goal "post-quantum threshold signatures" --output reaper-workspace/papers/2024-1234-notes.md ``` **Argument parsing:** The first non-flag argument is the paper path. Optional flags: - `--output `: Write output to the given path instead of the default `reaper-workspace/notes/paper-summary.md`. - `--goal ""`: The research goal as additional context. When provided, the output includes a **Relevance** section assessing how the paper relates to this goal, and reading depth is calibrated by relevance (see Step 1). +## Path Resolution Protocol + +This skill references files in sibling skills. **`{{REAPER_SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of the `/reaper` skill before reading, or the read will fail.** Common install locations: + +- `~/.claude/skills/reaper/` (Claude Code) +- `~/.cursor/skills/reaper/` (Cursor) +- `~/.agents/skills/reaper/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/reaper/` (Continue) +- `~/.windsurf/skills/reaper/` (Windsurf) +- `/skills/reaper/` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`). Single-skill installs will fail to resolve sibling references. + ## Instructions ### 1. Read the Paper -Read the paper at the provided path using the Read tool (works for PDFs and text files). +Read the paper at the provided path using your host's file-read primitive (works for PDFs and text files on hosts that support PDF reading; otherwise extract text first). -Follow the three-pass strategy from `references/paper-analysis.md`: +Follow the three-pass strategy from `{{REAPER_SKILL_DIR}}/references/paper-analysis.md`: - **Pass 1 (skeleton)**: Abstract, introduction, conclusion, theorem statements. Identify the main claims. - **Pass 2 (construction)**: Protocol details, proof sketches, figures. Understand the key technical idea. @@ -61,7 +76,7 @@ Write the extracted information to `reaper-workspace/notes/paper-summary.md` (or What problem does this paper solve? Why does it matter? ## System Model -[Extract all model dimensions relevant to the paper's domain. Consult references/model.md for the domain-appropriate dimensions to extract. Every applicable dimension must have a concrete answer.] +[Extract all model dimensions relevant to the paper's domain. Consult `{{REAPER_SKILL_DIR}}/references/model.md` for the domain-appropriate dimensions to extract. Every applicable dimension must have a concrete answer.] ## Construction Overview High-level protocol description. Key technical idea. Building blocks used. @@ -90,7 +105,7 @@ Overall proof approach. Key lemmas. Reduction chain. Where the corruption thresh Non-standard notation. Formal definitions referenced by the proofs. ## Red Flags -Any concerns identified during reading (see references/paper-analysis.md for common red flags). +Any concerns identified during reading (see `{{REAPER_SKILL_DIR}}/references/paper-analysis.md` for common red flags). ## Relevance [Present ONLY when --goal is provided. Tag one or more: *problem definition*, *formalization*, *solution technique*, *negative result*, *literature/context*, *writing model*. One sentence per tag explaining how this paper relates to the research goal.] diff --git a/skills/brainstorm/SKILL.md b/skills/brainstorm/SKILL.md index dd8e930..99c66d3 100644 --- a/skills/brainstorm/SKILL.md +++ b/skills/brainstorm/SKILL.md @@ -11,19 +11,21 @@ The recurring ideation step. Reads the current research state and proposes new o ## Usage +Invoke this skill by name; pass an optional context hint as a quoted string. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:brainstorm ""`). + ``` # Generate ideas based on current state -/reaper:brainstorm +brainstorm # With a hint about what direction to explore -/reaper:brainstorm "explore liveness under partial synchrony" +brainstorm "explore liveness under partial synchrony" ``` ## Relationship to Other Skills -- **`formalize-problem`** handles *initial* formalization: pinning down trust assumptions, the core question, definitional hygiene, and the first set of hypotheses. It runs once. -- **`brainstorm`** handles *recurring* ideation: generating additional ideas as the investigation progresses. It runs many times. -- **`investigate`** handles *execution*: deep-diving into specific ideas. It does not generate new hypotheses. +- **`/formalize-problem`** handles *initial* formalization: pinning down trust assumptions, the core question, definitional hygiene, and the first set of hypotheses. It runs once. +- **`/brainstorm`** handles *recurring* ideation: generating additional ideas as the investigation progresses. It runs many times. +- **`/investigate`** handles *execution*: deep-diving into specific ideas. It does not generate new hypotheses. ## Inputs @@ -40,6 +42,19 @@ The recurring ideation step. Reads the current research state and proposes new o - `reaper-workspace/papers/` — downloaded PDFs and per-paper notes - The optional context hint from the argument +## Path Resolution Protocol + +This skill references files in sibling skills. **`{{REAPER_SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of the `/reaper` skill before reading, or the read will fail.** Common install locations: + +- `~/.claude/skills/reaper/` (Claude Code) +- `~/.cursor/skills/reaper/` (Cursor) +- `~/.agents/skills/reaper/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/reaper/` (Continue) +- `~/.windsurf/skills/reaper/` (Windsurf) +- `/skills/reaper/` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`). Single-skill installs will fail to resolve sibling references. + ## Process ### 1. Assess Current State @@ -56,7 +71,7 @@ Apply these techniques systematically. Not all will produce ideas every time — #### Gap-Finding (Qian: Fill in the Blank) -Map the dimensions of existing work and find unexplored combinations. Consult `references/model.md` for the domain-appropriate gap-finding matrix dimensions. Which cells in this matrix are empty? Those are candidate hypotheses. +Map the dimensions of existing work and find unexplored combinations. Consult `{{REAPER_SKILL_DIR}}/references/model.md` for the domain-appropriate gap-finding matrix dimensions. Which cells in this matrix are empty? Those are candidate hypotheses. #### Problem Inversion (Hamming) @@ -85,7 +100,7 @@ If a hypothesis has trended toward refutation over 3+ cycles (counterexample att ### 3. Screen Against Known Impossibilities -For each candidate idea, check whether it contradicts a known impossibility or lower bound. Consult `references/impossibility-results.md` for the domain-relevant impossibility results and lower bounds. +For each candidate idea, check whether it contradicts a known impossibility or lower bound. Consult `{{REAPER_SKILL_DIR}}/references/impossibility-results.md` for the domain-relevant impossibility results and lower bounds. If a candidate contradicts a known impossibility: 1. Flag it explicitly with a warning @@ -94,7 +109,7 @@ If a candidate contradicts a known impossibility: ### 4. Prioritize (Hamming: Importance Filter) -Not all ideas are equally worth investigating. Rank by consequence — ask: "If we resolved this idea, who would care and why?" Consult `references/model.md` for domain-specific examples of how to rank by importance. +Not all ideas are equally worth investigating. Rank by consequence — ask: "If we resolved this idea, who would care and why?" Consult `{{REAPER_SKILL_DIR}}/references/model.md` for domain-specific examples of how to rank by importance. ### 5. Write Output diff --git a/skills/clarify-goal/SKILL.md b/skills/clarify-goal/SKILL.md index 2959158..d51e4bb 100644 --- a/skills/clarify-goal/SKILL.md +++ b/skills/clarify-goal/SKILL.md @@ -11,12 +11,14 @@ Ask the user targeted questions to sharpen a vague research goal into something ## Usage +Invoke this skill by name with the research goal (and optional paper path). On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:clarify-goal ""`). + ``` # Without a paper — goal-driven research -/reaper:clarify-goal "explore the feasibility of post-quantum threshold signatures" +clarify-goal "explore the feasibility of post-quantum threshold signatures" # With a paper -/reaper:clarify-goal "is this protocol secure?" path/to/paper.pdf +clarify-goal "is this protocol secure?" path/to/paper.pdf ``` **Argument parsing:** The research goal (quoted string) is required. If a path to an existing file is also provided, treat it as the paper. @@ -88,12 +90,12 @@ Tell the user the refined goal and confirm before the pipeline proceeds. - Questions are specific to the paper and goal, not generic - Each question offers concrete options or defaults -- The refined goal in `clarified-goal.md` is precise enough for `formalize-problem` to act on without further clarification +- The refined goal in `clarified-goal.md` is precise enough for `/formalize-problem` to act on without further clarification - If the paper PDF is unreadable, report the error — do not proceed with fabricated context ## Important Notes - This skill is interactive — it requires user input. Do not skip the question-asking step. -- Keep the quick scan fast. Don't spend time on deep analysis — that's what `analyze-paper` is for. +- Keep the quick scan fast. Don't spend time on deep analysis — that's what `/analyze-paper` is for. - If the research goal is already precise and unambiguous (e.g., "Does Theorem 4.2 hold when the adversary controls >1/3 of stake?"), you may skip questions and go straight to writing `clarified-goal.md`. Tell the user the goal is clear and proceed. - When no paper is provided, focus questions on scoping the research domain, defining key terms, and establishing what kind of output the user expects (survey, feasibility analysis, novel construction, etc.). diff --git a/skills/critique/SKILL.md b/skills/critique/SKILL.md index 4e37c2b..ba29c70 100644 --- a/skills/critique/SKILL.md +++ b/skills/critique/SKILL.md @@ -11,17 +11,32 @@ Provide external perspective on investigation results. Three modes: human feedba ## Usage +Invoke this skill by name; pass either feedback as a quoted string, `--codex`, or `--self`. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:critique ""`). + ``` # Human feedback — iterate on existing results -/reaper:critique "dig deeper into the liveness proof gap under partial synchrony" +critique "dig deeper into the liveness proof gap under partial synchrony" -# Codex consultation — get AI devil's advocate or inspiration -/reaper:critique --codex +# External-model consultation — get AI devil's advocate or inspiration (requires MCP host) +critique --codex # Self-review — agent reviews its own findings for gaps -/reaper:critique --self +critique --self ``` +## Path Resolution Protocol + +This skill references files in sibling skills. **`{{REAPER_SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of the `/reaper` skill before reading, or the read will fail.** Common install locations: + +- `~/.claude/skills/reaper/` (Claude Code) +- `~/.cursor/skills/reaper/` (Cursor) +- `~/.agents/skills/reaper/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/reaper/` (Continue) +- `~/.windsurf/skills/reaper/` (Windsurf) +- `/skills/reaper/` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`). Single-skill installs will fail to resolve sibling references. + ## Inputs **Always read** before starting: @@ -69,21 +84,21 @@ Categories: ### 3. Execute -**scope**: Return control to the orchestrator — this requires re-running `formalize-problem` before investigation. Do not run cycles yourself; instead, write the feedback file and indicate that re-formalization is needed. +**scope**: Return control to the orchestrator — this requires re-running `/formalize-problem` before investigation. Do not run cycles yourself; instead, write the feedback file and indicate that re-formalization is needed. -**deepen**: Run `/reaper:brainstorm "context from the user's feedback"` to generate targeted hypotheses based on the feedback, then run `/reaper:investigate 5`. +**deepen**: Invoke the `/brainstorm` skill with `"context from the user's feedback"` to generate targeted hypotheses based on the feedback, then invoke the `/investigate` skill with argument `5`. -**explore**: If the area may need additional literature, use the search scripts first and update `literature.md`. Then run `/reaper:brainstorm "context from the user's feedback"` to generate hypotheses for the new area, followed by `/reaper:investigate 5`. +**explore**: If the area may need additional literature, use the search scripts first and update `literature.md`. Then invoke the `/brainstorm` skill with `"context from the user's feedback"` to generate hypotheses for the new area, followed by the `/investigate` skill with argument `5`. -**rewrite**: No investigation cycles needed. Return control to the orchestrator to re-run `synthesize` only. +**rewrite**: No investigation cycles needed. Return control to the orchestrator to re-run `/synthesize` only. -For **deepen** and **explore**, after completing the cycles, the orchestrator should re-run `synthesize` to produce an updated report. +For **deepen** and **explore**, after completing the cycles, the orchestrator should re-run `/synthesize` to produce an updated report. ## Mode: Codex Consultation (`--codex`) Consult an external AI (OpenAI Codex via MCP) for an independent second opinion on the current investigation state. This establishes an automated feedback loop where Codex plays **devil's advocate** or provides **alternative inspiration**. -See `references/codex-consultation.md` for MCP setup, fallback behavior, session continuity, and context compression rules. The critique skill's Codex mode is the most thorough consultation — other skills have lighter-weight checkpoint consultations. +See `{{REAPER_SKILL_DIR}}/references/codex-consultation.md` (placeholder defined in the Path Resolution Protocol section above) for MCP setup, fallback behavior, session continuity, and context compression rules. The critique skill's Codex mode is the most thorough consultation — other skills have lighter-weight checkpoint consultations. ### Determining the Role @@ -130,7 +145,7 @@ The agent reviews its own investigation results for gaps, inconsistencies, or mi - **Missing angles**: Obvious questions raised by the current findings that haven't been investigated. - **Inconsistencies**: Claims in `current-understanding.md` that conflict with each other or with `notes/results.md`. 3. For each actionable finding, add a hypothesis to `ideas.md` with the next available H-number, tagged `[Self-N]` in the Source field (where N is one more than the count of existing self-review rounds). -4. If actionable hypotheses were added, run `/reaper:investigate 3` to address them. The self-review findings are recorded as part of the cycle logs in `reaper-workspace/logs/` — no separate self-review file is needed. +4. If actionable hypotheses were added, invoke the `/investigate` skill with argument `3` to address them. The self-review findings are recorded as part of the cycle logs in `reaper-workspace/logs/` — no separate self-review file is needed. ## Quality Criteria diff --git a/skills/formalize-problem/SKILL.md b/skills/formalize-problem/SKILL.md index 1656741..7b53a61 100644 --- a/skills/formalize-problem/SKILL.md +++ b/skills/formalize-problem/SKILL.md @@ -11,10 +11,25 @@ Transform a research goal into precise, testable hypotheses with explicit succes ## Usage +Invoke this skill by name with the research goal as a quoted string. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:formalize-problem ""`). + ``` -/reaper:formalize-problem "determine if the security proof in Section 4 holds under asynchrony" +formalize-problem "determine if the security proof in Section 4 holds under asynchrony" ``` +## Path Resolution Protocol + +This skill references files in sibling skills. **`{{REAPER_SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of the `/reaper` skill before reading, or the read will fail.** Common install locations: + +- `~/.claude/skills/reaper/` (Claude Code) +- `~/.cursor/skills/reaper/` (Cursor) +- `~/.agents/skills/reaper/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/reaper/` (Continue) +- `~/.windsurf/skills/reaper/` (Windsurf) +- `/skills/reaper/` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`). Single-skill installs will fail to resolve sibling references. + ## Instructions ### 1. Read Inputs @@ -33,21 +48,21 @@ What exactly needs to be resolved? Be specific. "Is this protocol secure?" is to ### 3. Pin Down Model Assumptions (must be unambiguous before hypotheses) -Pin down every dimension of the system/trust model. If any dimension is left vague, the investigation will produce ambiguous results. Consult `references/model.md` for the domain-appropriate checklist of dimensions that must be specified. Every field must have a concrete answer, not "TBD". +Pin down every dimension of the system/trust model. If any dimension is left vague, the investigation will produce ambiguous results. Consult `{{REAPER_SKILL_DIR}}/references/model.md` for the domain-appropriate checklist of dimensions that must be specified. Every field must have a concrete answer, not "TBD". Every hypothesis must reference these model assumptions by specifying which parameters it holds under. A hypothesis that states a claim without pinning every relevant dimension is rejected. ### 4. Apply Importance Filter (Hamming) -Not all questions are equally worth investigating. Prioritize by consequence — ask: "If we resolved this question, who would care and why?" Consult `references/model.md` for domain-specific examples of how to rank by importance. +Not all questions are equally worth investigating. Prioritize by consequence — ask: "If we resolved this question, who would care and why?" Consult `{{REAPER_SKILL_DIR}}/references/model.md` for domain-specific examples of how to rank by importance. ### 5. Apply Gap-Finding (Qian) -Map the dimensions of existing work and find unexplored combinations. Consult `references/model.md` for the domain-appropriate gap-finding matrix dimensions. Which cells in this matrix are empty? Those are candidate hypotheses. +Map the dimensions of existing work and find unexplored combinations. Consult `{{REAPER_SKILL_DIR}}/references/model.md` for the domain-appropriate gap-finding matrix dimensions. Which cells in this matrix are empty? Those are candidate hypotheses. ### 6. Screen Against Known Impossibilities -For each hypothesis, check whether it contradicts a known impossibility or lower bound. Consult `references/impossibility-results.md` for the domain-relevant impossibility results and lower bounds. +For each hypothesis, check whether it contradicts a known impossibility or lower bound. Consult `{{REAPER_SKILL_DIR}}/references/impossibility-results.md` for the domain-relevant impossibility results and lower bounds. If a hypothesis asks to prove something that an impossibility result rules out: 1. **Flag it explicitly** in the hypothesis with a warning @@ -56,7 +71,7 @@ If a hypothesis asks to prove something that an impossibility result rules out: ### 7. Enforce Definitional Hygiene -Each core property listed in the output must be stated precisely. Consult `references/definitional-standards.md` for the domain-appropriate acceptable definition forms. Informal or ambiguous terms without formal definitions are NOT acceptable — different papers define the same terms differently, so pin it down. If the paper under analysis uses informal definitions, formalize them explicitly and note that you are doing so. +Each core property listed in the output must be stated precisely. Consult `{{REAPER_SKILL_DIR}}/references/definitional-standards.md` for the domain-appropriate acceptable definition forms. Informal or ambiguous terms without formal definitions are NOT acceptable — different papers define the same terms differently, so pin it down. If the paper under analysis uses informal definitions, formalize them explicitly and note that you are doing so. ### 8. Write Output @@ -69,7 +84,7 @@ Write to `reaper-workspace/notes/problem-statement.md`: [Restate the goal precisely] ## Model Assumptions -[One field per dimension from references/model.md. Every dimension must have a concrete answer.] +[One field per dimension from `{{REAPER_SKILL_DIR}}/references/model.md`. Every dimension must have a concrete answer.] ## Security Properties Under Investigation [What must hold? List each property with its formal definition or reference.] @@ -103,7 +118,7 @@ Write ideas to a separate file `reaper-workspace/notes/ideas.md`: ### Relationship to Brainstorm -`formalize-problem` handles *initial* formalization: pinning down trust assumptions, the core question, definitional hygiene, and the first set of hypotheses. It runs once at the start of the pipeline. The `brainstorm` skill handles *recurring* ideation — generating additional ideas as the investigation progresses, applying Hamming/Qian heuristics to current state, and proposing follow-up hypotheses based on patterns in results. `brainstorm` updates the `ideas.md` that this skill creates (adding new ideas and editing existing ones inline). +`/formalize-problem` handles *initial* formalization: pinning down trust assumptions, the core question, definitional hygiene, and the first set of hypotheses. It runs once at the start of the pipeline. The `/brainstorm` skill handles *recurring* ideation — generating additional ideas as the investigation progresses, applying Hamming/Qian heuristics to current state, and proposing follow-up hypotheses based on patterns in results. `/brainstorm` updates the `ideas.md` that this skill creates (adding new ideas and editing existing ones inline). ### Quality Criteria diff --git a/skills/investigate/SKILL.md b/skills/investigate/SKILL.md index 490f8c2..f031808 100644 --- a/skills/investigate/SKILL.md +++ b/skills/investigate/SKILL.md @@ -11,12 +11,14 @@ The core research loop. Run N investigation cycles, each testing a hypothesis an ## Usage +Invoke this skill by name with an optional cycle count. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:investigate 10`). + ``` # Run 10 investigation cycles -/reaper:investigate 10 +investigate 10 # Default: 5 cycles -/reaper:investigate +investigate ``` Default: 5 cycles if no argument given. @@ -33,7 +35,20 @@ Default: 5 cycles if no argument given. - `reaper-workspace/notes/paper-summary.md` — the source paper (if provided) - `reaper-workspace/notes/literature.md` — known prior work (organized by same-goal and same-approach) - `reaper-workspace/papers/` — downloaded PDFs and per-paper notes (`-notes.md`) from the literature review -- `references/methodology.md` — proof/analysis patterns +- `{{REAPER_SKILL_DIR}}/references/methodology.md` — proof/analysis patterns + +## Path Resolution Protocol + +This skill references files in sibling skills. **`{{REAPER_SKILL_DIR}}`** above and below is a template placeholder — **you MUST substitute it with the absolute install path of the `/reaper` skill before reading, or the read will fail.** Common install locations: + +- `~/.claude/skills/reaper/` (Claude Code) +- `~/.cursor/skills/reaper/` (Cursor) +- `~/.agents/skills/reaper/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/reaper/` (Continue) +- `~/.windsurf/skills/reaper/` (Windsurf) +- `/skills/reaper/` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`). Single-skill installs will fail to resolve sibling references. ## The Batch Loop @@ -56,11 +71,11 @@ while cycles_remaining > 0 and not converged: 4. Select the largest set of **independent** hypotheses that can run concurrently. Cap the batch size at `cycles_remaining`. 5. Allocate cycle numbers: pre-assign a contiguous range per subagent (e.g., batch 1 gets 001-003, batch 2 gets 004-006). Each subagent gets one or more consecutive numbers from its range. 6. If all remaining hypotheses form a dependency chain, the batch size is 1 — this is the **sequential fallback** (see below). -7. If all hypotheses are resolved, **stop and return control** to the orchestrator. The orchestrator will call `brainstorm` to generate new ideas if needed. +7. If all hypotheses are resolved, **stop and return control** to the orchestrator. The orchestrator will call `/brainstorm` to generate new ideas if needed. ### Dispatch Batch -Spawn one subagent per hypothesis in the batch using the Agent tool. **Launch all subagents in a single message** for true parallelism. Each subagent receives: +Spawn one subagent per hypothesis in the batch using your host's parallel-subagent mechanism (e.g. Claude Code's `Agent` tool, or the equivalent task/spawn primitive on other hosts; if the host has no parallel primitive, fall back to sequential execution). **Launch all subagents in a single message** for true parallelism. Each subagent receives: - Its assigned cycle number(s) - The hypothesis to investigate @@ -84,7 +99,7 @@ For a new hypothesis, create `reaper-workspace/investigations/NNN-/` where Do the actual research. This is the core intellectual work. Depending on the hypothesis: -- **Proof verification**: Check each step of an existing proof. Look for gaps, implicit assumptions, boundary cases. Consult `references/methodology.md` for patterns. +- **Proof verification**: Check each step of an existing proof. Look for gaps, implicit assumptions, boundary cases. Consult `{{REAPER_SKILL_DIR}}/references/methodology.md` for patterns. - **Proof attempt**: Try to prove the claim. Start with the simplest approach. If it works, check if a simpler proof exists. All proofs must follow the formal proof structure below. - **Counterexample search**: Try to disprove the claim. Start small (2 parties, 1 round). Construct a specific adversary strategy and execution trace. - **Security analysis**: Enumerate threat models, check if reductions go through, verify simulator constructions. Security proofs must follow the formal proof structure below. @@ -126,7 +141,7 @@ For theoretical research (proving properties, security guarantees, or performanc **Proof technique:** ``` -Consult `references/methodology.md` for the proof techniques catalog, reduction quality gate, and performance sanity checks. State the chosen proof technique in the proof header. If it doesn't work after a genuine attempt, log which technique failed and why, then try an alternative in the next cycle. +Consult `{{REAPER_SKILL_DIR}}/references/methodology.md` for the proof techniques catalog, reduction quality gate, and performance sanity checks. State the chosen proof technique in the proof header. If it doesn't work after a genuine attempt, log which technique failed and why, then try an alternative in the next cycle. Requirements for formal proofs: @@ -169,7 +184,7 @@ When a proof issue is found, do not just say "found a gap." Classify it: ##### Composition Awareness -When a core property is confirmed, note the composition implications. Consult `references/definitional-standards.md` for domain-specific composition considerations (e.g., rewinding, shared setup, standalone vs compositional security). +When a core property is confirmed, note the composition implications. Consult `{{REAPER_SKILL_DIR}}/references/definitional-standards.md` for domain-specific composition considerations (e.g., rewinding, shared setup, standalone vs compositional security). Log composition limitations in the investigation's `analysis.md` even if the original hypothesis didn't ask about composition — this is critical context for the final report. @@ -240,7 +255,7 @@ After all subagents in a batch complete: Discard patterns: - [1 sentence summarizing why cycles were discarded, if any pattern emerges] ``` - These summaries serve two purposes: (a) `brainstorm` can read summaries instead of full investigation directories, and (b) `synthesize` can read summaries instead of loading all `analysis.md` files. + These summaries serve two purposes: (a) `/brainstorm` can read summaries instead of full investigation directories, and (b) `/synthesize` can read summaries instead of loading all `analysis.md` files. 5. **Re-read updated state** and plan the next batch. ### Sequential Fallback @@ -257,11 +272,11 @@ Run all N cycles. The only valid early stop is **genuine convergence**: all hypo ## When Stuck -If a cycle is going nowhere, follow the escalation protocol in `references/methodology.md` (section "When Stuck: 8-Step Escalation"). The steps progress from re-reading existing materials, through searching for new literature (see `references/search-tools.md` for search commands, which use `search_arxiv.py` and `search_iacr.py`), to trying radically different approaches. +If a cycle is going nowhere, follow the escalation protocol in `{{REAPER_SKILL_DIR}}/references/methodology.md` (section "When Stuck: 8-Step Escalation"). The steps progress from re-reading existing materials, through searching for new literature (see `{{REAPER_SKILL_DIR}}/references/search-tools.md` for search commands, which use `search_arxiv.py` and `search_iacr.py`), to trying radically different approaches. When searching for new literature mid-investigation, download relevant papers to `reaper-workspace/papers/`, write per-paper notes (`-notes.md`), and **integrate findings into `reaper-workspace/notes/literature.md` inline** — add new entries to the appropriate existing sections rather than appending a separate "Mid-Investigation Additions" section. Log the search as a cycle with action-type `literature-search` in `notes/results.md`. -If all escalation tactics are exhausted and the hypothesis remains stuck, log the cycle as `inconclusive` and continue to the next hypothesis. The orchestrator will call `brainstorm` after the batch to generate new ideas based on the pattern of failures. +If all escalation tactics are exhausted and the hypothesis remains stuck, log the cycle as `inconclusive` and continue to the next hypothesis. The orchestrator will call `/brainstorm` after the batch to generate new ideas based on the pattern of failures. ## Negative Result Protocol @@ -270,7 +285,7 @@ If after 3 cycles a hypothesis trends toward refutation (counterexample attempts 1. **Pivot explicitly** to proving the negative. Construct the strongest possible negative result — a concrete attack, execution trace, or reduction to a known impossibility. A clean impossibility result is more valuable than a vague "we couldn't prove it." 2. **Identify the minimal fix.** What is the weakest additional assumption that would make the positive result hold? (e.g., "Safety holds if we additionally assume synchronous message delivery in the view-change sub-protocol.") 3. **A clean negative result is a KEEP, not a failure.** It resolves the hypothesis (by refutation) and advances understanding. Log it with outcome `refuted` and status `keep`. -4. **Signal for new ideas.** Note in the cycle's result description that the hypothesis was refuted and what was learned. The orchestrator will call `brainstorm` to generate follow-up hypotheses (e.g., proving the impossibility formally, exploring the minimal fix). +4. **Signal for new ideas.** Note in the cycle's result description that the hypothesis was refuted and what was learned. The orchestrator will call `/brainstorm` to generate follow-up hypotheses (e.g., proving the impossibility formally, exploring the minimal fix). Do not spend 10 cycles attempting minor variations of the same failed proof strategy. Three failures at the same point is a signal to change direction. @@ -286,7 +301,7 @@ If an investigation cycle reveals that the problem formulation itself is wrong ( 3. Write `REFORMULATE` as the first line of the cycle description in `notes/results.md`, followed by a one-sentence summary. 4. **Stop the current investigation batch** and return control to the orchestrator. Do not continue investigating hypotheses based on a known-incorrect formulation. -The orchestrator will re-run `/reaper:formalize-problem` incorporating the re-formalization signal before resuming investigation. +The orchestrator will re-invoke the `/formalize-problem` skill incorporating the re-formalization signal before resuming investigation. If any cycle in a batch returns outcome `reformulate`, stop dispatching further batches and return control to the orchestrator with a re-formulation signal. diff --git a/skills/reaper/SKILL.md b/skills/reaper/SKILL.md index 1a978bd..86d953f 100644 --- a/skills/reaper/SKILL.md +++ b/skills/reaper/SKILL.md @@ -11,21 +11,37 @@ You are the Reaper orchestrator. You take a research goal — optionally with a ## Usage +Invoke the `/reaper` skill with a research goal (quoted string), optionally followed by a paper path and/or `--codex`. Examples below use a generic name-based form; the slash-command form `/reaper "..."` works on hosts with slash-command routing (e.g. Claude Code). + ``` # Without a paper — pure goal-driven research -/reaper "explore the feasibility of post-quantum threshold signatures" +reaper "explore the feasibility of post-quantum threshold signatures" # With a paper -/reaper "determine if the security proof in Section 4 holds under asynchrony" path/to/paper.pdf +reaper "determine if the security proof in Section 4 holds under asynchrony" path/to/paper.pdf -# With Codex consultation for automated AI feedback between investigation cycles -/reaper "determine if the security proof in Section 4 holds under asynchrony" path/to/paper.pdf --codex +# With external-model consultation for automated AI feedback between investigation cycles +reaper "determine if the security proof in Section 4 holds under asynchrony" path/to/paper.pdf --codex ``` -**Argument parsing:** The research goal (quoted string) is required. If a path to an existing file (PDF or text) is also provided, treat it as the paper. Pass `--codex` to enable Codex consultation across the entire pipeline — every skill gains an optional step where it consults Codex for a second opinion at a natural checkpoint. See `references/codex-consultation.md` for the full protocol. Requires [codex-mcp-server](https://github.com/tuannvm/codex-mcp-server). +**Argument parsing:** The research goal (quoted string) is required. If a path to an existing file (PDF or text) is also provided, treat it as the paper. Pass `--codex` to enable external-model consultation across the entire pipeline — every skill gains an optional step where it consults an external model for a second opinion at a natural checkpoint. See `references/codex-consultation.md` for the full protocol. Requires a host with MCP support and a registered Codex MCP server (e.g. [codex-mcp-server](https://github.com/tuannvm/codex-mcp-server)); silently no-op on hosts without MCP. When `--codex` is set, propagate this context to all skill invocations. Each skill will consult Codex only at its designated checkpoint (defined in `references/codex-consultation.md`), using compressed context and a shared session ID for continuity across the pipeline. +## Peer Skills + +This orchestrator chains 8 sub-skills that must be installed alongside it: `/clarify-goal`, `/analyze-paper`, `/review-literature`, `/formalize-problem`, `/brainstorm`, `/investigate`, `/critique`, `/synthesize`. Two more (`/search-arxiv`, `/search-iacr`) are called transitively by `/review-literature` and `/investigate`. The `/` form is the canonical display convention used in these docs; substitute the host-native invocation form (slash command, auto-discovery, manual `SKILL.md` pointer) when actually running them. + +If any of these are missing from your agent's skills folder, ask the user to reinstall the full Reaper package (`npx skills add SebastianElvis/reaper`). + +## Invocation Convention + +References below to running a sub-skill use the host-agnostic phrase "invoke the `` skill" — invoke each sub-skill by its `name` using your host's native skill-loading mechanism. The loaded `SKILL.md` provides the full instructions for that stage. Concrete invocation form varies by host: + +- **Slash-command hosts** (e.g. Claude Code): `/reaper:` (e.g. `/reaper:clarify-goal`) +- **Auto-discovery hosts** (e.g. Cursor, Codex CLI, Cline, Continue, Gemini CLI, Copilot, Windsurf, OpenCode): the agent loads peer `SKILL.md` files from the skills folder and routes by `name` + `description` match. +- **Manual invocation hosts**: explicitly point the agent at the installed skill's `SKILL.md` (typical paths: `~/.claude/skills//SKILL.md`, `~/.cursor/skills//SKILL.md`, `~/.agents/skills//SKILL.md`, `~/.continue/skills//SKILL.md`, `~/.windsurf/skills//SKILL.md`, or `/skills//SKILL.md` during development — substitute `` with the sub-skill directory name like `clarify-goal`). + ## Design Principles 1. **Separation of concerns**: Each skill has one job. Skills communicate through workspace files, not shared memory. @@ -62,7 +78,7 @@ reaper-workspace/ **File naming conventions:** Investigation dirs: `NNN-/` (zero-padded). Cycle logs: `cycle-NNN-.md`. Feedback: `round-N.md`, `codex-consultation-N.md`. Paper notes: `-notes.md`. -**Lazy-load protocol:** Early-pipeline skills (`formalize-problem`, `analyze-paper`) read source files eagerly. Loop skills (`brainstorm`, `investigate`, `critique`, `synthesize`) use `current-understanding.md` as primary source and lazy-load `paper-summary.md` and `literature.md` only when stuck or when a specific hypothesis requires it. +**Lazy-load protocol:** Early-pipeline skills (`/formalize-problem`, `/analyze-paper`) read source files eagerly. Loop skills (`/brainstorm`, `/investigate`, `/critique`, `/synthesize`) use `current-understanding.md` as primary source and lazy-load `paper-summary.md` and `literature.md` only when stuck or when a specific hypothesis requires it. Initialize `reaper-workspace/notes/results.md` with: ```markdown @@ -82,10 +98,10 @@ Initialize `reaper-workspace/notes/current-understanding.md` with: ### Step 2: Clarify the Research Goal **If a paper was provided:** -Run **`/reaper:clarify-goal "" `** — does a quick scan of the paper, asks the user 3-5 targeted clarifying questions about scope, assumptions, and success criteria, then writes `notes/clarified-goal.md`. +Invoke the **`/clarify-goal`** skill with arguments `"" ` — it does a quick scan of the paper, asks the user 3-5 targeted clarifying questions about scope, assumptions, and success criteria, then writes `notes/clarified-goal.md`. **If no paper was provided:** -Run **`/reaper:clarify-goal ""`** — asks the user 3-5 targeted clarifying questions based on the goal alone (no paper scan), then writes `notes/clarified-goal.md`. +Invoke the **`/clarify-goal`** skill with argument `""` — it asks the user 3-5 targeted clarifying questions based on the goal alone (no paper scan), then writes `notes/clarified-goal.md`. If the goal is already precise and unambiguous, this step writes the file without asking questions. @@ -93,22 +109,22 @@ All downstream skills should read `clarified-goal.md` for the refined goal and c ### Step 3: Establish Baseline (parallel) -**If a paper was provided**, run these two skills as **parallel subagents** using the Agent tool — they write to non-overlapping files: +**If a paper was provided**, run these two skills as **parallel subagents** using your host's parallel-spawn primitive (e.g. Claude Code's `Agent` tool, or the equivalent on your host; if the host has no parallel primitive, run them sequentially) — they write to non-overlapping files: -1. **`/reaper:analyze-paper `** — produces `notes/paper-summary.md` -2. **`/reaper:review-literature ""`** — produces `notes/literature.md` +1. Invoke **`/analyze-paper`** with `` — produces `notes/paper-summary.md` +2. Invoke **`/review-literature`** with `""` — produces `notes/literature.md` Use the refined goal from `clarified-goal.md` for the literature review argument. Both must complete before proceeding. **If no paper was provided**, run only: -1. **`/reaper:review-literature ""`** — produces `notes/literature.md` +1. Invoke **`/review-literature`** with `""` — produces `notes/literature.md` -Skip `analyze-paper` entirely — there is no paper to analyze. The pipeline proceeds without `notes/paper-summary.md`. +Skip `/analyze-paper` entirely — there is no paper to analyze. The pipeline proceeds without `notes/paper-summary.md`. ### Step 4: Formalize the Problem -Run **`/reaper:formalize-problem ""`** — reads the baseline outputs (`clarified-goal.md`, `literature.md`, and `paper-summary.md` if available) and produces `notes/problem-statement.md` (trust assumptions, security properties, performance goals) and `notes/ideas.md` (prioritized ideas). +Invoke **`/formalize-problem`** with `""` — it reads the baseline outputs (`clarified-goal.md`, `literature.md`, and `paper-summary.md` if available) and produces `notes/problem-statement.md` (trust assumptions, security properties, performance goals) and `notes/ideas.md` (prioritized ideas). ### Step 5: Brainstorm → Investigate → Critique Loop @@ -126,12 +142,12 @@ The loop adapts to problem complexity. Assess complexity after formalization, be **Default (no `--codex`):** ``` -/reaper:brainstorm → /reaper:investigate N → /reaper:critique --self → /reaper:brainstorm → /reaper:investigate N +brainstorm → investigate N → critique --self → brainstorm → investigate N ``` **With `--codex`:** ``` -/reaper:brainstorm → /reaper:investigate N → /reaper:critique --codex → /reaper:brainstorm → /reaper:investigate N → /reaper:critique --codex +brainstorm → investigate N → critique --codex → brainstorm → investigate N → critique --codex ``` Where N is determined by the complexity assessment above. @@ -145,23 +161,23 @@ Where N is determined by the complexity assessment above. #### Re-Formalization Handling -If `investigate` returns with a re-formalization signal (any cycle logged with outcome `reformulate`): +If `/investigate` returns with a re-formalization signal (any cycle logged with outcome `reformulate`): 1. **Preserve state**: Do NOT clear `notes/results.md`, `notes/current-understanding.md`, or existing investigation directories. 2. **Archive old formulation**: Rename `notes/problem-statement.md` to `notes/problem-statement-v.md` before re-running. -3. **Re-run `formalize-problem`**: It reads the reformulation signal from the triggering cycle's `analysis.md`. +3. **Re-run `/formalize-problem`**: It reads the reformulation signal from the triggering cycle's `analysis.md`. 4. **Reset ideas selectively**: In `ideas.md`, mark hypotheses that depended on the old formulation as `[superseded by v]`. Add new hypotheses below. 5. **Restart loop**: Fresh cycle budget for the new formulation. #### Loop Mechanics -The `brainstorm` step reads the current state and updates `ideas.md` (adding new ideas, editing existing ones inline) (tagged `[Brainstorm-N]`). The `critique` step may also add hypotheses (tagged `[Codex-N]` or `[Self-N]`). The next `investigate` batch picks up all unresolved ideas automatically. +The `/brainstorm` step reads the current state and updates `ideas.md` (adding new ideas, editing existing ones inline) (tagged `[Brainstorm-N]`). The `/critique` step may also add hypotheses (tagged `[Codex-N]` or `[Self-N]`). The next `/investigate` batch picks up all unresolved ideas automatically. This loop runs autonomously — do not interrupt or ask if it should continue. ### Step 6: Synthesize -Run **`/reaper:synthesize`** — reads all workspace files and produces `report.md`. +Invoke the **`/synthesize`** skill — it reads all workspace files and produces `report.md`. ### Step 7: Present Results @@ -175,36 +191,36 @@ After synthesis completes: After presenting results, let the user know they can iterate: -> If you'd like to refine, deepen, or challenge any aspect of this research, use `/reaper:critique "your feedback here"`. +> If you'd like to refine, deepen, or challenge any aspect of this research, invoke the `/critique` skill with your feedback as a quoted string. (Slash-command hosts: `/reaper:critique "your feedback here"`.) -Do **not** block waiting for a response — the pipeline is complete. The user can invoke `/reaper:critique` with quoted feedback at any time to start a feedback round. The critique skill classifies the feedback, may run targeted investigation cycles, and then you should re-run `/reaper:synthesize` to produce an updated report. +Do **not** block waiting for a response — the pipeline is complete. The user can invoke `/critique` with quoted feedback at any time to start a feedback round. The critique skill classifies the feedback, may run targeted investigation cycles, and then you should re-invoke `/synthesize` to produce an updated report. ## Skill Dependency Graph ``` -clarify-goal ──► analyze-paper ──┐ - (if paper provided) ├──► formalize-problem ──► brainstorm ◄──► investigate ◄──► critique - review-literature ┘ │ - (calls analyze-paper synthesize ◄───────┘ - for each paper) +/clarify-goal ──► /analyze-paper ──┐ + (if paper provided) ├──► /formalize-problem ──► /brainstorm ◄──► /investigate ◄──► /critique + /review-literature ┘ │ + (calls /analyze-paper /synthesize ◄──────────┘ + for each paper) ``` | Skill | Requires | Produces | |-------|----------|----------| -| clarify-goal | (paper path, optional) | `notes/clarified-goal.md` | -| analyze-paper | (paper path) — **skipped at top level if no paper**; also called by `review-literature` for each downloaded paper | `notes/paper-summary.md` or `papers/-notes.md` (when `--output` is specified) | -| review-literature | (research goal); calls `analyze-paper --goal` per downloaded paper | `notes/literature.md`, `papers/*` | -| formalize-problem | `clarified-goal.md`, `literature.md`, `paper-summary.md` (optional) | `problem-statement.md`, `ideas.md` | -| brainstorm | `problem-statement.md`, `ideas.md`, `current-understanding.md`, `results.md` | Updates `ideas.md` | -| investigate | `problem-statement.md`, `ideas.md`, `current-understanding.md`, `results.md` | Updates `results.md`, `current-understanding.md`, `ideas.md`; creates `investigations/*`, `logs/*` | -| critique | `current-understanding.md`, `results.md`, `problem-statement.md`, `ideas.md` | `feedbacks/*`; may update `ideas.md` | -| synthesize | `current-understanding.md`, `results.md`, `problem-statement.md`, `ideas.md` | `report.md` | -| search-arxiv | (query) | stdout | -| search-iacr | (query) | stdout | +| `/clarify-goal` | (paper path, optional) | `notes/clarified-goal.md` | +| `/analyze-paper` | (paper path) — **skipped at top level if no paper**; also called by `/review-literature` for each downloaded paper | `notes/paper-summary.md` or `papers/-notes.md` (when `--output` is specified) | +| `/review-literature` | (research goal); calls `/analyze-paper --goal` per downloaded paper | `notes/literature.md`, `papers/*` | +| `/formalize-problem` | `clarified-goal.md`, `literature.md`, `paper-summary.md` (optional) | `problem-statement.md`, `ideas.md` | +| `/brainstorm` | `problem-statement.md`, `ideas.md`, `current-understanding.md`, `results.md` | Updates `ideas.md` | +| `/investigate` | `problem-statement.md`, `ideas.md`, `current-understanding.md`, `results.md` | Updates `results.md`, `current-understanding.md`, `ideas.md`; creates `investigations/*`, `logs/*` | +| `/critique` | `current-understanding.md`, `results.md`, `problem-statement.md`, `ideas.md` | `feedbacks/*`; may update `ideas.md` | +| `/synthesize` | `current-understanding.md`, `results.md`, `problem-statement.md`, `ideas.md` | `report.md` | +| `/search-arxiv` | (query) | stdout | +| `/search-iacr` | (query) | stdout | ## Important Notes -- Each skill is invoked via the Skill tool (e.g., `skill: "reaper:analyze-paper", args: "paper.pdf"`) +- Sub-skills are invoked using the host agent's native skill mechanism — by `name` plus arguments. The exact API differs per host (e.g. Claude Code's `Skill` tool with `skill: "reaper:analyze-paper", args: "paper.pdf"`; Cursor/Codex/Cline auto-route based on the loaded `SKILL.md`). Refer to the host's skill documentation for the exact form. - Skills communicate exclusively through workspace files — no in-memory state passing - If a skill fails, read its output file to diagnose, then retry - The workspace is the source of truth — if context is compressed, re-read workspace files diff --git a/skills/reaper/references/codex-consultation.md b/skills/reaper/references/codex-consultation.md index 0b6de08..978aaf8 100644 --- a/skills/reaper/references/codex-consultation.md +++ b/skills/reaper/references/codex-consultation.md @@ -4,10 +4,13 @@ Shared protocol for consulting an external AI model (OpenAI Codex via MCP) from ## Setup -**Requires**: The `codex-cli` MCP server registered with Claude Code (see [codex-mcp-server](https://github.com/tuannvm/codex-mcp-server)): -``` -claude mcp add codex-cli -- npx -y codex-mcp-server -``` +**Requires**: A host with MCP support (Claude Code, Cursor, Cline, Continue, Windsurf, OpenCode, Goose, etc.) and the `codex-cli` MCP server registered with that host (see [codex-mcp-server](https://github.com/tuannvm/codex-mcp-server)). MCP registration syntax varies by host — consult your host's MCP documentation. Example forms: + +- **Claude Code**: `claude mcp add codex-cli -- npx -y codex-mcp-server` +- **Cursor / Continue / Cline / Windsurf**: edit the host's MCP config (typically `~/./mcp.json` or the host's settings UI) to add an entry with `command: "npx"`, `args: ["-y", "codex-mcp-server"]` +- **Other MCP hosts**: register a new MCP server pointing at `npx -y codex-mcp-server` + +Hosts without MCP support cannot use Codex consultation — every `--codex` checkpoint silently no-ops on those hosts (see Fallback below). ## Fallback diff --git a/skills/reaper/references/search-tools.md b/skills/reaper/references/search-tools.md index c094ca9..8b5b9e5 100644 --- a/skills/reaper/references/search-tools.md +++ b/skills/reaper/references/search-tools.md @@ -2,11 +2,24 @@ Reaper uses Python scripts to search academic paper archives. This document catalogs the available tools, when to use each, and common workflow patterns. +## Path Resolution Protocol + +The scripts referenced below live in sibling skills (`search-arxiv/` and `search-iacr/`). The placeholders **`{{SEARCH_ARXIV_SKILL_DIR}}`** and **`{{SEARCH_IACR_SKILL_DIR}}`** below are template tokens — **you MUST substitute each with the absolute install path of the corresponding sibling skill before invoking, or the exec will fail.** Common install locations (substitute the trailing skill name as needed): + +- `~/.claude/skills//` (Claude Code) +- `~/.cursor/skills//` (Cursor) +- `~/.agents/skills//` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills//` (Continue) +- `~/.windsurf/skills//` (Windsurf) +- `/skills//` (during repo development) + +**Sibling-skill dependency**: This reference assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`) so that `reaper/`, `search-arxiv/`, and `search-iacr/` are co-located in your agent's skills folder. + ## Tools ### search_arxiv.py -**Location**: `skills/search-arxiv/search_arxiv.py` +**Location**: `{{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py` **Dependencies**: `pip install arxiv requests` | Command | Purpose | Key Parameters | @@ -27,7 +40,7 @@ Reaper uses Python scripts to search academic paper archives. This document cata ### search_iacr.py -**Location**: `skills/search-iacr/search_iacr.py` +**Location**: `{{SEARCH_IACR_SKILL_DIR}}/search_iacr.py` **Dependencies**: `pip install requests beautifulsoup4` | Command | Purpose | Key Parameters | @@ -87,12 +100,12 @@ Need very recent papers? 1. Run a focused query on the specific question that arose: ```bash - python skills/search-iacr/search_iacr.py search "exact technical question" --max-results 5 - python skills/search-arxiv/search_arxiv.py search "exact technical question" --max-results 5 + python {{SEARCH_IACR_SKILL_DIR}}/search_iacr.py search "exact technical question" --max-results 5 + python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py search "exact technical question" --max-results 5 ``` 2. If a highly relevant paper is found, download and read it: ```bash - python skills/search-arxiv/search_arxiv.py download --output-dir reaper-workspace/papers/ + python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py download --output-dir reaper-workspace/papers/ ``` 3. Integrate findings into `literature.md` inline (add to appropriate existing sections) @@ -101,7 +114,7 @@ Need very recent papers? 1. Start with a known paper's arXiv ID 2. Get references (backward) and citations (forward): ```bash - python skills/search-arxiv/search_arxiv.py citations 2305.12345 --max-results 20 + python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py citations 2305.12345 --max-results 20 ``` 3. For each highly relevant citation, recursively chase (1-2 hops max) diff --git a/skills/review-literature/SKILL.md b/skills/review-literature/SKILL.md index ce81871..fcdcac9 100644 --- a/skills/review-literature/SKILL.md +++ b/skills/review-literature/SKILL.md @@ -11,10 +11,25 @@ Search for related academic work, download and deeply read the most important pa ## Usage +Invoke this skill by name with the research topic as a quoted string. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:review-literature ""`). + ``` -/reaper:review-literature "post-quantum threshold signatures" +review-literature "post-quantum threshold signatures" ``` +## Path Resolution Protocol + +This skill references files and scripts in sibling skills. The placeholders **`{{REAPER_SKILL_DIR}}`**, **`{{SEARCH_ARXIV_SKILL_DIR}}`**, and **`{{SEARCH_IACR_SKILL_DIR}}`** below are template tokens — **you MUST substitute each with the absolute install path of the corresponding sibling skill before reading or invoking, or the read/exec will fail.** Common install locations (substitute the trailing skill name as needed — `reaper`, `search-arxiv`, `search-iacr`): + +- `~/.claude/skills//` (Claude Code) +- `~/.cursor/skills//` (Cursor) +- `~/.agents/skills//` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills//` (Continue) +- `~/.windsurf/skills//` (Windsurf) +- `/skills//` (during repo development) + +**Sibling-skill dependency**: This skill assumes the full `/reaper` package was installed together (`npx skills add SebastianElvis/reaper`) so that `reaper/`, `search-arxiv/`, and `search-iacr/` are co-located in your agent's skills folder. Single-skill installs will fail to resolve sibling references. + ## Instructions ### 1. Gather Context @@ -29,18 +44,18 @@ Combine with the research goal to formulate search queries. ### 2. Search — Structured Sources (Primary) -Use the search scripts via Bash to query arXiv and IACR ePrint. Generate multiple diverse queries per source. +Use the search scripts via Bash to query arXiv and IACR ePrint. Generate multiple diverse queries per source. (The placeholders `{{SEARCH_ARXIV_SKILL_DIR}}` and `{{SEARCH_IACR_SKILL_DIR}}` below are defined in the Path Resolution Protocol section above — substitute the absolute install paths before invoking. Alternatively, invoke the `/search-arxiv` and `/search-iacr` skills by name through your host's skill mechanism.) **arXiv** (broad CS/math — use for distributed systems, complexity, general crypto): ```bash -python skills/search-arxiv/search_arxiv.py search "" --max-results 10 --categories cs.CR,cs.DC +python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py search "" --max-results 10 --categories cs.CR,cs.DC ``` **IACR ePrint** (cryptography-specific — use for all crypto topics): ```bash -python skills/search-iacr/search_iacr.py search "" --max-results 10 +python {{SEARCH_IACR_SKILL_DIR}}/search_iacr.py search "" --max-results 10 ``` **Query types** (generate at least one query per type, per source): @@ -52,7 +67,7 @@ python skills/search-iacr/search_iacr.py search "" --max-results 10 - **Attacks/impossibilities**: known negative results (e.g., "FLP impossibility", "DLS lower bound") - **Surveys**: SoK papers, systematization of knowledge (e.g., "SoK blockchain consensus") -**Spawn parallel subagents** (using the Agent tool) for concurrent search: +**Spawn parallel subagents** (using your host's parallel-spawn primitive — e.g. Claude Code's `Agent` tool — or run sequentially if unavailable) for concurrent search: - **Subagent 1**: arXiv searches (multiple queries with different categories) - **Subagent 2**: IACR ePrint searches (multiple queries) - **Subagent 3**: WebSearch fallback (see step 3) @@ -75,7 +90,7 @@ This runs as a parallel subagent alongside the structured searches. For the **seed paper** (from `paper-summary.md`) and the **top 3 most relevant results**, trace citations: ```bash -python skills/search-arxiv/search_arxiv.py citations --max-results 20 +python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py citations --max-results 20 ``` This returns both: @@ -91,7 +106,7 @@ Deduplicate results across all search sources. For fast-moving areas, check for very recent publications: ```bash -python skills/search-iacr/search_iacr.py recent --max-results 10 +python {{SEARCH_IACR_SKILL_DIR}}/search_iacr.py recent --max-results 10 ``` Scan titles/abstracts for relevance to the research goal. Include any relevant recent papers that the main search may have missed. @@ -105,7 +120,7 @@ For each result found, assess relevance to the research goal. Classify each pape #### Venue and Author Weighting -Weight results heavily toward top venues. A peer-reviewed top-conference paper is far more trustworthy than an unreviewed preprint. Consult `references/venue-tiers.md` for the domain-appropriate venue tier table and author weighting criteria. +Weight results heavily toward top venues. A peer-reviewed top-conference paper is far more trustworthy than an unreviewed preprint. Consult `{{REAPER_SKILL_DIR}}/references/venue-tiers.md` (placeholder defined in the Path Resolution Protocol section above) for the domain-appropriate venue tier table and author weighting criteria. When two papers make competing claims, prefer the one from the higher-tier venue by authors with more domain-specific expertise. When a preprint contradicts a published top-venue result, flag it but do not treat the preprint as authoritative without independent verification. @@ -124,27 +139,29 @@ For all **high-relevance** papers (and medium-relevance papers that seem particu ```bash # arXiv papers -python skills/search-arxiv/search_arxiv.py download --output-dir reaper-workspace/papers +python {{SEARCH_ARXIV_SKILL_DIR}}/search_arxiv.py download --output-dir reaper-workspace/papers # IACR ePrint papers -python skills/search-iacr/search_iacr.py download --output-dir reaper-workspace/papers +python {{SEARCH_IACR_SKILL_DIR}}/search_iacr.py download --output-dir reaper-workspace/papers ``` -After downloading, **delegate paper reading to `analyze-paper`**. For each downloaded paper, invoke: +After downloading, **delegate paper reading to `/analyze-paper`**. For each downloaded paper, invoke the `/analyze-paper` skill with: ``` -/reaper:analyze-paper reaper-workspace/papers/.pdf --goal "" --output reaper-workspace/papers/-notes.md +reaper-workspace/papers/.pdf --goal "" --output reaper-workspace/papers/-notes.md ``` -**Spawn parallel subagents** (using the Agent tool) to analyze multiple papers concurrently — each paper is independent. +(On Claude Code: `/reaper:analyze-paper `. On other agents: invoke by skill name with the same arguments.) + +**Spawn parallel subagents** (using your host's parallel-spawn primitive — e.g. Claude Code's `Agent` tool — or run sequentially if unavailable) to analyze multiple papers concurrently — each paper is independent. -The `analyze-paper` skill handles the multi-pass reading (calibrating depth by relevance to the goal) and writes per-paper notes to `reaper-workspace/papers/-notes.md`. Passing `--goal` ensures the output includes a relevance assessment. +The `/analyze-paper` skill handles the multi-pass reading (calibrating depth by relevance to the goal) and writes per-paper notes to `reaper-workspace/papers/-notes.md`. Passing `--goal` ensures the output includes a relevance assessment. These notes serve as a durable reference for the investigate step. They are evolving files — update inline if revisited during mid-investigation search. ### 8. Cross-Reference Verification -Using the per-paper notes produced by `analyze-paper` in the previous step, check whether the paper under analysis correctly cites and uses each high-relevance work: +Using the per-paper notes produced by `/analyze-paper` in the previous step, check whether the paper under analysis correctly cites and uses each high-relevance work: - **Accuracy**: Does the paper under analysis state the prior result accurately? Compare the claim in the paper against the actual theorem statement in the cited work. - **Model compatibility**: Are the assumptions of the cited result compatible with the current paper's model? A result proven under synchrony cannot be invoked in an asynchronous protocol without justification. @@ -215,7 +232,7 @@ If PDF download fails for a paper, note it in the table (leave Local Path as "un - Results include papers from both arXiv and IACR ePrint (when the topic is crypto-related) - Papers are split into same-goal and same-approach categories — both categories should have entries - High-relevance papers are downloaded and analyzed via `analyze-paper --goal`, with per-paper notes in `reaper-workspace/papers/` -- Per-paper notes (produced by `analyze-paper`) contain structured analysis with key results, strengths/weaknesses, and relevance assessment — not just abstract-level summaries +- Per-paper notes (produced by `/analyze-paper`) contain structured analysis with key results, strengths/weaknesses, and relevance assessment — not just abstract-level summaries - Citation graph section shows forward and backward citations for key papers - Landscape summary gives a reader unfamiliar with the area a useful mental map - Each related work has a specific relevance statement (not just "related to our topic") diff --git a/skills/search-arxiv/SKILL.md b/skills/search-arxiv/SKILL.md index df051e6..fbcc42f 100644 --- a/skills/search-arxiv/SKILL.md +++ b/skills/search-arxiv/SKILL.md @@ -11,18 +11,33 @@ Search arXiv for academic papers using the `arxiv` Python package, with citation ## Usage +Invoke this skill by name with the query as a quoted string. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:search-arxiv ""`). + ``` -/search-arxiv "post-quantum threshold signatures" --max-results 15 --categories cs.CR +search-arxiv "post-quantum threshold signatures" --max-results 15 --categories cs.CR ``` +## Path Resolution Protocol + +This skill wraps `search_arxiv.py`, which lives in the **same directory as this `SKILL.md`**. **`{{SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of this skill before invoking, or the exec will fail.** Common install locations: + +- `~/.claude/skills/search-arxiv/` (Claude Code) +- `~/.cursor/skills/search-arxiv/` (Cursor) +- `~/.agents/skills/search-arxiv/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/search-arxiv/` (Continue) +- `~/.windsurf/skills/search-arxiv/` (Windsurf) +- `/skills/search-arxiv/` (during repo development) + +This skill has no sibling-skill dependencies — it ships its own Python script. + ## Commands -This skill wraps `skills/search-arxiv/search_arxiv.py`. Run commands via Bash. +Run commands via Bash. ### Search ```bash -python skills/search-arxiv/search_arxiv.py search "BFT consensus communication complexity" --max-results 10 --categories cs.CR,cs.DC +python {{SKILL_DIR}}/search_arxiv.py search "BFT consensus communication complexity" --max-results 10 --categories cs.CR,cs.DC ``` Returns JSON array of papers: `arxiv_id`, `title`, `authors`, `year`, `abstract`, `categories`, `pdf_url`, `published`. @@ -30,7 +45,7 @@ Returns JSON array of papers: `arxiv_id`, `title`, `authors`, `year`, `abstract` ### Download ```bash -python skills/search-arxiv/search_arxiv.py download 2305.12345 --output-dir reaper-workspace/papers/ +python {{SKILL_DIR}}/search_arxiv.py download 2305.12345 --output-dir reaper-workspace/papers/ ``` Downloads the paper PDF. Returns JSON with `path` and `title`. @@ -38,7 +53,7 @@ Downloads the paper PDF. Returns JSON with `path` and `title`. ### Citations ```bash -python skills/search-arxiv/search_arxiv.py citations 2305.12345 --max-results 20 +python {{SKILL_DIR}}/search_arxiv.py citations 2305.12345 --max-results 20 ``` Returns JSON with `references` (backward citations — what this paper builds on) and `citations` (forward citations — who cites this paper). Each entry has `title`, `authors`, `year`, `arxiv_id`, `url`. @@ -46,7 +61,7 @@ Returns JSON with `references` (backward citations — what this paper builds on ## Role - **Standalone**: Invoked directly by the user to search for papers. -- **Building block**: Called by `review-literature` and `investigate` via the underlying Python script. +- **Building block**: Called by `/review-literature` and `/investigate` via the underlying Python script. ## Instructions diff --git a/skills/search-iacr/SKILL.md b/skills/search-iacr/SKILL.md index 51f64cc..ad3fddd 100644 --- a/skills/search-iacr/SKILL.md +++ b/skills/search-iacr/SKILL.md @@ -11,18 +11,33 @@ Search the IACR Cryptology ePrint Archive for cryptography and security papers. ## Usage +Invoke this skill by name with the query as a quoted string. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:search-iacr ""`). + ``` -/search-iacr "threshold signatures" --max-results 15 +search-iacr "threshold signatures" --max-results 15 ``` +## Path Resolution Protocol + +This skill wraps `search_iacr.py`, which lives in the **same directory as this `SKILL.md`**. **`{{SKILL_DIR}}`** below is a template placeholder — **you MUST substitute it with the absolute install path of this skill before invoking, or the exec will fail.** Common install locations: + +- `~/.claude/skills/search-iacr/` (Claude Code) +- `~/.cursor/skills/search-iacr/` (Cursor) +- `~/.agents/skills/search-iacr/` (Codex CLI, Cline, Gemini CLI, Copilot, OpenCode, Warp, Goose, Replit — universal target) +- `~/.continue/skills/search-iacr/` (Continue) +- `~/.windsurf/skills/search-iacr/` (Windsurf) +- `/skills/search-iacr/` (during repo development) + +This skill has no sibling-skill dependencies — it ships its own Python script. + ## Commands -This skill wraps `skills/search-iacr/search_iacr.py`. Run commands via Bash. +Run commands via Bash. ### Search ```bash -python skills/search-iacr/search_iacr.py search "post-quantum threshold signatures" --max-results 10 +python {{SKILL_DIR}}/search_iacr.py search "post-quantum threshold signatures" --max-results 10 ``` Returns JSON array of papers: `eprint_id`, `title`, `authors`, `year`, `abstract` (for top 5), `pdf_url`, `url`. Top 5 results are enriched with metadata from the paper page. @@ -30,7 +45,7 @@ Returns JSON array of papers: `eprint_id`, `title`, `authors`, `year`, `abstract ### Recent Papers ```bash -python skills/search-iacr/search_iacr.py recent --max-results 10 +python {{SKILL_DIR}}/search_iacr.py recent --max-results 10 ``` Returns the most recently published ePrint papers. @@ -38,7 +53,7 @@ Returns the most recently published ePrint papers. ### Download ```bash -python skills/search-iacr/search_iacr.py download 2024/1234 --output-dir reaper-workspace/papers/ +python {{SKILL_DIR}}/search_iacr.py download 2024/1234 --output-dir reaper-workspace/papers/ ``` Downloads the paper PDF. Returns JSON with `path` and `eprint_id`. @@ -46,7 +61,7 @@ Downloads the paper PDF. Returns JSON with `path` and `eprint_id`. ### Get URL ```bash -python skills/search-iacr/search_iacr.py url 2024/1234 +python {{SKILL_DIR}}/search_iacr.py url 2024/1234 ``` Returns JSON with `url` and `pdf_url` for the paper. @@ -54,7 +69,7 @@ Returns JSON with `url` and `pdf_url` for the paper. ## Role - **Standalone**: Invoked directly by the user to search for papers. -- **Building block**: Called by `review-literature` and `investigate` via the underlying Python script. +- **Building block**: Called by `/review-literature` and `/investigate` via the underlying Python script. ## Instructions diff --git a/skills/synthesize/SKILL.md b/skills/synthesize/SKILL.md index 6f8b2c2..0c78596 100644 --- a/skills/synthesize/SKILL.md +++ b/skills/synthesize/SKILL.md @@ -11,8 +11,10 @@ Generate a formal research paper from all investigation results. The output shou ## Usage +Invoke this skill by name; no arguments required. On slash-command hosts, prefix with `/reaper:` (e.g. `/reaper:synthesize`). + ``` -/reaper:synthesize +synthesize ``` ## Instructions diff --git a/tests/test_skills_structure.py b/tests/test_skills_structure.py index db1ac07..3d2da30 100644 --- a/tests/test_skills_structure.py +++ b/tests/test_skills_structure.py @@ -53,19 +53,54 @@ def test_all_scripts_exist(): def test_skill_frontmatter_format(): - """Every SKILL.md must have name, description, user-invocable in frontmatter.""" + """Every SKILL.md must satisfy the vercel-labs/skills parser: + - name matches [a-z][a-z0-9-]* and equals the directory name + - description is non-empty + Also asserts the Claude-Code-specific user-invocable key is present. + """ + name_pattern = re.compile(r"^[a-z][a-z0-9-]*$") for name, path in EXPECTED_SKILLS.items(): content = Path(path).read_text() assert content.startswith("---"), f"{path}: missing frontmatter" - # Extract frontmatter parts = content.split("---", 2) assert len(parts) >= 3, f"{path}: malformed frontmatter" fm = parts[1] - assert "name:" in fm, f"{path}: missing 'name' in frontmatter" - assert "description:" in fm, f"{path}: missing 'description' in frontmatter" + + name_match = re.search(r"^name:\s*(\S+)", fm, re.MULTILINE) + assert name_match, f"{path}: missing 'name' in frontmatter" + parsed_name = name_match.group(1).strip("\"'") + assert name_pattern.match(parsed_name), ( + f"{path}: name '{parsed_name}' must match [a-z][a-z0-9-]* " + f"(npx skills requirement)" + ) + assert parsed_name == name, ( + f"{path}: frontmatter name '{parsed_name}' must equal directory " + f"name '{name}' (npx skills resolves skills by directory)" + ) + + desc_match = re.search(r"^description:\s*(.+)", fm, re.MULTILINE) + assert desc_match, f"{path}: missing 'description' in frontmatter" + desc = desc_match.group(1).strip().strip("\"'") + assert desc, f"{path}: empty description" + assert "user-invocable:" in fm, f"{path}: missing 'user-invocable' in frontmatter" +def test_marketplace_json_lists_all_skills(): + """marketplace.json's skills array must list every directory under skills/.""" + import json + marketplace = json.loads(Path(".claude-plugin/marketplace.json").read_text()) + listed = set() + for plugin in marketplace.get("plugins", []): + for skill_path in plugin.get("skills", []): + listed.add(Path(skill_path).name) + expected = set(EXPECTED_SKILLS.keys()) + missing = expected - listed + extra = listed - expected + assert not missing, f"marketplace.json missing skills: {missing}" + assert not extra, f"marketplace.json lists unknown skills: {extra}" + + def test_review_literature_references_search_scripts(): """H2: review-literature should reference the search scripts.""" content = Path("skills/review-literature/SKILL.md").read_text() @@ -132,3 +167,128 @@ def test_readme_lists_search_skills(): content = Path("README.md").read_text() assert "search-arxiv" in content assert "search-iacr" in content + + +# --------------------------------------------------------------------------- +# Path-portability regression tests (host-agnostic skills package) +# --------------------------------------------------------------------------- + +# Skill files that ship inter-skill or intra-skill path references. Each is +# checked for both: (a) no relative `python skills/...` invocations have crept +# back in, and (b) every {{*_SKILL_DIR}} placeholder used in the file is also +# defined somewhere in the file. +PATH_AWARE_SKILLS = [ + "skills/reaper/SKILL.md", + "skills/reaper/references/search-tools.md", + "skills/reaper/references/codex-consultation.md", + "skills/clarify-goal/SKILL.md", + "skills/analyze-paper/SKILL.md", + "skills/review-literature/SKILL.md", + "skills/formalize-problem/SKILL.md", + "skills/brainstorm/SKILL.md", + "skills/investigate/SKILL.md", + "skills/critique/SKILL.md", + "skills/synthesize/SKILL.md", + "skills/search-arxiv/SKILL.md", + "skills/search-iacr/SKILL.md", +] + + +def test_no_relative_python_skills_invocations(): + """Regression: skills must never invoke `python skills//...` directly. + + Such relative paths only resolve if the user happens to be running the + agent from the repo root. After `npx skills add`, the scripts live under + a per-host install dir (e.g. ~/.agents/skills/, ~/.cursor/skills/), so + skills must use the {{SEARCH_*_SKILL_DIR}} placeholders that the agent + substitutes at execution time. + """ + pattern = re.compile(r"python\s+skills/") + offenders = [] + for path in PATH_AWARE_SKILLS: + p = Path(path) + if not p.exists(): + continue + content = p.read_text() + for m in pattern.finditer(content): + # Find the offending line + line_no = content[: m.start()].count("\n") + 1 + offenders.append(f"{path}:{line_no}") + assert not offenders, ( + "Found relative `python skills/...` invocations — these break under " + "npx skills install (scripts live in per-host install dirs, not " + "under skills/). Use the {{SEARCH_ARXIV_SKILL_DIR}} / " + "{{SEARCH_IACR_SKILL_DIR}} placeholders instead. Offenders: " + + ", ".join(offenders) + ) + + +def test_skill_dir_placeholders_are_defined(): + """Every {{*SKILL_DIR}} placeholder used in a skill must also be defined + in that same file (so the agent has the substitution rules co-located + with the references that need substituting). A 'definition' is any + mention of the placeholder inside a Path-Resolution-style block — we + detect this by requiring the placeholder appears in a paragraph that + also contains the word 'install' or 'substitute' or 'resolve' (the + standardized preamble vocabulary). + + Matches both the multi-skill form ({{REAPER_SKILL_DIR}}, + {{SEARCH_ARXIV_SKILL_DIR}}, {{SEARCH_IACR_SKILL_DIR}}) and the + own-directory form ({{SKILL_DIR}}) used by leaf skills like + search-arxiv and search-iacr. + """ + placeholder_pattern = re.compile(r"\{\{([A-Z_]*SKILL_DIR)\}\}") + # Words that appear in a definition paragraph (per the standardized + # Path Resolution Protocol preamble). + definition_keywords = ("install", "substitute", "resolve", "denote", "absolute") + + failures = [] + for path in PATH_AWARE_SKILLS: + p = Path(path) + if not p.exists(): + continue + content = p.read_text() + used = set(placeholder_pattern.findall(content)) + if not used: + continue + # Split into paragraphs and find which paragraphs define a placeholder + paragraphs = re.split(r"\n\s*\n", content) + defined = set() + for para in paragraphs: + if not any(kw in para.lower() for kw in definition_keywords): + continue + for ph in placeholder_pattern.findall(para): + defined.add(ph) + missing = used - defined + if missing: + failures.append(f"{path}: uses {sorted(missing)} but never defines them") + assert not failures, ( + "Some skills reference {{*_SKILL_DIR}} placeholders without a local " + "definition paragraph. Each skill that uses a placeholder must " + "include a Path Resolution Protocol section that lists install " + "locations and tells the agent to substitute it. Failures:\n " + + "\n ".join(failures) + ) + + +def test_path_resolution_protocol_section_present(): + """Skills that use {{*SKILL_DIR}} placeholders must declare a + 'Path Resolution Protocol' section so the convention is visually + obvious to readers and downstream auditors. Matches both the + multi-skill placeholders (e.g. {{REAPER_SKILL_DIR}}) and the leaf + own-directory form ({{SKILL_DIR}}).""" + placeholder_pattern = re.compile(r"\{\{[A-Z_]*SKILL_DIR\}\}") + failures = [] + for path in PATH_AWARE_SKILLS: + p = Path(path) + if not p.exists(): + continue + content = p.read_text() + if not placeholder_pattern.search(content): + continue + if "Path Resolution Protocol" not in content: + failures.append(path) + assert not failures, ( + "Skills that use path placeholders must declare a 'Path Resolution " + "Protocol' section. Missing in: " + ", ".join(failures) + )