From f29d3ad50a5842e3d1144f6aee53e7e21ae9f2a5 Mon Sep 17 00:00:00 2001
From: Emile Riberdy <emileriberdy@gmail.com>
Date: Wed, 3 Jun 2026 21:21:04 -0400
Subject: [PATCH] docs(predict-rlm): split rlm agent skills

---
 .../skills/predict-rlm-contributor/SKILL.md   |   48 +
 .../references/contributor-rules.md           |   26 +
 .../references/gepa-internals.md              |   20 +
 .../references/repo-map.md                    |   49 +
 .agents/skills/rlm-gepa/SKILL.md              |   97 ++
 .../skills/rlm-gepa/references/agent-spec.md  |   61 +
 .../rlm-gepa/references/data-and-scoring.md   |   56 +
 .../rlm-gepa/references/project-layout.md     |  150 +++
 .agents/skills/rlm/SKILL.md                   | 1061 ++---------------
 .../skills/rlm/references/predict-rlm-api.md  |  138 +++
 .../skills/rlm/references/project-layout.md   |  187 +++
 .../rlm/references/sandbox-and-research.md    |   51 +
 README.md                                     |   13 +-
 src/rlm_gepa/README.md                        |   37 +-
 tests/test_rlm_skill_docs.py                  |   13 +-
 15 files changed, 1024 insertions(+), 983 deletions(-)
 create mode 100644 .agents/skills/predict-rlm-contributor/SKILL.md
 create mode 100644 .agents/skills/predict-rlm-contributor/references/contributor-rules.md
 create mode 100644 .agents/skills/predict-rlm-contributor/references/gepa-internals.md
 create mode 100644 .agents/skills/predict-rlm-contributor/references/repo-map.md
 create mode 100644 .agents/skills/rlm-gepa/SKILL.md
 create mode 100644 .agents/skills/rlm-gepa/references/agent-spec.md
 create mode 100644 .agents/skills/rlm-gepa/references/data-and-scoring.md
 create mode 100644 .agents/skills/rlm-gepa/references/project-layout.md
 create mode 100644 .agents/skills/rlm/references/predict-rlm-api.md
 create mode 100644 .agents/skills/rlm/references/project-layout.md
 create mode 100644 .agents/skills/rlm/references/sandbox-and-research.md

diff --git a/.agents/skills/predict-rlm-contributor/SKILL.md b/.agents/skills/predict-rlm-contributor/SKILL.md
new file mode 100644
index 00000000..4721d1b7
--- /dev/null
+++ b/.agents/skills/predict-rlm-contributor/SKILL.md
@@ -0,0 +1,48 @@
+---
+name: predict-rlm-contributor
+description: >
+  Contribute to the predict-rlm repository itself: modify core PredictRLM runtime
+  code, RLM-GEPA internals, built-in skills, examples, docs, tests, packaging, or
+  repo-scoped agent skill guidance. Use when the user asks to change this repo or
+  investigate a bug in predict-rlm/RLM-GEPA. Do not use for building a new
+  downstream RLM package; use rlm for that, or rlm-gepa for downstream
+  optimization wiring.
+---
+
+# Contribute To predict-rlm
+
+Use this skill for repository work. Do not run the new-RLM scoping interview
+unless the user is explicitly asking to build a downstream RLM package.
+
+## Reference Map
+
+Read only what the task needs:
+
+- `references/repo-map.md`: major modules, examples, and verification commands.
+- `references/contributor-rules.md`: repo-specific coding, docs, and PR rules.
+- `references/gepa-internals.md`: RLM-GEPA contribution boundaries and proposer
+  behavior rules.
+
+## Workflow
+
+1. Inspect the requested change and relevant repo paths before editing.
+2. Preserve the distinction between downstream usage and repo contribution.
+3. Keep changes scoped to the module, docs, examples, or skill guidance in the
+   request.
+4. Validate at system boundaries. Prefer host-side tools for native libraries,
+   auth, network APIs, filesystem-heavy work, and anything that cannot run
+   cleanly in Pyodide.
+5. Run targeted tests or checks. Docs-only and skill-only changes need markdown
+   sanity plus `git diff --check`; code changes need focused tests, with broader
+   tests when touching shared runtime, sandbox execution, optimizer behavior, or
+   examples.
+
+## Issue And PR Rules
+
+Creating GitHub PRs/issues or pushing public branches is external publishing.
+Do it only when explicitly requested.
+
+When an investigation identifies a bug likely attributable to the
+`predict-rlm` package, ask whether the user wants it reported as a GitHub issue
+as soon as attribution is clear. Do not open the issue without explicit
+approval.
diff --git a/.agents/skills/predict-rlm-contributor/references/contributor-rules.md b/.agents/skills/predict-rlm-contributor/references/contributor-rules.md
new file mode 100644
index 00000000..bb98e7f7
--- /dev/null
+++ b/.agents/skills/predict-rlm-contributor/references/contributor-rules.md
@@ -0,0 +1,26 @@
+# Contributor Rules
+
+- PredictRLM is for callable, repeatable, deep-context workflows, not open-ended
+  interactive chat flows.
+- Keep large inputs as `File` references or metadata. Use focused `predict()`
+  calls and keep LLM-facing Pydantic schemas lean with `Field(description=...)`.
+- Validate at system boundaries. Let library validation raise when schema fields
+  are required; do not add silent fallbacks.
+- Keep generic runtime behavior domain-neutral. Domain or benchmark specifics
+  belong in examples, `AgentSpec`, seed/domain skills, runtime-grounding
+  examples, or evaluator feedback.
+- Persist experimental behavior in config, CLI options, or artifacts rather than
+  hidden env-only switches.
+- Use Conventional Commits. The allowed scopes are `rlm-gepa`, `predict-rlm`,
+  and `examples/[example-name]`.
+- PR descriptions must start with **Rationale**, followed by Summary and Test
+  Plan.
+
+## Skill Guidance Changes
+
+Keep each repo skill focused on one job. Use short trigger descriptions with
+clear boundaries. Put detailed API and workflow material in one-level
+`references/` files linked from `SKILL.md`.
+
+Do not put downstream RLM-building guidance and repository-contributor guidance
+in the same `SKILL.md`.
diff --git a/.agents/skills/predict-rlm-contributor/references/gepa-internals.md b/.agents/skills/predict-rlm-contributor/references/gepa-internals.md
new file mode 100644
index 00000000..f90c2ff3
--- /dev/null
+++ b/.agents/skills/predict-rlm-contributor/references/gepa-internals.md
@@ -0,0 +1,20 @@
+# RLM-GEPA Internals
+
+Use these rules when changing `src/rlm_gepa/`, tests, examples, or docs.
+
+- Treat `AgentSpec`, evaluator feedback, and seed instructions as the
+  optimization direction. Keep runtime and budget knobs separate.
+- Derive signature and tool context from the constructed RLM with
+  `agent_spec_from_rlm(...)` where possible.
+- Avoid duplicating broad prose or exposing internal IDs unnecessarily.
+- Keep generic proposer behavior domain-neutral. Domain or benchmark specifics
+  belong in `AgentSpec`, seed/domain skills, runtime-grounding examples, or
+  evaluator feedback.
+- Patch-merge/crossover should be evidence-backed behavioral grafting from train
+  disagreement traces, not broad synthesis, prompt concatenation, or source text
+  import.
+- GEPA project wiring should live in downstream `gepa/` packages. Generic
+  optimizer orchestration belongs in `src/rlm_gepa/`.
+
+For verification, run targeted RLM-GEPA tests when touching optimizer schemas,
+runtime adapters, proposer behavior, reporting, or SpreadBench GEPA wiring.
diff --git a/.agents/skills/predict-rlm-contributor/references/repo-map.md b/.agents/skills/predict-rlm-contributor/references/repo-map.md
new file mode 100644
index 00000000..e69ba1c9
--- /dev/null
+++ b/.agents/skills/predict-rlm-contributor/references/repo-map.md
@@ -0,0 +1,49 @@
+# Repo Map
+
+`predict-rlm` extends DSPy's RLM with a built-in `predict()` tool. It has a
+two-level execution model:
+
+1. The outer LLM writes and executes Python in a sandbox.
+2. The sub-LM handles perception and extraction through `predict()` calls.
+
+## Key Modules
+
+- `src/predict_rlm/predict_rlm.py`: `PredictRLM`, `predict()` tool creation,
+  action/extract signatures, LM contexts, and file I/O orchestration.
+- `src/predict_rlm/backends/jspi/backend.py`: default Deno/Pyodide backend.
+- `src/predict_rlm/backends/sbx/backend.py`: Docker Sandboxes backend.
+- `src/predict_rlm/backends/supervisor/`: shared sandbox runner process
+  supervision.
+- `src/predict_rlm/rlm_skills.py`: `Skill` dataclass and `merge_skills()`.
+- `src/predict_rlm/_shared.py`: action/extract signature construction and tool
+  doc formatting.
+- `src/predict_rlm/skills/`: built-in `pdf`, `spreadsheet`, and `docx` skills.
+- `src/rlm_gepa/`: RLM-GEPA optimizer integration.
+- `.agents/skills/`: repo-scoped agent skills for downstream users and
+  contributors.
+
+## Example Structure
+
+Examples generally follow:
+
+```text
+schema.py -> signature.py -> tools.py -> skills.py -> service.py -> run.py
+```
+
+Keep generated or example RLM packages grouped under `agent/`, with optional
+`tools/`, `bench/`, and `gepa/` packages only when needed.
+
+## Common Commands
+
+```bash
+uv sync
+uv sync --extra examples
+make test-unit
+make test-integration
+uv run pytest tests/test_predict_rlm.py::TestPredictTool::test_name -v
+uv run ruff check src/ tests/
+git diff --check
+```
+
+Use targeted checks for narrow changes. Run broader suites when touching shared
+interfaces, sandbox execution, optimizer behavior, or examples.
diff --git a/.agents/skills/rlm-gepa/SKILL.md b/.agents/skills/rlm-gepa/SKILL.md
new file mode 100644
index 00000000..6ba7e8c8
--- /dev/null
+++ b/.agents/skills/rlm-gepa/SKILL.md
@@ -0,0 +1,97 @@
+---
+name: rlm-gepa
+description: >
+  Design, scaffold, and use RLM-GEPA optimization wiring for PredictRLM projects,
+  including AgentSpec scoping, train/validation data, scoring feedback, seed
+  candidates, GEPA project files, and optimize/eval CLI setup. Use when the user
+  asks for GEPA, prompt or skill optimization, candidate selection from RLM
+  traces, AgentSpec, RLMGepaProject, optimization metrics, or train/validation
+  split design. Do not use for modifying the predict-rlm repository internals;
+  use predict-rlm-contributor for that.
+---
+
+# RLM-GEPA Optimization
+
+RLM-GEPA optimizes reusable PredictRLM text components, usually skill
+instructions, from execution traces. A project defines the agent to run, the
+train/validation examples to evaluate, the scoring feedback, and an `AgentSpec`
+that tells the proposer what reusable behavior is in scope.
+
+Use this skill when optimization is in scope. If the user only wants a callable
+RLM with no GEPA wiring, use `rlm`. If the user is changing the `predict-rlm`
+repo implementation, use `predict-rlm-contributor`.
+
+## Reference Map
+
+Read only what the task needs:
+
+- `references/agent-spec.md`: `AgentSpec` scoping, `agent_spec_from_rlm(...)`,
+  component focus, and anti-duplication rules.
+- `references/data-and-scoring.md`: dataset audit, split hygiene, scoring
+  feedback, and overfitting boundaries.
+- `references/project-layout.md`: generated `gepa/` package shape, CLI wiring,
+  and verification commands.
+
+## Workflow
+
+### 1. Confirm The Optimization Target
+
+Identify the PredictRLM workflow that GEPA should improve. If the RLM does not
+exist yet, first scope the RLM enough to define its real DSPy signature, skills,
+tools, inputs, and outputs. Do not ask the user to hand-write
+`target_signature` or `tool_signatures`; derive them from the constructed RLM.
+
+### 2. Scope The GEPA Brief
+
+Interview only for context GEPA cannot infer:
+
+- product or optimization goal;
+- input distribution, scale, and representative examples;
+- output schema and important failure modes;
+- train/validation data source;
+- labels, references, or scoring rule;
+- partial-credit feedback and anti-overfitting boundary;
+- tools, sandbox facts, file conventions, and runtime constraints.
+
+If the user cannot answer everything, proceed with explicit assumptions and mark
+fields that must be revisited before spending model calls.
+
+### 3. Audit Data And Scoring
+
+Read `references/data-and-scoring.md` before writing split or scoring code.
+Inspect examples enough to identify task types, input sizes, labels/reference
+shape, duplicates, leakage risks, missing labels, and failure buckets.
+
+Use train examples to propose and gate edits. Use validation examples for
+candidate selection and regression checks. Create a held-out test set only when
+the user asks for a benchmark/eval harness and the dataset size supports it.
+
+### 4. Design Components
+
+The most common component is `skill_instructions`, but multi-component projects
+can optimize several text blocks. `seed_candidate()` must return exactly the
+keys listed in `components`.
+
+Keep runtime and budget knobs out of the `AgentSpec`. Use `AgentSpec`, evaluator
+feedback, and seed instructions to steer optimization direction. Use CLI/config
+for `max_metric_calls`, minibatch size, concurrency, model choices, and runtime
+limits.
+
+### 5. Scaffold Project Wiring
+
+Create project-local `gepa/` files only when the user asks for optimization.
+The generated package owns task loading, metrics, seed candidate text, defaults,
+and CLI glue. The shared `rlm_gepa` package owns generic orchestration.
+
+Use `references/project-layout.md` for files and imports. Add the GEPA package
+extra and `rlm-gepa` console script in `pyproject.toml` when scaffolding a full
+project.
+
+### 6. Verify Before Running Optimization
+
+Add fast checks that load train/validation data, construct the project, verify
+the seed candidate keys, and build the target RLM without running a costly
+optimization.
+
+Run `uv run rlm-gepa optimize --check` when the project CLI exists. For docs-only
+or scaffolding changes, also run markdown sanity checks and `git diff --check`.
diff --git a/.agents/skills/rlm-gepa/references/agent-spec.md b/.agents/skills/rlm-gepa/references/agent-spec.md
new file mode 100644
index 00000000..83cf6ba6
--- /dev/null
+++ b/.agents/skills/rlm-gepa/references/agent-spec.md
@@ -0,0 +1,61 @@
+# AgentSpec
+
+Prefer `agent_spec_from_rlm(...)` for new projects. The RLM stays the source of
+truth for the DSPy signature, output schema, skills, and tools.
+
+```python
+from rlm_gepa import agent_spec_from_rlm
+
+agent_spec = agent_spec_from_rlm(
+    build_rlm(SEED_SKILL_INSTRUCTIONS),
+    use_cases=[
+        "contract review with clause-level citations",
+        "invoice analysis with total reconciliation",
+    ],
+    runtime_grounding_examples={
+        "skills": ["document-analysis skill instructions are optimized"],
+        "sandbox facts": ["Pyodide filesystem paths and package limits"],
+    },
+    scoring_description=(
+        "Score combines answer correctness and citation support. Feedback names "
+        "missing findings, unsupported citations, and extraction errors."
+    ),
+)
+```
+
+Do not duplicate facts `agent_spec_from_rlm(...)` can derive. Add only context
+GEPA cannot infer:
+
+- transfer use cases beyond the benchmark;
+- runtime-grounding examples the proposer must preserve;
+- scoring signal and evaluator feedback shape;
+- anti-overfitting boundaries;
+- short product or optimization framing, only when it adds useful context.
+
+Omit `agent_type` by default. Set it only when a concise product or optimization
+anchor adds information not already present in the signature, tools, or output
+schema.
+
+## Components
+
+`components` names mutable text fields. `seed_candidate()` must return exactly
+those keys.
+
+```python
+class MyProject(RLMGepaProject):
+    components = ("skill_instructions",)
+
+    def seed_candidate(self) -> dict[str, str]:
+        return {"skill_instructions": SEED_SKILL_INSTRUCTIONS}
+```
+
+Override `component_focus(component_name)` when each component needs a different
+proposer brief. Keep component names stable so runs and candidate artifacts are
+comparable.
+
+## Proposer Boundaries
+
+Patch-merge/crossover should be evidence-backed behavioral grafting from train
+disagreement traces. Avoid broad synthesis, prompt concatenation, source text
+imports, or benchmark-specific hacks. Domain specifics belong in `AgentSpec`,
+seed/domain skills, runtime-grounding examples, or evaluator feedback.
diff --git a/.agents/skills/rlm-gepa/references/data-and-scoring.md b/.agents/skills/rlm-gepa/references/data-and-scoring.md
new file mode 100644
index 00000000..1ca22ee8
--- /dev/null
+++ b/.agents/skills/rlm-gepa/references/data-and-scoring.md
@@ -0,0 +1,56 @@
+# Data And Scoring
+
+Investigate the dataset before writing split or scoring code. Do not treat it
+as an opaque list of rows.
+
+Inspect enough examples to identify:
+
+- task types and input sizes;
+- label or reference-output shape;
+- duplicate or near-duplicate examples;
+- missing labels or ambiguous references;
+- source grouping keys such as document, user, customer, or task family;
+- failure buckets the scorer should expose.
+
+## Split Semantics
+
+Use split names consistently:
+
+- **Train**: examples the optimizer/proposer may use to generate and gate edits.
+- **Validation**: examples used for candidate selection and regression checks.
+- **Test / held-out eval**: optional final reporting set.
+
+Prefer deterministic splits. Put random seed, split ratio/counts, grouping key,
+and sampling limits in `bench/config.py` or `gepa/config.py`. Split by group when
+leakage is plausible. Never let near-identical examples from the same source
+land in both train and validation without calling it out.
+
+If the dataset is tiny, prefer explicit hand-authored train/validation files
+over random splitting.
+
+## Scoring Feedback
+
+Each `evaluate_example()` should return a scalar score plus feedback that helps
+the proposer make a targeted behavioral change.
+
+Good feedback names concrete misses:
+
+- missing fields;
+- unsupported citations;
+- extraction or parsing errors;
+- wrong calculations;
+- formatting or file-output failures;
+- tool-use mistakes visible in traces.
+
+Avoid feedback that only says "wrong" or restates the score. GEPA quality is
+bounded by the evidence the metric returns.
+
+## Overfitting Boundaries
+
+State what counts as a transferable improvement versus a benchmark-specific
+hack. Examples:
+
+- preserve citation grounding instead of memorizing answer strings;
+- improve table handling generally instead of keying on fixture names;
+- preserve sandbox path conventions and tool APIs;
+- prefer behavior that transfers across document lengths and layouts.
diff --git a/.agents/skills/rlm-gepa/references/project-layout.md b/.agents/skills/rlm-gepa/references/project-layout.md
new file mode 100644
index 00000000..239f070e
--- /dev/null
+++ b/.agents/skills/rlm-gepa/references/project-layout.md
@@ -0,0 +1,150 @@
+# RLM-GEPA Project Layout
+
+Create project-local optimization wiring only when the user asks for GEPA or
+prompt/skill optimization.
+
+```text
+my_rlm/
+├── agent/           # PredictRLM signature, schema, service, skills/tools
+├── bench/           # optional eval loaders/scoring/fixtures
+└── gepa/
+    ├── __init__.py
+    ├── config.py
+    ├── project.py
+    ├── cli.py
+    └── __main__.py
+```
+
+The generated `gepa/` package owns train/validation loading, metric feedback,
+seed candidate text, defaults, and CLI glue. The shared `rlm_gepa` package
+provides optimizer runtime and CLI helpers.
+
+## pyproject.toml
+
+Add GEPA dependencies and a project-local CLI when optimization is in scope.
+
+```toml
+dependencies = [
+    "predict-rlm[gepa,gepa-viz]>=0.7.0-alpha5,<0.8",
+]
+
+[project.scripts]
+rlm-gepa = "my_rlm.gepa:main"
+
+[tool.predict-rlm.generated]
+predict_rlm_version = "0.7.0-alpha5"
+skill_version = "3.0"
+layout = "agent-tools-bench-gepa"
+features = ["agent", "bench", "rlm-gepa"]
+```
+
+## Project Skeleton
+
+```python
+from dataclasses import dataclass
+from typing import Any
+
+from predict_rlm import PredictRLM, Skill
+from predict_rlm.trace import RunTrace
+from rlm_gepa import (
+    EvaluationContext,
+    RLMGepaExampleResult,
+    RLMGepaProject,
+    agent_spec_from_rlm,
+)
+
+from ..agent.signature import AnalyzeDocuments
+
+
+SEED_SKILL_INSTRUCTIONS = "Initial domain instructions for the RLM."
+
+
+@dataclass
+class EvalExample:
+    example_id: str
+    rlm_kwargs: dict[str, Any]
+    reference: Any
+
+
+def build_rlm(skill_instructions: str, *, lm=None, sub_lm=None, max_iterations=30):
+    return PredictRLM(
+        AnalyzeDocuments,
+        lm=lm,
+        sub_lm=sub_lm,
+        max_iterations=max_iterations,
+        skills=[Skill(name="document-analysis", instructions=skill_instructions)],
+    )
+
+
+class MyProject(RLMGepaProject):
+    project_name = "my-project"
+    components = ("skill_instructions",)
+    agent_spec = agent_spec_from_rlm(build_rlm(SEED_SKILL_INSTRUCTIONS), ...)
+
+    def seed_candidate(self) -> dict[str, str]:
+        return {"skill_instructions": SEED_SKILL_INSTRUCTIONS}
+
+    def load_trainset(self):
+        return [...]
+
+    def load_valset(self):
+        return [...]
+
+    async def evaluate_example(
+        self,
+        candidate: dict[str, str],
+        example: EvalExample,
+        context: EvaluationContext,
+    ) -> RLMGepaExampleResult:
+        rlm = build_rlm(
+            candidate["skill_instructions"],
+            lm=context.lm,
+            sub_lm=context.sub_lm,
+            max_iterations=context.max_iterations,
+        )
+        result = await rlm.acall(**example.rlm_kwargs)
+        score, feedback = score_result(result, example.reference)
+
+        trace: RunTrace | None = getattr(result, "trace", None)
+        traces = [trace] if trace is not None else []
+
+        return RLMGepaExampleResult(
+            score=score,
+            feedback=feedback,
+            traces=traces,
+            rlm_inputs={"example_id": example.example_id, **example.rlm_kwargs},
+            example_id=example.example_id,
+        )
+```
+
+## CLI
+
+The generated `my_rlm.gepa:main` should call `run_project_cli(...)`.
+
+```python
+from rlm_gepa.cli import run_project_cli
+
+from .config import default_config
+from .project import build_project
+
+
+def main() -> int:
+    return run_project_cli(build_project, default_config())
+```
+
+Use `optimize --check` before a real run:
+
+```bash
+uv run rlm-gepa optimize --check
+```
+
+If `bench/` exists, expose seed, validation, and held-out evaluation through the
+same CLI only when the user asks for eval commands.
+
+For eval and optimization CLIs, route task execution through
+`rlm_gepa.runtime.adapter.RLMGepaAdapter` rather than bespoke `asyncio.gather`
+loops. Project-local `bench/` code owns dataset selection, candidate loading,
+task setup, and `eval.json` summary shaping; the shared adapter owns concurrency,
+per-task timeouts, progress display, verbose RLM logs, `task_traces/*.jsonl`,
+and `cost_log.jsonl`. Write `eval.json` in the run directory so
+`rlm-gepa stats <run_dir>` works for held-out evals as well as optimization runs.
diff --git a/.agents/skills/rlm/SKILL.md b/.agents/skills/rlm/SKILL.md
index 201bf294..e8fdd389 100644
--- a/.agents/skills/rlm/SKILL.md
+++ b/.agents/skills/rlm/SKILL.md
@@ -1,1005 +1,144 @@
 ---
 name: rlm
 description: >
-  Plan and build an RLM (Recursive Language Model) with predict-rlm, or contribute
-  to predict-rlm/RLM-GEPA itself. Interactively defines inputs, outputs, skills,
-  and architecture from a goal, then implements the code. Use when the user wants
-  to create a new RLM, explore whether one is feasible, or modify RLM/RLM-GEPA
-  guidance and implementation.
-compatibility: Requires Python 3.11+, Deno, and the predict-rlm package (built on DSPy).
-metadata:
-  author: Emile Riberdy
-  version: "2.1"
+  Plan, design, and build callable PredictRLM/RLM packages with typed inputs,
+  structured outputs, skills, host-side tools, and smoke tests. Use when the user
+  wants to create a new RLM, assess whether a workflow is a good RLM fit, or add
+  normal PredictRLM usage code. Do not use for contributing to the predict-rlm
+  repository itself or for RLM-GEPA optimization wiring; use
+  predict-rlm-contributor or rlm-gepa for those tasks.
 ---
 
-# Build an RLM
+# Build An RLM
 
 An RLM is a callable, pre-configured agent. It autonomously explores context,
-writes and executes code in a sandboxed REPL, calls tools, inspects results, and
-iterates until the task is done. Unlike a chat agent, an RLM is a function — you
-define its inputs, outputs, and tools, then call it from your code. It returns
-structured data, not chat messages.
+writes and executes code in a sandboxed Python REPL, calls tools, inspects
+results, and iterates until the task is done. Unlike a chat agent, an RLM is a
+function: define its inputs, outputs, and tools, then call it from code. It
+returns structured data, not chat messages.
 
-This skill has two phases:
+Use this skill for new PredictRLM packages and application code. If the user is
+modifying the `predict-rlm` repo, switch to `predict-rlm-contributor`. If the
+user asks for GEPA, optimization, train/validation candidate selection, or
+`AgentSpec` wiring, switch to `rlm-gepa`.
 
-1. **Plan** — interactively define the RLM with the user, research feasibility,
-   produce a plan
-2. **Build** — implement the plan as code files
+## Reference Map
 
-**First action**: Check skill freshness if due, then enter plan mode using the
-EnterPlanMode tool, unless the user is contributing to predict-rlm/RLM-GEPA
-itself. For contribution work, use Contributor mode first.
+Read only what the task needs:
 
----
-
-# Skill freshness check
-
-When this skill is loaded in an environment with shell, filesystem, and network
-access, run a lightweight update check at most once per day. Keep the last-check
-marker under
-`${HERMES_HOME:-$HOME/.hermes}/skills/.rlm-skill-update-check.json`. Compare the
-installed `SKILL.md` against
-`https://raw.githubusercontent.com/Trampoline-AI/predict-rlm/main/.agents/skills/rlm/SKILL.md`
-using a content hash, ETag, or commit SHA.
-
-If a newer skill is available, tell the user the `/rlm` skill has updates and
-ask whether they want to update or reinstall it. Do not update automatically.
-Suggested commands are `hermes skills update` for Hermes-managed installs, or
-`npx skills add Trampoline-AI/predict-rlm` for direct Skills CLI installs.
-
-Skip the check silently when tools, network, or a writable marker path are not
-available.
-
----
-
-# Contributor mode for predict-rlm / RLM-GEPA
-
-Use this mode when the user is modifying this repository's code, docs, examples,
-or installable skill guidance rather than asking you to build a new RLM package.
-Do not force the new-RLM scoping interview. First inspect the repo context, the
-requested change, and the relevant implementation/docs paths.
-
-Contributor rules:
-
-- PredictRLM is for callable, repeatable, deep-context workflows, not open-ended
-  interactive chat flows.
-- Keep large inputs as `File` references or metadata. Use focused `predict()`
-  calls, and keep LLM-facing Pydantic schemas lean with
-  `Field(description=...)`.
-- Validate at system boundaries. Prefer host-side tools for native libraries,
-  auth, network APIs, filesystem-heavy work, and anything that cannot run
-  cleanly in Pyodide.
-- For RLM-GEPA, treat `AgentSpec`, evaluator feedback, and seed instructions as
-  the optimization direction. Keep runtime and budget knobs separate.
-- Derive `AgentSpec` signature/tool context from the constructed RLM with
-  `agent_spec_from_rlm(...)` where possible. Avoid duplicating broad prose or
-  exposing internal IDs unnecessarily.
-- Keep generic proposer behavior domain-neutral. Domain or benchmark specifics
-  belong in `AgentSpec`, seed/domain skills, runtime grounding examples, or
-  evaluator feedback.
-- Patch-merge/crossover should be evidence-backed behavioral grafting from train
-  disagreement traces, not broad synthesis, prompt concatenation, or source text
-  import.
-- Persist experimental optimizer behavior in config, CLI options, or artifacts
-  rather than hidden env-only switches.
-- Creating GitHub PRs/issues or pushing public branches is external publishing;
-  do it only when explicitly requested.
-- When an investigation identifies a bug or problem likely attributable to the
-  `predict-rlm` package, ask the user whether they want it reported as a GitHub
-  issue as soon as that attribution is clear. Do not open the issue without that
-  explicit approval.
-- For verification, docs-only changes need markdown sanity or
-  `git diff
-  --check`. Code changes need targeted tests, plus broader tests
-  when touching shared interfaces, sandbox execution, optimizer behavior, or
-  examples.
-
----
-
-# Phase 1: Plan
-
-Work through these steps interactively. Do not skip steps or rush to the plan.
-Each step should involve asking the user questions and confirming alignment
-before moving on.
-
-## Step 1: Goal Definition
-
-Understand what the user wants to build.
-
-Ask:
-
-- What is the desired outcome? What does success look like?
-- What is the input material? (documents, code, data, APIs, etc.)
-- What does the output look like? (structured report, modified files,
-  spreadsheet, etc.)
-
-Then **validate RLM fit**. An RLM is the right tool when:
-
-- The input is large and needs selective exploration (documents, datasets,
-  codebases)
-- The task is multi-step with tool use (extract -> transform -> validate)
-- Actions modify state (redaction, form filling, generation)
-- Parallel sub-LM calls are needed across many items
-- File-to-file transformations (PDFs -> spreadsheets, documents -> reports)
-
-If the task is better served by a single LLM call or a simple script, tell the
-user and suggest an alternative. Otherwise, proceed.
-
-## Step 2: Input Design
-
-Work with the user to define every input to the RLM.
-
-For each input, determine:
-
-- **Name** and **type**: `File`, `list[File]`, `str`, or a Pydantic model
-- **Description**: what it contains and how the RLM uses it
-- **Source**: user-provided file, API response, config, generated data
-
-Key principles:
-
-- Large content (PDFs, images, datasets) must be `File` references — the RLM
-  accesses content on-demand through skills, keeping its context small
-- Metadata (file paths, page counts, config flags) can be strings or Pydantic
-  models
-- Use `list[File]` for variable-count file inputs
-
-Confirm the input design with the user before proceeding.
-
-## Step 3: Output Design
-
-Work with the user to define the structured output.
-
-For each output field, determine:
-
-- **Name**, **type**, and **description**
-- Whether it's a Pydantic model (structured data), `File` (generated file), or
-  primitive
-
-Push for specificity — vague outputs lead to poor RLM performance. Sketch the
-Pydantic models with `Field(description=...)` annotations. Include nested models
-where appropriate.
-
-Ask the user:
-
-- What fields matter most? What would they check first?
-- Are there any computed/derived fields (scores, summaries, counts)?
-- Do they need output files (Excel, PDF, images)?
-
-Confirm the output design with the user before proceeding.
-
-## Step 4: Research
-
-This step is **autonomous**. Tell the user you are researching, then do it.
-
-Use web search and the Explore subagent to:
+- `references/project-layout.md`: generated package layout, smoke tests, and
+  service wiring patterns.
+- `references/predict-rlm-api.md`: `PredictRLM`, `File`, `Skill`, built-in
+  skills, tools, `predict()`, and CodexLM usage.
+- `references/sandbox-and-research.md`: feasibility research, Pyodide package
+  compatibility, network allowlists, and host-side tool decisions.
 
-1. **Find Python packages** for the domain (e.g., `networkx` for graphs,
-   `tree-sitter` for code parsing, `beautifulsoup4` for HTML).
+## Workflow
 
-2. **Check Pyodide compatibility**. The sandbox runs Pyodide (Python in WASM).
-   Only **pure-Python wheels** or packages with **Emscripten builds** work.
-   Search pypi.org for each package and check:
-   - Does it have a `py3-none-any` wheel? (pure Python — works)
-   - Does it have C extensions without Emscripten builds? (won't work in
-     sandbox)
-   - Is it in the Pyodide built-in package list? (check
-     <https://pyodide.org/en/stable/usage/packages-in-pyodide.html>)
+### 1. Define The Goal
 
-3. **Identify network needs**. Does the task require calling external APIs? If
-   so, note the domains for `allowed_domains`.
+Ask what success looks like, what input material the RLM receives, and what
+structured output or generated files it should return.
 
-4. **Identify host-side tool needs**. If any functionality cannot run in WASM
-   (native binaries, C extensions, heavy computation), it must be a **host-side
-   tool** — a Python function running on the host that the RLM calls like any
-   other tool.
+Validate RLM fit. An RLM is appropriate when the task needs selective
+exploration of large inputs, multi-step tool use, stateful file transformations,
+parallel sub-LM calls, or repeated callable workflows. If a simple script or
+single LLM call is the better tool, say so and suggest that path.
 
-5. **Check for existing skills**. The built-in skills are:
-   - `pdf` — pymupdf for PDF rendering, text extraction, manipulation
-   - `spreadsheet` — openpyxl, pandas, formulas for Excel work
-   - `docx` — python-docx for reading, writing, and modifying Word documents
+### 2. Design Inputs
 
-Report findings to the user with a clear feasibility assessment. Flag any
-blockers.
+Define every input:
 
-## Step 5: Skill Design
+- name and type: `File`, `list[File]`, `str`, or a Pydantic model;
+- description: what it contains and how the RLM uses it;
+- source: user-provided file, API response, config, or generated data.
 
-Based on research, design the skill configuration.
+Use `File` references for large content such as PDFs, images, workbooks,
+documents, datasets, audio, or video. Keep raw bulk content out of the LLM
+schema.
 
-### Built-in skills
+### 3. Design Outputs
 
-List which built-in skills to use and why.
-
-### Custom skills (if needed)
-
-For each custom skill, define:
-
-- **name**: short identifier
-- **instructions**: prose guidance injected into the RLM's system prompt —
-  teaches the RLM patterns and best practices. Be detailed; this is the primary
-  way to control RLM behavior.
-- **packages**: PyPI packages installed in the sandbox via micropip (must be
-  Pyodide-compatible)
-- **modules**: Python files mounted into the sandbox as importable modules
-- **tools**: host-side callable functions exposed to the RLM
-
-### Host-side tool design
-
-For each host-side tool:
-
-- Function name and signature with type hints
-- Docstring (the RLM sees this to understand how to call it)
-- What it does and why it must be host-side
-
-Confirm the skill design with the user before proceeding.
-
-## Step 6: Strategy and Architecture
-
-### Signature strategy
-
-Write the step-by-step strategy that goes in the signature's docstring. This is
-the RLM's playbook:
-
-1. What to do first (survey/understand the input)
-2. How to gather information (render pages, use predict() for extraction, call
-   tools)
-3. How to process and synthesize
-4. What to produce and where to save output files
-
-### Single vs chained RLMs
-
-Evaluate whether this needs one RLM or multiple chained RLMs.
-
-**Use a single RLM when**:
-
-- The task is one coherent workflow
-- All steps need the same context/state
-- The iteration count stays reasonable (under 40)
-
-**Use chained RLMs when**:
-
-- There are distinct phases with different skill needs
-- One phase produces artifacts consumed by another
-- The combined task would exceed reasonable iteration counts
-- Different phases benefit from different sub-LM models
-
-If chaining, define each stage:
-
-- Stage name, signature (inputs/outputs), skills, strategy
-- The DAG: which stage feeds into which, with typed connections
-
-### Configuration
-
-- `max_iterations` estimate per RLM
-- `allowed_domains` if network access is needed
-- `sub_lm` recommendations (capability level needed)
-
-### Delivery scope
-
-Confirm which artifacts the user wants. Do not assume evals or optimization are
-required for every RLM.
-
-- **Agent only**: the callable RLM package, domain skills/tools, and fast smoke
-  tests. This is the default unless the user asks for benchmarking or GEPA.
-- **Agent + evals**: add a `bench/` package with dataset loading, scoring, and
-  evaluation CLI/helpers. Use this when the user has labeled examples, fixtures,
-  or a deterministic metric.
-- **Agent + optimization**: add project-local RLM-GEPA wiring only when the user
-  wants prompt/skill optimization. GEPA needs train/validation examples and a
-  metric, but it does not require a separate held-out eval CLI unless requested.
-- **Full project**: agent, tools, benchmark/eval harness, and optimization
-  wiring, like the SpreadBench example.
-
-### RLM-GEPA scoping interview
-
-When the user asks for optimization wiring but has not supplied enough context
-to write a concrete `AgentSpec`, run a short interview before writing the plan.
-Do not invent the AgentSpec from a vague task description. Gather enough to
-define:
-
-- the product or optimization goal GEPA should improve for;
-- the input distribution, scale, and examples that represent real work;
-- the output schema and the failure modes users care about most;
-- the train/validation data source and whether labels or reference outputs
-  exist;
-- the scoring rule, partial-credit feedback, and anti-overfitting boundaries;
-- the tools, sandbox constraints, file conventions, and runtime facts the
-  proposer must preserve.
-
-The interview should scope the RLM that owns the real DSPy signature and tools;
-do not ask the user to restate `target_signature`, `tool_signatures`, or a broad
-agent description as separate prose artifacts. Generate the signature/tool
-fields from the constructed RLM with `agent_spec_from_rlm(...)`; omit
-`agent_type` unless the user volunteers a product or optimization anchor that
-adds useful context.
-
-If the user cannot answer everything, proceed with explicit assumptions and mark
-the generated `AgentSpec` fields that should be revisited before spending model
-calls.
-
-### Dataset and split hygiene
-
-When the user asks for evals or optimization, investigate the dataset before
-writing split or scoring code. Inspect enough examples to identify task types,
-input sizes, label/reference-output shape, duplicate or near-duplicate examples,
-leakage risks, missing labels, and failure buckets the scorer should expose.
-Capture those findings in the plan instead of treating the dataset as an opaque
-list of rows.
-
-Use split semantics consistently:
-
-- **Train**: examples the optimizer/proposer may use to generate and gate edits.
-- **Validation**: examples used for candidate selection and regression checks
-  during optimization.
-- **Test / held-out eval**: optional final reporting set. Do not create or spend
-  on it unless the user asks for a benchmark/eval harness or has enough labeled
-  data to justify it.
-
-Prefer deterministic splits. Put the random seed, split ratio/counts, grouping
-key (if examples share source documents/users/tasks), and any sampling limits in
-`bench/config.py` or `gepa/config.py`. Split by group when leakage is plausible;
-never let near-identical cases from the same source land in both train and
-validation without calling it out. If the dataset is tiny, prefer explicit
-hand-authored train/validation files over random splitting.
-
-For GEPA, the project-local `gepa/` code owns train/validation loading and the
-seed candidate text. The seed candidate means the initial mutable component,
-such as baseline skill instructions; it is separate from the random seed used
-for splits, sampling, or optimizer reproducibility.
-
-For benchmarks with official splits, preserve the benchmark's public semantics.
-Use the official train split for optimization data, carve GEPA validation from
-that train split when a candidate-selection set is needed, and reserve official
-dev/test/challenge splits for reporting only when the user asks for held-out
-benchmark evaluation. Do not let optimizer feedback leak from held-out splits
-into seed instructions or candidate selection.
-
-### Benchmark integration boundaries
-
-Keep benchmark evaluators and oracle-style answer checkers harness-side. The RLM
-may see environment-safe tools, docs, state APIs, or session controls, but it
-should not see evaluator feedback or hidden scoring APIs while solving an
-example. After the attempt, the harness can call the evaluator and pass score
-and feedback to GEPA as the learning signal.
-
-When benchmark packages conflict with predict-rlm, DSPy, Pyodide, or the main
-project environment, prefer an isolated host-side runner/tool behind a typed
-JSON boundary. Do not force incompatible benchmark dependencies into the RLM
-sandbox or main package environment.
-
-For eval and optimization CLIs, route task execution through the shared
-`rlm_gepa.runtime.adapter.RLMGepaAdapter` semantics rather than bespoke
-`asyncio.gather` loops. Project-local `bench/` code may own dataset selection,
-candidate loading, and `eval.json` summary shaping, but should reuse the adapter
-for concurrency, per-task timeouts, progress bars, verbose RLM log handling,
-`task_traces/*.jsonl`, and `cost_log.jsonl`. If the eval command is async, call
-`await adapter.aevaluate(...)`; if it is synchronous, call
-`adapter.evaluate(...)`. Write `eval.json` in the run directory so
-`rlm-gepa stats <run_dir>` works for held-out evals as well as optimize runs.
-
-## Feasibility Checklist
-
-Before producing the final plan, verify:
-
-- [ ] All proposed packages are Pyodide-compatible (or have host-side fallbacks)
-- [ ] Network access needs are identified with specific domains
-- [ ] Host-side tools are defined for anything that can't run in WASM
-- [ ] Iteration count is reasonable (under 50 per RLM)
-- [ ] Input sizes are manageable (or chunking strategy is defined)
-- [ ] Output schemas are specific enough for reliable extraction
-- [ ] The task is achievable — no unsupported capabilities assumed
-
-## Plan Output
-
-Write the plan to the Claude Code plan file with these sections:
-
-1. **Overview** — one paragraph: what, why, and expected workflow
-2. **Delivery scope** — agent-only, evals, optimization, or full project
-3. **File manifest** — every file to create with a one-line description
-4. **Input schemas** — complete Pydantic model code for `agent/schema.py`
-5. **Output schemas** — complete Pydantic model code for `agent/schema.py`
-6. **Signature** — complete `agent/signature.py` code with strategy docstring
-7. **Skills configuration** — built-in imports + custom `Skill(...)`
-   definitions + tool signatures
-8. **Service architecture** — single RLM wiring or chained DAG:
-
-   ```
-   Stage1(documents) --[ExtractedData]--> Stage2(extracted) --[Report]--> Stage3(report)
-   ```
-
-9. **Optional eval/optimization design** — only if requested. Include dataset
-   audit findings, split policy, scoring feedback shape, and reproducibility
-   seed. For optimization, also include the `AgentSpec` interview summary,
-   train/validation source, seed candidate source, and the exact `gepa/` files
-   to create.
-10. **Feasibility notes** — constraints, risks, alternatives
-11. **Estimated complexity** — iteration count, sub-LM calls, cost range,
-    runtime
-12. **Smoke tests** — test files to create and commands to run. Every generated
-    RLM must include at least one fast no-network smoke test that imports the
-    generated package and constructs the service without making LLM calls.
-
-After writing the plan, use ExitPlanMode to get user approval. Once approved,
-proceed to Phase 2.
-
----
-
-# Phase 2: Build
-
-Implement the approved plan. Create all files following the patterns below.
-
-## File structure
-
-Default to a grouped package. Keep the root package thin and put the callable
-RLM under `agent/`. Add `tools/`, `bench/`, and `gepa/` only when the selected
-delivery scope needs them.
-
-```
-my_rlm/
-├── pyproject.toml        # Dependencies + generated-with metadata
-├── __init__.py           # Public exports from agent/
-├── agent/
-│   ├── __init__.py       # Public agent exports
-│   ├── schema.py         # Pydantic models for inputs AND outputs
-│   ├── signature.py      # DSPy Signature + strategy docstring
-│   ├── service.py        # DSPy Module wiring signature + PredictRLM + skills
-│   └── skills.py         # Optional custom skill definitions
-├── tools/                # Optional host-side tools and helpers
-│   └── __init__.py
-├── bench/                # Optional dataset/eval/scoring code
-│   ├── __init__.py
-│   ├── config.py         # Optional eval defaults
-│   └── cli.py            # Optional eval CLI helpers
-├── gepa/                 # Optional project-local RLM-GEPA wiring
-│   ├── __init__.py       # Exports main for `my_rlm.gepa:main`
-│   ├── config.py         # OptimizeConfig defaults + AgentSpec
-│   ├── project.py        # RLMGepaProject implementation
-│   ├── cli.py            # Thin run_project_cli wiring
-│   └── __main__.py       # Optional `python -m my_rlm.gepa`
-└── tests/
-    └── test_smoke.py     # Fast import/construction smoke tests
-```
-
-**Always create**: `pyproject.toml`, package `__init__.py`, `agent/schema.py`,
-`agent/signature.py`, `agent/service.py`, `agent/__init__.py`, and
-`tests/test_smoke.py`.
-
-**Create when needed**:
-
-- `agent/skills.py` when the RLM needs domain-specific instructions beyond
-  built-in skills.
-- `tools/` when host-side functions or helper modules are needed.
-- `bench/` when the user wants evals, datasets, scoring, or eval config.
-- `gepa/` and a console script when the user wants RLM-GEPA optimization.
-
-Do not add compatibility shims for old flat module names in newly generated
-projects. The grouped imports are the source of truth.
-
-## pyproject.toml — Dependencies and generated-with metadata
-
-Every generated RLM project should record which predict-rlm version and skill
-layout it targets. Use the current package version unless the user explicitly
-pins another one. For this repository version, that is `0.4.1`.
-
-Agent-only project:
-
-```toml
-[project]
-name = "my-rlm"
-version = "0.1.0"
-requires-python = ">=3.11"
-dependencies = [
-    "predict-rlm>=0.4.1,<0.5",
-]
-
-[tool.predict-rlm.generated]
-predict_rlm_version = "0.4.1"
-skill_version = "2.0"
-layout = "agent-tools-bench-gepa"
-features = ["agent"]
-```
-
-If the project uses built-in example skills, add only the required extras or
-packages. If it uses GEPA, add the GEPA extras and project-local `rlm-gepa`
-script as the main UX. Do not include optimization dependencies for an
-agent-only or eval-only project.
-
-```toml
-dependencies = [
-    "predict-rlm[examples,gepa,gepa-viz]>=0.4.1,<0.5",
-]
-
-[project.scripts]
-rlm-gepa = "my_rlm.gepa:main"
-
-[tool.predict-rlm.generated]
-predict_rlm_version = "0.4.1"
-skill_version = "2.0"
-layout = "agent-tools-bench-gepa"
-features = ["agent", "tools", "bench", "rlm-gepa"]
-```
-
-For examples inside the predict-rlm monorepo, an editable path source is fine,
-but keep the generated metadata table so readers know which API/layout version
-the project was generated against.
-
-## agent/schema.py — Pydantic models
-
-Define models for structured inputs and outputs. Use `Field(description=...)` so
-the RLM knows what each field means.
-
-```python
-from pydantic import BaseModel, Field
-
-
-class KeyDate(BaseModel):
-    """A key date extracted from a document."""
-
-    name: str = Field(description="e.g. 'Submission Deadline', 'Effective Date'")
-    date: str = Field(description="ISO format date (YYYY-MM-DD)")
-    time: str | None = Field(
-        None, description="24-hour format (HH:MM), e.g. '14:00', '09:30'"
-    )
-    timezone: str | None = Field(
-        None, description="Timezone code, e.g. 'EST', 'EDT', 'PST', 'UTC'"
-    )
-
-
-class DocumentAnalysis(BaseModel):
-    """Structured analysis of a document set."""
-
-    report: str = Field(
-        description="Full analysis as a well-formatted markdown report"
-    )
-    key_dates: list[KeyDate] = Field(
-        default_factory=list, description="Important dates found in the documents"
-    )
-```
-
-## agent/signature.py — Inputs, outputs, and strategy
-
-The docstring becomes the RLM's system instructions — tell the RLM how to
-approach the task step by step:
-
-```python
-import dspy
-
-from predict_rlm import File
-
-from .schema import DocumentAnalysis
-
-
-class AnalyzeDocuments(dspy.Signature):
-    """Analyze documents and produce a structured report.
-
-    1. **Read the report criteria** (appended below) to understand what
-       information to extract and in what format.
-
-    2. **Survey the documents** to understand what you're working with:
-       file names, page counts, document types.
-
-    3. **Gather information** systematically by rendering pages as images
-       and using predict() to extract content.
-
-    4. **Produce the report** following the format specified in the criteria.
-       Use tables for structured data, prose for analysis and context.
-    """
-
-    documents: list[File] = dspy.InputField(
-        desc="PDF documents to analyze"
-    )
-    analysis: DocumentAnalysis = dspy.OutputField(
-        desc="Structured analysis with markdown report, key dates, and key entities"
-    )
-```
-
-## agent/service.py — Wiring it together
-
-Wrap signature + skills + PredictRLM into a reusable DSPy Module:
-
-```python
-import dspy
-
-from predict_rlm import File, PredictRLM
-from predict_rlm.skills import pdf as pdf_skill
-
-from .schema import DocumentAnalysis
-from .signature import AnalyzeDocuments
-
-
-class DocumentAnalyzer(dspy.Module):
-    def __init__(
-        self,
-        sub_lm: dspy.LM | str | None = None,
-        max_iterations: int = 30,
-        verbose: bool = False,
-        debug: bool = False,
-    ):
-        self.sub_lm = sub_lm
-        self.max_iterations = max_iterations
-        self.verbose = verbose
-        self.debug = debug
-
-    async def aforward(
-        self, documents: list[File], criteria: str
-    ) -> DocumentAnalysis:
-        signature = AnalyzeDocuments.with_instructions(
-            AnalyzeDocuments.instructions + "\n\n# Task\n\n" + criteria.strip()
-        )
-        predictor = PredictRLM(
-            signature,
-            sub_lm=self.sub_lm,
-            skills=[pdf_skill],
-            max_iterations=self.max_iterations,
-            verbose=self.verbose,
-            debug=self.debug,
-        )
-        result = await predictor.acall(documents=documents)
-        return result.analysis
-```
-
-When using multiple skills or host-side tools:
-
-```python
-from predict_rlm.skills import pdf as pdf_skill
-from predict_rlm.skills import spreadsheet as spreadsheet_skill
-
-async def aforward(self, documents: list[File]) -> MyOutput:
-    predictor = PredictRLM(
-        MySignature,
-        sub_lm=self.sub_lm,
-        skills=[pdf_skill, spreadsheet_skill],
-        tools={"fetch_exchange_rate": fetch_exchange_rate},
-        ...
-    )
-```
-
-### Chaining pattern (multiple RLMs)
-
-```python
-async def aforward(self, documents: list[File]):
-    # Stage 1: Extract
-    extractor = PredictRLM(ExtractSignature, sub_lm=self.sub_lm, skills=[pdf_skill])
-    extracted = await extractor.acall(documents=documents)
-
-    # Stage 2: Analyze (uses output from stage 1)
-    analyzer = PredictRLM(AnalyzeSignature, sub_lm=self.sub_lm, skills=[analysis_skill])
-    result = await analyzer.acall(data=extracted.data)
-
-    return result
-```
-
-## agent/skills.py — Custom skills
-
-Create only when the RLM needs domain-specific instructions beyond built-in
-skills.
-
-```python
-from predict_rlm import Skill
-from predict_rlm.skills import pdf as pdf_skill
-
-redaction_skill = Skill(
-    name="redaction",
-    instructions="""How to redact content from PDFs using pymupdf.
-
-## Text redaction
-Search for text, create redaction annotations, then apply:
-    page = doc[page_num]
-    hits = page.search_for("sensitive text")
-    for rect in hits:
-        page.add_redact_annot(rect, fill=(0, 0, 0))
-    page.apply_redactions()
-...""",
-)
-
-__all__ = ["pdf_skill", "redaction_skill"]
-```
-
-## tests/test_smoke.py — Generated smoke tests
-
-Create smoke tests for every generated RLM. The default smoke test must be fast
-and must not require network access, API keys, Deno, Pyodide, or an actual LLM
-call. It should prove the generated package imports, the signature exposes the
-expected fields, and the service can be constructed.
-
-Tailor imports and class names to the generated RLM:
-
-```python
-def test_service_constructs():
-    from my_rlm import DocumentAnalyzer
-
-    service = DocumentAnalyzer(max_iterations=1, verbose=False, debug=False)
-    assert service.max_iterations == 1
-
-
-def test_signature_has_fields():
-    from my_rlm.agent.signature import AnalyzeDocuments
-
-    assert AnalyzeDocuments.input_fields
-    assert AnalyzeDocuments.output_fields
-```
-
-If the generated project includes tiny local fixtures and the user wants an
-end-to-end check, add a separate `@pytest.mark.integration` test that performs a
-minimal `acall`. Gate that test behind explicit credentials or an environment
-flag so the default smoke suite remains deterministic and cheap.
-
-## Optional bench/ package — Evals
-
-Create `bench/` only when the user wants evaluation. Keep it project-local:
-dataset loaders, scoring rules, fixtures, and reports belong here, not in
-`agent/`. Evals can exist without optimization. The eval layer should make the
-dataset audit and split policy explicit: where examples come from, how labels or
-reference outputs are represented, which examples are train/validation/test, and
-which seed/grouping rules make the split reproducible.
-
-Suggested files when needed:
-
-```text
-bench/
-├── __init__.py
-├── config.py      # Eval defaults and EvalConfig
-├── dataset.py      # Load examples/fixtures into typed task objects
-├── evaluation.py   # Project-specific task execution/scoring contract
-├── scoring.py      # Deterministic task-specific scoring
-└── cli.py          # Optional held-out eval subcommand/helpers
-```
-
-For benchmark eval and optimize entrypoints, use the shared RLM-GEPA runtime
-adapter semantics in `src/rlm_gepa/runtime/adapter.py` unless the user
-explicitly asks for a one-off local harness. Put dataset loading, scoring,
-setup, and task cleanup behind the project contract; let the shared adapter own
-concurrency, timeouts, progress/tqdm display, trace capture, and report
-semantics.
-
-## Optional gepa/ package — Optimization
-
-Create optimization wiring only when the user asks for it. The shared `rlm_gepa`
-package provides generic orchestration; the generated project owns its task
-loading, metric, seed candidate, and defaults. Import `agent_spec_from_rlm`,
-`OptimizeConfig`, `RLMGepaExampleResult`, `RLMGepaProject`, and
-`EvaluationContext` from `rlm_gepa` rather than copying optimizer internals into
-the project.
-
-The generated `AgentSpec` should be interview-backed, but it should not
-duplicate facts already present on the RLM. Define a single `build_rlm(...)`
-helper that constructs the PredictRLM with the real DSPy signature, skills, and
-tools. Use `agent_spec_from_rlm(build_rlm(seed_instructions), ...)` so GEPA
-derives `target_signature` and `tool_signatures` from that object. Omit
-`agent_type` by default; set it only for a short product or optimization anchor
-that adds non-duplicative framing beyond the signature, tools, or output schema.
-The interview should supply only the extra GEPA brief: transfer use cases,
-runtime-grounding examples, scoring signal, and anti-overfitting boundary. If
-any of those are weakly specified, add a `TODO` in `config.py` and keep
-`optimize --check` available so the user can catch missing setup before spending
-model calls.
-
-Suggested files when needed:
-
-```text
-gepa/
-├── __init__.py    # Exports `main` for the console script
-├── config.py      # Project-local OptimizeConfig defaults + AgentSpec
-├── project.py     # RLMGepaProject implementation and metric wiring
-├── cli.py         # Thin run_project_cli glue
-└── __main__.py    # Optional `python -m my_rlm.gepa`
-```
-
-Optimization can reuse helpers from `bench/` when evals exist. If a `bench/`
-package is present, expose its seed/validation/held-out evaluation flow as an
-`eval` subcommand on the same GEPA CLI surface. Do not create a held-out eval
-command just because GEPA is present: GEPA needs training and validation
-examples plus feedback; a separate benchmark/eval suite is optional.
-
----
+Define every output field with name, type, and description. Use Pydantic models
+with `Field(description=...)` for structured outputs. Use `File` output fields
+when the RLM must write artifacts back to the host.
 
-# Architecture Reference
+Push vague outputs into concrete schemas before implementation. Ask what fields
+the caller will inspect first, which derived fields matter, and whether generated
+files are required.
 
-Use this reference to ensure plans and implementations are accurate. Do not
-hallucinate parameters or patterns.
+### 4. Research Feasibility
 
-## How an RLM works
+Do autonomous research before writing the plan. Read
+`references/sandbox-and-research.md` when package compatibility, network access,
+or host-side tools are relevant.
 
-The architecture is two-level:
+Report feasibility clearly: package support, sandbox constraints,
+`allowed_domains`, required host tools, likely iteration count, and blockers.
 
-1. **The outer LLM** (the RLM itself) writes and executes Python code in a
-   sandboxed Pyodide/WASM REPL. It plans, orchestrates, and iterates.
-2. **The sub-LM** (via `predict()`) handles perception and extraction —
-   analyzing images, understanding text, and returning typed results. Each
-   `predict()` call gets its own context window.
+### 5. Design Skills And Tools
 
-The outer LLM's context stays small (code + tool results), while context-heavy
-work is offloaded to `predict()` calls.
+Choose built-in skills or custom `Skill(...)` definitions. Use built-in skills
+for common document domains:
 
-## File I/O
+- `pdf`: PDF reading, rendering, manipulation, and redaction.
+- `spreadsheet`: Excel workbook editing, formulas, and verification.
+- `docx`: Word document reading, writing, tables, formatting, and styles.
 
-Use `File` for file-typed fields:
+Create custom skills only when the RLM needs reusable domain instructions,
+sandbox packages, mounted modules, or bundled host-side tools.
 
-- **Input field**: mounts the file from host into the sandbox at
-  `/sandbox/input/{field_name}/`
-- **Output field**: syncs from `/sandbox/output/{field_name}/` back to the host
+Use host-side tools for authenticated APIs, database calls, native binaries,
+heavy filesystem work, or anything that cannot run cleanly in the sandbox.
 
-```python
-from predict_rlm import File
+### 6. Choose Architecture
 
-# Input: File(path="/absolute/path/to/file.pdf")
-# Output: declared as File output field, RLM writes to /sandbox/output/<field>/
-```
+Write the signature docstring as the RLM playbook:
 
-## PredictRLM constructor
+1. how to survey the inputs;
+2. how to gather information with files, skills, tools, and `predict()`;
+3. how to process and verify results;
+4. what to return or save.
 
-```python
-PredictRLM(
-    signature: type[Signature] | str,     # DSPy signature class
-    lm: dspy.LM | str | None = None,      # Main LM (code generation)
-    sub_lm: dspy.LM | str | None = None,  # Sub-LM for predict() calls
-    max_iterations: int = 30,
-    max_llm_calls: int = 50,
-    verbose: bool = False,
-    tools: dict[str, Callable] | list[Callable] | None = None,
-    allowed_domains: list[str] | None = None,
-    skills: list[Skill] | None = None,
-    debug: bool = False,
-    output_dir: str | Path | None = None,
-)
-```
+Use a single RLM when the task is one coherent workflow and can stay within a
+reasonable iteration budget. Use chained RLMs when phases have different inputs,
+skills, outputs, or budgets.
 
-Both `lm` and `sub_lm` accept a model string (e.g. `"openai/gpt-5.4"`) or a
-`dspy.LM` instance. If `lm` is omitted, the current context LM from
-`dspy.context(lm=...)` is used.
+### 7. Confirm Delivery Scope
 
-## CodexLM / ChatGPT subscription backend
+Default to an agent-only package unless the user asks for more:
 
-Starting in `predict-rlm` v0.7.0, `predict-rlm[codex-lm]` includes
-`dspy_codex_lm.CodexLM`, a DSPy LM backed by the Codex/ChatGPT subscription
-backend. Use it when the user wants PredictRLM to run on Codex model slugs
-through ChatGPT/Codex auth instead of normal OpenAI API keys.
+- **Agent only**: callable RLM package, domain skills/tools, and fast smoke
+  tests.
+- **Agent + evals**: add project-local dataset loading and scoring when the user
+  has examples, fixtures, labels, or deterministic metrics.
 
-Install and authenticate:
+For GEPA optimization or candidate selection, stop using this skill and switch
+to `rlm-gepa`.
 
-```bash
-uv add "predict-rlm[codex-lm]"
-uv run codex-lm auth login default
-uv run codex-lm auth status
-uv run codex-lm smoke-test --model gpt-5.5
-```
-
-For prerelease builds, allow prereleases explicitly:
-
-```bash
-uv add --prerelease=allow "predict-rlm[codex-lm]==0.7.0-alpha0"
-```
+### 8. Write The Plan
 
-Use `CodexLM` directly when wiring `lm` or `sub_lm`:
-
-```python
-from dspy_codex_lm import CodexLM
-from predict_rlm import PredictRLM
-
-rlm = PredictRLM(
-    MySignature,
-    lm=CodexLM(model="gpt-5.5"),
-    sub_lm=CodexLM(model="gpt-5.5"),
-)
-```
+Produce a plan with:
 
-To run an existing DSPy script without editing its LM construction, use the CLI
-wrapper. It monkeypatches OpenAI-family `dspy.LM(...)` calls and routes supported
-Codex model slugs to `CodexLM`:
+1. Overview.
+2. Delivery scope.
+3. File manifest.
+4. Input schemas.
+5. Output schemas.
+6. Signature code and strategy docstring.
+7. Skills and host-side tools.
+8. Service architecture or chained RLM DAG.
+9. Feasibility notes.
+10. Estimated complexity: iterations, sub-LM calls, runtime, and cost range.
+11. Smoke tests and commands.
 
-```bash
-uv run codex-lm my_dspy_script.py
-```
+Get approval before building when the environment or calling surface has an
+explicit plan mode. Otherwise proceed when the user has already asked for
+implementation.
 
-Useful CLI commands:
+### 9. Build And Verify
 
-```bash
-uv run codex-lm --help
-uv run codex-lm auth list
-uv run codex-lm usage
-uv run codex-lm rotation on
-```
-
-Important caveats:
-
-- CodexLM uses Codex/ChatGPT subscription auth, not ordinary OpenAI API keys.
-- Routing is strict: OpenAI-family model strings are intercepted only when the
-  slug is a supported Codex model; unsupported OpenAI-family slugs raise an
-  error instead of silently falling back.
-- Common supported slugs include `gpt-5.1-codex`, `gpt-5.1-codex-max`,
-  `gpt-5.1-codex-mini`, `gpt-5.2`, `gpt-5.2-codex`, `gpt-5.3-codex`,
-  `gpt-5.3-codex-spark`, `gpt-5.4`, `gpt-5.4-mini`, and `gpt-5.5`.
-
-## Skill dataclass
-
-```python
-from predict_rlm import Skill
-
-Skill(
-    name="my-skill",                          # Short identifier
-    instructions="How to approach...",         # Prose injected into the RLM prompt
-    packages=["pandas", "openpyxl"],           # PyPI packages installed in the sandbox
-    modules={"helper": "/path/to/helper.py"},  # Python files mounted as importable modules
-    tools={"fetch": fetch_fn},                 # Host-side callable functions exposed to the RLM
-)
-```
-
-Skills can bundle **host-side tools** via their `tools=` field. When skills are
-composed, their tools are merged alongside instructions and packages (tool name
-conflicts raise errors).
-
-## Built-in skills
-
-```python
-from predict_rlm.skills import pdf as pdf_skill          # pymupdf
-from predict_rlm.skills import spreadsheet as spreadsheet_skill  # openpyxl, pandas, formulas
-from predict_rlm.skills import docx as docx_skill        # python-docx
-```
-
-| Skill           | Packages                         | Modules        | What it teaches the RLM                                                    |
-| --------------- | -------------------------------- | -------------- | -------------------------------------------------------------------------- |
-| **pdf**         | `pymupdf`                        | —              | Read, render, modify, and redact PDFs                                      |
-| **spreadsheet** | `openpyxl`, `pandas`, `formulas` | `formula_eval` | Build and modify Excel workbooks with formulas and formatting              |
-| **docx**        | `python-docx`                    | `md2docx`      | Read, write, and modify Word documents with tables, formatting, and styles |
-
-## Tools
-
-Tools are **host-side functions** the RLM can call from the sandbox. Use them
-for operations that cannot run inside the sandbox — host access, authenticated
-APIs, database queries, system resources.
-
-```python
-async def fetch_exchange_rate(currency: str, date: str) -> str:
-    """Fetch the exchange rate for a currency on a given date.
-
-    Args:
-        currency: ISO currency code (e.g. "EUR", "GBP")
-        date: Date in YYYY-MM-DD format
-
-    Returns:
-        JSON string with the exchange rate data
-    """
-    async with httpx.AsyncClient() as client:
-        resp = await client.get(f"https://api.example.com/rates/{currency}/{date}")
-        return resp.text
-```
-
-Tools can be passed directly to PredictRLM via `tools={"name": fn}` or bundled
-inside a Skill via `tools=`.
-
-### When to use a Skill vs tools
-
-| Use a Skill when...                                  | Use `tools=` when...                                                      |
-| ---------------------------------------------------- | ------------------------------------------------------------------------- |
-| The RLM needs a **package** installed in the sandbox | The function must run on the **host** (API calls, DB queries, filesystem) |
-| You need to teach the RLM **how to use** something   | The tool's docstring is self-explanatory                                  |
-| The knowledge is **reusable** across RLMs            | It's a single specific function for one RLM                               |
-
-## predict() tool (inside sandbox)
-
-The RLM can call `predict()` for sub-LM perception/extraction:
-
-```python
-result = await predict(
-    "image: dspy.Image -> items: list[Item]",
-    instructions="Extract all line items from this invoice page",
-    image=page_image,
-)
-```
-
-Each predict() call gets its own context window. Supports `dspy.Image` for
-multimodal.
-
-## Key imports
-
-```python
-from predict_rlm import PredictRLM, Skill, File
-from predict_rlm.skills import pdf, spreadsheet, docx
-```
-
-## WASM sandbox constraints
-
-- Only pure-Python wheels or Pyodide built-in packages work
-- No subprocess, no native binaries, no C extensions (unless Emscripten-built)
-- Network access requires `allowed_domains` whitelist
-- File I/O is within the sandbox filesystem
-- Host-side tools bridge the gap for anything WASM can't do
+Implement the approved plan using the package structure in
+`references/project-layout.md`. Every generated RLM must include fast
+no-network smoke tests that import the package, inspect the signature fields,
+and construct the service without making LLM calls.
diff --git a/.agents/skills/rlm/references/predict-rlm-api.md b/.agents/skills/rlm/references/predict-rlm-api.md
new file mode 100644
index 00000000..3561cdc3
--- /dev/null
+++ b/.agents/skills/rlm/references/predict-rlm-api.md
@@ -0,0 +1,138 @@
+# PredictRLM API Reference
+
+Use this reference to keep generated code aligned with the package API.
+
+## Core Imports
+
+```python
+from predict_rlm import File, PredictRLM, Skill
+from predict_rlm.skills import docx, pdf, spreadsheet
+```
+
+## PredictRLM
+
+```python
+PredictRLM(
+    signature: type[Signature] | str,
+    lm: dspy.LM | str | None = None,
+    sub_lm: dspy.LM | str | None = None,
+    max_iterations: int = 30,
+    max_llm_calls: int = 50,
+    max_output_chars: int = 100_000,
+    verbose: bool = True,
+    tools: dict[str, Callable[..., str]] | list[Callable] | None = None,
+    interpreter: CodeInterpreter | None = None,
+    sandbox_backend: BackendName | str | None = None,
+    sbx_config: SbxConfig | None = None,
+    sbx_pool: SbxPool | None = None,
+    allowed_domains: list[str] | None = None,
+    skills: list[Skill] | None = None,
+    debug: bool = False,
+    output_dir: str | Path | None = None,
+    telemetry_context: TelemetryContext | None = None,
+    submit_confirmation: Callable[[SubmitConfirmationContext], str | None] | None = None,
+    trace_export_path: str | Path | None = None,
+    runtime_hooks: list[RuntimeHook] | None = None,
+    on_runtime_hook_event: Callable[[RuntimeHookEvent], Any] | None = None,
+    model_execution_timeout: bool = False,
+)
+```
+
+Both `lm` and `sub_lm` accept a model string or a `dspy.LM` instance. If `lm` is
+omitted, the current `dspy.context(lm=...)` LM is used.
+
+## File I/O
+
+Use `File` for large inputs and generated artifacts.
+
+- Input fields mount host files under `/sandbox/input/{field_name}/`.
+- Output fields sync files from `/sandbox/output/{field_name}/` back to the host.
+
+## Skills
+
+```python
+Skill(
+    name="my-skill",
+    instructions="How to approach the domain...",
+    packages=["pandas", "openpyxl"],
+    modules={"helper": "/path/to/helper.py"},
+    tools={"fetch": fetch_fn},
+)
+```
+
+Skills bundle reusable instructions, sandbox packages, mounted modules, and
+host-side tools. When skills are composed, instructions concatenate, packages
+deduplicate, and tool-name conflicts raise errors.
+
+Built-in skills:
+
+| Skill | Import | Packages | Modules | Purpose |
+| --- | --- | --- | --- | --- |
+| `pdf` | `from predict_rlm.skills import pdf` | `pymupdf` | - | Read, render, modify, and redact PDFs |
+| `spreadsheet` | `from predict_rlm.skills import spreadsheet` | `openpyxl`, `pandas`, `formulas` | `formula_eval` | Build and modify Excel workbooks |
+| `docx` | `from predict_rlm.skills import docx` | `python-docx` | `md2docx` | Read, write, and modify Word documents |
+
+## Host-Side Tools
+
+Tools are host-side callables the RLM can invoke from the sandbox. Use them for
+operations that need host access, authenticated APIs, databases, native
+libraries, or filesystem-heavy work.
+
+```python
+async def fetch_exchange_rate(currency: str, date: str) -> str:
+    """Fetch the exchange rate for a currency on a given date.
+
+    Args:
+        currency: ISO currency code, e.g. "EUR".
+        date: Date in YYYY-MM-DD format.
+
+    Returns:
+        JSON string with exchange-rate data.
+    """
+    ...
+```
+
+Pass tools directly via `tools={"name": fn}` or bundle reusable tools in a
+`Skill`.
+
+## predict() Inside The Sandbox
+
+The RLM can call `predict()` for sub-LM perception or extraction. Each call gets
+its own context window.
+
+```python
+result = await predict(
+    "image: dspy.Image -> items: list[Item]",
+    instructions="Extract all line items from this invoice page",
+    image=page_image,
+)
+```
+
+Use `dspy.Image` for multimodal image inputs.
+
+## CodexLM
+
+When the user wants PredictRLM to run on Codex/ChatGPT subscription auth instead
+of ordinary OpenAI API keys, use `predict-rlm[codex-lm]` and `CodexLM`.
+
+```bash
+uv add "predict-rlm[codex-lm]"
+uv run codex-lm auth login default
+uv run codex-lm auth status
+uv run codex-lm smoke-test --model gpt-5.5
+```
+
+```python
+from dspy_codex_lm import CodexLM
+from predict_rlm import PredictRLM
+
+rlm = PredictRLM(
+    MySignature,
+    lm=CodexLM(model="gpt-5.5"),
+    sub_lm=CodexLM(model="gpt-5.5"),
+)
+```
+
+`CodexLM` uses Codex/ChatGPT subscription auth, not ordinary OpenAI API keys.
+Routing is strict: unsupported Codex slugs should fail instead of silently
+falling back.
diff --git a/.agents/skills/rlm/references/project-layout.md b/.agents/skills/rlm/references/project-layout.md
new file mode 100644
index 00000000..8ee97805
--- /dev/null
+++ b/.agents/skills/rlm/references/project-layout.md
@@ -0,0 +1,187 @@
+# Project Layout
+
+Default to a grouped package. Keep the root package thin and put the callable
+RLM under `agent/`. Add `tools/` and `bench/` only when the selected delivery
+scope needs them.
+
+```text
+my_rlm/
+├── pyproject.toml
+├── __init__.py
+├── agent/
+│   ├── __init__.py
+│   ├── schema.py
+│   ├── signature.py
+│   ├── service.py
+│   └── skills.py        # optional custom skills
+├── tools/               # optional host-side tools/helpers
+├── bench/               # optional eval dataset/scoring code
+└── tests/
+    └── test_smoke.py
+```
+
+Always create `pyproject.toml`, package `__init__.py`, `agent/schema.py`,
+`agent/signature.py`, `agent/service.py`, `agent/__init__.py`, and
+`tests/test_smoke.py`. Do not add compatibility shims for old flat module names
+in newly generated projects.
+
+## pyproject.toml
+
+Record generated-with metadata so readers know the target API/layout version.
+Use the current package version unless the user explicitly pins another one.
+
+```toml
+[project]
+name = "my-rlm"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "predict-rlm>=0.7.0-alpha5,<0.8",
+]
+
+[tool.predict-rlm.generated]
+predict_rlm_version = "0.7.0-alpha5"
+skill_version = "3.0"
+layout = "agent-tools-bench"
+features = ["agent"]
+```
+
+For examples inside the `predict-rlm` monorepo, an editable path source is fine,
+but keep the metadata table.
+
+## Schema Pattern
+
+Define models for structured inputs and outputs. Use `Field(description=...)` so
+the RLM knows what each field means.
+
+```python
+from pydantic import BaseModel, Field
+
+
+class KeyDate(BaseModel):
+    """A key date extracted from a document."""
+
+    name: str = Field(description="e.g. 'Submission Deadline', 'Effective Date'")
+    date: str = Field(description="ISO format date (YYYY-MM-DD)")
+    time: str | None = Field(None, description="24-hour format, e.g. '14:00'")
+    timezone: str | None = Field(None, description="Timezone code, e.g. 'UTC'")
+
+
+class DocumentAnalysis(BaseModel):
+    """Structured analysis of a document set."""
+
+    report: str = Field(description="Full analysis as a markdown report")
+    key_dates: list[KeyDate] = Field(
+        default_factory=list,
+        description="Important dates found in the documents",
+    )
+```
+
+## Signature Pattern
+
+The signature docstring is the RLM's operating strategy.
+
+```python
+import dspy
+
+from predict_rlm import File
+
+from .schema import DocumentAnalysis
+
+
+class AnalyzeDocuments(dspy.Signature):
+    """Analyze documents and produce a structured report.
+
+    1. Read the report criteria.
+    2. Survey the documents: file names, page counts, and document types.
+    3. Gather information by rendering pages and using predict() for extraction.
+    4. Produce the requested report with grounded structured fields.
+    """
+
+    documents: list[File] = dspy.InputField(desc="PDF documents to analyze")
+    analysis: DocumentAnalysis = dspy.OutputField(
+        desc="Structured analysis with report, key dates, and entities"
+    )
+```
+
+## Service Pattern
+
+Wrap the signature and skills in a reusable DSPy module.
+
+```python
+import dspy
+
+from predict_rlm import File, PredictRLM
+from predict_rlm.skills import pdf as pdf_skill
+
+from .schema import DocumentAnalysis
+from .signature import AnalyzeDocuments
+
+
+class DocumentAnalyzer(dspy.Module):
+    def __init__(
+        self,
+        sub_lm: dspy.LM | str | None = None,
+        max_iterations: int = 30,
+        verbose: bool = False,
+        debug: bool = False,
+    ):
+        self.sub_lm = sub_lm
+        self.max_iterations = max_iterations
+        self.verbose = verbose
+        self.debug = debug
+
+    async def aforward(
+        self, documents: list[File], criteria: str
+    ) -> DocumentAnalysis:
+        signature = AnalyzeDocuments.with_instructions(
+            AnalyzeDocuments.instructions + "\n\n# Task\n\n" + criteria.strip()
+        )
+        predictor = PredictRLM(
+            signature,
+            sub_lm=self.sub_lm,
+            skills=[pdf_skill],
+            max_iterations=self.max_iterations,
+            verbose=self.verbose,
+            debug=self.debug,
+        )
+        result = await predictor.acall(documents=documents)
+        return result.analysis
+```
+
+## Chaining Pattern
+
+Use chained RLMs only for distinct phases with different skills, budgets, or
+typed artifacts.
+
+```python
+async def aforward(self, documents: list[File]):
+    extractor = PredictRLM(ExtractSignature, sub_lm=self.sub_lm, skills=[pdf_skill])
+    extracted = await extractor.acall(documents=documents)
+
+    analyzer = PredictRLM(AnalyzeSignature, sub_lm=self.sub_lm, skills=[analysis_skill])
+    return await analyzer.acall(data=extracted.data)
+```
+
+## Smoke Tests
+
+Default smoke tests must be fast and must not require network access, API keys,
+Deno, Pyodide, or LLM calls.
+
+```python
+def test_service_constructs():
+    from my_rlm import DocumentAnalyzer
+
+    service = DocumentAnalyzer(max_iterations=1, verbose=False, debug=False)
+    assert service.max_iterations == 1
+
+
+def test_signature_has_fields():
+    from my_rlm.agent.signature import AnalyzeDocuments
+
+    assert AnalyzeDocuments.input_fields
+    assert AnalyzeDocuments.output_fields
+```
+
+If an end-to-end check is useful, add it as a separate integration test gated by
+explicit credentials or an environment flag.
diff --git a/.agents/skills/rlm/references/sandbox-and-research.md b/.agents/skills/rlm/references/sandbox-and-research.md
new file mode 100644
index 00000000..8aa30834
--- /dev/null
+++ b/.agents/skills/rlm/references/sandbox-and-research.md
@@ -0,0 +1,51 @@
+# Sandbox And Research
+
+Research feasibility before implementation whenever the RLM needs third-party
+packages, external network access, heavy computation, native libraries, or
+nontrivial file formats.
+
+## Package Compatibility
+
+The default sandbox runs Python in WASM. Packages work when they are pure Python
+or available as Pyodide/Emscripten builds.
+
+Check each candidate package:
+
+- Does PyPI provide a `py3-none-any` wheel?
+- Is the package in the Pyodide built-in package list?
+- Does it depend on C extensions or native binaries?
+- Is a host-side tool simpler and more reliable?
+
+Do not assume native packages, subprocesses, system binaries, or arbitrary
+filesystem access are available in the sandbox.
+
+## Network Access
+
+If the RLM must call external APIs from the sandbox, identify exact domains and
+set `allowed_domains`. Prefer host-side tools for authenticated APIs so secrets,
+refresh tokens, and provider SDKs stay outside the sandbox.
+
+## Host-Side Tool Decisions
+
+Use a host-side tool when the operation:
+
+- requires authentication or private environment variables;
+- calls a database, SaaS API, or internal service;
+- needs a native library, subprocess, system binary, browser, or GPU;
+- reads/writes outside mounted input/output files;
+- is deterministic and easier to implement outside the RLM loop.
+
+Expose concise signatures and docstrings. The RLM sees the docstring to decide
+when and how to call the tool.
+
+## Feasibility Report
+
+Before finalizing the build plan, report:
+
+- packages and compatibility status;
+- built-in and custom skills;
+- host-side tools and why they are needed;
+- network allowlist domains;
+- estimated iteration count and sub-LM call volume;
+- input-size or chunking strategy;
+- any unsupported assumptions or blockers.
diff --git a/README.md b/README.md
index 15354fec..04669d3d 100644
--- a/README.md
+++ b/README.md
@@ -117,19 +117,24 @@ codex-lm usage
 
 ### With your coding agent
 
-Install the [predict-rlm skill](.agents/skills/rlm/SKILL.md) in Claude Code,
-Codex, Cursor, or any compatible coding agent:
+Install the repo's agent skills in Claude Code, Codex, Cursor, or any compatible
+coding agent:
 
 ```bash
 npx skills add Trampoline-AI/predict-rlm
 ```
 
-Then ask your agent to build an RLM:
+Then ask your agent to build an RLM with [`rlm`](.agents/skills/rlm/SKILL.md):
 
 ```
-❯ /rlm build an RLM that extracts line items from PDF invoices into a spreadsheet
+❯ $rlm build an RLM that extracts line items from PDF invoices into a spreadsheet
 ```
 
+Use [`rlm-gepa`](.agents/skills/rlm-gepa/SKILL.md) when you want GEPA
+optimization wiring, and
+[`predict-rlm-contributor`](.agents/skills/predict-rlm-contributor/SKILL.md)
+when contributing to this repository itself.
+
 ### Quick Example
 
 ```python
diff --git a/src/rlm_gepa/README.md b/src/rlm_gepa/README.md
index 48eb4f1d..25fbb2b2 100644
--- a/src/rlm_gepa/README.md
+++ b/src/rlm_gepa/README.md
@@ -29,28 +29,31 @@ improvement versus a benchmark-specific hack. Budget knobs such as
 
 ## Start with a coding agent
 
-The repository’s `/rlm` agent skill in `.agents/skills/rlm/SKILL.md` is the
-recommended starting point. It can build a normal PredictRLM package, and when
-you ask for optimization it can also add the RLM-GEPA project wiring.
+The repository ships separate agent skills for separate jobs:
 
-Install the skill in Claude Code, Codex, Cursor, or any compatible coding agent:
+- `.agents/skills/rlm/SKILL.md` builds normal PredictRLM packages.
+- `.agents/skills/rlm-gepa/SKILL.md` adds RLM-GEPA optimization wiring.
+- `.agents/skills/predict-rlm-contributor/SKILL.md` is for contributing to this
+  repository itself.
+
+Install the skills in Claude Code, Codex, Cursor, or any compatible coding agent:
 
 ```bash
 npx skills add Trampoline-AI/predict-rlm
 ```
 
-Then ask the agent to use `/rlm` and be explicit about whether you want just the
-PredictRLM, evals, or RLM-GEPA optimization too:
+Use `$rlm` to design the PredictRLM itself, and use `$rlm-gepa` when evals or
+RLM-GEPA optimization are in scope:
 
 ```text
-/rlm interview me to design a PredictRLM that extracts renewal terms, pricing
+$rlm-gepa interview me to design a PredictRLM that extracts renewal terms, pricing
 changes, and notice windows from vendor contracts. Then build the RLM, evals,
 and RLM-GEPA optimization wiring.
 ```
 
-When the prompt asks for an interview, the `/rlm` skill is expected to scope the
-RLM and GEPA setup before it writes the plan. The RLM itself should remain the
-source of truth for the DSPy signature and tools; GEPA should derive those via
+When the prompt asks for an interview, the `$rlm-gepa` skill scopes the RLM and
+GEPA setup before it writes the plan. The RLM itself should remain the source of
+truth for the DSPy signature and tools; GEPA should derive those via
 `agent_spec_from_rlm(...)`. The interview fills in the extra GEPA brief:
 
 - input shape and scale, for example “PDF/MSA/SOW contracts, 20-200 pages,”
@@ -69,7 +72,7 @@ source of truth for the DSPy signature and tools; GEPA should derive those via
   “sales coaching,” “customer-support QA,” “data-cleaning workflows,” or
   “competitive research.”
 
-When optimization is in scope, the `/rlm` skill should add the project-local
+When optimization is in scope, the `$rlm-gepa` skill should add the project-local
 `gepa/` package:
 
 ```text
@@ -100,7 +103,7 @@ from predict_rlm import PredictRLM, Skill
 from predict_rlm.trace import RunTrace
 from rlm_gepa import EvaluationContext, RLMGepaExampleResult, RLMGepaProject, agent_spec_from_rlm
 
-from .signature import AnalyzeDocuments
+from ..agent.signature import AnalyzeDocuments
 
 
 SEED_SKILL_INSTRUCTIONS = "Initial domain instructions for the RLM."
@@ -214,7 +217,7 @@ component needs a different proposer brief.
 
 RLM-GEPA projects should feel like a product CLI: from the project root, run
 `uv run rlm-gepa ...` for checks, evals, optimization, stats, and plots. When
-the `/rlm` skill scaffolds an optimization project, it should set this up in
+the `$rlm-gepa` skill scaffolds an optimization project, it should set this up in
 `pyproject.toml` for you:
 
 ```toml
@@ -257,6 +260,14 @@ The `eval` subcommand is project-specific because datasets and metrics are
 project-specific. Agent-only or optimization-only projects do not need a
 held-out `eval` command unless the user asks for one.
 
+For eval and optimization CLIs, route task execution through
+`rlm_gepa.runtime.adapter.RLMGepaAdapter` rather than bespoke `asyncio.gather`
+loops. Project-local code can own dataset selection, candidate loading, task
+setup, and `eval.json` summary shaping; the shared adapter owns concurrency,
+per-task timeouts, progress display, verbose RLM logs, `task_traces/*.jsonl`,
+and `cost_log.jsonl`. Write `eval.json` in the run directory so
+`rlm-gepa stats <run_dir>` works for held-out evals as well as optimization runs.
+
 Use `--verbose-rlm` to print human-readable RLM trace blocks during eval:
 reasoning, generated code, output, tool calls, errors, and `SUBMIT` payloads.
 Use `--debug-rlm` for timestamped RLM and sandbox lifecycle diagnostics.
diff --git a/tests/test_rlm_skill_docs.py b/tests/test_rlm_skill_docs.py
index 442546ab..10e22af2 100644
--- a/tests/test_rlm_skill_docs.py
+++ b/tests/test_rlm_skill_docs.py
@@ -9,25 +9,28 @@
 ROOT = Path(__file__).resolve().parents[1]
 
 
+def _installable_skill_docs_text() -> str:
+    skill_docs = sorted((ROOT / ".agents" / "skills").glob("**/*.md"))
+    return "\n".join(path.read_text() for path in skill_docs)
+
+
 def test_public_rlm_skill_version_snippets_match_package_version():
     package_version = tomllib.loads((ROOT / "pyproject.toml").read_text())["project"][
         "version"
     ]
-    skill_text = (ROOT / ".agents/skills/rlm/SKILL.md").read_text()
-
-    if package_version != "0.4.1":
-        return
+    skill_text = _installable_skill_docs_text()
 
     stale_snippets = [
         r"predict-rlm>=0\.3\.0",
         r"predict-rlm\[[^\]]+\]>=0\.4\.0",
     ]
+    assert f'predict_rlm_version = "{package_version}"' in skill_text
     for snippet in stale_snippets:
         assert not re.search(snippet, skill_text), f"stale RLM skill snippet: {snippet}"
 
 
 def test_public_rlm_skill_requires_shared_eval_adapter_semantics():
-    skill_text = (ROOT / ".agents/skills/rlm/SKILL.md").read_text()
+    skill_text = _installable_skill_docs_text()
 
     assert "rlm_gepa.runtime.adapter.RLMGepaAdapter" in skill_text
     assert "eval.json" in skill_text