From f29d3ad50a5842e3d1144f6aee53e7e21ae9f2a5 Mon Sep 17 00:00:00 2001 From: Emile Riberdy Date: Wed, 3 Jun 2026 21:21:04 -0400 Subject: [PATCH] docs(predict-rlm): split rlm agent skills --- .../skills/predict-rlm-contributor/SKILL.md | 48 + .../references/contributor-rules.md | 26 + .../references/gepa-internals.md | 20 + .../references/repo-map.md | 49 + .agents/skills/rlm-gepa/SKILL.md | 97 ++ .../skills/rlm-gepa/references/agent-spec.md | 61 + .../rlm-gepa/references/data-and-scoring.md | 56 + .../rlm-gepa/references/project-layout.md | 150 +++ .agents/skills/rlm/SKILL.md | 1061 ++--------------- .../skills/rlm/references/predict-rlm-api.md | 138 +++ .../skills/rlm/references/project-layout.md | 187 +++ .../rlm/references/sandbox-and-research.md | 51 + README.md | 13 +- src/rlm_gepa/README.md | 37 +- tests/test_rlm_skill_docs.py | 13 +- 15 files changed, 1024 insertions(+), 983 deletions(-) create mode 100644 .agents/skills/predict-rlm-contributor/SKILL.md create mode 100644 .agents/skills/predict-rlm-contributor/references/contributor-rules.md create mode 100644 .agents/skills/predict-rlm-contributor/references/gepa-internals.md create mode 100644 .agents/skills/predict-rlm-contributor/references/repo-map.md create mode 100644 .agents/skills/rlm-gepa/SKILL.md create mode 100644 .agents/skills/rlm-gepa/references/agent-spec.md create mode 100644 .agents/skills/rlm-gepa/references/data-and-scoring.md create mode 100644 .agents/skills/rlm-gepa/references/project-layout.md create mode 100644 .agents/skills/rlm/references/predict-rlm-api.md create mode 100644 .agents/skills/rlm/references/project-layout.md create mode 100644 .agents/skills/rlm/references/sandbox-and-research.md diff --git a/.agents/skills/predict-rlm-contributor/SKILL.md b/.agents/skills/predict-rlm-contributor/SKILL.md new file mode 100644 index 00000000..4721d1b7 --- /dev/null +++ b/.agents/skills/predict-rlm-contributor/SKILL.md @@ -0,0 +1,48 @@ +--- +name: predict-rlm-contributor +description: > + Contribute to the predict-rlm repository itself: modify core PredictRLM runtime + code, RLM-GEPA internals, built-in skills, examples, docs, tests, packaging, or + repo-scoped agent skill guidance. Use when the user asks to change this repo or + investigate a bug in predict-rlm/RLM-GEPA. Do not use for building a new + downstream RLM package; use rlm for that, or rlm-gepa for downstream + optimization wiring. +--- + +# Contribute To predict-rlm + +Use this skill for repository work. Do not run the new-RLM scoping interview +unless the user is explicitly asking to build a downstream RLM package. + +## Reference Map + +Read only what the task needs: + +- `references/repo-map.md`: major modules, examples, and verification commands. +- `references/contributor-rules.md`: repo-specific coding, docs, and PR rules. +- `references/gepa-internals.md`: RLM-GEPA contribution boundaries and proposer + behavior rules. + +## Workflow + +1. Inspect the requested change and relevant repo paths before editing. +2. Preserve the distinction between downstream usage and repo contribution. +3. Keep changes scoped to the module, docs, examples, or skill guidance in the + request. +4. Validate at system boundaries. Prefer host-side tools for native libraries, + auth, network APIs, filesystem-heavy work, and anything that cannot run + cleanly in Pyodide. +5. Run targeted tests or checks. Docs-only and skill-only changes need markdown + sanity plus `git diff --check`; code changes need focused tests, with broader + tests when touching shared runtime, sandbox execution, optimizer behavior, or + examples. + +## Issue And PR Rules + +Creating GitHub PRs/issues or pushing public branches is external publishing. +Do it only when explicitly requested. + +When an investigation identifies a bug likely attributable to the +`predict-rlm` package, ask whether the user wants it reported as a GitHub issue +as soon as attribution is clear. Do not open the issue without explicit +approval. diff --git a/.agents/skills/predict-rlm-contributor/references/contributor-rules.md b/.agents/skills/predict-rlm-contributor/references/contributor-rules.md new file mode 100644 index 00000000..bb98e7f7 --- /dev/null +++ b/.agents/skills/predict-rlm-contributor/references/contributor-rules.md @@ -0,0 +1,26 @@ +# Contributor Rules + +- PredictRLM is for callable, repeatable, deep-context workflows, not open-ended + interactive chat flows. +- Keep large inputs as `File` references or metadata. Use focused `predict()` + calls and keep LLM-facing Pydantic schemas lean with `Field(description=...)`. +- Validate at system boundaries. Let library validation raise when schema fields + are required; do not add silent fallbacks. +- Keep generic runtime behavior domain-neutral. Domain or benchmark specifics + belong in examples, `AgentSpec`, seed/domain skills, runtime-grounding + examples, or evaluator feedback. +- Persist experimental behavior in config, CLI options, or artifacts rather than + hidden env-only switches. +- Use Conventional Commits. The allowed scopes are `rlm-gepa`, `predict-rlm`, + and `examples/[example-name]`. +- PR descriptions must start with **Rationale**, followed by Summary and Test + Plan. + +## Skill Guidance Changes + +Keep each repo skill focused on one job. Use short trigger descriptions with +clear boundaries. Put detailed API and workflow material in one-level +`references/` files linked from `SKILL.md`. + +Do not put downstream RLM-building guidance and repository-contributor guidance +in the same `SKILL.md`. diff --git a/.agents/skills/predict-rlm-contributor/references/gepa-internals.md b/.agents/skills/predict-rlm-contributor/references/gepa-internals.md new file mode 100644 index 00000000..f90c2ff3 --- /dev/null +++ b/.agents/skills/predict-rlm-contributor/references/gepa-internals.md @@ -0,0 +1,20 @@ +# RLM-GEPA Internals + +Use these rules when changing `src/rlm_gepa/`, tests, examples, or docs. + +- Treat `AgentSpec`, evaluator feedback, and seed instructions as the + optimization direction. Keep runtime and budget knobs separate. +- Derive signature and tool context from the constructed RLM with + `agent_spec_from_rlm(...)` where possible. +- Avoid duplicating broad prose or exposing internal IDs unnecessarily. +- Keep generic proposer behavior domain-neutral. Domain or benchmark specifics + belong in `AgentSpec`, seed/domain skills, runtime-grounding examples, or + evaluator feedback. +- Patch-merge/crossover should be evidence-backed behavioral grafting from train + disagreement traces, not broad synthesis, prompt concatenation, or source text + import. +- GEPA project wiring should live in downstream `gepa/` packages. Generic + optimizer orchestration belongs in `src/rlm_gepa/`. + +For verification, run targeted RLM-GEPA tests when touching optimizer schemas, +runtime adapters, proposer behavior, reporting, or SpreadBench GEPA wiring. diff --git a/.agents/skills/predict-rlm-contributor/references/repo-map.md b/.agents/skills/predict-rlm-contributor/references/repo-map.md new file mode 100644 index 00000000..e69ba1c9 --- /dev/null +++ b/.agents/skills/predict-rlm-contributor/references/repo-map.md @@ -0,0 +1,49 @@ +# Repo Map + +`predict-rlm` extends DSPy's RLM with a built-in `predict()` tool. It has a +two-level execution model: + +1. The outer LLM writes and executes Python in a sandbox. +2. The sub-LM handles perception and extraction through `predict()` calls. + +## Key Modules + +- `src/predict_rlm/predict_rlm.py`: `PredictRLM`, `predict()` tool creation, + action/extract signatures, LM contexts, and file I/O orchestration. +- `src/predict_rlm/backends/jspi/backend.py`: default Deno/Pyodide backend. +- `src/predict_rlm/backends/sbx/backend.py`: Docker Sandboxes backend. +- `src/predict_rlm/backends/supervisor/`: shared sandbox runner process + supervision. +- `src/predict_rlm/rlm_skills.py`: `Skill` dataclass and `merge_skills()`. +- `src/predict_rlm/_shared.py`: action/extract signature construction and tool + doc formatting. +- `src/predict_rlm/skills/`: built-in `pdf`, `spreadsheet`, and `docx` skills. +- `src/rlm_gepa/`: RLM-GEPA optimizer integration. +- `.agents/skills/`: repo-scoped agent skills for downstream users and + contributors. + +## Example Structure + +Examples generally follow: + +```text +schema.py -> signature.py -> tools.py -> skills.py -> service.py -> run.py +``` + +Keep generated or example RLM packages grouped under `agent/`, with optional +`tools/`, `bench/`, and `gepa/` packages only when needed. + +## Common Commands + +```bash +uv sync +uv sync --extra examples +make test-unit +make test-integration +uv run pytest tests/test_predict_rlm.py::TestPredictTool::test_name -v +uv run ruff check src/ tests/ +git diff --check +``` + +Use targeted checks for narrow changes. Run broader suites when touching shared +interfaces, sandbox execution, optimizer behavior, or examples. diff --git a/.agents/skills/rlm-gepa/SKILL.md b/.agents/skills/rlm-gepa/SKILL.md new file mode 100644 index 00000000..6ba7e8c8 --- /dev/null +++ b/.agents/skills/rlm-gepa/SKILL.md @@ -0,0 +1,97 @@ +--- +name: rlm-gepa +description: > + Design, scaffold, and use RLM-GEPA optimization wiring for PredictRLM projects, + including AgentSpec scoping, train/validation data, scoring feedback, seed + candidates, GEPA project files, and optimize/eval CLI setup. Use when the user + asks for GEPA, prompt or skill optimization, candidate selection from RLM + traces, AgentSpec, RLMGepaProject, optimization metrics, or train/validation + split design. Do not use for modifying the predict-rlm repository internals; + use predict-rlm-contributor for that. +--- + +# RLM-GEPA Optimization + +RLM-GEPA optimizes reusable PredictRLM text components, usually skill +instructions, from execution traces. A project defines the agent to run, the +train/validation examples to evaluate, the scoring feedback, and an `AgentSpec` +that tells the proposer what reusable behavior is in scope. + +Use this skill when optimization is in scope. If the user only wants a callable +RLM with no GEPA wiring, use `rlm`. If the user is changing the `predict-rlm` +repo implementation, use `predict-rlm-contributor`. + +## Reference Map + +Read only what the task needs: + +- `references/agent-spec.md`: `AgentSpec` scoping, `agent_spec_from_rlm(...)`, + component focus, and anti-duplication rules. +- `references/data-and-scoring.md`: dataset audit, split hygiene, scoring + feedback, and overfitting boundaries. +- `references/project-layout.md`: generated `gepa/` package shape, CLI wiring, + and verification commands. + +## Workflow + +### 1. Confirm The Optimization Target + +Identify the PredictRLM workflow that GEPA should improve. If the RLM does not +exist yet, first scope the RLM enough to define its real DSPy signature, skills, +tools, inputs, and outputs. Do not ask the user to hand-write +`target_signature` or `tool_signatures`; derive them from the constructed RLM. + +### 2. Scope The GEPA Brief + +Interview only for context GEPA cannot infer: + +- product or optimization goal; +- input distribution, scale, and representative examples; +- output schema and important failure modes; +- train/validation data source; +- labels, references, or scoring rule; +- partial-credit feedback and anti-overfitting boundary; +- tools, sandbox facts, file conventions, and runtime constraints. + +If the user cannot answer everything, proceed with explicit assumptions and mark +fields that must be revisited before spending model calls. + +### 3. Audit Data And Scoring + +Read `references/data-and-scoring.md` before writing split or scoring code. +Inspect examples enough to identify task types, input sizes, labels/reference +shape, duplicates, leakage risks, missing labels, and failure buckets. + +Use train examples to propose and gate edits. Use validation examples for +candidate selection and regression checks. Create a held-out test set only when +the user asks for a benchmark/eval harness and the dataset size supports it. + +### 4. Design Components + +The most common component is `skill_instructions`, but multi-component projects +can optimize several text blocks. `seed_candidate()` must return exactly the +keys listed in `components`. + +Keep runtime and budget knobs out of the `AgentSpec`. Use `AgentSpec`, evaluator +feedback, and seed instructions to steer optimization direction. Use CLI/config +for `max_metric_calls`, minibatch size, concurrency, model choices, and runtime +limits. + +### 5. Scaffold Project Wiring + +Create project-local `gepa/` files only when the user asks for optimization. +The generated package owns task loading, metrics, seed candidate text, defaults, +and CLI glue. The shared `rlm_gepa` package owns generic orchestration. + +Use `references/project-layout.md` for files and imports. Add the GEPA package +extra and `rlm-gepa` console script in `pyproject.toml` when scaffolding a full +project. + +### 6. Verify Before Running Optimization + +Add fast checks that load train/validation data, construct the project, verify +the seed candidate keys, and build the target RLM without running a costly +optimization. + +Run `uv run rlm-gepa optimize --check` when the project CLI exists. For docs-only +or scaffolding changes, also run markdown sanity checks and `git diff --check`. diff --git a/.agents/skills/rlm-gepa/references/agent-spec.md b/.agents/skills/rlm-gepa/references/agent-spec.md new file mode 100644 index 00000000..83cf6ba6 --- /dev/null +++ b/.agents/skills/rlm-gepa/references/agent-spec.md @@ -0,0 +1,61 @@ +# AgentSpec + +Prefer `agent_spec_from_rlm(...)` for new projects. The RLM stays the source of +truth for the DSPy signature, output schema, skills, and tools. + +```python +from rlm_gepa import agent_spec_from_rlm + +agent_spec = agent_spec_from_rlm( + build_rlm(SEED_SKILL_INSTRUCTIONS), + use_cases=[ + "contract review with clause-level citations", + "invoice analysis with total reconciliation", + ], + runtime_grounding_examples={ + "skills": ["document-analysis skill instructions are optimized"], + "sandbox facts": ["Pyodide filesystem paths and package limits"], + }, + scoring_description=( + "Score combines answer correctness and citation support. Feedback names " + "missing findings, unsupported citations, and extraction errors." + ), +) +``` + +Do not duplicate facts `agent_spec_from_rlm(...)` can derive. Add only context +GEPA cannot infer: + +- transfer use cases beyond the benchmark; +- runtime-grounding examples the proposer must preserve; +- scoring signal and evaluator feedback shape; +- anti-overfitting boundaries; +- short product or optimization framing, only when it adds useful context. + +Omit `agent_type` by default. Set it only when a concise product or optimization +anchor adds information not already present in the signature, tools, or output +schema. + +## Components + +`components` names mutable text fields. `seed_candidate()` must return exactly +those keys. + +```python +class MyProject(RLMGepaProject): + components = ("skill_instructions",) + + def seed_candidate(self) -> dict[str, str]: + return {"skill_instructions": SEED_SKILL_INSTRUCTIONS} +``` + +Override `component_focus(component_name)` when each component needs a different +proposer brief. Keep component names stable so runs and candidate artifacts are +comparable. + +## Proposer Boundaries + +Patch-merge/crossover should be evidence-backed behavioral grafting from train +disagreement traces. Avoid broad synthesis, prompt concatenation, source text +imports, or benchmark-specific hacks. Domain specifics belong in `AgentSpec`, +seed/domain skills, runtime-grounding examples, or evaluator feedback. diff --git a/.agents/skills/rlm-gepa/references/data-and-scoring.md b/.agents/skills/rlm-gepa/references/data-and-scoring.md new file mode 100644 index 00000000..1ca22ee8 --- /dev/null +++ b/.agents/skills/rlm-gepa/references/data-and-scoring.md @@ -0,0 +1,56 @@ +# Data And Scoring + +Investigate the dataset before writing split or scoring code. Do not treat it +as an opaque list of rows. + +Inspect enough examples to identify: + +- task types and input sizes; +- label or reference-output shape; +- duplicate or near-duplicate examples; +- missing labels or ambiguous references; +- source grouping keys such as document, user, customer, or task family; +- failure buckets the scorer should expose. + +## Split Semantics + +Use split names consistently: + +- **Train**: examples the optimizer/proposer may use to generate and gate edits. +- **Validation**: examples used for candidate selection and regression checks. +- **Test / held-out eval**: optional final reporting set. + +Prefer deterministic splits. Put random seed, split ratio/counts, grouping key, +and sampling limits in `bench/config.py` or `gepa/config.py`. Split by group when +leakage is plausible. Never let near-identical examples from the same source +land in both train and validation without calling it out. + +If the dataset is tiny, prefer explicit hand-authored train/validation files +over random splitting. + +## Scoring Feedback + +Each `evaluate_example()` should return a scalar score plus feedback that helps +the proposer make a targeted behavioral change. + +Good feedback names concrete misses: + +- missing fields; +- unsupported citations; +- extraction or parsing errors; +- wrong calculations; +- formatting or file-output failures; +- tool-use mistakes visible in traces. + +Avoid feedback that only says "wrong" or restates the score. GEPA quality is +bounded by the evidence the metric returns. + +## Overfitting Boundaries + +State what counts as a transferable improvement versus a benchmark-specific +hack. Examples: + +- preserve citation grounding instead of memorizing answer strings; +- improve table handling generally instead of keying on fixture names; +- preserve sandbox path conventions and tool APIs; +- prefer behavior that transfers across document lengths and layouts. diff --git a/.agents/skills/rlm-gepa/references/project-layout.md b/.agents/skills/rlm-gepa/references/project-layout.md new file mode 100644 index 00000000..239f070e --- /dev/null +++ b/.agents/skills/rlm-gepa/references/project-layout.md @@ -0,0 +1,150 @@ +# RLM-GEPA Project Layout + +Create project-local optimization wiring only when the user asks for GEPA or +prompt/skill optimization. + +```text +my_rlm/ +├── agent/ # PredictRLM signature, schema, service, skills/tools +├── bench/ # optional eval loaders/scoring/fixtures +└── gepa/ + ├── __init__.py + ├── config.py + ├── project.py + ├── cli.py + └── __main__.py +``` + +The generated `gepa/` package owns train/validation loading, metric feedback, +seed candidate text, defaults, and CLI glue. The shared `rlm_gepa` package +provides optimizer runtime and CLI helpers. + +## pyproject.toml + +Add GEPA dependencies and a project-local CLI when optimization is in scope. + +```toml +dependencies = [ + "predict-rlm[gepa,gepa-viz]>=0.7.0-alpha5,<0.8", +] + +[project.scripts] +rlm-gepa = "my_rlm.gepa:main" + +[tool.predict-rlm.generated] +predict_rlm_version = "0.7.0-alpha5" +skill_version = "3.0" +layout = "agent-tools-bench-gepa" +features = ["agent", "bench", "rlm-gepa"] +``` + +## Project Skeleton + +```python +from dataclasses import dataclass +from typing import Any + +from predict_rlm import PredictRLM, Skill +from predict_rlm.trace import RunTrace +from rlm_gepa import ( + EvaluationContext, + RLMGepaExampleResult, + RLMGepaProject, + agent_spec_from_rlm, +) + +from ..agent.signature import AnalyzeDocuments + + +SEED_SKILL_INSTRUCTIONS = "Initial domain instructions for the RLM." + + +@dataclass +class EvalExample: + example_id: str + rlm_kwargs: dict[str, Any] + reference: Any + + +def build_rlm(skill_instructions: str, *, lm=None, sub_lm=None, max_iterations=30): + return PredictRLM( + AnalyzeDocuments, + lm=lm, + sub_lm=sub_lm, + max_iterations=max_iterations, + skills=[Skill(name="document-analysis", instructions=skill_instructions)], + ) + + +class MyProject(RLMGepaProject): + project_name = "my-project" + components = ("skill_instructions",) + agent_spec = agent_spec_from_rlm(build_rlm(SEED_SKILL_INSTRUCTIONS), ...) + + def seed_candidate(self) -> dict[str, str]: + return {"skill_instructions": SEED_SKILL_INSTRUCTIONS} + + def load_trainset(self): + return [...] + + def load_valset(self): + return [...] + + async def evaluate_example( + self, + candidate: dict[str, str], + example: EvalExample, + context: EvaluationContext, + ) -> RLMGepaExampleResult: + rlm = build_rlm( + candidate["skill_instructions"], + lm=context.lm, + sub_lm=context.sub_lm, + max_iterations=context.max_iterations, + ) + result = await rlm.acall(**example.rlm_kwargs) + score, feedback = score_result(result, example.reference) + + trace: RunTrace | None = getattr(result, "trace", None) + traces = [trace] if trace is not None else [] + + return RLMGepaExampleResult( + score=score, + feedback=feedback, + traces=traces, + rlm_inputs={"example_id": example.example_id, **example.rlm_kwargs}, + example_id=example.example_id, + ) +``` + +## CLI + +The generated `my_rlm.gepa:main` should call `run_project_cli(...)`. + +```python +from rlm_gepa.cli import run_project_cli + +from .config import default_config +from .project import build_project + + +def main() -> int: + return run_project_cli(build_project, default_config()) +``` + +Use `optimize --check` before a real run: + +```bash +uv run rlm-gepa optimize --check +``` + +If `bench/` exists, expose seed, validation, and held-out evaluation through the +same CLI only when the user asks for eval commands. + +For eval and optimization CLIs, route task execution through +`rlm_gepa.runtime.adapter.RLMGepaAdapter` rather than bespoke `asyncio.gather` +loops. Project-local `bench/` code owns dataset selection, candidate loading, +task setup, and `eval.json` summary shaping; the shared adapter owns concurrency, +per-task timeouts, progress display, verbose RLM logs, `task_traces/*.jsonl`, +and `cost_log.jsonl`. Write `eval.json` in the run directory so +`rlm-gepa stats ` works for held-out evals as well as optimization runs. diff --git a/.agents/skills/rlm/SKILL.md b/.agents/skills/rlm/SKILL.md index 201bf294..e8fdd389 100644 --- a/.agents/skills/rlm/SKILL.md +++ b/.agents/skills/rlm/SKILL.md @@ -1,1005 +1,144 @@ --- name: rlm description: > - Plan and build an RLM (Recursive Language Model) with predict-rlm, or contribute - to predict-rlm/RLM-GEPA itself. Interactively defines inputs, outputs, skills, - and architecture from a goal, then implements the code. Use when the user wants - to create a new RLM, explore whether one is feasible, or modify RLM/RLM-GEPA - guidance and implementation. -compatibility: Requires Python 3.11+, Deno, and the predict-rlm package (built on DSPy). -metadata: - author: Emile Riberdy - version: "2.1" + Plan, design, and build callable PredictRLM/RLM packages with typed inputs, + structured outputs, skills, host-side tools, and smoke tests. Use when the user + wants to create a new RLM, assess whether a workflow is a good RLM fit, or add + normal PredictRLM usage code. Do not use for contributing to the predict-rlm + repository itself or for RLM-GEPA optimization wiring; use + predict-rlm-contributor or rlm-gepa for those tasks. --- -# Build an RLM +# Build An RLM An RLM is a callable, pre-configured agent. It autonomously explores context, -writes and executes code in a sandboxed REPL, calls tools, inspects results, and -iterates until the task is done. Unlike a chat agent, an RLM is a function — you -define its inputs, outputs, and tools, then call it from your code. It returns -structured data, not chat messages. +writes and executes code in a sandboxed Python REPL, calls tools, inspects +results, and iterates until the task is done. Unlike a chat agent, an RLM is a +function: define its inputs, outputs, and tools, then call it from code. It +returns structured data, not chat messages. -This skill has two phases: +Use this skill for new PredictRLM packages and application code. If the user is +modifying the `predict-rlm` repo, switch to `predict-rlm-contributor`. If the +user asks for GEPA, optimization, train/validation candidate selection, or +`AgentSpec` wiring, switch to `rlm-gepa`. -1. **Plan** — interactively define the RLM with the user, research feasibility, - produce a plan -2. **Build** — implement the plan as code files +## Reference Map -**First action**: Check skill freshness if due, then enter plan mode using the -EnterPlanMode tool, unless the user is contributing to predict-rlm/RLM-GEPA -itself. For contribution work, use Contributor mode first. +Read only what the task needs: ---- - -# Skill freshness check - -When this skill is loaded in an environment with shell, filesystem, and network -access, run a lightweight update check at most once per day. Keep the last-check -marker under -`${HERMES_HOME:-$HOME/.hermes}/skills/.rlm-skill-update-check.json`. Compare the -installed `SKILL.md` against -`https://raw.githubusercontent.com/Trampoline-AI/predict-rlm/main/.agents/skills/rlm/SKILL.md` -using a content hash, ETag, or commit SHA. - -If a newer skill is available, tell the user the `/rlm` skill has updates and -ask whether they want to update or reinstall it. Do not update automatically. -Suggested commands are `hermes skills update` for Hermes-managed installs, or -`npx skills add Trampoline-AI/predict-rlm` for direct Skills CLI installs. - -Skip the check silently when tools, network, or a writable marker path are not -available. - ---- - -# Contributor mode for predict-rlm / RLM-GEPA - -Use this mode when the user is modifying this repository's code, docs, examples, -or installable skill guidance rather than asking you to build a new RLM package. -Do not force the new-RLM scoping interview. First inspect the repo context, the -requested change, and the relevant implementation/docs paths. - -Contributor rules: - -- PredictRLM is for callable, repeatable, deep-context workflows, not open-ended - interactive chat flows. -- Keep large inputs as `File` references or metadata. Use focused `predict()` - calls, and keep LLM-facing Pydantic schemas lean with - `Field(description=...)`. -- Validate at system boundaries. Prefer host-side tools for native libraries, - auth, network APIs, filesystem-heavy work, and anything that cannot run - cleanly in Pyodide. -- For RLM-GEPA, treat `AgentSpec`, evaluator feedback, and seed instructions as - the optimization direction. Keep runtime and budget knobs separate. -- Derive `AgentSpec` signature/tool context from the constructed RLM with - `agent_spec_from_rlm(...)` where possible. Avoid duplicating broad prose or - exposing internal IDs unnecessarily. -- Keep generic proposer behavior domain-neutral. Domain or benchmark specifics - belong in `AgentSpec`, seed/domain skills, runtime grounding examples, or - evaluator feedback. -- Patch-merge/crossover should be evidence-backed behavioral grafting from train - disagreement traces, not broad synthesis, prompt concatenation, or source text - import. -- Persist experimental optimizer behavior in config, CLI options, or artifacts - rather than hidden env-only switches. -- Creating GitHub PRs/issues or pushing public branches is external publishing; - do it only when explicitly requested. -- When an investigation identifies a bug or problem likely attributable to the - `predict-rlm` package, ask the user whether they want it reported as a GitHub - issue as soon as that attribution is clear. Do not open the issue without that - explicit approval. -- For verification, docs-only changes need markdown sanity or - `git diff - --check`. Code changes need targeted tests, plus broader tests - when touching shared interfaces, sandbox execution, optimizer behavior, or - examples. - ---- - -# Phase 1: Plan - -Work through these steps interactively. Do not skip steps or rush to the plan. -Each step should involve asking the user questions and confirming alignment -before moving on. - -## Step 1: Goal Definition - -Understand what the user wants to build. - -Ask: - -- What is the desired outcome? What does success look like? -- What is the input material? (documents, code, data, APIs, etc.) -- What does the output look like? (structured report, modified files, - spreadsheet, etc.) - -Then **validate RLM fit**. An RLM is the right tool when: - -- The input is large and needs selective exploration (documents, datasets, - codebases) -- The task is multi-step with tool use (extract -> transform -> validate) -- Actions modify state (redaction, form filling, generation) -- Parallel sub-LM calls are needed across many items -- File-to-file transformations (PDFs -> spreadsheets, documents -> reports) - -If the task is better served by a single LLM call or a simple script, tell the -user and suggest an alternative. Otherwise, proceed. - -## Step 2: Input Design - -Work with the user to define every input to the RLM. - -For each input, determine: - -- **Name** and **type**: `File`, `list[File]`, `str`, or a Pydantic model -- **Description**: what it contains and how the RLM uses it -- **Source**: user-provided file, API response, config, generated data - -Key principles: - -- Large content (PDFs, images, datasets) must be `File` references — the RLM - accesses content on-demand through skills, keeping its context small -- Metadata (file paths, page counts, config flags) can be strings or Pydantic - models -- Use `list[File]` for variable-count file inputs - -Confirm the input design with the user before proceeding. - -## Step 3: Output Design - -Work with the user to define the structured output. - -For each output field, determine: - -- **Name**, **type**, and **description** -- Whether it's a Pydantic model (structured data), `File` (generated file), or - primitive - -Push for specificity — vague outputs lead to poor RLM performance. Sketch the -Pydantic models with `Field(description=...)` annotations. Include nested models -where appropriate. - -Ask the user: - -- What fields matter most? What would they check first? -- Are there any computed/derived fields (scores, summaries, counts)? -- Do they need output files (Excel, PDF, images)? - -Confirm the output design with the user before proceeding. - -## Step 4: Research - -This step is **autonomous**. Tell the user you are researching, then do it. - -Use web search and the Explore subagent to: +- `references/project-layout.md`: generated package layout, smoke tests, and + service wiring patterns. +- `references/predict-rlm-api.md`: `PredictRLM`, `File`, `Skill`, built-in + skills, tools, `predict()`, and CodexLM usage. +- `references/sandbox-and-research.md`: feasibility research, Pyodide package + compatibility, network allowlists, and host-side tool decisions. -1. **Find Python packages** for the domain (e.g., `networkx` for graphs, - `tree-sitter` for code parsing, `beautifulsoup4` for HTML). +## Workflow -2. **Check Pyodide compatibility**. The sandbox runs Pyodide (Python in WASM). - Only **pure-Python wheels** or packages with **Emscripten builds** work. - Search pypi.org for each package and check: - - Does it have a `py3-none-any` wheel? (pure Python — works) - - Does it have C extensions without Emscripten builds? (won't work in - sandbox) - - Is it in the Pyodide built-in package list? (check - ) +### 1. Define The Goal -3. **Identify network needs**. Does the task require calling external APIs? If - so, note the domains for `allowed_domains`. +Ask what success looks like, what input material the RLM receives, and what +structured output or generated files it should return. -4. **Identify host-side tool needs**. If any functionality cannot run in WASM - (native binaries, C extensions, heavy computation), it must be a **host-side - tool** — a Python function running on the host that the RLM calls like any - other tool. +Validate RLM fit. An RLM is appropriate when the task needs selective +exploration of large inputs, multi-step tool use, stateful file transformations, +parallel sub-LM calls, or repeated callable workflows. If a simple script or +single LLM call is the better tool, say so and suggest that path. -5. **Check for existing skills**. The built-in skills are: - - `pdf` — pymupdf for PDF rendering, text extraction, manipulation - - `spreadsheet` — openpyxl, pandas, formulas for Excel work - - `docx` — python-docx for reading, writing, and modifying Word documents +### 2. Design Inputs -Report findings to the user with a clear feasibility assessment. Flag any -blockers. +Define every input: -## Step 5: Skill Design +- name and type: `File`, `list[File]`, `str`, or a Pydantic model; +- description: what it contains and how the RLM uses it; +- source: user-provided file, API response, config, or generated data. -Based on research, design the skill configuration. +Use `File` references for large content such as PDFs, images, workbooks, +documents, datasets, audio, or video. Keep raw bulk content out of the LLM +schema. -### Built-in skills +### 3. Design Outputs -List which built-in skills to use and why. - -### Custom skills (if needed) - -For each custom skill, define: - -- **name**: short identifier -- **instructions**: prose guidance injected into the RLM's system prompt — - teaches the RLM patterns and best practices. Be detailed; this is the primary - way to control RLM behavior. -- **packages**: PyPI packages installed in the sandbox via micropip (must be - Pyodide-compatible) -- **modules**: Python files mounted into the sandbox as importable modules -- **tools**: host-side callable functions exposed to the RLM - -### Host-side tool design - -For each host-side tool: - -- Function name and signature with type hints -- Docstring (the RLM sees this to understand how to call it) -- What it does and why it must be host-side - -Confirm the skill design with the user before proceeding. - -## Step 6: Strategy and Architecture - -### Signature strategy - -Write the step-by-step strategy that goes in the signature's docstring. This is -the RLM's playbook: - -1. What to do first (survey/understand the input) -2. How to gather information (render pages, use predict() for extraction, call - tools) -3. How to process and synthesize -4. What to produce and where to save output files - -### Single vs chained RLMs - -Evaluate whether this needs one RLM or multiple chained RLMs. - -**Use a single RLM when**: - -- The task is one coherent workflow -- All steps need the same context/state -- The iteration count stays reasonable (under 40) - -**Use chained RLMs when**: - -- There are distinct phases with different skill needs -- One phase produces artifacts consumed by another -- The combined task would exceed reasonable iteration counts -- Different phases benefit from different sub-LM models - -If chaining, define each stage: - -- Stage name, signature (inputs/outputs), skills, strategy -- The DAG: which stage feeds into which, with typed connections - -### Configuration - -- `max_iterations` estimate per RLM -- `allowed_domains` if network access is needed -- `sub_lm` recommendations (capability level needed) - -### Delivery scope - -Confirm which artifacts the user wants. Do not assume evals or optimization are -required for every RLM. - -- **Agent only**: the callable RLM package, domain skills/tools, and fast smoke - tests. This is the default unless the user asks for benchmarking or GEPA. -- **Agent + evals**: add a `bench/` package with dataset loading, scoring, and - evaluation CLI/helpers. Use this when the user has labeled examples, fixtures, - or a deterministic metric. -- **Agent + optimization**: add project-local RLM-GEPA wiring only when the user - wants prompt/skill optimization. GEPA needs train/validation examples and a - metric, but it does not require a separate held-out eval CLI unless requested. -- **Full project**: agent, tools, benchmark/eval harness, and optimization - wiring, like the SpreadBench example. - -### RLM-GEPA scoping interview - -When the user asks for optimization wiring but has not supplied enough context -to write a concrete `AgentSpec`, run a short interview before writing the plan. -Do not invent the AgentSpec from a vague task description. Gather enough to -define: - -- the product or optimization goal GEPA should improve for; -- the input distribution, scale, and examples that represent real work; -- the output schema and the failure modes users care about most; -- the train/validation data source and whether labels or reference outputs - exist; -- the scoring rule, partial-credit feedback, and anti-overfitting boundaries; -- the tools, sandbox constraints, file conventions, and runtime facts the - proposer must preserve. - -The interview should scope the RLM that owns the real DSPy signature and tools; -do not ask the user to restate `target_signature`, `tool_signatures`, or a broad -agent description as separate prose artifacts. Generate the signature/tool -fields from the constructed RLM with `agent_spec_from_rlm(...)`; omit -`agent_type` unless the user volunteers a product or optimization anchor that -adds useful context. - -If the user cannot answer everything, proceed with explicit assumptions and mark -the generated `AgentSpec` fields that should be revisited before spending model -calls. - -### Dataset and split hygiene - -When the user asks for evals or optimization, investigate the dataset before -writing split or scoring code. Inspect enough examples to identify task types, -input sizes, label/reference-output shape, duplicate or near-duplicate examples, -leakage risks, missing labels, and failure buckets the scorer should expose. -Capture those findings in the plan instead of treating the dataset as an opaque -list of rows. - -Use split semantics consistently: - -- **Train**: examples the optimizer/proposer may use to generate and gate edits. -- **Validation**: examples used for candidate selection and regression checks - during optimization. -- **Test / held-out eval**: optional final reporting set. Do not create or spend - on it unless the user asks for a benchmark/eval harness or has enough labeled - data to justify it. - -Prefer deterministic splits. Put the random seed, split ratio/counts, grouping -key (if examples share source documents/users/tasks), and any sampling limits in -`bench/config.py` or `gepa/config.py`. Split by group when leakage is plausible; -never let near-identical cases from the same source land in both train and -validation without calling it out. If the dataset is tiny, prefer explicit -hand-authored train/validation files over random splitting. - -For GEPA, the project-local `gepa/` code owns train/validation loading and the -seed candidate text. The seed candidate means the initial mutable component, -such as baseline skill instructions; it is separate from the random seed used -for splits, sampling, or optimizer reproducibility. - -For benchmarks with official splits, preserve the benchmark's public semantics. -Use the official train split for optimization data, carve GEPA validation from -that train split when a candidate-selection set is needed, and reserve official -dev/test/challenge splits for reporting only when the user asks for held-out -benchmark evaluation. Do not let optimizer feedback leak from held-out splits -into seed instructions or candidate selection. - -### Benchmark integration boundaries - -Keep benchmark evaluators and oracle-style answer checkers harness-side. The RLM -may see environment-safe tools, docs, state APIs, or session controls, but it -should not see evaluator feedback or hidden scoring APIs while solving an -example. After the attempt, the harness can call the evaluator and pass score -and feedback to GEPA as the learning signal. - -When benchmark packages conflict with predict-rlm, DSPy, Pyodide, or the main -project environment, prefer an isolated host-side runner/tool behind a typed -JSON boundary. Do not force incompatible benchmark dependencies into the RLM -sandbox or main package environment. - -For eval and optimization CLIs, route task execution through the shared -`rlm_gepa.runtime.adapter.RLMGepaAdapter` semantics rather than bespoke -`asyncio.gather` loops. Project-local `bench/` code may own dataset selection, -candidate loading, and `eval.json` summary shaping, but should reuse the adapter -for concurrency, per-task timeouts, progress bars, verbose RLM log handling, -`task_traces/*.jsonl`, and `cost_log.jsonl`. If the eval command is async, call -`await adapter.aevaluate(...)`; if it is synchronous, call -`adapter.evaluate(...)`. Write `eval.json` in the run directory so -`rlm-gepa stats ` works for held-out evals as well as optimize runs. - -## Feasibility Checklist - -Before producing the final plan, verify: - -- [ ] All proposed packages are Pyodide-compatible (or have host-side fallbacks) -- [ ] Network access needs are identified with specific domains -- [ ] Host-side tools are defined for anything that can't run in WASM -- [ ] Iteration count is reasonable (under 50 per RLM) -- [ ] Input sizes are manageable (or chunking strategy is defined) -- [ ] Output schemas are specific enough for reliable extraction -- [ ] The task is achievable — no unsupported capabilities assumed - -## Plan Output - -Write the plan to the Claude Code plan file with these sections: - -1. **Overview** — one paragraph: what, why, and expected workflow -2. **Delivery scope** — agent-only, evals, optimization, or full project -3. **File manifest** — every file to create with a one-line description -4. **Input schemas** — complete Pydantic model code for `agent/schema.py` -5. **Output schemas** — complete Pydantic model code for `agent/schema.py` -6. **Signature** — complete `agent/signature.py` code with strategy docstring -7. **Skills configuration** — built-in imports + custom `Skill(...)` - definitions + tool signatures -8. **Service architecture** — single RLM wiring or chained DAG: - - ``` - Stage1(documents) --[ExtractedData]--> Stage2(extracted) --[Report]--> Stage3(report) - ``` - -9. **Optional eval/optimization design** — only if requested. Include dataset - audit findings, split policy, scoring feedback shape, and reproducibility - seed. For optimization, also include the `AgentSpec` interview summary, - train/validation source, seed candidate source, and the exact `gepa/` files - to create. -10. **Feasibility notes** — constraints, risks, alternatives -11. **Estimated complexity** — iteration count, sub-LM calls, cost range, - runtime -12. **Smoke tests** — test files to create and commands to run. Every generated - RLM must include at least one fast no-network smoke test that imports the - generated package and constructs the service without making LLM calls. - -After writing the plan, use ExitPlanMode to get user approval. Once approved, -proceed to Phase 2. - ---- - -# Phase 2: Build - -Implement the approved plan. Create all files following the patterns below. - -## File structure - -Default to a grouped package. Keep the root package thin and put the callable -RLM under `agent/`. Add `tools/`, `bench/`, and `gepa/` only when the selected -delivery scope needs them. - -``` -my_rlm/ -├── pyproject.toml # Dependencies + generated-with metadata -├── __init__.py # Public exports from agent/ -├── agent/ -│ ├── __init__.py # Public agent exports -│ ├── schema.py # Pydantic models for inputs AND outputs -│ ├── signature.py # DSPy Signature + strategy docstring -│ ├── service.py # DSPy Module wiring signature + PredictRLM + skills -│ └── skills.py # Optional custom skill definitions -├── tools/ # Optional host-side tools and helpers -│ └── __init__.py -├── bench/ # Optional dataset/eval/scoring code -│ ├── __init__.py -│ ├── config.py # Optional eval defaults -│ └── cli.py # Optional eval CLI helpers -├── gepa/ # Optional project-local RLM-GEPA wiring -│ ├── __init__.py # Exports main for `my_rlm.gepa:main` -│ ├── config.py # OptimizeConfig defaults + AgentSpec -│ ├── project.py # RLMGepaProject implementation -│ ├── cli.py # Thin run_project_cli wiring -│ └── __main__.py # Optional `python -m my_rlm.gepa` -└── tests/ - └── test_smoke.py # Fast import/construction smoke tests -``` - -**Always create**: `pyproject.toml`, package `__init__.py`, `agent/schema.py`, -`agent/signature.py`, `agent/service.py`, `agent/__init__.py`, and -`tests/test_smoke.py`. - -**Create when needed**: - -- `agent/skills.py` when the RLM needs domain-specific instructions beyond - built-in skills. -- `tools/` when host-side functions or helper modules are needed. -- `bench/` when the user wants evals, datasets, scoring, or eval config. -- `gepa/` and a console script when the user wants RLM-GEPA optimization. - -Do not add compatibility shims for old flat module names in newly generated -projects. The grouped imports are the source of truth. - -## pyproject.toml — Dependencies and generated-with metadata - -Every generated RLM project should record which predict-rlm version and skill -layout it targets. Use the current package version unless the user explicitly -pins another one. For this repository version, that is `0.4.1`. - -Agent-only project: - -```toml -[project] -name = "my-rlm" -version = "0.1.0" -requires-python = ">=3.11" -dependencies = [ - "predict-rlm>=0.4.1,<0.5", -] - -[tool.predict-rlm.generated] -predict_rlm_version = "0.4.1" -skill_version = "2.0" -layout = "agent-tools-bench-gepa" -features = ["agent"] -``` - -If the project uses built-in example skills, add only the required extras or -packages. If it uses GEPA, add the GEPA extras and project-local `rlm-gepa` -script as the main UX. Do not include optimization dependencies for an -agent-only or eval-only project. - -```toml -dependencies = [ - "predict-rlm[examples,gepa,gepa-viz]>=0.4.1,<0.5", -] - -[project.scripts] -rlm-gepa = "my_rlm.gepa:main" - -[tool.predict-rlm.generated] -predict_rlm_version = "0.4.1" -skill_version = "2.0" -layout = "agent-tools-bench-gepa" -features = ["agent", "tools", "bench", "rlm-gepa"] -``` - -For examples inside the predict-rlm monorepo, an editable path source is fine, -but keep the generated metadata table so readers know which API/layout version -the project was generated against. - -## agent/schema.py — Pydantic models - -Define models for structured inputs and outputs. Use `Field(description=...)` so -the RLM knows what each field means. - -```python -from pydantic import BaseModel, Field - - -class KeyDate(BaseModel): - """A key date extracted from a document.""" - - name: str = Field(description="e.g. 'Submission Deadline', 'Effective Date'") - date: str = Field(description="ISO format date (YYYY-MM-DD)") - time: str | None = Field( - None, description="24-hour format (HH:MM), e.g. '14:00', '09:30'" - ) - timezone: str | None = Field( - None, description="Timezone code, e.g. 'EST', 'EDT', 'PST', 'UTC'" - ) - - -class DocumentAnalysis(BaseModel): - """Structured analysis of a document set.""" - - report: str = Field( - description="Full analysis as a well-formatted markdown report" - ) - key_dates: list[KeyDate] = Field( - default_factory=list, description="Important dates found in the documents" - ) -``` - -## agent/signature.py — Inputs, outputs, and strategy - -The docstring becomes the RLM's system instructions — tell the RLM how to -approach the task step by step: - -```python -import dspy - -from predict_rlm import File - -from .schema import DocumentAnalysis - - -class AnalyzeDocuments(dspy.Signature): - """Analyze documents and produce a structured report. - - 1. **Read the report criteria** (appended below) to understand what - information to extract and in what format. - - 2. **Survey the documents** to understand what you're working with: - file names, page counts, document types. - - 3. **Gather information** systematically by rendering pages as images - and using predict() to extract content. - - 4. **Produce the report** following the format specified in the criteria. - Use tables for structured data, prose for analysis and context. - """ - - documents: list[File] = dspy.InputField( - desc="PDF documents to analyze" - ) - analysis: DocumentAnalysis = dspy.OutputField( - desc="Structured analysis with markdown report, key dates, and key entities" - ) -``` - -## agent/service.py — Wiring it together - -Wrap signature + skills + PredictRLM into a reusable DSPy Module: - -```python -import dspy - -from predict_rlm import File, PredictRLM -from predict_rlm.skills import pdf as pdf_skill - -from .schema import DocumentAnalysis -from .signature import AnalyzeDocuments - - -class DocumentAnalyzer(dspy.Module): - def __init__( - self, - sub_lm: dspy.LM | str | None = None, - max_iterations: int = 30, - verbose: bool = False, - debug: bool = False, - ): - self.sub_lm = sub_lm - self.max_iterations = max_iterations - self.verbose = verbose - self.debug = debug - - async def aforward( - self, documents: list[File], criteria: str - ) -> DocumentAnalysis: - signature = AnalyzeDocuments.with_instructions( - AnalyzeDocuments.instructions + "\n\n# Task\n\n" + criteria.strip() - ) - predictor = PredictRLM( - signature, - sub_lm=self.sub_lm, - skills=[pdf_skill], - max_iterations=self.max_iterations, - verbose=self.verbose, - debug=self.debug, - ) - result = await predictor.acall(documents=documents) - return result.analysis -``` - -When using multiple skills or host-side tools: - -```python -from predict_rlm.skills import pdf as pdf_skill -from predict_rlm.skills import spreadsheet as spreadsheet_skill - -async def aforward(self, documents: list[File]) -> MyOutput: - predictor = PredictRLM( - MySignature, - sub_lm=self.sub_lm, - skills=[pdf_skill, spreadsheet_skill], - tools={"fetch_exchange_rate": fetch_exchange_rate}, - ... - ) -``` - -### Chaining pattern (multiple RLMs) - -```python -async def aforward(self, documents: list[File]): - # Stage 1: Extract - extractor = PredictRLM(ExtractSignature, sub_lm=self.sub_lm, skills=[pdf_skill]) - extracted = await extractor.acall(documents=documents) - - # Stage 2: Analyze (uses output from stage 1) - analyzer = PredictRLM(AnalyzeSignature, sub_lm=self.sub_lm, skills=[analysis_skill]) - result = await analyzer.acall(data=extracted.data) - - return result -``` - -## agent/skills.py — Custom skills - -Create only when the RLM needs domain-specific instructions beyond built-in -skills. - -```python -from predict_rlm import Skill -from predict_rlm.skills import pdf as pdf_skill - -redaction_skill = Skill( - name="redaction", - instructions="""How to redact content from PDFs using pymupdf. - -## Text redaction -Search for text, create redaction annotations, then apply: - page = doc[page_num] - hits = page.search_for("sensitive text") - for rect in hits: - page.add_redact_annot(rect, fill=(0, 0, 0)) - page.apply_redactions() -...""", -) - -__all__ = ["pdf_skill", "redaction_skill"] -``` - -## tests/test_smoke.py — Generated smoke tests - -Create smoke tests for every generated RLM. The default smoke test must be fast -and must not require network access, API keys, Deno, Pyodide, or an actual LLM -call. It should prove the generated package imports, the signature exposes the -expected fields, and the service can be constructed. - -Tailor imports and class names to the generated RLM: - -```python -def test_service_constructs(): - from my_rlm import DocumentAnalyzer - - service = DocumentAnalyzer(max_iterations=1, verbose=False, debug=False) - assert service.max_iterations == 1 - - -def test_signature_has_fields(): - from my_rlm.agent.signature import AnalyzeDocuments - - assert AnalyzeDocuments.input_fields - assert AnalyzeDocuments.output_fields -``` - -If the generated project includes tiny local fixtures and the user wants an -end-to-end check, add a separate `@pytest.mark.integration` test that performs a -minimal `acall`. Gate that test behind explicit credentials or an environment -flag so the default smoke suite remains deterministic and cheap. - -## Optional bench/ package — Evals - -Create `bench/` only when the user wants evaluation. Keep it project-local: -dataset loaders, scoring rules, fixtures, and reports belong here, not in -`agent/`. Evals can exist without optimization. The eval layer should make the -dataset audit and split policy explicit: where examples come from, how labels or -reference outputs are represented, which examples are train/validation/test, and -which seed/grouping rules make the split reproducible. - -Suggested files when needed: - -```text -bench/ -├── __init__.py -├── config.py # Eval defaults and EvalConfig -├── dataset.py # Load examples/fixtures into typed task objects -├── evaluation.py # Project-specific task execution/scoring contract -├── scoring.py # Deterministic task-specific scoring -└── cli.py # Optional held-out eval subcommand/helpers -``` - -For benchmark eval and optimize entrypoints, use the shared RLM-GEPA runtime -adapter semantics in `src/rlm_gepa/runtime/adapter.py` unless the user -explicitly asks for a one-off local harness. Put dataset loading, scoring, -setup, and task cleanup behind the project contract; let the shared adapter own -concurrency, timeouts, progress/tqdm display, trace capture, and report -semantics. - -## Optional gepa/ package — Optimization - -Create optimization wiring only when the user asks for it. The shared `rlm_gepa` -package provides generic orchestration; the generated project owns its task -loading, metric, seed candidate, and defaults. Import `agent_spec_from_rlm`, -`OptimizeConfig`, `RLMGepaExampleResult`, `RLMGepaProject`, and -`EvaluationContext` from `rlm_gepa` rather than copying optimizer internals into -the project. - -The generated `AgentSpec` should be interview-backed, but it should not -duplicate facts already present on the RLM. Define a single `build_rlm(...)` -helper that constructs the PredictRLM with the real DSPy signature, skills, and -tools. Use `agent_spec_from_rlm(build_rlm(seed_instructions), ...)` so GEPA -derives `target_signature` and `tool_signatures` from that object. Omit -`agent_type` by default; set it only for a short product or optimization anchor -that adds non-duplicative framing beyond the signature, tools, or output schema. -The interview should supply only the extra GEPA brief: transfer use cases, -runtime-grounding examples, scoring signal, and anti-overfitting boundary. If -any of those are weakly specified, add a `TODO` in `config.py` and keep -`optimize --check` available so the user can catch missing setup before spending -model calls. - -Suggested files when needed: - -```text -gepa/ -├── __init__.py # Exports `main` for the console script -├── config.py # Project-local OptimizeConfig defaults + AgentSpec -├── project.py # RLMGepaProject implementation and metric wiring -├── cli.py # Thin run_project_cli glue -└── __main__.py # Optional `python -m my_rlm.gepa` -``` - -Optimization can reuse helpers from `bench/` when evals exist. If a `bench/` -package is present, expose its seed/validation/held-out evaluation flow as an -`eval` subcommand on the same GEPA CLI surface. Do not create a held-out eval -command just because GEPA is present: GEPA needs training and validation -examples plus feedback; a separate benchmark/eval suite is optional. - ---- +Define every output field with name, type, and description. Use Pydantic models +with `Field(description=...)` for structured outputs. Use `File` output fields +when the RLM must write artifacts back to the host. -# Architecture Reference +Push vague outputs into concrete schemas before implementation. Ask what fields +the caller will inspect first, which derived fields matter, and whether generated +files are required. -Use this reference to ensure plans and implementations are accurate. Do not -hallucinate parameters or patterns. +### 4. Research Feasibility -## How an RLM works +Do autonomous research before writing the plan. Read +`references/sandbox-and-research.md` when package compatibility, network access, +or host-side tools are relevant. -The architecture is two-level: +Report feasibility clearly: package support, sandbox constraints, +`allowed_domains`, required host tools, likely iteration count, and blockers. -1. **The outer LLM** (the RLM itself) writes and executes Python code in a - sandboxed Pyodide/WASM REPL. It plans, orchestrates, and iterates. -2. **The sub-LM** (via `predict()`) handles perception and extraction — - analyzing images, understanding text, and returning typed results. Each - `predict()` call gets its own context window. +### 5. Design Skills And Tools -The outer LLM's context stays small (code + tool results), while context-heavy -work is offloaded to `predict()` calls. +Choose built-in skills or custom `Skill(...)` definitions. Use built-in skills +for common document domains: -## File I/O +- `pdf`: PDF reading, rendering, manipulation, and redaction. +- `spreadsheet`: Excel workbook editing, formulas, and verification. +- `docx`: Word document reading, writing, tables, formatting, and styles. -Use `File` for file-typed fields: +Create custom skills only when the RLM needs reusable domain instructions, +sandbox packages, mounted modules, or bundled host-side tools. -- **Input field**: mounts the file from host into the sandbox at - `/sandbox/input/{field_name}/` -- **Output field**: syncs from `/sandbox/output/{field_name}/` back to the host +Use host-side tools for authenticated APIs, database calls, native binaries, +heavy filesystem work, or anything that cannot run cleanly in the sandbox. -```python -from predict_rlm import File +### 6. Choose Architecture -# Input: File(path="/absolute/path/to/file.pdf") -# Output: declared as File output field, RLM writes to /sandbox/output// -``` +Write the signature docstring as the RLM playbook: -## PredictRLM constructor +1. how to survey the inputs; +2. how to gather information with files, skills, tools, and `predict()`; +3. how to process and verify results; +4. what to return or save. -```python -PredictRLM( - signature: type[Signature] | str, # DSPy signature class - lm: dspy.LM | str | None = None, # Main LM (code generation) - sub_lm: dspy.LM | str | None = None, # Sub-LM for predict() calls - max_iterations: int = 30, - max_llm_calls: int = 50, - verbose: bool = False, - tools: dict[str, Callable] | list[Callable] | None = None, - allowed_domains: list[str] | None = None, - skills: list[Skill] | None = None, - debug: bool = False, - output_dir: str | Path | None = None, -) -``` +Use a single RLM when the task is one coherent workflow and can stay within a +reasonable iteration budget. Use chained RLMs when phases have different inputs, +skills, outputs, or budgets. -Both `lm` and `sub_lm` accept a model string (e.g. `"openai/gpt-5.4"`) or a -`dspy.LM` instance. If `lm` is omitted, the current context LM from -`dspy.context(lm=...)` is used. +### 7. Confirm Delivery Scope -## CodexLM / ChatGPT subscription backend +Default to an agent-only package unless the user asks for more: -Starting in `predict-rlm` v0.7.0, `predict-rlm[codex-lm]` includes -`dspy_codex_lm.CodexLM`, a DSPy LM backed by the Codex/ChatGPT subscription -backend. Use it when the user wants PredictRLM to run on Codex model slugs -through ChatGPT/Codex auth instead of normal OpenAI API keys. +- **Agent only**: callable RLM package, domain skills/tools, and fast smoke + tests. +- **Agent + evals**: add project-local dataset loading and scoring when the user + has examples, fixtures, labels, or deterministic metrics. -Install and authenticate: +For GEPA optimization or candidate selection, stop using this skill and switch +to `rlm-gepa`. -```bash -uv add "predict-rlm[codex-lm]" -uv run codex-lm auth login default -uv run codex-lm auth status -uv run codex-lm smoke-test --model gpt-5.5 -``` - -For prerelease builds, allow prereleases explicitly: - -```bash -uv add --prerelease=allow "predict-rlm[codex-lm]==0.7.0-alpha0" -``` +### 8. Write The Plan -Use `CodexLM` directly when wiring `lm` or `sub_lm`: - -```python -from dspy_codex_lm import CodexLM -from predict_rlm import PredictRLM - -rlm = PredictRLM( - MySignature, - lm=CodexLM(model="gpt-5.5"), - sub_lm=CodexLM(model="gpt-5.5"), -) -``` +Produce a plan with: -To run an existing DSPy script without editing its LM construction, use the CLI -wrapper. It monkeypatches OpenAI-family `dspy.LM(...)` calls and routes supported -Codex model slugs to `CodexLM`: +1. Overview. +2. Delivery scope. +3. File manifest. +4. Input schemas. +5. Output schemas. +6. Signature code and strategy docstring. +7. Skills and host-side tools. +8. Service architecture or chained RLM DAG. +9. Feasibility notes. +10. Estimated complexity: iterations, sub-LM calls, runtime, and cost range. +11. Smoke tests and commands. -```bash -uv run codex-lm my_dspy_script.py -``` +Get approval before building when the environment or calling surface has an +explicit plan mode. Otherwise proceed when the user has already asked for +implementation. -Useful CLI commands: +### 9. Build And Verify -```bash -uv run codex-lm --help -uv run codex-lm auth list -uv run codex-lm usage -uv run codex-lm rotation on -``` - -Important caveats: - -- CodexLM uses Codex/ChatGPT subscription auth, not ordinary OpenAI API keys. -- Routing is strict: OpenAI-family model strings are intercepted only when the - slug is a supported Codex model; unsupported OpenAI-family slugs raise an - error instead of silently falling back. -- Common supported slugs include `gpt-5.1-codex`, `gpt-5.1-codex-max`, - `gpt-5.1-codex-mini`, `gpt-5.2`, `gpt-5.2-codex`, `gpt-5.3-codex`, - `gpt-5.3-codex-spark`, `gpt-5.4`, `gpt-5.4-mini`, and `gpt-5.5`. - -## Skill dataclass - -```python -from predict_rlm import Skill - -Skill( - name="my-skill", # Short identifier - instructions="How to approach...", # Prose injected into the RLM prompt - packages=["pandas", "openpyxl"], # PyPI packages installed in the sandbox - modules={"helper": "/path/to/helper.py"}, # Python files mounted as importable modules - tools={"fetch": fetch_fn}, # Host-side callable functions exposed to the RLM -) -``` - -Skills can bundle **host-side tools** via their `tools=` field. When skills are -composed, their tools are merged alongside instructions and packages (tool name -conflicts raise errors). - -## Built-in skills - -```python -from predict_rlm.skills import pdf as pdf_skill # pymupdf -from predict_rlm.skills import spreadsheet as spreadsheet_skill # openpyxl, pandas, formulas -from predict_rlm.skills import docx as docx_skill # python-docx -``` - -| Skill | Packages | Modules | What it teaches the RLM | -| --------------- | -------------------------------- | -------------- | -------------------------------------------------------------------------- | -| **pdf** | `pymupdf` | — | Read, render, modify, and redact PDFs | -| **spreadsheet** | `openpyxl`, `pandas`, `formulas` | `formula_eval` | Build and modify Excel workbooks with formulas and formatting | -| **docx** | `python-docx` | `md2docx` | Read, write, and modify Word documents with tables, formatting, and styles | - -## Tools - -Tools are **host-side functions** the RLM can call from the sandbox. Use them -for operations that cannot run inside the sandbox — host access, authenticated -APIs, database queries, system resources. - -```python -async def fetch_exchange_rate(currency: str, date: str) -> str: - """Fetch the exchange rate for a currency on a given date. - - Args: - currency: ISO currency code (e.g. "EUR", "GBP") - date: Date in YYYY-MM-DD format - - Returns: - JSON string with the exchange rate data - """ - async with httpx.AsyncClient() as client: - resp = await client.get(f"https://api.example.com/rates/{currency}/{date}") - return resp.text -``` - -Tools can be passed directly to PredictRLM via `tools={"name": fn}` or bundled -inside a Skill via `tools=`. - -### When to use a Skill vs tools - -| Use a Skill when... | Use `tools=` when... | -| ---------------------------------------------------- | ------------------------------------------------------------------------- | -| The RLM needs a **package** installed in the sandbox | The function must run on the **host** (API calls, DB queries, filesystem) | -| You need to teach the RLM **how to use** something | The tool's docstring is self-explanatory | -| The knowledge is **reusable** across RLMs | It's a single specific function for one RLM | - -## predict() tool (inside sandbox) - -The RLM can call `predict()` for sub-LM perception/extraction: - -```python -result = await predict( - "image: dspy.Image -> items: list[Item]", - instructions="Extract all line items from this invoice page", - image=page_image, -) -``` - -Each predict() call gets its own context window. Supports `dspy.Image` for -multimodal. - -## Key imports - -```python -from predict_rlm import PredictRLM, Skill, File -from predict_rlm.skills import pdf, spreadsheet, docx -``` - -## WASM sandbox constraints - -- Only pure-Python wheels or Pyodide built-in packages work -- No subprocess, no native binaries, no C extensions (unless Emscripten-built) -- Network access requires `allowed_domains` whitelist -- File I/O is within the sandbox filesystem -- Host-side tools bridge the gap for anything WASM can't do +Implement the approved plan using the package structure in +`references/project-layout.md`. Every generated RLM must include fast +no-network smoke tests that import the package, inspect the signature fields, +and construct the service without making LLM calls. diff --git a/.agents/skills/rlm/references/predict-rlm-api.md b/.agents/skills/rlm/references/predict-rlm-api.md new file mode 100644 index 00000000..3561cdc3 --- /dev/null +++ b/.agents/skills/rlm/references/predict-rlm-api.md @@ -0,0 +1,138 @@ +# PredictRLM API Reference + +Use this reference to keep generated code aligned with the package API. + +## Core Imports + +```python +from predict_rlm import File, PredictRLM, Skill +from predict_rlm.skills import docx, pdf, spreadsheet +``` + +## PredictRLM + +```python +PredictRLM( + signature: type[Signature] | str, + lm: dspy.LM | str | None = None, + sub_lm: dspy.LM | str | None = None, + max_iterations: int = 30, + max_llm_calls: int = 50, + max_output_chars: int = 100_000, + verbose: bool = True, + tools: dict[str, Callable[..., str]] | list[Callable] | None = None, + interpreter: CodeInterpreter | None = None, + sandbox_backend: BackendName | str | None = None, + sbx_config: SbxConfig | None = None, + sbx_pool: SbxPool | None = None, + allowed_domains: list[str] | None = None, + skills: list[Skill] | None = None, + debug: bool = False, + output_dir: str | Path | None = None, + telemetry_context: TelemetryContext | None = None, + submit_confirmation: Callable[[SubmitConfirmationContext], str | None] | None = None, + trace_export_path: str | Path | None = None, + runtime_hooks: list[RuntimeHook] | None = None, + on_runtime_hook_event: Callable[[RuntimeHookEvent], Any] | None = None, + model_execution_timeout: bool = False, +) +``` + +Both `lm` and `sub_lm` accept a model string or a `dspy.LM` instance. If `lm` is +omitted, the current `dspy.context(lm=...)` LM is used. + +## File I/O + +Use `File` for large inputs and generated artifacts. + +- Input fields mount host files under `/sandbox/input/{field_name}/`. +- Output fields sync files from `/sandbox/output/{field_name}/` back to the host. + +## Skills + +```python +Skill( + name="my-skill", + instructions="How to approach the domain...", + packages=["pandas", "openpyxl"], + modules={"helper": "/path/to/helper.py"}, + tools={"fetch": fetch_fn}, +) +``` + +Skills bundle reusable instructions, sandbox packages, mounted modules, and +host-side tools. When skills are composed, instructions concatenate, packages +deduplicate, and tool-name conflicts raise errors. + +Built-in skills: + +| Skill | Import | Packages | Modules | Purpose | +| --- | --- | --- | --- | --- | +| `pdf` | `from predict_rlm.skills import pdf` | `pymupdf` | - | Read, render, modify, and redact PDFs | +| `spreadsheet` | `from predict_rlm.skills import spreadsheet` | `openpyxl`, `pandas`, `formulas` | `formula_eval` | Build and modify Excel workbooks | +| `docx` | `from predict_rlm.skills import docx` | `python-docx` | `md2docx` | Read, write, and modify Word documents | + +## Host-Side Tools + +Tools are host-side callables the RLM can invoke from the sandbox. Use them for +operations that need host access, authenticated APIs, databases, native +libraries, or filesystem-heavy work. + +```python +async def fetch_exchange_rate(currency: str, date: str) -> str: + """Fetch the exchange rate for a currency on a given date. + + Args: + currency: ISO currency code, e.g. "EUR". + date: Date in YYYY-MM-DD format. + + Returns: + JSON string with exchange-rate data. + """ + ... +``` + +Pass tools directly via `tools={"name": fn}` or bundle reusable tools in a +`Skill`. + +## predict() Inside The Sandbox + +The RLM can call `predict()` for sub-LM perception or extraction. Each call gets +its own context window. + +```python +result = await predict( + "image: dspy.Image -> items: list[Item]", + instructions="Extract all line items from this invoice page", + image=page_image, +) +``` + +Use `dspy.Image` for multimodal image inputs. + +## CodexLM + +When the user wants PredictRLM to run on Codex/ChatGPT subscription auth instead +of ordinary OpenAI API keys, use `predict-rlm[codex-lm]` and `CodexLM`. + +```bash +uv add "predict-rlm[codex-lm]" +uv run codex-lm auth login default +uv run codex-lm auth status +uv run codex-lm smoke-test --model gpt-5.5 +``` + +```python +from dspy_codex_lm import CodexLM +from predict_rlm import PredictRLM + +rlm = PredictRLM( + MySignature, + lm=CodexLM(model="gpt-5.5"), + sub_lm=CodexLM(model="gpt-5.5"), +) +``` + +`CodexLM` uses Codex/ChatGPT subscription auth, not ordinary OpenAI API keys. +Routing is strict: unsupported Codex slugs should fail instead of silently +falling back. diff --git a/.agents/skills/rlm/references/project-layout.md b/.agents/skills/rlm/references/project-layout.md new file mode 100644 index 00000000..8ee97805 --- /dev/null +++ b/.agents/skills/rlm/references/project-layout.md @@ -0,0 +1,187 @@ +# Project Layout + +Default to a grouped package. Keep the root package thin and put the callable +RLM under `agent/`. Add `tools/` and `bench/` only when the selected delivery +scope needs them. + +```text +my_rlm/ +├── pyproject.toml +├── __init__.py +├── agent/ +│ ├── __init__.py +│ ├── schema.py +│ ├── signature.py +│ ├── service.py +│ └── skills.py # optional custom skills +├── tools/ # optional host-side tools/helpers +├── bench/ # optional eval dataset/scoring code +└── tests/ + └── test_smoke.py +``` + +Always create `pyproject.toml`, package `__init__.py`, `agent/schema.py`, +`agent/signature.py`, `agent/service.py`, `agent/__init__.py`, and +`tests/test_smoke.py`. Do not add compatibility shims for old flat module names +in newly generated projects. + +## pyproject.toml + +Record generated-with metadata so readers know the target API/layout version. +Use the current package version unless the user explicitly pins another one. + +```toml +[project] +name = "my-rlm" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "predict-rlm>=0.7.0-alpha5,<0.8", +] + +[tool.predict-rlm.generated] +predict_rlm_version = "0.7.0-alpha5" +skill_version = "3.0" +layout = "agent-tools-bench" +features = ["agent"] +``` + +For examples inside the `predict-rlm` monorepo, an editable path source is fine, +but keep the metadata table. + +## Schema Pattern + +Define models for structured inputs and outputs. Use `Field(description=...)` so +the RLM knows what each field means. + +```python +from pydantic import BaseModel, Field + + +class KeyDate(BaseModel): + """A key date extracted from a document.""" + + name: str = Field(description="e.g. 'Submission Deadline', 'Effective Date'") + date: str = Field(description="ISO format date (YYYY-MM-DD)") + time: str | None = Field(None, description="24-hour format, e.g. '14:00'") + timezone: str | None = Field(None, description="Timezone code, e.g. 'UTC'") + + +class DocumentAnalysis(BaseModel): + """Structured analysis of a document set.""" + + report: str = Field(description="Full analysis as a markdown report") + key_dates: list[KeyDate] = Field( + default_factory=list, + description="Important dates found in the documents", + ) +``` + +## Signature Pattern + +The signature docstring is the RLM's operating strategy. + +```python +import dspy + +from predict_rlm import File + +from .schema import DocumentAnalysis + + +class AnalyzeDocuments(dspy.Signature): + """Analyze documents and produce a structured report. + + 1. Read the report criteria. + 2. Survey the documents: file names, page counts, and document types. + 3. Gather information by rendering pages and using predict() for extraction. + 4. Produce the requested report with grounded structured fields. + """ + + documents: list[File] = dspy.InputField(desc="PDF documents to analyze") + analysis: DocumentAnalysis = dspy.OutputField( + desc="Structured analysis with report, key dates, and entities" + ) +``` + +## Service Pattern + +Wrap the signature and skills in a reusable DSPy module. + +```python +import dspy + +from predict_rlm import File, PredictRLM +from predict_rlm.skills import pdf as pdf_skill + +from .schema import DocumentAnalysis +from .signature import AnalyzeDocuments + + +class DocumentAnalyzer(dspy.Module): + def __init__( + self, + sub_lm: dspy.LM | str | None = None, + max_iterations: int = 30, + verbose: bool = False, + debug: bool = False, + ): + self.sub_lm = sub_lm + self.max_iterations = max_iterations + self.verbose = verbose + self.debug = debug + + async def aforward( + self, documents: list[File], criteria: str + ) -> DocumentAnalysis: + signature = AnalyzeDocuments.with_instructions( + AnalyzeDocuments.instructions + "\n\n# Task\n\n" + criteria.strip() + ) + predictor = PredictRLM( + signature, + sub_lm=self.sub_lm, + skills=[pdf_skill], + max_iterations=self.max_iterations, + verbose=self.verbose, + debug=self.debug, + ) + result = await predictor.acall(documents=documents) + return result.analysis +``` + +## Chaining Pattern + +Use chained RLMs only for distinct phases with different skills, budgets, or +typed artifacts. + +```python +async def aforward(self, documents: list[File]): + extractor = PredictRLM(ExtractSignature, sub_lm=self.sub_lm, skills=[pdf_skill]) + extracted = await extractor.acall(documents=documents) + + analyzer = PredictRLM(AnalyzeSignature, sub_lm=self.sub_lm, skills=[analysis_skill]) + return await analyzer.acall(data=extracted.data) +``` + +## Smoke Tests + +Default smoke tests must be fast and must not require network access, API keys, +Deno, Pyodide, or LLM calls. + +```python +def test_service_constructs(): + from my_rlm import DocumentAnalyzer + + service = DocumentAnalyzer(max_iterations=1, verbose=False, debug=False) + assert service.max_iterations == 1 + + +def test_signature_has_fields(): + from my_rlm.agent.signature import AnalyzeDocuments + + assert AnalyzeDocuments.input_fields + assert AnalyzeDocuments.output_fields +``` + +If an end-to-end check is useful, add it as a separate integration test gated by +explicit credentials or an environment flag. diff --git a/.agents/skills/rlm/references/sandbox-and-research.md b/.agents/skills/rlm/references/sandbox-and-research.md new file mode 100644 index 00000000..8aa30834 --- /dev/null +++ b/.agents/skills/rlm/references/sandbox-and-research.md @@ -0,0 +1,51 @@ +# Sandbox And Research + +Research feasibility before implementation whenever the RLM needs third-party +packages, external network access, heavy computation, native libraries, or +nontrivial file formats. + +## Package Compatibility + +The default sandbox runs Python in WASM. Packages work when they are pure Python +or available as Pyodide/Emscripten builds. + +Check each candidate package: + +- Does PyPI provide a `py3-none-any` wheel? +- Is the package in the Pyodide built-in package list? +- Does it depend on C extensions or native binaries? +- Is a host-side tool simpler and more reliable? + +Do not assume native packages, subprocesses, system binaries, or arbitrary +filesystem access are available in the sandbox. + +## Network Access + +If the RLM must call external APIs from the sandbox, identify exact domains and +set `allowed_domains`. Prefer host-side tools for authenticated APIs so secrets, +refresh tokens, and provider SDKs stay outside the sandbox. + +## Host-Side Tool Decisions + +Use a host-side tool when the operation: + +- requires authentication or private environment variables; +- calls a database, SaaS API, or internal service; +- needs a native library, subprocess, system binary, browser, or GPU; +- reads/writes outside mounted input/output files; +- is deterministic and easier to implement outside the RLM loop. + +Expose concise signatures and docstrings. The RLM sees the docstring to decide +when and how to call the tool. + +## Feasibility Report + +Before finalizing the build plan, report: + +- packages and compatibility status; +- built-in and custom skills; +- host-side tools and why they are needed; +- network allowlist domains; +- estimated iteration count and sub-LM call volume; +- input-size or chunking strategy; +- any unsupported assumptions or blockers. diff --git a/README.md b/README.md index 15354fec..04669d3d 100644 --- a/README.md +++ b/README.md @@ -117,19 +117,24 @@ codex-lm usage ### With your coding agent -Install the [predict-rlm skill](.agents/skills/rlm/SKILL.md) in Claude Code, -Codex, Cursor, or any compatible coding agent: +Install the repo's agent skills in Claude Code, Codex, Cursor, or any compatible +coding agent: ```bash npx skills add Trampoline-AI/predict-rlm ``` -Then ask your agent to build an RLM: +Then ask your agent to build an RLM with [`rlm`](.agents/skills/rlm/SKILL.md): ``` -❯ /rlm build an RLM that extracts line items from PDF invoices into a spreadsheet +❯ $rlm build an RLM that extracts line items from PDF invoices into a spreadsheet ``` +Use [`rlm-gepa`](.agents/skills/rlm-gepa/SKILL.md) when you want GEPA +optimization wiring, and +[`predict-rlm-contributor`](.agents/skills/predict-rlm-contributor/SKILL.md) +when contributing to this repository itself. + ### Quick Example ```python diff --git a/src/rlm_gepa/README.md b/src/rlm_gepa/README.md index 48eb4f1d..25fbb2b2 100644 --- a/src/rlm_gepa/README.md +++ b/src/rlm_gepa/README.md @@ -29,28 +29,31 @@ improvement versus a benchmark-specific hack. Budget knobs such as ## Start with a coding agent -The repository’s `/rlm` agent skill in `.agents/skills/rlm/SKILL.md` is the -recommended starting point. It can build a normal PredictRLM package, and when -you ask for optimization it can also add the RLM-GEPA project wiring. +The repository ships separate agent skills for separate jobs: -Install the skill in Claude Code, Codex, Cursor, or any compatible coding agent: +- `.agents/skills/rlm/SKILL.md` builds normal PredictRLM packages. +- `.agents/skills/rlm-gepa/SKILL.md` adds RLM-GEPA optimization wiring. +- `.agents/skills/predict-rlm-contributor/SKILL.md` is for contributing to this + repository itself. + +Install the skills in Claude Code, Codex, Cursor, or any compatible coding agent: ```bash npx skills add Trampoline-AI/predict-rlm ``` -Then ask the agent to use `/rlm` and be explicit about whether you want just the -PredictRLM, evals, or RLM-GEPA optimization too: +Use `$rlm` to design the PredictRLM itself, and use `$rlm-gepa` when evals or +RLM-GEPA optimization are in scope: ```text -/rlm interview me to design a PredictRLM that extracts renewal terms, pricing +$rlm-gepa interview me to design a PredictRLM that extracts renewal terms, pricing changes, and notice windows from vendor contracts. Then build the RLM, evals, and RLM-GEPA optimization wiring. ``` -When the prompt asks for an interview, the `/rlm` skill is expected to scope the -RLM and GEPA setup before it writes the plan. The RLM itself should remain the -source of truth for the DSPy signature and tools; GEPA should derive those via +When the prompt asks for an interview, the `$rlm-gepa` skill scopes the RLM and +GEPA setup before it writes the plan. The RLM itself should remain the source of +truth for the DSPy signature and tools; GEPA should derive those via `agent_spec_from_rlm(...)`. The interview fills in the extra GEPA brief: - input shape and scale, for example “PDF/MSA/SOW contracts, 20-200 pages,” @@ -69,7 +72,7 @@ source of truth for the DSPy signature and tools; GEPA should derive those via “sales coaching,” “customer-support QA,” “data-cleaning workflows,” or “competitive research.” -When optimization is in scope, the `/rlm` skill should add the project-local +When optimization is in scope, the `$rlm-gepa` skill should add the project-local `gepa/` package: ```text @@ -100,7 +103,7 @@ from predict_rlm import PredictRLM, Skill from predict_rlm.trace import RunTrace from rlm_gepa import EvaluationContext, RLMGepaExampleResult, RLMGepaProject, agent_spec_from_rlm -from .signature import AnalyzeDocuments +from ..agent.signature import AnalyzeDocuments SEED_SKILL_INSTRUCTIONS = "Initial domain instructions for the RLM." @@ -214,7 +217,7 @@ component needs a different proposer brief. RLM-GEPA projects should feel like a product CLI: from the project root, run `uv run rlm-gepa ...` for checks, evals, optimization, stats, and plots. When -the `/rlm` skill scaffolds an optimization project, it should set this up in +the `$rlm-gepa` skill scaffolds an optimization project, it should set this up in `pyproject.toml` for you: ```toml @@ -257,6 +260,14 @@ The `eval` subcommand is project-specific because datasets and metrics are project-specific. Agent-only or optimization-only projects do not need a held-out `eval` command unless the user asks for one. +For eval and optimization CLIs, route task execution through +`rlm_gepa.runtime.adapter.RLMGepaAdapter` rather than bespoke `asyncio.gather` +loops. Project-local code can own dataset selection, candidate loading, task +setup, and `eval.json` summary shaping; the shared adapter owns concurrency, +per-task timeouts, progress display, verbose RLM logs, `task_traces/*.jsonl`, +and `cost_log.jsonl`. Write `eval.json` in the run directory so +`rlm-gepa stats ` works for held-out evals as well as optimization runs. + Use `--verbose-rlm` to print human-readable RLM trace blocks during eval: reasoning, generated code, output, tool calls, errors, and `SUBMIT` payloads. Use `--debug-rlm` for timestamped RLM and sandbox lifecycle diagnostics. diff --git a/tests/test_rlm_skill_docs.py b/tests/test_rlm_skill_docs.py index 442546ab..10e22af2 100644 --- a/tests/test_rlm_skill_docs.py +++ b/tests/test_rlm_skill_docs.py @@ -9,25 +9,28 @@ ROOT = Path(__file__).resolve().parents[1] +def _installable_skill_docs_text() -> str: + skill_docs = sorted((ROOT / ".agents" / "skills").glob("**/*.md")) + return "\n".join(path.read_text() for path in skill_docs) + + def test_public_rlm_skill_version_snippets_match_package_version(): package_version = tomllib.loads((ROOT / "pyproject.toml").read_text())["project"][ "version" ] - skill_text = (ROOT / ".agents/skills/rlm/SKILL.md").read_text() - - if package_version != "0.4.1": - return + skill_text = _installable_skill_docs_text() stale_snippets = [ r"predict-rlm>=0\.3\.0", r"predict-rlm\[[^\]]+\]>=0\.4\.0", ] + assert f'predict_rlm_version = "{package_version}"' in skill_text for snippet in stale_snippets: assert not re.search(snippet, skill_text), f"stale RLM skill snippet: {snippet}" def test_public_rlm_skill_requires_shared_eval_adapter_semantics(): - skill_text = (ROOT / ".agents/skills/rlm/SKILL.md").read_text() + skill_text = _installable_skill_docs_text() assert "rlm_gepa.runtime.adapter.RLMGepaAdapter" in skill_text assert "eval.json" in skill_text