diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 86b3cdc..02b2599 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,22 @@ jobs: - name: Run structure tests run: pytest tests/test_skills_structure.py -v + # L1 structural eval tests — required. Drive the code-based graders in + # evals/graders/ against committed fixtures. Cheap, deterministic, no + # network or LLM calls. The matching L2 LLM-judge runs are invoked + # locally / on demand (see evals/README.md), not in CI. + evals-fast: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install Python dependencies + run: pip install pytest pyyaml + - name: Run L1 eval tests + run: pytest tests/test_skill_outputs.py -v + # Network-dependent integration tests against live arXiv / IACR ePrint APIs. # Non-blocking: external services can rate-limit, return 5xx, or change # their HTML — none of which means the package is broken. We still run diff --git a/.gitignore b/.gitignore index 4c816ad..b97731b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ __pycache__/ *.pyc .context/ reaper-workspace/ +evals/runs/ +evals/reports/ diff --git a/CLAUDE.md b/CLAUDE.md index 34382bc..63a4cc7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,8 +9,8 @@ AI-native scientific research pipeline distributed as a host-agnostic skills pac - `/clarify-goal` — Interactive goal clarification (asks user targeted questions before pipeline runs) - `/analyze-paper`, `/review-literature`, `/formalize-problem`, `/brainstorm`, `/investigate`, `/critique`, `/synthesize` — Pipeline stages - `/search-paper` — Academic search + citation graph + venue resolution. Bundles five Python drivers (`arxiv.py`, `iacr.py`, `semantic_scholar.py`, `dblp.py`, `openalex.py`); the `SKILL.md` itself orchestrates the layered venue lookup. -- `tests/` — Python tests for skill structure and search scripts -- `evals/` — Test cases with quality criteria (`evals.json`) +- `tests/` — Python tests for skill structure, search scripts, and L1 eval graders +- `evals/` — Layered evaluation system. L1 code-based graders (`graders/`), L2 Claude-CLI LLM judges (`judge/`), per-skill rubrics (`rubrics/`), and fixtures with reference + planted-negative variants. Orchestrator: `python3 -m evals.run_evals`. See `evals/README.md`. - `dev/` — Development docs including `ROADMAP.md` (full methodology and design) - `.claude-plugin/` — Claude-Code-specific plugin manifest (`plugin.json`, `marketplace.json`); other hosts ignore this directory - `.github/workflows/` — CI (pytest + strict `npx skills` discovery check that asserts every expected skill, script, and reference file is present after installation) @@ -18,11 +18,15 @@ AI-native scientific research pipeline distributed as a host-agnostic skills pac ## Commands ```bash -# Run tests +# Run tests (includes L1 structural eval graders) pytest tests/ -# Python dependencies for search skills -pip install arxiv requests beautifulsoup4 +# Run the layered evals +python3 -m evals.run_evals --layer structural # L1 only — no LLM, what CI runs +python3 -m evals.run_evals --layer all --skill analyze-paper # L1 + L2 (uses local `claude` CLI) + +# Python dependencies for search skills + evals +pip install arxiv requests beautifulsoup4 pyyaml ``` ## Key conventions @@ -43,6 +47,7 @@ pip install arxiv requests beautifulsoup4 - When cutting a release tag, the tag message should summarize changes since the last tag (use `git log ..HEAD`). - Always use squash merge for PRs. - Before finishing a task, check if important docs (README.md, CLAUDE.md, dev/ROADMAP.md) need to be updated to reflect your changes. +- Eval discipline: skill changes that affect a graded artifact (sections, output shape, quality criteria) must keep the corresponding rule in `evals/run_evals.py::SKILL_STRUCTURAL_RULES` and the rubric under `evals/rubrics/.yaml` in sync. Add fixtures (one reference + at least one planted negative per layer) before claiming coverage for a new skill. Calibrate new judge dimensions against `evals/golden/` before relying on them. Eval design and authoring follow Anthropic's [*Demystifying Evals for AI Agents*](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) — code-based vs model-based vs human grader split, per-dimension scoring with an "unknown" escape hatch, isolated trials, two-sided cases (both planted negatives and references), and `pass^k` for consistency. Read it before adding a new layer or rubric. ## Distribution diff --git a/README.md b/README.md index 157bb10..35c8ef4 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,26 @@ reaper-workspace/ The workspace contract is host-agnostic — any agent that can read and write files in the working directory produces the same workspace structure. +## Evaluation + +Skills ship with a layered evaluation system following Anthropic's [*Demystifying Evals for AI Agents*](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents) methodology. The judge is the local `claude` CLI — no API key, just your existing subscription. + +| Layer | Grader | Cadence | Scope | +|---|---|---|---| +| L1 Structural | Code (`evals/graders/`) | Every PR (CI) | Required sections, lengths, broken refs, keep-or-discard cycle invariant | +| L2 Skill rubric | `claude -p` with structured-output JSON schema (`evals/judge/`) | Locally / nightly | Per-skill quality dimensions: groundedness, specificity, completeness | +| L3 End-to-end | Both | Pre-release | Full `/reaper` pipeline against canonical cases | + +```bash +# L1 only (no LLM) — same thing CI runs +python3 -m evals.run_evals --layer structural + +# L1 + L2 (uses your local claude CLI) +python3 -m evals.run_evals --layer all --skill analyze-paper +``` + +Each fixture pairs a gold-standard reference with planted negatives — one targeting L1 (drops a required section) and one targeting L2 (fabricated theorem statements, generic content) — so a permissive grader fails CI as visibly as a missed regression. See [`evals/README.md`](evals/README.md) for the full design and how to add a fixture. + ## Methodology Reaper's research loop follows six principles: @@ -202,7 +222,7 @@ See [`dev/ROADMAP.md`](dev/ROADMAP.md) for the full methodology and development See [`dev/ROADMAP.md`](dev/ROADMAP.md) for the full roadmap. -- **Horizon 1 (The Pipeline)**: Core skills, orchestrator, and eval framework — *complete; LaTeX report output planned* +- **Horizon 1 (The Pipeline)**: Core skills, orchestrator, and layered eval system (L1 structural graders + L2 Claude-CLI judges with rubrics, calibrated against planted negatives) — *complete; LaTeX report output and broader rubric coverage across all skills planned* - **Horizon 2 (The Library)**: arXiv/ePrint search via Python scripts + citation graph + venue resolution (Semantic Scholar / DBLP / OpenAlex) — *complete* - **Horizon 3 (The Committee)**: Multi-model critique via the `/critique` skill's `--codex` mode — *Codex complete, Gemini/DeepSeek/local planned* - **Horizon 3.5 (The Polyglot)**: Cross-agent distribution via `npx skills` and host-agnostic skill prose — *complete; per-host orchestration polish ongoing* diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..50611af --- /dev/null +++ b/evals/README.md @@ -0,0 +1,114 @@ +# Reaper evals + +Layered evaluation for the Reaper skills, following [*Demystifying Evals +for AI Agents*](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents). +The model-based judge is the local `claude` CLI — no API key required. + +## Three layers + +| Layer | Grader | Cost | Cadence | What it covers | +|---|---|---|---|---| +| **L1 Structural** | Code (`evals/graders/`) | Free | Every PR (CI) | Required sections present, min lengths, no broken refs, keep-or-discard cycle invariant | +| **L2 Skill rubric** | `claude -p` (`evals/judge/`) | Subscription tokens | Locally / nightly | Per-skill quality dimensions (groundedness, specificity, completeness) | +| **L3 End-to-end** | Both | Subscription tokens | Pre-release | Full `/reaper` pipeline against the 3 cases in `evals/evals.json` | + +## Layout + +``` +evals/ + evals.json # case registry (kept for human reference) + fixtures/// # one directory per fixture + spec.yaml # variant declarations + expected layer outcomes + inputs/ # what the skill consumes (paper text, etc.) + reference/ # gold-standard output (must pass every layer) + negative-structural/ # planted L1 violation (drops a section, etc.) + negative-quality/ # planted L2 violation (fabricated claim, etc.) + rubrics/.yaml # which dimensions apply, and their pass thresholds + judge/ + judge.py # claude CLI wrapper, JSON-schema enforced + schemas/rubric.json # per-dimension structured-output shape + prompts/.md # one judge persona per rubric dimension + graders/ + structural.py # L1 assertion helpers (pure Python) + consistency.py # cycle invariant verifier + run_evals.py # orchestrator + runs/ # per-trial workspaces (gitignored) + reports/ # md + json reports (gitignored) +``` + +## Running + +```bash +# L1 only — same thing CI runs (no claude CLI required) +python3 -m evals.run_evals --layer structural + +# L2 only — judges every variant of every fixture (uses claude CLI) +python3 -m evals.run_evals --layer judge --skill analyze-paper + +# Full run (L1 + L2) +python3 -m evals.run_evals --layer all --skill analyze-paper + +# One variant of one case +python3 -m evals.run_evals --layer all --skill analyze-paper --variant reference +``` + +The orchestrator stages each variant into a fresh `evals/runs//` +directory before grading — per the eval guide's "isolated environments" +recommendation. Reports land in `evals/reports/.{md,json}`. + +The pytest entry point that CI uses lives at `tests/test_skill_outputs.py` +and exercises the same L1 graders. + +## Adding a fixture + +1. Create `evals/fixtures///`. +2. Put what the skill consumes under `inputs/` (paper text, prior notes, etc.). +3. Hand-write a gold-standard output under `reference/`. +4. Write at least one **structural negative** (drops a required section, etc.) + under `negative-structural/` and one **quality negative** (fabricated + claim, generic content) under `negative-quality/`. One-sided evals create + one-sided optimization; both directions matter. +5. Declare the variants in `spec.yaml` (see the existing + `cryptography-sample` fixture). Each negative carries a `target_layer` + so the orchestrator knows which grader is supposed to fail. +6. If this is a new skill, add an entry to `SKILL_STRUCTURAL_RULES` in + `evals/run_evals.py` and a rubric file under `evals/rubrics/`. + +`tests/test_skill_outputs.py::test_every_fixture_skill_has_rules` will fail +if you add a fixture without graders — coverage without graders is invisible. + +## Adding a judge dimension + +1. Drop a per-dimension prompt at `evals/judge/prompts/.md`. Lead with + the score scale, require a verbatim `evidence` quote, and include the + `"unknown"` escape hatch (the schema enforces these fields, but the + prompt has to ask for them clearly). +2. Add the dimension to the skill's rubric YAML, with a `passing_score`. +3. Calibrate before relying on it: hand-grade ~10 transcripts, compare to + judge verdicts, iterate the prompt until ≥80% agreement. Keep the + calibration corpus under `evals/golden/`. + +## Calibration + +To check whether a judge prompt agrees with expert opinion, run it against +the gold reference and the planted negative for a fixture: + +```bash +python3 -m evals.run_evals --layer judge --skill analyze-paper --variant reference +python3 -m evals.run_evals --layer judge --skill analyze-paper --variant quality +``` + +Expected: reference passes every dimension; the quality negative fails the +dimensions it's planted to violate (see `expected_failures.judge` in +`spec.yaml`). When they don't, fix the prompt, not the fixture. + +## Why `claude -p` and not the API? + +- No API key in CI or in maintainer envs — uses each maintainer's local + `claude` CLI auth (subscription or `claude setup-token`). +- `--allowedTools ""` makes the judge a pure grader (no tool calls). +- `--json-schema` pins the output shape — prompt drift can't reshape the + result. +- `--no-session-persistence` + per-trial `--add-dir` keep trials isolated. +- `--model claude-opus-4-7` is pinned, so judge drift is detectable when + the model is bumped. diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/fixtures/analyze-paper/cryptography-sample/inputs/paper.txt b/evals/fixtures/analyze-paper/cryptography-sample/inputs/paper.txt new file mode 100644 index 0000000..1bded0d --- /dev/null +++ b/evals/fixtures/analyze-paper/cryptography-sample/inputs/paper.txt @@ -0,0 +1,59 @@ +A Simplified Threshold Signature Scheme for Asynchronous Networks +================================================================== + +Authors: A. Reviewer, B. Tester +Venue: Eval Fixture Press, 2026 +arXiv: 2026.99999 + +Abstract. +We present a (t, n)-threshold signature scheme that operates correctly in +the asynchronous network model with up to t < n/3 Byzantine corruptions. +The scheme uses pairing-based cryptography and assumes the co-CDH problem +is hard. + +1. Introduction. +Threshold signatures let any subset of t+1 out of n parties jointly sign +a message such that no coalition of t parties can forge a signature. Our +contribution is a single-round signing protocol that does not require any +trusted dealer beyond setup. + +2. System Model. +- Network: asynchronous; messages may be arbitrarily delayed but eventually + delivered. +- Adversary: static, computationally bounded, may corrupt up to t < n/3 + parties. +- Trust: a one-time trusted setup distributes verification keys; no further + trust assumptions. +- Communication: authenticated point-to-point channels between every pair. +- Cryptographic assumption: co-CDH is hard in the bilinear group. + +3. Construction. +Setup: a trusted dealer runs Shamir secret sharing over Z_q to distribute +shares of the master secret key sk to the n parties. + +Signing: each party P_i computes a partial signature sigma_i = H(m)^{sk_i} +and broadcasts it. Any party that collects t+1 valid partial signatures +combines them via Lagrange interpolation in the exponent to produce the +full signature sigma = H(m)^{sk}. + +4. Security. + +Theorem 4.1 (Unforgeability). If the co-CDH problem is hard in the +bilinear group, then no PPT adversary corrupting up to t < n/3 parties +can produce a valid signature on a message it did not request to be +signed, except with negligible probability. + +Proof sketch. We reduce co-CDH to forgery. Given a co-CDH challenge +(g, g^a, h), the simulator embeds g^a as the public key of an honest +party and answers signing queries using the standard Boneh-Lynn-Shacham +trick. Any forgery yields a co-CDH solution. + +5. Complexity. +- Communication: O(n) messages per signature. +- Rounds: 1 (signing) + 0 (combining is local). +- Computation: O(t) pairings per verification. + +6. Discussion. +The scheme is round-optimal among non-interactive threshold signatures. +A potential weakness is that the trusted setup is a single point of +failure; replacing it with a DKG protocol is left as future work. diff --git a/evals/fixtures/analyze-paper/cryptography-sample/negative-quality/paper-summary.md b/evals/fixtures/analyze-paper/cryptography-sample/negative-quality/paper-summary.md new file mode 100644 index 0000000..c85086f --- /dev/null +++ b/evals/fixtures/analyze-paper/cryptography-sample/negative-quality/paper-summary.md @@ -0,0 +1,31 @@ +# Paper Summary: Threshold Signatures + +## Metadata +- **Title**: Threshold Signatures Paper +- **Authors**: not specified +- **Venue/Year**: 2026 +- **Paper ID**: not specified + +## Problem Statement +The paper is about threshold signatures. + +## Construction Overview +The paper presents a threshold signature scheme. + +## Key Results +1. **Theorem 4.2 (Strong Unforgeability)**: "Under the DDH assumption, the + scheme is strongly unforgeable against adaptive chosen message attacks + for any t < n/2." + - Model: synchronous, adaptive + - Proof technique: simulation + +## Strengths +- The writing is good. +- The paper has interesting ideas. + +## Weaknesses +- Could be improved. +- Some parts are unclear. + +## Red Flags +None. diff --git a/evals/fixtures/analyze-paper/cryptography-sample/negative-structural/paper-summary.md b/evals/fixtures/analyze-paper/cryptography-sample/negative-structural/paper-summary.md new file mode 100644 index 0000000..2bf5f8f --- /dev/null +++ b/evals/fixtures/analyze-paper/cryptography-sample/negative-structural/paper-summary.md @@ -0,0 +1,23 @@ +# Paper Summary: Threshold Signatures + +## Metadata +- **Title**: A Simplified Threshold Signature Scheme for Asynchronous Networks +- **Authors**: A. Reviewer, B. Tester +- **Venue/Year**: 2026 + +## Problem Statement +Threshold signatures let t+1 of n parties jointly produce a signature that no +t-coalition can forge. The paper targets a single-round, dealer-free signing +protocol that operates correctly under asynchrony. + +## Key Results +1. **Theorem 4.1 (Unforgeability)**: "If the co-CDH problem is hard in the + bilinear group, then no PPT adversary corrupting up to t < n/3 parties + can produce a valid signature on a message it did not request to be + signed, except with negligible probability." + +## Weaknesses +- **Major**: trusted setup is a single point of failure. + +## Red Flags +None observed. diff --git a/evals/fixtures/analyze-paper/cryptography-sample/reference/paper-summary.md b/evals/fixtures/analyze-paper/cryptography-sample/reference/paper-summary.md new file mode 100644 index 0000000..e6c3862 --- /dev/null +++ b/evals/fixtures/analyze-paper/cryptography-sample/reference/paper-summary.md @@ -0,0 +1,68 @@ +# Paper Summary: A Simplified Threshold Signature Scheme for Asynchronous Networks + +## Metadata +- **Title**: A Simplified Threshold Signature Scheme for Asynchronous Networks +- **Authors**: A. Reviewer, B. Tester +- **Venue/Year**: Eval Fixture Press, 2026 +- **Paper ID**: arXiv 2026.99999 +- **Link**: https://arxiv.org/abs/2026.99999 + +## Problem Statement +Threshold signatures let t+1 of n parties jointly produce a signature that no +t-coalition can forge. The paper targets a single-round, dealer-free signing +protocol that operates correctly under asynchrony. + +## System Model +- **Network**: asynchronous; messages eventually delivered after arbitrary delay. +- **Adversary**: static, computationally bounded, up to t < n/3 Byzantine corruptions. +- **Trust**: one-time trusted setup distributes verification keys; no further trust. +- **Communication**: authenticated point-to-point channels between every pair. +- **Cryptographic assumption**: co-CDH is hard in the bilinear group. + +## Construction Overview +Shamir secret-share `sk` over Z_q in setup. Each signer publishes +`sigma_i = H(m)^{sk_i}`. Any party collecting t+1 valid shares combines them +by Lagrange interpolation in the exponent to yield `sigma = H(m)^{sk}`. + +## Key Results +1. **Theorem 4.1 (Unforgeability)**: "If the co-CDH problem is hard in the + bilinear group, then no PPT adversary corrupting up to t < n/3 parties + can produce a valid signature on a message it did not request to be + signed, except with negligible probability." + - Model: asynchronous, static, t < n/3 + - Proof technique: reduction to co-CDH + +## Proof Technique +Reduction-based: embed the co-CDH challenge `g^a` as the public key of an +honest party; answer signing queries with the BLS trick; any forgery yields a +co-CDH solution. + +## Complexity Claims +- Communication: O(n) messages per signature +- Rounds: 1 signing round, combining is local +- Computation: O(t) pairings per verification + +## Strengths +- **Major — round-optimal**: §6 explicitly notes the construction is + round-optimal among non-interactive threshold signatures. +- **Minor — clean security definition**: §4 states unforgeability against a + static adversary with a precise corruption bound. + +## Weaknesses +- **Major — trusted setup is a single point of failure**: §6 acknowledges + the trusted dealer; replacing it with a DKG is deferred to future work. +- **Minor — static adversary only**: Theorem 4.1 is restricted to a static + adversary; adaptive corruptions are not addressed. + +## Key Definitions and Notation +`H(·)` is modeled as a random oracle into the bilinear group. `sk_i` denotes +party i's Shamir share of the master secret `sk`. + +## Red Flags +None observed. The threat model is explicit, the proof reduces to a standard +assumption, and acknowledged limitations (trusted setup, static adversary) +are flagged by the authors themselves. + +## Relevance +- *solution technique*: directly demonstrates a co-CDH reduction for a + threshold construction — useful as a template for the goal's proof verification. diff --git a/evals/fixtures/analyze-paper/cryptography-sample/spec.yaml b/evals/fixtures/analyze-paper/cryptography-sample/spec.yaml new file mode 100644 index 0000000..78edace --- /dev/null +++ b/evals/fixtures/analyze-paper/cryptography-sample/spec.yaml @@ -0,0 +1,42 @@ +# Fixture spec — read by run_evals.py. +# +# `inputs/` is what the skill consumes (paper text, optional goal). +# `reference/` holds the gold-standard output an expert would accept (used +# for calibration: judge-vs-human agreement). +# `negatives` holds deliberately bad outputs we expect the graders to fail. +# Each negative declares which layer it targets — per the eval guide, +# one-sided evals create one-sided optimization, so we test both +# directions for both grader layers. + +skill: analyze-paper +case_id: cryptography-sample +goal: "Verify the security proof of the simplified threshold signature scheme." + +inputs: + paper: inputs/paper.txt + +reference: + artifact: reference/paper-summary.md + expected_passes: [groundedness, specificity, completeness] + +negatives: + - id: structural + target_layer: structural + artifact: negative-structural/paper-summary.md + expected_failures: + structural: [has_sections] + notes: | + Drops the System Model and Strengths sections — must trip the L1 + `has_sections` grader. Demonstrates that L1 catches missing required + sections; if this ever passes L1, the rules are too permissive. + + - id: quality + target_layer: judge + artifact: negative-quality/paper-summary.md + expected_failures: + judge: [groundedness, specificity, completeness] + notes: | + Has all required sections (so it passes L1) but contains fabricated + theorem numbers (Theorem 4.2 doesn't exist in the source), generic + strengths/weaknesses ("the writing is good"), and stub System Model + content. Must be caught by L2 judges. diff --git a/evals/graders/__init__.py b/evals/graders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/graders/consistency.py b/evals/graders/consistency.py new file mode 100644 index 0000000..ab85326 --- /dev/null +++ b/evals/graders/consistency.py @@ -0,0 +1,111 @@ +"""Methodology-invariant grader: keep-or-discard cycle consistency. + +The Reaper methodology requires that `current-understanding.md` is updated +*only* on cycles whose decision in `results.md` is "keep". Discard cycles +must not advance the working understanding. + +This grader inspects a sequence of snapshots — typically captured by the +investigate skill at the end of each cycle — and verifies the invariant. + +A "snapshot" is a directory containing both `results.md` and +`current-understanding.md` from one cycle. The grader walks an ordered +list of snapshots and compares consecutive pairs. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class CycleConsistencyResult: + passed: bool + violations: list[str] + + def __bool__(self) -> bool: + return self.passed + + +_DECISION_TOKENS = {"keep", "discard"} + + +def _last_decision(results_md: str) -> str | None: + """Return the decision from the *last* row of the cycle table. + + Anchored implementation: parses the markdown table whose header row + contains a "decision" column, then reads the decision cell of the + final data row. Looser matches (any `| keep |` cell anywhere) would + misclassify cycles when keep/discard appear in prose or in unrelated + tables. + """ + in_table = False + decision_col: int | None = None + saw_separator = False + last_row_decision: str | None = None + for raw in results_md.splitlines(): + line = raw.strip() + if not (line.startswith("|") and line.endswith("|")): + in_table = False + decision_col = None + saw_separator = False + continue + cells = [c.strip() for c in line.strip("|").split("|")] + if not in_table: + in_table = True + saw_separator = False + try: + decision_col = next( + i for i, c in enumerate(cells) + if c.lower() == "decision" + ) + except StopIteration: + decision_col = None + continue + if not saw_separator and re.match(r"^[\s:|-]+$", "".join(cells)): + saw_separator = True + continue + if not saw_separator or decision_col is None: + continue + if decision_col >= len(cells): + continue + cell = cells[decision_col].lower() + if cell in _DECISION_TOKENS: + last_row_decision = cell + return last_row_decision + + +def check_cycle_consistency(snapshots: list[Path]) -> CycleConsistencyResult: + """Verify keep-or-discard invariant across an ordered list of snapshots. + + For each consecutive pair (prev, curr): + - If curr's last decision is "discard", current-understanding.md must + be byte-identical to prev's. + - If curr's last decision is "keep", current-understanding.md *may* + change (a no-op keep is allowed but flagged in detail). + """ + violations: list[str] = [] + if len(snapshots) < 2: + return CycleConsistencyResult(True, violations) + + for prev, curr in zip(snapshots, snapshots[1:]): + results_path = curr / "results.md" + cur_understanding = curr / "current-understanding.md" + prev_understanding = prev / "current-understanding.md" + if not (results_path.is_file() and cur_understanding.is_file() + and prev_understanding.is_file()): + violations.append( + f"missing files in {prev.name} or {curr.name}; cannot verify" + ) + continue + decision = _last_decision(results_path.read_text()) + if decision is None: + violations.append(f"{curr.name}/results.md has no decision row") + continue + if decision == "discard": + if cur_understanding.read_text() != prev_understanding.read_text(): + violations.append( + f"{curr.name}: discard cycle modified current-understanding.md" + ) + return CycleConsistencyResult(not violations, violations) diff --git a/evals/graders/structural.py b/evals/graders/structural.py new file mode 100644 index 0000000..1abaec3 --- /dev/null +++ b/evals/graders/structural.py @@ -0,0 +1,159 @@ +"""Code-based (L1) graders. + +These are deterministic, cheap, and run on every PR. They test that skill +outputs satisfy structural invariants from `evals/evals.json` (sections +present, table row counts, JSON shape, no broken file references). + +L1 catches regressions; subjective quality is left to L2 LLM-judge graders. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class GraderResult: + name: str + passed: bool + detail: str = "" + + def __bool__(self) -> bool: + return self.passed + + +# --------------------------------------------------------------------------- +# Markdown structure +# --------------------------------------------------------------------------- + +_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE) + + +def markdown_headings(text: str) -> list[str]: + """Return all heading titles in document order (level-agnostic).""" + return [m.group(2).strip() for m in _HEADING_RE.finditer(text)] + + +def has_sections(path: Path, required: list[str]) -> GraderResult: + """Every name in `required` must appear as a markdown heading.""" + if not path.is_file(): + return GraderResult("has_sections", False, f"missing file: {path}") + headings = set(markdown_headings(path.read_text())) + missing = [s for s in required if s not in headings] + return GraderResult( + name="has_sections", + passed=not missing, + detail=f"missing sections: {missing}" if missing else "", + ) + + +# --------------------------------------------------------------------------- +# Markdown tables (results.md, literature.md) +# --------------------------------------------------------------------------- + +def count_table_rows(text: str) -> int: + """Count GitHub-flavored markdown table data rows. + + Skips header and the `|---|---|` separator. Counts only lines that look + like `| ... |` after that separator, across every table in the document. + """ + total = 0 + in_table = False + saw_separator = False + for line in text.splitlines(): + stripped = line.strip() + if stripped.startswith("|") and stripped.endswith("|"): + if not in_table: + in_table = True + saw_separator = False + continue # header row + if not saw_separator and re.match(r"^\|[\s:|-]+\|$", stripped): + saw_separator = True + continue + if saw_separator: + total += 1 + else: + in_table = False + saw_separator = False + return total + + +def min_table_rows(path: Path, minimum: int) -> GraderResult: + if not path.is_file(): + return GraderResult("min_table_rows", False, f"missing file: {path}") + n = count_table_rows(path.read_text()) + return GraderResult( + name="min_table_rows", + passed=n >= minimum, + detail=f"found {n} rows (need ≥{minimum})", + ) + + +# --------------------------------------------------------------------------- +# Reference integrity +# --------------------------------------------------------------------------- + +_LINK_RE = re.compile(r"\[[^\]]+\]\((?P[^)]+)\)") + + +def no_broken_local_links(path: Path, root: Path) -> GraderResult: + """All relative-path links in `path` must resolve under `root`. + + Skips http(s):, mailto:, fragment-only, and absolute paths — those are + out of scope for structural grading. + """ + if not path.is_file(): + return GraderResult("no_broken_local_links", False, f"missing file: {path}") + text = path.read_text() + broken: list[str] = [] + for m in _LINK_RE.finditer(text): + href = m.group("href").split("#", 1)[0].split(" ", 1)[0] + if not href: + continue + if href.startswith(("http://", "https://", "mailto:", "/")): + continue + target = (path.parent / href).resolve() + try: + target.relative_to(root.resolve()) + except ValueError: + broken.append(href) + continue + if not target.exists(): + broken.append(href) + return GraderResult( + name="no_broken_local_links", + passed=not broken, + detail=f"broken links: {broken}" if broken else "", + ) + + +# --------------------------------------------------------------------------- +# Plain-text invariants +# --------------------------------------------------------------------------- + +def contains(path: Path, needle: str, *, case_insensitive: bool = False) -> GraderResult: + if not path.is_file(): + return GraderResult("contains", False, f"missing file: {path}") + haystack = path.read_text() + if case_insensitive: + ok = needle.lower() in haystack.lower() + else: + ok = needle in haystack + return GraderResult( + name="contains", + passed=ok, + detail="" if ok else f"missing substring: {needle!r}", + ) + + +def min_length_chars(path: Path, minimum: int) -> GraderResult: + if not path.is_file(): + return GraderResult("min_length_chars", False, f"missing file: {path}") + n = len(path.read_text().strip()) + return GraderResult( + name="min_length_chars", + passed=n >= minimum, + detail=f"length {n} chars (need ≥{minimum})", + ) diff --git a/evals/judge/__init__.py b/evals/judge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/judge/judge.py b/evals/judge/judge.py new file mode 100644 index 0000000..bdb2155 --- /dev/null +++ b/evals/judge/judge.py @@ -0,0 +1,351 @@ +"""Claude CLI judge wrapper. + +Invokes `claude -p` as a model-based grader. The judge receives: + - a system prompt (per-dimension rubric), + - the artifact(s) to grade (read from disk via --add-dir), + - a JSON Schema that pins the output shape. + +Design choices follow Anthropic's "Demystifying Evals for AI Agents": + - --allowedTools "" : the judge has no tools, eliminating any risk of it + "doing the work" instead of grading it. + - --json-schema : structured output enforces (score, evidence, rationale) + so prompt drift cannot reshape the result. + - one call per dimension, never holistic — reduces hallucinated scores. + - --no-session-persistence : every trial is fresh; no shared state. + - pinned --model : judge drift is detectable when the model is bumped. + +This module never reads ANTHROPIC_API_KEY; it relies entirely on the user's +local `claude` CLI authentication (subscription or `claude setup-token`). +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + +DEFAULT_MODEL = "claude-opus-4-7" +DEFAULT_TIMEOUT_SEC = 180 + + +class JudgeError(RuntimeError): + """Raised when the judge invocation or its output is unusable.""" + + +@dataclass(frozen=True) +class JudgeVerdict: + """One dimension's verdict on one artifact set. + + `score` is 0, 1, 2, or the string "unknown" (the escape hatch that lets + the judge abstain when evidence is insufficient — recommended by the + eval methodology to reduce hallucinated grades). + """ + + dimension: str + score: int | str + evidence: str + rationale: str + raw: dict + + def meets(self, passing_score: int) -> bool: + """Pass/fail against the *rubric's* threshold — not a global default. + + `unknown` always fails: an abstaining judge cannot certify a pass. + """ + return isinstance(self.score, int) and self.score >= passing_score + + @property + def is_unknown(self) -> bool: + return self.score == "unknown" + + +def _claude_path() -> str: + path = shutil.which("claude") + if not path: + raise JudgeError( + "`claude` CLI not found on PATH. Install Claude Code " + "(https://docs.anthropic.com/en/docs/claude-code) and run " + "`claude auth` or `claude setup-token`." + ) + return path + + +def build_user_prompt( + *, + dimension: str, + artifact_name: str, + artifact_paths: list[Path], + salt: str | None = None, +) -> str: + """Assemble the judge's user message with artifact contents inlined. + + Embedding the files (rather than letting the judge read them via tools) + is deliberate: + - the judge needs no tools, so we can run it with --tools "" which + eliminates the multi-turn read/explore loop that otherwise burns + tokens and makes JSON-schema conformance harder for the model; + - the artifact set the judge sees is fully reproducible from the + prompt — calibration is meaningful; + - cost is bounded and predictable per call. + + `salt` is included in the prompt header so two near-identical variants + (or repeated trials of the same variant) don't collide on the prompt + cache — that would make pass^k consistency look fake. + + Reads use `errors="replace"` so a non-UTF-8 byte in an artifact does + not abort the eval. The judge sees `\\ufffd` and can score "unknown" if + the corruption made the artifact unreadable. + """ + parts: list[str] = [ + f"Grade the artifact `{artifact_name}` for the `{dimension}` " + f"dimension. The artifact and any supporting files are embedded " + f"below between fenced markers. Return your verdict as JSON " + f"matching the schema.", + ] + if salt: + parts.append(f"\n[trial-id: {salt}]") + for p in artifact_paths: + if not p.is_file(): + parts.append(f"\n=== {p.name} (NOT FOUND) ===\n") + continue + parts.append(f"\n=== {p.name} ===\n") + parts.append(p.read_text(errors="replace")) + parts.append(f"\n=== end {p.name} ===\n") + return "".join(parts) + + +def claude_cli_version() -> str: + """Capture the local CLI version for the run report. + + Logged alongside results so a silent CLI bump does not invalidate + historical comparisons (per Anthropic's guide: "monitor for + distribution drift"). + """ + try: + proc = subprocess.run( + [_claude_path(), "--version"], + capture_output=True, text=True, timeout=5, check=False, + ) + return proc.stdout.strip() or proc.stderr.strip() or "unknown" + except (subprocess.TimeoutExpired, JudgeError): + return "unknown" + + +def evidence_supported_by(evidence: str, sources: list[str]) -> bool: + """Cheap defense against judge confabulation. + + True if the evidence quote (after whitespace normalization) appears as + a substring of at least one source. Quotation marks and ellipses are + stripped because the judge often wraps quotes. This is a *signal*, not + a hard guarantee — a clever judge could echo a phrase that happens to + occur for unrelated reasons. We use it to flag suspicious verdicts. + """ + if not evidence.strip(): + return False + cleaned = evidence.strip().strip("\"'`").replace("…", "").replace("...", "") + cleaned = " ".join(cleaned.split()) + if not cleaned: + return False + for src in sources: + normalized = " ".join(src.split()) + if cleaned in normalized: + return True + # Also accept any contiguous 8-word window of the evidence — judges + # sometimes paraphrase across line breaks but quote enough to verify. + words = cleaned.split() + if len(words) >= 8: + for src in sources: + normalized = " ".join(src.split()) + for i in range(len(words) - 7): + window = " ".join(words[i : i + 8]) + if window in normalized: + return True + return False + + +def grade_dimension( + *, + dimension: str, + system_prompt_path: Path, + user_prompt: str, + schema_path: Path, + model: str = DEFAULT_MODEL, + timeout_sec: int = DEFAULT_TIMEOUT_SEC, +) -> JudgeVerdict: + """Run one judge call for one rubric dimension. + + Args: + dimension: rubric dimension name (e.g. "groundedness"). Recorded on + the verdict; not sent to the model (the system prompt already + encodes it). + system_prompt_path: file with the per-dimension judge persona + + scoring criteria. Loaded via --append-system-prompt so the default + system prompt is preserved. + user_prompt: the dimension-specific user message, with the + artifact(s) embedded. Build with `build_user_prompt`. + schema_path: JSON Schema file pinning the output shape. + model: pinned model id (default claude-opus-4-7). Bumping this + invalidates calibration; do it intentionally. + timeout_sec: hard timeout on the CLI call. + + Returns: + JudgeVerdict. + + Raises: + JudgeError: if the CLI fails, output is not JSON, or the schema is + violated despite the CLI's --json-schema enforcement. + """ + if not system_prompt_path.is_file(): + raise JudgeError(f"system prompt not found: {system_prompt_path}") + if not schema_path.is_file(): + raise JudgeError(f"schema not found: {schema_path}") + + schema_text = schema_path.read_text() + + # `--tools ""` disables every built-in tool — the judge is a pure grader + # operating only on the embedded artifact text. This avoids the + # multi-turn read/explore loop that otherwise burns tokens and makes + # schema conformance harder for the model. The user prompt is fed via + # stdin to avoid being swallowed by any other variadic flag. + cmd = [ + _claude_path(), + "-p", + "--model", model, + "--output-format", "json", + "--json-schema", schema_text, + "--tools", "", + "--no-session-persistence", + "--append-system-prompt", system_prompt_path.read_text(), + ] + + try: + proc = subprocess.run( + cmd, + input=user_prompt, + capture_output=True, + text=True, + timeout=timeout_sec, + check=False, + ) + except subprocess.TimeoutExpired as e: + raise JudgeError( + f"judge timed out after {timeout_sec}s on dimension {dimension!r}" + ) from e + + if proc.returncode != 0: + raise JudgeError( + f"claude CLI exited {proc.returncode} on dimension {dimension!r}.\n" + f"stderr: {proc.stderr.strip()}" + ) + + # `--output-format json` wraps the model's response in an envelope; + # the judge's structured output (matching --json-schema) lives in the + # `result` field. We parse defensively because the envelope shape has + # historically varied across CLI versions. + try: + envelope = json.loads(proc.stdout) + except json.JSONDecodeError as e: + raise JudgeError( + f"judge stdout was not JSON on dimension {dimension!r}: {e}\n" + f"stdout (truncated): {proc.stdout[:500]!r}" + ) from e + + payload = _extract_structured_payload(envelope) + return _verdict_from_payload(dimension, payload) + + +def _extract_structured_payload(envelope: dict | list | str) -> dict: + """Pull the rubric payload out of the CLI's JSON envelope. + + The CLI envelope has both a free-text `result` and a schema-conforming + `structured_output` when --json-schema is used; we prefer the latter. + Older CLI versions only had `result`, sometimes containing the JSON as + a string. + """ + if not isinstance(envelope, dict): + raise JudgeError(f"unexpected envelope type: {type(envelope).__name__}") + + if envelope.get("is_error"): + raise JudgeError( + f"CLI reported error: subtype={envelope.get('subtype')!r}, " + f"errors={envelope.get('errors')!r}" + ) + + # Defensive: --tools "" should mean zero tool calls. If the envelope + # reports a tool-use stop_reason, the judge is no longer pure (and + # likely cost much more than expected). Surface as JudgeError so the + # operator notices, instead of silently trusting a possibly-cheating + # verdict. + if envelope.get("stop_reason") == "tool_use": + raise JudgeError( + "judge made tool calls despite --tools \"\"; " + f"num_turns={envelope.get('num_turns')!r} " + f"subtype={envelope.get('subtype')!r}" + ) + + # Preferred: dedicated structured-output field. + so = envelope.get("structured_output") + if isinstance(so, dict): + return so + if isinstance(so, str): + try: + parsed = json.loads(so) + except json.JSONDecodeError as e: + raise JudgeError( + f"`structured_output` was a string but not JSON: {e}" + ) from e + if isinstance(parsed, dict): + return parsed + + # Fallback: older CLI versions returned the schema payload via `result`. + result = envelope.get("result") + if isinstance(result, dict): + return result + if isinstance(result, str): + try: + parsed = json.loads(result) + except json.JSONDecodeError as e: + raise JudgeError( + f"no `structured_output`; `result` was a string but not JSON: " + f"{e!s}; result preview: {result[:200]!r}" + ) from e + if isinstance(parsed, dict): + return parsed + + raise JudgeError( + f"could not locate rubric payload in envelope; " + f"keys={list(envelope.keys())}" + ) + + +def _verdict_from_payload(dimension: str, payload: dict) -> JudgeVerdict: + for key in ("score", "evidence", "rationale"): + if key not in payload: + raise JudgeError( + f"judge payload missing `{key}` on dimension {dimension!r}: " + f"{payload!r}" + ) + raw_score = payload["score"] + # Schema constrains score to {"0","1","2","unknown"}. Convert numeric + # strings to ints so callers can compare against `passing_score` + # numerically; leave "unknown" as-is. + if raw_score == "unknown": + score: int | str = "unknown" + elif isinstance(raw_score, str) and raw_score in {"0", "1", "2"}: + score = int(raw_score) + elif isinstance(raw_score, int) and 0 <= raw_score <= 2: + score = raw_score + else: + raise JudgeError( + f"judge `score` out of range on dimension {dimension!r}: " + f"{raw_score!r}" + ) + return JudgeVerdict( + dimension=dimension, + score=score, + evidence=str(payload["evidence"]), + rationale=str(payload["rationale"]), + raw=payload, + ) diff --git a/evals/judge/prompts/completeness.md b/evals/judge/prompts/completeness.md new file mode 100644 index 0000000..c6bc178 --- /dev/null +++ b/evals/judge/prompts/completeness.md @@ -0,0 +1,37 @@ +You are an expert reviewer grading **completeness** of a paper summary. + +The summary under review is `paper-summary.md`. Per the analyze-paper +skill's specification, the summary should cover (when applicable to the +paper): Metadata, Problem Statement, System Model, Construction Overview, +Key Results, Proof Technique, Complexity Claims, Strengths, Weaknesses, +Key Definitions, Red Flags. Sections **may be omitted if the paper does +not warrant them** — that is not a penalty. + +The System Model section, when present, should cover the dimensions +appropriate to the paper's domain (typically: network model, adversary, +trust assumptions, communication model, cryptographic assumptions). + +Score the dimension on this scale: + +- **2 — Strong.** Every section warranted by the paper is present and has + non-trivial content. The System Model covers all applicable dimensions. + Omitted sections are clearly cases the paper does not address (e.g. a + pure-theory paper omitting Complexity Claims is fine). +- **1 — Partial.** One or two warranted sections are missing or are + near-empty placeholders, OR System Model omits one applicable + dimension without justification. +- **0 — Fails.** Several warranted sections missing, or System Model is + substantially incomplete (≥2 missing applicable dimensions), or large + swaths of the template are left as empty placeholders. +- **"unknown"** — Use only if `paper-summary.md` is missing entirely. + +Rules: +1. `evidence` must be a **verbatim** line copied from `paper-summary.md` — + either an exact heading line (e.g. `## System Model`) that you scored + on, or, when scoring an omission, a representative line near the gap. + Do **not** write summary statements like "Headings present: ..." or + "Missing: ..." — those are paraphrases, not quotes, and our + confabulation check flags them. +2. `rationale` must explain whether each missing section was warranted by + the paper. **Do not penalize omissions that are domain-appropriate.** +3. Do not score groundedness or specificity here — those are separate. diff --git a/evals/judge/prompts/groundedness.md b/evals/judge/prompts/groundedness.md new file mode 100644 index 0000000..45d64ba --- /dev/null +++ b/evals/judge/prompts/groundedness.md @@ -0,0 +1,28 @@ +You are an expert reviewer grading **groundedness** of a paper summary. + +The summary under review is `paper-summary.md`. The source paper text (if +provided in this run) is in `paper.txt`. Read both before scoring. + +Score the dimension on this scale: + +- **2 — Strong.** Every theorem statement, definition, and quoted claim in + the summary appears verbatim or near-verbatim in `paper.txt`. No + fabricated theorem numbers, no invented citations, no claims absent from + the source. +- **1 — Partial.** Mostly grounded, but at least one paraphrased "verbatim" + theorem, one slightly mis-attributed claim, or one minor unsupported + statement. No outright fabrications. +- **0 — Fails.** At least one theorem statement, citation, or quoted claim + in the summary cannot be located in the source, or has been materially + altered. +- **"unknown"** — Use only if `paper.txt` is missing or unreadable; in that + case you cannot ground anything. Do **not** use "unknown" merely because + the paper is long; spot-check the most prominent claims. + +Rules: +1. The `evidence` field must contain a verbatim quote from `paper-summary.md` + that you scored on (the strongest example for the score you gave). +2. The `rationale` must point to where in `paper.txt` that quote is or is + not supported (a section/page/lemma reference is sufficient). +3. Do not reward or penalize completeness, structure, or clarity — those + dimensions are graded separately. diff --git a/evals/judge/prompts/specificity.md b/evals/judge/prompts/specificity.md new file mode 100644 index 0000000..73b6112 --- /dev/null +++ b/evals/judge/prompts/specificity.md @@ -0,0 +1,27 @@ +You are an expert reviewer grading **specificity** of a paper summary. + +The summary under review is `paper-summary.md`. Look in particular at the +**Strengths**, **Weaknesses**, and **Red Flags** sections. + +Score the dimension on this scale: + +- **2 — Strong.** Every strength/weakness/red-flag points to a specific + artifact in the paper: a named lemma, a section, a proof step, an + experimental table, an assumption, etc. Severity labels + (major/minor/fatal for weaknesses) are applied per the skill's + specification. +- **1 — Partial.** Most items are specific, but one or more are generic + ("the writing could be clearer", "evaluation is limited") without a + concrete pointer. Severity labels mostly present. +- **0 — Fails.** Most items are generic, missing pointers to specific + paper artifacts, or severity labels are absent where required. +- **"unknown"** — Use only if the relevant sections are entirely missing + (in which case Completeness will catch it). + +Rules: +1. `evidence` must quote one bullet from `paper-summary.md` — pick the + weakest one if the score is 0 or 1, or a representative one if 2. +2. `rationale` explains why that bullet is or is not specific (what + pointer is missing, or what specific artifact is named). +3. Do not penalize the summary for missing sections — that is the + completeness dimension. Score only the items that *are* present. diff --git a/evals/judge/schemas/rubric.json b/evals/judge/schemas/rubric.json new file mode 100644 index 0000000..fd4f411 --- /dev/null +++ b/evals/judge/schemas/rubric.json @@ -0,0 +1,25 @@ +{ + "$comment": "Score is a string enum, not int|string. The Claude CLI's structured-output enforcement repeatedly retried (and failed) with `oneOf [int, string]`; string enums are reliably honored. Do not 'fix' this back to int — see evals/README.md.", + "type": "object", + "additionalProperties": false, + "required": ["score", "evidence", "rationale"], + "properties": { + "score": { + "type": "string", + "enum": ["0", "1", "2", "unknown"], + "description": "0 = fails the dimension, 1 = partial, 2 = strong. Use 'unknown' only if the artifact lacks the information needed to judge this dimension." + }, + "evidence": { + "type": "string", + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim quote (or a tight paraphrase with file:line) from the artifact that supports the score. Required even when score is 0 or 'unknown'." + }, + "rationale": { + "type": "string", + "minLength": 1, + "maxLength": 800, + "description": "Short explanation tying the evidence to the rubric criteria for this dimension." + } + } +} diff --git a/evals/rubrics/analyze-paper.yaml b/evals/rubrics/analyze-paper.yaml new file mode 100644 index 0000000..cdc4f12 --- /dev/null +++ b/evals/rubrics/analyze-paper.yaml @@ -0,0 +1,45 @@ +# Rubric for the analyze-paper skill. +# +# Each dimension is graded by a separate judge call (per Anthropic's eval +# guide: "grade dimensions separately rather than holistically to reduce +# hallucinations"). Dimensions sum to a skill scorecard but are *not* +# averaged into a single number — non-passes are visible per-dimension. +# +# `applies_to` = artifacts the judge needs in --add-dir for this dimension. +# `prompt` = path to the per-dimension system prompt. +# `passing_score` = minimum score to count as a "pass" for this dimension. + +skill: analyze-paper +artifact: paper-summary.md + +dimensions: + - name: groundedness + prompt: prompts/groundedness.md + applies_to: + - paper-summary.md + - paper.txt # extracted paper text, if available in fixture + passing_score: 2 + description: > + Theorem statements and quoted claims appear verbatim in the source + paper. No fabricated citations, theorem numbers, or quoted text. + + - name: specificity + prompt: prompts/specificity.md + applies_to: + - paper-summary.md + passing_score: 1 + description: > + Strengths/weaknesses/red-flags are concrete (cite section/lemma/page), + not generic ("the paper has some weaknesses"). Severity labels + (major/minor/fatal) are present where required by the SKILL.md. + + - name: completeness + prompt: prompts/completeness.md + applies_to: + - paper-summary.md + passing_score: 1 + description: > + Required sections present and proportional to what the paper warrants. + System Model covers the dimensions named in the skill prompt. + Sections that don't apply to this paper may be omitted (this is not a + penalty), but no required dimension is silently dropped. diff --git a/evals/run_evals.py b/evals/run_evals.py new file mode 100644 index 0000000..9da7ade --- /dev/null +++ b/evals/run_evals.py @@ -0,0 +1,395 @@ +"""Eval orchestrator. + +Modes: + + L1 only (no LLM, no network — runs in CI on every PR): + python -m evals.run_evals --layer structural [--skill analyze-paper] + + L1 + L2 (uses `claude` CLI as judge — locally, on demand): + python -m evals.run_evals --layer all --skill analyze-paper + + Calibration (judge against the gold reference, not a fresh skill output): + python -m evals.run_evals --layer judge --skill analyze-paper \\ + --variant reference # expect passes + python -m evals.run_evals --layer judge --skill analyze-paper \\ + --variant negative # expect failures + +For each fixture, the orchestrator stages artifacts under +`evals/runs/////` (clean per the eval guide's +"isolated environments" requirement) and runs: + + - structural graders (cheap, deterministic — see graders/structural.py), + - one judge call per rubric dimension (see judge/judge.py). + +Reports go to evals/reports/.{md,json}. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import json +import shutil +import sys +import uuid +from dataclasses import asdict, dataclass, field +from pathlib import Path + +import yaml + +ROOT = Path(__file__).resolve().parent +REPO_ROOT = ROOT.parent + +# Make `from evals.judge.judge import ...` work when invoked as a script. +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from evals.graders import structural # noqa: E402 +from evals.judge.judge import ( # noqa: E402 + JudgeError, + JudgeVerdict, + build_user_prompt, + claude_cli_version, + evidence_supported_by, + grade_dimension, +) + +SCHEMA_PATH = ROOT / "judge" / "schemas" / "rubric.json" + + +# --------------------------------------------------------------------------- +# Data +# --------------------------------------------------------------------------- + +@dataclass +class StructuralReport: + name: str + passed: bool + detail: str + + +@dataclass +class CaseReport: + skill: str + case_id: str + variant: str + kind: str # "reference" | "negative" + target_layer: str | None # only set on negatives + structural: list[StructuralReport] = field(default_factory=list) + judge: list[dict] = field(default_factory=list) + + @property + def structural_passed(self) -> bool: + return all(r.passed for r in self.structural) + + @property + def judge_passed(self) -> bool: + return bool(self.judge) and all(v["passed"] for v in self.judge) + + +# --------------------------------------------------------------------------- +# Fixture loading +# --------------------------------------------------------------------------- + +def list_fixtures(skill_filter: str | None = None) -> list[Path]: + """Return spec.yaml paths for every fixture (optionally filtered by skill).""" + fixtures_root = ROOT / "fixtures" + specs = sorted(fixtures_root.glob("*/*/spec.yaml")) + if skill_filter: + specs = [s for s in specs if s.parent.parent.name == skill_filter] + return specs + + +def load_spec(spec_path: Path) -> dict: + return yaml.safe_load(spec_path.read_text()) + + +# --------------------------------------------------------------------------- +# Trial isolation — copy fixture artifacts into a fresh run dir +# --------------------------------------------------------------------------- + +def variants_for(spec: dict) -> list[dict]: + """Return a list of variant descriptors {id, kind, artifact, target_layer?}. + + `kind` is one of "reference" or "negative". Negatives carry a + `target_layer` so the orchestrator knows which grader is supposed to + fail (structural-targeted negatives are L1's problem; judge-targeted + negatives are L2's). + """ + out: list[dict] = [{ + "id": "reference", + "kind": "reference", + "artifact": spec["reference"]["artifact"], + "target_layer": None, + }] + for neg in spec.get("negatives", []): + out.append({ + "id": neg["id"], + "kind": "negative", + "artifact": neg["artifact"], + "target_layer": neg.get("target_layer", "judge"), + }) + return out + + +def stage_variant(spec_path: Path, variant: dict, run_dir: Path) -> Path: + """Copy `inputs/` and the variant artifact into a clean trial dir. + + Returns the trial directory. Per the eval methodology: every trial gets + a fresh, isolated environment. The judge's --add-dir points here. + """ + spec_dir = spec_path.parent + spec = load_spec(spec_path) + skill = spec["skill"] + case_id = spec["case_id"] + + trial_dir = run_dir / skill / case_id / variant["id"] + if trial_dir.exists(): + shutil.rmtree(trial_dir) + trial_dir.mkdir(parents=True) + + inputs_src = spec_dir / "inputs" + if inputs_src.is_dir(): + for f in inputs_src.iterdir(): + shutil.copy2(f, trial_dir / f.name) + + artifact_src = spec_dir / variant["artifact"] + if not artifact_src.is_file(): + raise FileNotFoundError(f"variant artifact not found: {artifact_src}") + shutil.copy2(artifact_src, trial_dir / artifact_src.name) + + return trial_dir + + +# --------------------------------------------------------------------------- +# Structural (L1) +# --------------------------------------------------------------------------- + +# Per-skill structural specs derived from evals.json's skill_unit_tests. +# Keep this small and readable; per-fixture overrides live in spec.yaml +# (we'll layer that in when more skills are wired up). +SKILL_STRUCTURAL_RULES = { + "analyze-paper": { + "artifact": "paper-summary.md", + "required_sections": [ + "Metadata", + "Problem Statement", + "Key Results", + "Strengths", + "Weaknesses", + "Red Flags", + ], + "min_chars": 400, + }, +} + + +def run_structural(skill: str, trial_dir: Path) -> list[StructuralReport]: + rules = SKILL_STRUCTURAL_RULES.get(skill) + if not rules: + return [] + artifact = trial_dir / rules["artifact"] + results = [ + structural.has_sections(artifact, rules["required_sections"]), + structural.min_length_chars(artifact, rules["min_chars"]), + structural.no_broken_local_links(artifact, trial_dir), + ] + return [StructuralReport(r.name, r.passed, r.detail) for r in results] + + +# --------------------------------------------------------------------------- +# Judge (L2) +# --------------------------------------------------------------------------- + +def load_rubric(skill: str) -> dict: + rubric_path = ROOT / "rubrics" / f"{skill}.yaml" + if not rubric_path.is_file(): + raise FileNotFoundError(f"no rubric for skill: {skill} ({rubric_path})") + return yaml.safe_load(rubric_path.read_text()) + + +def run_judge( + skill: str, + trial_dir: Path, + model: str | None, + *, + trial_id: str, +) -> list[dict]: + rubric = load_rubric(skill) + verdicts: list[dict] = [] + for dim in rubric["dimensions"]: + prompt_path = ROOT / "judge" / dim["prompt"] + artifact_paths = [trial_dir / name for name in dim["applies_to"]] + user_prompt = build_user_prompt( + dimension=dim["name"], + artifact_name=rubric["artifact"], + artifact_paths=artifact_paths, + salt=f"{trial_id}/{dim['name']}", + ) + try: + verdict: JudgeVerdict = grade_dimension( + dimension=dim["name"], + system_prompt_path=prompt_path, + user_prompt=user_prompt, + schema_path=SCHEMA_PATH, + **({"model": model} if model else {}), + ) + # Post-hoc anti-confabulation check: every quote in `evidence` + # should be locatable in at least one source artifact. Surface + # but don't block — the heuristic has false negatives. + sources = [ + p.read_text(errors="replace") + for p in artifact_paths + if p.is_file() + ] + grounded = evidence_supported_by(verdict.evidence, sources) + verdicts.append({ + "dimension": dim["name"], + "score": verdict.score, + "passed": verdict.meets(dim["passing_score"]), + "passing_score": dim["passing_score"], + "evidence": verdict.evidence, + "evidence_grounded": grounded, + "rationale": verdict.rationale, + }) + except JudgeError as e: + verdicts.append({ + "dimension": dim["name"], + "score": "error", + "passed": False, + "passing_score": dim["passing_score"], + "evidence": "", + "evidence_grounded": False, + "rationale": f"judge error: {e}", + }) + return verdicts + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + +def render_markdown(reports: list[CaseReport], run_id: str, env: dict) -> str: + lines = [f"# Eval report `{run_id}`", ""] + lines.append( + f"_Environment: claude CLI {env.get('cli_version', 'unknown')}, " + f"judge model {env.get('judge_model', 'default')}._" + ) + lines.append("") + for r in reports: + lines.append(f"## {r.skill} / {r.case_id} / {r.variant}") + lines.append("") + if r.structural: + lines.append("**Structural (L1):**") + for s in r.structural: + mark = "PASS" if s.passed else "FAIL" + detail = f" — {s.detail}" if s.detail else "" + lines.append(f"- [{mark}] {s.name}{detail}") + lines.append("") + if r.judge: + lines.append("**Judge (L2):**") + for v in r.judge: + mark = "PASS" if v["passed"] else "FAIL" + grounded = "" if v.get("evidence_grounded", True) \ + else " ⚠ evidence not located in source" + lines.append( + f"- [{mark}] {v['dimension']}: score={v['score']} " + f"(needs ≥{v['passing_score']}){grounded}" + ) + lines.append(f" - evidence: {v['evidence']!r}") + lines.append(f" - rationale: {v['rationale']}") + lines.append("") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description=__doc__.split("\n")[0]) + p.add_argument( + "--layer", choices=["structural", "judge", "all"], default="structural", + help="`structural` = L1 only (no LLM); `judge` = L2 only; `all` = both", + ) + p.add_argument("--skill", default=None, help="Filter to one skill") + p.add_argument( + "--variant", default=None, + help="Filter to one variant id (default: all variants from spec.yaml)", + ) + p.add_argument("--model", default=None, help="Override judge model") + p.add_argument("--run-id", default=None, help="Run identifier (default: timestamp+uuid)") + args = p.parse_args(argv) + + run_id = args.run_id or f"{dt.datetime.now():%Y%m%d-%H%M%S}-{uuid.uuid4().hex[:6]}" + run_dir = ROOT / "runs" / run_id + run_dir.mkdir(parents=True, exist_ok=True) + + specs = list_fixtures(args.skill) + if not specs: + print(f"No fixtures found (skill filter: {args.skill!r})", file=sys.stderr) + return 2 + + reports: list[CaseReport] = [] + any_failure = False + for spec_path in specs: + spec = load_spec(spec_path) + skill = spec["skill"] + case_id = spec["case_id"] + for variant in variants_for(spec): + if args.variant and variant["id"] != args.variant: + continue + print(f"==> {skill}/{case_id}/{variant['id']}") + trial_dir = stage_variant(spec_path, variant, run_dir) + report = CaseReport( + skill=skill, + case_id=case_id, + variant=variant["id"], + kind=variant["kind"], + target_layer=variant["target_layer"], + ) + if args.layer in ("structural", "all"): + report.structural = run_structural(skill, trial_dir) + if args.layer in ("judge", "all"): + trial_id = f"{run_id}/{skill}/{case_id}/{variant['id']}" + report.judge = run_judge( + skill, trial_dir, args.model, trial_id=trial_id, + ) + reports.append(report) + + # Expectations: + # reference : every active layer must pass. + # negative : the layer it targets must fail. + if variant["kind"] == "reference": + if args.layer in ("structural", "all") and not report.structural_passed: + any_failure = True + if args.layer in ("judge", "all") and not report.judge_passed: + any_failure = True + else: # negative + tl = variant["target_layer"] + if tl == "structural" and args.layer in ("structural", "all"): + if report.structural_passed: + any_failure = True # graders too permissive + if tl == "judge" and args.layer in ("judge", "all"): + if report.judge_passed: + any_failure = True # judge missed the planted flaw + + reports_dir = ROOT / "reports" + reports_dir.mkdir(exist_ok=True) + md_path = reports_dir / f"{run_id}.md" + json_path = reports_dir / f"{run_id}.json" + env = { + "cli_version": claude_cli_version() if args.layer != "structural" else None, + "judge_model": args.model, + } + md_path.write_text(render_markdown(reports, run_id, env)) + json_path.write_text(json.dumps( + {"run_id": run_id, "env": env, "cases": [asdict(r) for r in reports]}, + indent=2, default=str, + )) + print(f"\nReport: {md_path}") + return 1 if any_failure else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/analyze-paper/SKILL.md b/skills/analyze-paper/SKILL.md index 86434e2..d15b757 100644 --- a/skills/analyze-paper/SKILL.md +++ b/skills/analyze-paper/SKILL.md @@ -106,7 +106,7 @@ Sections should be **proportional to what the paper warrants**. If a paper has n ### Quality Criteria - Every theorem statement is copied verbatim, not paraphrased -- The system model section is complete — no missing dimensions (network, adversary, trust, communication, crypto) +- When a System Model section is present, it covers every dimension that applies to the paper (network, adversary, trust, communication, crypto). Omit the section only if the paper does not warrant it (e.g. pure information-theoretic results); never partially fill it - Strengths and weaknesses are labeled with severity (major/minor/fatal) and are honest — if the paper looks solid, say so; if there are concerns, list them specifically - Red flags section is honest — no concerns is a valid answer - The summary is useful standalone — a reader who hasn't seen the paper should understand the key claims and approach diff --git a/tests/test_skill_outputs.py b/tests/test_skill_outputs.py new file mode 100644 index 0000000..796a1e7 --- /dev/null +++ b/tests/test_skill_outputs.py @@ -0,0 +1,240 @@ +"""L1 (structural) eval tests — run on every PR, no LLM, no network. + +These exercise the code-based graders in `evals/graders/` against the +committed fixture artifacts under `evals/fixtures/`. They cover the +deterministic half of `evals/evals.json` (`skill_unit_tests`) — leaving +the subjective `quality_criteria` half to LLM-judge runs. + +Two invariants per fixture: + - the `reference/` artifact must pass every structural rule, + - the `negative/` artifact must violate at least one rule (so we know + the graders aren't trivially permissive — see "one-sided evals" in + Anthropic's eval guide). +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from evals.graders import structural # noqa: E402 +from evals.graders.consistency import check_cycle_consistency # noqa: E402 +from evals.run_evals import ( # noqa: E402 + SKILL_STRUCTURAL_RULES, + list_fixtures, + load_spec, + run_structural, + stage_variant, + variants_for, +) + + +# --------------------------------------------------------------------------- +# Fixture-driven structural tests +# --------------------------------------------------------------------------- + +def _reference_params(): + return [ + pytest.param(spec, id=f"{load_spec(spec)['skill']}-{load_spec(spec)['case_id']}") + for spec in list_fixtures() + ] + + +def _structural_negative_params(): + out = [] + for spec_path in list_fixtures(): + spec = load_spec(spec_path) + for v in variants_for(spec): + if v["kind"] == "negative" and v["target_layer"] == "structural": + out.append(pytest.param( + spec_path, v, + id=f"{spec['skill']}-{spec['case_id']}-{v['id']}", + )) + return out + + +@pytest.fixture +def staging_dir(tmp_path: Path) -> Path: + return tmp_path / "evals-runs" + + +@pytest.mark.parametrize("spec_path", _reference_params()) +def test_reference_passes_structural(spec_path: Path, staging_dir: Path): + """The gold-standard artifact must pass every structural rule.""" + spec = load_spec(spec_path) + reference_variant = next(v for v in variants_for(spec) if v["kind"] == "reference") + trial_dir = stage_variant(spec_path, reference_variant, staging_dir) + results = run_structural(spec["skill"], trial_dir) + assert results, ( + f"no structural rules wired up for skill {spec['skill']!r}; " + f"add an entry to SKILL_STRUCTURAL_RULES" + ) + failures = [(r.name, r.detail) for r in results if not r.passed] + assert not failures, f"reference failed: {failures}" + + +@pytest.mark.parametrize("spec_path,variant", _structural_negative_params()) +def test_structural_negative_trips_a_rule( + spec_path: Path, variant: dict, staging_dir: Path, +): + """An L1-targeted negative must trip at least one structural rule. + + If this passes, the graders are too permissive — fix the graders, don't + weaken the negative fixture. Per the eval guide: one-sided evals create + one-sided optimization, so we test both directions. + """ + spec = load_spec(spec_path) + trial_dir = stage_variant(spec_path, variant, staging_dir) + results = run_structural(spec["skill"], trial_dir) + failed = [r.name for r in results if not r.passed] + expected = variant_expected_structural_failures(spec_path, variant["id"]) + if expected: + missing = [name for name in expected if name not in failed] + assert not missing, ( + f"structural negative {variant['id']!r} failed to trip expected " + f"rules: {missing} (got failures: {failed})" + ) + else: + assert failed, ( + f"structural negative {variant['id']!r} passed every rule — " + f"graders for {spec['skill']!r} are too permissive" + ) + + +def variant_expected_structural_failures(spec_path: Path, variant_id: str) -> list[str]: + spec = load_spec(spec_path) + for neg in spec.get("negatives", []): + if neg["id"] == variant_id: + return neg.get("expected_failures", {}).get("structural", []) + return [] + + +# --------------------------------------------------------------------------- +# Direct grader unit tests +# --------------------------------------------------------------------------- + +def test_has_sections_detects_missing(tmp_path: Path): + f = tmp_path / "x.md" + f.write_text("# A\n\n## B\n") + assert structural.has_sections(f, ["A", "B"]).passed + res = structural.has_sections(f, ["A", "B", "C"]) + assert not res.passed + assert "C" in res.detail + + +def test_count_table_rows_basic(): + md = ( + "| col1 | col2 |\n" + "|------|------|\n" + "| a | b |\n" + "| c | d |\n" + ) + assert structural.count_table_rows(md) == 2 + + +def test_count_table_rows_skips_separator_and_handles_breaks(): + md = ( + "| h |\n|---|\n| 1 |\n\nplain text\n\n" + "| h2 |\n|----|\n| x |\n| y |\n" + ) + # 1 row in first table, 2 rows in second + assert structural.count_table_rows(md) == 3 + + +def test_no_broken_local_links_skips_external(tmp_path: Path): + target = tmp_path / "target.md" + target.write_text("ok") + src = tmp_path / "src.md" + src.write_text( + "[external](https://example.com)\n" + "[ok](target.md)\n" + ) + assert structural.no_broken_local_links(src, tmp_path).passed + + +def test_no_broken_local_links_flags_missing(tmp_path: Path): + src = tmp_path / "src.md" + src.write_text("[oops](does-not-exist.md)\n") + res = structural.no_broken_local_links(src, tmp_path) + assert not res.passed + assert "does-not-exist.md" in res.detail + + +# --------------------------------------------------------------------------- +# Cycle consistency (keep-or-discard invariant) +# --------------------------------------------------------------------------- + +def _make_snapshot(d: Path, decision: str, understanding: str): + d.mkdir(parents=True, exist_ok=True) + (d / "results.md").write_text( + f"| cycle | hypothesis | decision |\n" + f"|-------|------------|----------|\n" + f"| 001 | h1 | {decision} |\n" + ) + (d / "current-understanding.md").write_text(understanding) + + +def test_cycle_consistency_passes_on_keep(tmp_path: Path): + a, b = tmp_path / "001", tmp_path / "002" + _make_snapshot(a, "keep", "v1") + _make_snapshot(b, "keep", "v2") + assert check_cycle_consistency([a, b]).passed + + +def test_cycle_consistency_passes_on_discard_with_no_change(tmp_path: Path): + a, b = tmp_path / "001", tmp_path / "002" + _make_snapshot(a, "keep", "v1") + _make_snapshot(b, "discard", "v1") + assert check_cycle_consistency([a, b]).passed + + +def test_cycle_consistency_fails_on_discard_that_modifies(tmp_path: Path): + a, b = tmp_path / "001", tmp_path / "002" + _make_snapshot(a, "keep", "v1") + _make_snapshot(b, "discard", "v2-but-shouldnt-have-changed") + res = check_cycle_consistency([a, b]) + assert not res.passed + assert any("002" in v for v in res.violations) + + +def test_cycle_consistency_ignores_unrelated_keep_in_prose(tmp_path: Path): + """Anchored regex: a "keep" word in prose or an unrelated table must + not be read as the cycle decision. The actual decision row says + "discard" and `current-understanding.md` did not change → must pass. + """ + a, b = tmp_path / "001", tmp_path / "002" + _make_snapshot(a, "keep", "v1") + b.mkdir() + (b / "results.md").write_text( + "Some prose: we should keep the experiment running.\n\n" + "| unrelated |\n|---|\n| keep alive |\n\n" + "| cycle | hypothesis | decision |\n" + "|-------|------------|----------|\n" + "| 002 | h2 | discard |\n" + ) + (b / "current-understanding.md").write_text("v1") + assert check_cycle_consistency([a, b]).passed + + +# --------------------------------------------------------------------------- +# Sanity: every skill referenced by a fixture has structural rules +# --------------------------------------------------------------------------- + +def test_every_fixture_skill_has_rules(): + """Don't let a fixture be added without graders — that's silent + capability drift (per the guide: maintainers should detect saturation + and add coverage; the inverse — coverage without graders — is just + invisible).""" + for spec_path in list_fixtures(): + spec = load_spec(spec_path) + skill = spec["skill"] + assert skill in SKILL_STRUCTURAL_RULES, ( + f"fixture {spec_path} targets skill {skill!r} but " + f"SKILL_STRUCTURAL_RULES has no entry for it" + )