diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index e18269f..37dd33b 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -14,12 +14,12 @@ "repo": "Autonoma-AI/test-planner-plugin", "ref": "production" }, - "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation" + "description": "Generates comprehensive E2E test cases through a validated multi-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands." }, { "name": "autonoma-test-planner-development", "source": "./", - "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation" + "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated multi-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands." } ] } diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index bade427..e43d1e1 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,11 +1,8 @@ { "name": "autonoma-test-planner", "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step", - "version": "1.1.0", + "version": "1.13.1", "author": { "name": "Autonoma" - }, - "commands": [ - "./commands" - ] + } } diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 73754fe..f2c1c4d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,5 +14,5 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.11" - - run: pip install pytest pyyaml + - run: pip install pytest pyyaml Faker - run: pytest tests/ -v diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 4c313f9..2ef9a1c 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "1.4.0" + ".": "1.14.0" } diff --git a/CLAUDE.md b/CLAUDE.md index 3822134..7bb5b8b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,56 +1,80 @@ # Autonoma Test Planner Plugin -Claude Code plugin that generates E2E test suites through a 4-step deterministic pipeline. +Claude Code plugin that generates E2E test suites through a deterministic multi-step pipeline. ## Project Structure -``` -.claude-plugin/ # Plugin manifest (plugin.json, marketplace.json) -commands/generate-tests.md # Entry point — dispatches the 4-step pipeline -skills/generate-tests/SKILL.md # Orchestrator skill -agents/ # Isolated subagents (one per step) - kb-generator.md # Step 1: Knowledge base → autonoma/AUTONOMA.md + features.json - scenario-generator.md # Step 2: Scenarios → autonoma/scenarios.md - test-case-generator.md # Step 3: Tests → autonoma/qa-tests/INDEX.md + test files - env-factory-generator.md # Step 4: Environment factory endpoint +```text +.claude-plugin/ # Plugin manifest +commands/generate-tests.md # Full pipeline command +commands/generate-adhoc-tests.md +skills/generate-tests/SKILL.md +skills/generate-adhoc-tests/SKILL.md +agents/ + kb-generator.md # Step 1: Knowledge base + entity-audit-generator.md # Step 2: Entity creation audit + scenario-generator.md # Step 3: Scenarios + env-factory-generator.md # Step 4: Environment Factory implementation + scenario-validator.md # Step 5: Scenario lifecycle validation + test-case-generator.md # Step 6: E2E tests + focused-test-case-generator.md hooks/ - hooks.json # PostToolUse hook config (triggers on Write) - validate-pipeline-output.sh # Bash dispatcher → routes to Python validators - validators/ # Python scripts that validate YAML frontmatter + hooks.json + pipeline-kickoff.sh + pretool-heartbeat.sh + transcript-streamer.py + validate-pipeline-output.sh + preflight_scenario_recipes.py + validators/ + evals/ +tests/ ``` -## How the Pipeline Works +## Pipeline -Each step spawns an isolated subagent. After each Write, the PostToolUse hook in `hooks/hooks.json` runs `validate-pipeline-output.sh`, which pattern-matches the file path and runs the appropriate Python validator. Validators exit 0 (OK) or 2 (block with error message). +1. Knowledge Base +2. Entity Creation Audit +3. Scenarios +4. Implement Environment Factory +5. Validate Scenario Lifecycle +6. Generate E2E Tests -Steps 1-3 require user confirmation before advancing. Step 4 is the final step (no gate). +The full pipeline is interactive. After steps 1-5, Claude presents the step summary and waits for user confirmation before continuing. Lifecycle reporting is handled by plugin hooks, not by ad hoc agent curl calls. ## Validation -Validators are in `hooks/validators/`. They parse YAML frontmatter and check required fields, types, and cross-file consistency. All validators print "OK" on success or an error message on failure. +Validators are in `hooks/validators/`. | Validator | File matched | Key checks | |-----------|-------------|------------| -| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | app_name, app_description (≥20 chars), core_flows with at least one `core: true` | -| `validate_features.py` | `*/autonoma/features.json` | features array length matches total_features, valid types, at least one core feature | -| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario_count ≥ 3, standard/empty/large scenarios present, entity_types | -| `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals match folder sums, criticality sums, cross-checks against features.json | -| `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | title, description, criticality (critical/high/mid/low), scenario, flow | +| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | frontmatter and core-flow structure | +| `validate_features.py` | `*/autonoma/features.json` | feature inventory schema | +| `validate_entity_audit.py` | `*/autonoma/entity-audit.md` | model creation classification and owner links | +| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario count, metadata, required sections | +| `validate_endpoint_implemented.py` | `*/autonoma/.endpoint-implemented` | handler path and factory integrity | +| `validate_creation_file_immutable.py` | `*/autonoma/.endpoint-implemented` | accepted audit creation files were not rewritten unsafely | +| `validate_factory_fidelity.py` | `*/autonoma/.endpoint-implemented` | semantic per-model factory fidelity | +| `validate_scenario_validation.py` | `*/autonoma/.scenario-validation.json` | Step 5 terminal-state contract | +| `validate_scenario_recipes.py` | `*/autonoma/scenario-recipes.json` | recipe schema | +| `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals and folder sums | +| `validate_directory_structure.py` | `*/autonoma/qa-tests/INDEX.md` | test directory structure | +| `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | test frontmatter | + +Scenario recipes also run live endpoint preflight through `hooks/preflight_scenario_recipes.py`. + +Test file writes are blocked until `autonoma/.endpoint-validated` exists. ## Development ```bash -# Run plugin locally without installing claude --plugin-dir ./ - -# Validate plugin structure claude plugin validate ./ +pytest ``` -## Dependencies - -- Python 3 + PyYAML (auto-installed by the hook if missing) - -## Known Issues +## Notes -- `commands/generate-tests.md` has unresolved merge conflicts between the AskUserQuestion approach and the end-turn approach for user confirmation between steps. Resolve before merging to main. +- Step 4 implements the Environment Factory and may edit target backend code. +- Step 4 writes `autonoma/.endpoint-implemented` only after discover smoke and factory-integrity checks pass. +- Step 5 validates signed `discover` / `up` / `down` for every scenario and may fix handler bugs or reconcile `scenarios.md`. +- Step 6 is gated on `autonoma/.endpoint-validated`. diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index b6726f6..84b49ff 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -4,8 +4,8 @@ This guide explains how to test changes from a branch without publishing to the ## Prerequisites -- [Claude Code](https://claude.ai/code) installed -- Your branch pushed to GitHub +- [Claude Code](https://claude.ai/code) +- your branch pushed to GitHub ## Install from a branch @@ -31,15 +31,22 @@ Push new commits to your branch, then reinstall: ## Environment variables -The plugin requires three environment variables to be set in the project where you run it: +The plugin itself requires these values in the target project session: | Variable | Description | | --- | --- | -| `AUTONOMA_API_KEY` | Your Autonoma API key (get it from the dashboard under Settings > API Keys) | -| `AUTONOMA_PROJECT_ID` | The application ID from the Autonoma dashboard | -| `AUTONOMA_API_URL` | API base URL - use `http://localhost:4000` for local dev | +| `AUTONOMA_API_KEY` | Autonoma API key | +| `AUTONOMA_PROJECT_ID` | Application ID from the Autonoma dashboard | +| `AUTONOMA_API_URL` | API base URL, for example `http://localhost:4000` in local dev | -Add them to the `.env` file or export them in your shell before running Claude Code in the target project. +You do **not** need to pre-set `AUTONOMA_SDK_ENDPOINT`, `AUTONOMA_SHARED_SECRET`, or `AUTONOMA_SIGNING_SECRET`. +Step 1 creates or discovers those values in the target repo by editing `.env` and `.env.example`. + +Use `AUTONOMA_AUTO_ADVANCE=true` as the canonical launch mode while testing. If you are still using +the older confirmation flag, `AUTONOMA_REQUIRE_CONFIRMATION=false` is treated as the same +auto-advance behavior. + +After the generated PR is merged, the user still needs to deploy those env changes. ## References diff --git a/README.md b/README.md index d264d6d..28fb2b5 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,13 @@ # Autonoma Test Planner -A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 4-step pipeline. +A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 6-step pipeline. -Each step runs in an isolated subagent with deterministic validation — shell scripts check the output format before the pipeline advances. No hallucinated validations, no cascading errors. +Each step runs in an isolated subagent with deterministic validation. The pipeline audits how application entities are created, implements an Autonoma Environment Factory against the target app, validates scenario lifecycles through the live endpoint, and only then generates E2E tests. ## Install -**Step 1:** Add the marketplace: - -``` +```text /plugin marketplace add Autonoma-AI/test-planner-plugin -``` - -**Step 2:** Install the plugin: - -``` /plugin install autonoma-test-planner@autonoma ``` @@ -22,112 +15,188 @@ Each step runs in an isolated subagent with deterministic validation — shell s Inside any project with Claude Code: -``` +```text /autonoma-test-planner:generate-tests ``` -The plugin walks you through 4 steps, asking for confirmation at each checkpoint before proceeding. +The full pipeline is interactive. After steps 1-5, Claude presents the step summary and waits for your confirmation before continuing. -## How it works +Lifecycle reporting is hook-driven: + +- `hooks/pipeline-kickoff.sh` creates the setup record and writes `autonoma/.docs-url` plus `autonoma/.generation-id`. +- `hooks/validate-pipeline-output.sh` validates artifacts, emits step events, uploads artifacts, and enforces the test-generation gate. +- `hooks/pretool-heartbeat.sh` keeps dashboard activity reporting alive while tools are running. + +## Pipeline ### Step 1: Knowledge Base -Analyzes your frontend codebase and produces `autonoma/AUTONOMA.md` — a user-perspective map of every page, flow, and feature. The file includes YAML frontmatter with a core flows table that determines how test coverage is distributed. +Analyzes the app and produces `autonoma/AUTONOMA.md`, `autonoma/skills/*.md`, and `autonoma/features.json`. -**You review**: the core flows table. If a flow is marked `core: true`, it gets 50-60% of test coverage. +**You review**: the core flows table. -### Step 2: Scenarios +### Step 2: Entity Creation Audit -Reads the knowledge base and your backend data model to design three test data environments: `standard` (realistic variety), `empty` (empty states), and `large` (pagination/performance). Outputs `autonoma/scenarios.md` with frontmatter summarizing each scenario. +Audits every database model and records how each model comes into existence in `autonoma/entity-audit.md`. -**You review**: entity names, counts, and relationships. These become hard assertions in your tests. +Models marked `independently_created: true` become Environment Factory factories that call the app's real creation functions. Dependent-only models use the SDK's raw SQL fallback and are torn down through their owner model. -### Step 3: E2E Tests +**You review**: factory-backed models, dependent-only models, and any dual-creation models. -Generates markdown test files organized by feature in `autonoma/qa-tests/`. Each test has frontmatter (title, description, criticality, scenario, flow) and uses only natural-language steps: click, scroll, type, assert. +### Step 3: Scenarios -An `INDEX.md` tracks total test count, folder breakdown, and coverage correlation with your codebase size. +Reads the knowledge base and `autonoma/entity-audit.md`, then produces `autonoma/scenarios.md`. -**You review**: test distribution and coverage correlation. Test count should roughly match 3-5x your route/feature count. +Scenarios include `standard`, `empty`, and `large`, track variable fields that must vary across runs, and use nested create trees rooted at the scope entity. -### Step 4: Environment Factory +**You review**: entity names, counts, relationships, variable fields, and via-owner versus standalone creation choices. -Implements an endpoint in your backend that creates and tears down isolated test data for each scenario. Handles `discover`, `up`, and `down` actions with HMAC-SHA256 request signing and JWT-signed refs for safe teardown. +### Step 4: Implement Environment Factory -**You review**: implementation plan before any code is written. The endpoint never modifies existing data. +Installs and configures the Autonoma SDK endpoint, then registers a factory for every `independently_created: true` model from `entity-audit.md`. -## Validation +This step runs a signed `discover` smoke test and factory-integrity checks, then writes `autonoma/.endpoint-implemented`. It does **not** run full `up` / `down`; lifecycle validation happens in Step 5. -Every output file has YAML frontmatter validated by shell scripts (not prompts). If validation fails, Claude sees the error and must fix it before proceeding. +**You review**: handler path, installed packages, factories registered, and required secrets. -| File | What's validated | -|------|-----------------| -| `AUTONOMA.md` | core_flows table, app description, feature/skill counts | -| `scenarios.md` | scenario count, required scenarios (standard/empty/large), entity types | -| `INDEX.md` | test totals match folder sums, criticality counts sum correctly, test count within expected range | -| Each test file | title, description, criticality (critical/high/mid/low), scenario, flow | +### Step 5: Validate Scenario Lifecycle -## Environment Variables (Step 4) +Runs signed `discover` / `up` / `down` against every scenario. The validator may fix handler bugs or reconcile `autonoma/scenarios.md` with real endpoint behavior. -Step 4 requires two secrets for the Environment Factory endpoint: +On success, it writes `autonoma/scenario-recipes.json`, `autonoma/.scenario-validation.json`, and `autonoma/.endpoint-validated`. The `.endpoint-validated` sentinel gates Step 6; test files cannot be written before it exists. -```bash -# Generate secrets -openssl rand -hex 32 # AUTONOMA_SIGNING_SECRET -openssl rand -hex 32 # AUTONOMA_JWT_SECRET +**You review**: scenarios passed, scenario edits, preflight result, and recipe upload status. + +### Step 6: Generate E2E Tests + +Generates markdown test files in `autonoma/qa-tests/` plus `autonoma/qa-tests/INDEX.md`. + +**You review**: test distribution and coverage correlation. + +## Key Outputs + +- `autonoma/AUTONOMA.md` +- `autonoma/skills/*.md` +- `autonoma/features.json` +- `autonoma/entity-audit.md` +- `autonoma/scenarios.md` +- `autonoma/.factory-plan.md` +- `autonoma/.endpoint-implemented` +- `autonoma/scenario-recipes.json` +- `autonoma/.scenario-validation.json` +- `autonoma/.endpoint-validated` +- `autonoma/qa-tests/INDEX.md` + +## Ad Hoc Test Generation + +The same plugin includes a `generate-adhoc-tests` command that generates tests focused on a specific topic without regenerating your full test suite. + +### Usage + +Pass your focus description directly after the command: + +``` +/autonoma-test-planner:generate-adhoc-tests description ``` -Add to your `.env`: +Or invoke without arguments and the command will suggest focus areas based on your codebase: + ``` -AUTONOMA_SIGNING_SECRET= -AUTONOMA_JWT_SECRET= +/autonoma-test-planner:generate-adhoc-tests ``` -## Requirements +### How it works + +**Subsequent runs** (active scenarios and recipes already exist in Autonoma): fetches existing scenario, skill, and test context from Autonoma, then runs only focused test generation for the requested topic. + +Tests are written to `autonoma/qa-tests/{focus-slug}/` so they sit alongside your existing test suite without overwriting it. -- Claude Code -- Python 3 (ships with macOS/Linux) -- PyYAML (auto-installed if missing) +### Running multiple focus areas + +Each focus area run writes to its own subfolder and tracks its own generation ID file. Multiple topics can run in parallel: + +``` +autonoma/qa-tests/ +├── canvas-interactions/ ← autonoma/.generation-id-canvas-interactions +└── signatures-and-documents/ ← autonoma/.generation-id-signatures-and-documents +``` + +## Environment Variables + +Provide these before running the plugin: + +```bash +AUTONOMA_DOCS_URL= +AUTONOMA_API_KEY= +AUTONOMA_PROJECT_ID= +AUTONOMA_API_URL= +``` + +`AUTONOMA_DOCS_URL` is required so subagents can fetch the latest Autonoma instructions. `AUTONOMA_API_KEY`, `AUTONOMA_PROJECT_ID`, and `AUTONOMA_API_URL` are required for dashboard setup records, lifecycle events, artifact uploads, and recipe uploads. + +The Environment Factory step generates or discovers these target-app values and updates `.env` and `.env.example` when applicable: + +```bash +AUTONOMA_SHARED_SECRET= +AUTONOMA_SIGNING_SECRET= +``` + +`AUTONOMA_SDK_ENDPOINT` is needed by scenario validation and recipe preflight once the endpoint exists. Generated environment changes still need to be deployed with the target app. + +## Validation + +Every pipeline output is validated by shell-dispatched Python validators. + +| File | Validator | Validation | +| --- | --- | --- | +| `AUTONOMA.md` | `validate_kb.py` | frontmatter and core-flow structure | +| `features.json` | `validate_features.py` | feature inventory schema | +| `entity-audit.md` | `validate_entity_audit.py` | model creation classification, factory counts, and owner links | +| `scenarios.md` | `validate_scenarios.py` | scenario schema and required sections | +| `.endpoint-implemented` | `validate_endpoint_implemented.py`, `validate_creation_file_immutable.py`, `validate_factory_fidelity.py` | handler path, factory integrity, immutable audit snapshot, and semantic factory fidelity | +| `.scenario-validation.json` | `validate_scenario_validation.py` | Step 5 terminal-state contract | +| `scenario-recipes.json` | `validate_scenario_recipes.py` | recipe schema plus live endpoint preflight | +| `INDEX.md` | `validate_test_index.py`, `validate_directory_structure.py` | test totals, folder breakdown, and directory structure | +| test files | `validate_test_file.py` | required frontmatter | + +Test files are blocked until `autonoma/.endpoint-validated` exists. ## Local Development ```bash -# Test locally without installing claude --plugin-dir ./ - -# Validate plugin structure claude plugin validate ./ +pytest ``` ## Project Structure -``` +```text autonoma-test-planner/ ├── .claude-plugin/ -│ ├── plugin.json # Plugin manifest -│ └── marketplace.json # Marketplace catalog -├── skills/generate-tests/SKILL.md # /generate-tests orchestrator +├── commands/generate-tests.md +├── commands/generate-adhoc-tests.md +├── skills/generate-tests/SKILL.md +├── skills/generate-adhoc-tests/SKILL.md ├── agents/ -│ ├── kb-generator.md # Step 1 subagent -│ ├── scenario-generator.md # Step 2 subagent -│ ├── test-case-generator.md # Step 3 subagent -│ └── env-factory-generator.md # Step 4 subagent +│ ├── kb-generator.md +│ ├── entity-audit-generator.md +│ ├── scenario-generator.md +│ ├── env-factory-generator.md +│ ├── test-case-generator.md +│ ├── focused-test-case-generator.md +│ └── scenario-validator.md ├── hooks/ -│ ├── hooks.json # PostToolUse hook config -│ ├── validate-pipeline-output.sh # Validation dispatcher +│ ├── pipeline-kickoff.sh +│ ├── pretool-heartbeat.sh +│ ├── transcript-streamer.py +│ ├── validate-pipeline-output.sh +│ ├── preflight_scenario_recipes.py │ └── validators/ -│ ├── validate_kb.py -│ ├── validate_scenarios.py -│ ├── validate_test_index.py -│ └── validate_test_file.py -├── LICENSE -└── README.md +│ └── evals/ +└── tests/ ``` -## Documentation - -Full prompt documentation: [docs.agent.autonoma.app/llms.txt](https://docs.agent.autonoma.app/llms.txt) - ## License MIT diff --git a/agents/entity-audit-generator.md b/agents/entity-audit-generator.md new file mode 100644 index 0000000..96b30e0 --- /dev/null +++ b/agents/entity-audit-generator.md @@ -0,0 +1,241 @@ +--- +description: > + Audits every database model to describe every way it comes into existence. + For each model the agent answers two orthogonal questions: (a) does a + standalone creation path exist? (b) which other models' creation flows + produce it as a side effect? Independently-created models get factories; + the rest fall back to raw SQL INSERT and are torn down via their owner(s). +tools: + - Read + - Glob + - Grep + - Write + - Edit + - Bash + - Agent + - WebFetch +maxTurns: 60 +--- + +# Entity Creation Audit + +You audit the codebase to discover **every way each database model is created**. For every model +you answer two orthogonal questions and record the answers so the Environment Factory can plan +factories, scenario trees, and teardown correctly. + +Your input is the knowledge base (`autonoma/AUTONOMA.md` and `autonoma/skills/`). Your output +is `autonoma/entity-audit.md`. + +## The two orthogonal questions + +For every model, answer **both** independently: + +1. **`independently_created`** — *Does the codebase have an exported function / method / + controller that creates this model on its own?* Boolean. +2. **`created_by`** — *When I trace every other model's creation function, does any of them + produce this model as a side effect?* List of `{owner, via, why}` entries; empty if none. + +These are **not** mutually exclusive. A single model can be both. For example, a `` model +may have its own `Service.create()` (answer 1 = true) *and* be minted inline inside a +parent's `Service.createRoot()` transaction as a required default row (answer 2 +non-empty). Both facts are true simultaneously and both matter downstream — the scenario +generator decides per-scenario whether a given `` is introduced via its standalone +factory or comes along with its owner. + +**Do not collapse the two.** Do not omit `created_by` just because `independently_created` is +true. Do not omit `independently_created` just because the model appears in someone else's +`created_by`. + +**When in doubt, prefer `independently_created: true` and include `created_by` anyway.** +Overclassifying a root as a dependent is worse than the inverse — a spurious factory is noisy, +a missing factory leaves a real root untested. + +## The four states a model can be in + +| `independently_created` | `created_by` | Interpretation | +|---|---|---| +| `true` | `[]` | Pure root — only standalone creation exists. | +| `true` | non-empty | Dual — has a standalone path AND is produced by at least one owner. | +| `false` | non-empty | Pure dependent — only reachable via an owner's creation flow. | +| `false` | `[]` | **Invalid.** Unreachable model — either you missed the owner, or the model is never created. Fix the audit before writing it. | + +## Instructions + +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. + + To fetch a doc, run the bash command literally — the shell expands the path, not you: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt" + ``` + + These are the source of truth. Follow them for audit methodology and output format. + +3. Read the knowledge base from `autonoma/AUTONOMA.md` and all skill files in `autonoma/skills/`. + Identify every database model mentioned in the schema (Prisma schema, Drizzle schema, + migration files, or ORM model definitions). + +4. **Pass A — find every standalone creation path.** For each model, search for a dedicated + create function: + - Service files: `*.service.ts`, `*.service.js`, `*Service.*`, `*_service.*` + - Repository files: `*.repository.ts`, `*.repository.js`, `*Repository.*`, `*_repository.*` + - Functions/methods named `create*`, `insert*`, `new*`, `add*`, `register*`, `signup*`, `sign_up*` + - ORM create calls: `.create(`, `.insert(`, `.save(`, `.build(` + - Controller or route handler files that contain inline creation logic + - Framework hooks (Better-Auth `databaseHooks.user.create`, NextAuth callbacks, Devise + callbacks, etc.) — these count as standalone creation paths. + + If a standalone path exists → `independently_created: true` and record `creation_file`, + `creation_function`, and observed `side_effects`. If the only creation is inline in a route + handler or framework-hook closure, still mark `true` and add `needs_extraction: true` — the + env-factory agent will extract into a named export before wiring the factory. + +5. **Pass B — for every standalone creation path, find the sibling rows it mints.** Open each + creation function you found in Pass A and enumerate every write it performs: + - Every `db..create(...)` / `.insert(...)` / `.save(...)` / `.create` call + - Every `.create(...)` / repository call it delegates to + - Every transactional block (`db.$transaction`, `session.begin`, `Repo.transaction`, etc.) + that bundles multiple inserts together + + For each sibling insert, append an entry to **that sibling model's** `created_by` list: + + ```yaml + created_by: + - owner: + via: Service.createRoot> + why: "" + ``` + + The `why` is prose, written for humans. Scenarios and the env-factory teardown logic quote + it verbatim. Make it specific — "Every new `` needs a default `` created inline + in the same transaction so downstream features have something to read from the start" is + useful; "creates a ``" is not. + + One pass per standalone path. When you're done, every sibling that was written inline will + have a `created_by` pointer back to the owner, and every model either has its own standalone + path (`independently_created: true`) or is reachable through at least one owner (non-empty + `created_by`). + +6. **Validate invariants before writing.** A model with `independently_created: false` and + empty `created_by` is a bug — either you missed a creation path, or the model is orphaned + in the schema. Do not ship an audit with orphans. + +7. Side effects are informational — they describe what an independently-created model's + function does. They help humans understand why a factory matters but do not affect + classification. + +## Output Format + +Write `autonoma/entity-audit.md` with YAML frontmatter and markdown body. + +### Frontmatter + +```yaml +--- +model_count: 4 +factory_count: 3 # number of models with independently_created: true +models: + - name: + independently_created: true + creation_file: src//.ts + creation_function: .databaseHooks.user.create + side_effects: + - hashes password + - creates default + rows + created_by: [] + + - name: + independently_created: true + creation_file: src//.service.ts + creation_function: Service.create + side_effects: + - mints a default in the same transaction + - mints an row + created_by: [] + + - name: + independently_created: true + creation_file: src//.service.ts + creation_function: Service.create + side_effects: [] + created_by: + - owner: + via: Service.create + why: "Every new needs a default , created inline in the same transaction so downstream features have something to read from the start." + + - name: + independently_created: false + created_by: + - owner: + via: Service.create + why: "Minted inside the transaction so dependent UI has a row wired up from the start." +--- +``` + +Schema rules: + +- `name` — required (string). +- `independently_created` — required (boolean). +- `creation_file` / `creation_function` / `side_effects` — required **iff** + `independently_created: true`. +- `needs_extraction` — optional boolean; true when the standalone path is inline in a route + handler or framework-hook closure and the env-factory agent will need to extract it. +- `created_by` — required (list, may be empty). Each entry requires `owner` (string — must + match another model's `name`), `via` (string — the function name), and `why` (non-empty + prose string). +- Any model with `independently_created: false` MUST have a non-empty `created_by`. + +### Markdown Body + +After the frontmatter, write: + +#### Roots (models with `independently_created: true`) + +For each, include: +- The model name as a heading +- `creation_file` + `creation_function` +- A brief description of what the function does, including observed side effects +- Any sibling models it mints inline (these are the models with `owner: ` in their + `created_by`). Link back to them so the reader can follow the tree. + +#### Dependents (models with `independently_created: false`) + +A table listing each dependent model, its owner(s) (from `created_by`), and the `why` for each. +This is the map the scenario generator uses: pure dependents are always created through their +owner, not as standalone tree nodes. + +#### Dual-creation models + +A call-out section listing every model with `independently_created: true` AND non-empty +`created_by`. For each, one sentence on when the standalone path is the right choice and when +the via-owner path is. This helps scenarios decide which to use per narrative. + +## Important + +- Be thorough — every inline `db..create(...)` inside someone else's creation function + must produce a `created_by` entry on that sibling, even if that sibling also has its own + service. +- Read the ACTUAL code to locate creation functions and sibling inserts — don't guess from file + names alone. +- If a model has multiple standalone creation paths (e.g., signup + admin-create), pick the + canonical one (usually the public API or most-called path) for `creation_function` and note + alternatives in the body. +- Framework-level hooks (Better-Auth, NextAuth, Devise) count as standalone paths — record them + with `needs_extraction: true` so the env-factory agent lifts the hook body into a named + export before wiring the factory. +- ORM-level hooks (Prisma middleware, Sequelize hooks, ActiveRecord callbacks) DO NOT run on + raw SQL. A pure-dependent (`independently_created: false`) model relying on them is a + correctness bug; call it out in the body. +- **Use subagents aggressively.** Pass A (find standalone paths) and Pass B (find sibling + inserts) are both embarrassingly parallel. diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md index 85d6ba7..cd0fb54 100644 --- a/agents/env-factory-generator.md +++ b/agents/env-factory-generator.md @@ -1,8 +1,9 @@ --- description: > - Implements the Autonoma Environment Factory endpoint in the project's backend. - Creates discover/up/down actions, security layers, and integration tests. - Tests the implementation within the session before completing. + Installs the Autonoma SDK and configures the handler by registering factories for + every model with dedicated creation code (from entity-audit.md). Writes + autonoma/.endpoint-implemented on completion. End-to-end validation happens in the + next step (scenario-validator). tools: - Read - Glob @@ -15,44 +16,397 @@ tools: maxTurns: 60 --- -# Environment Factory Generator +# Environment Factory: SDK Setup -You implement the Autonoma Environment Factory endpoint in the project's backend. -Your input is `autonoma/scenarios.md`. Your output is working endpoint code with tests. +You install the Autonoma SDK and configure the handler with factories. +Your inputs are `autonoma/scenarios.md` and `autonoma/entity-audit.md`. Your output is an +endpoint that responds to `discover` — end-to-end validation (`up`/`down`) happens in the +next pipeline step. + +## CRITICAL: Database Safety + +You may be connected to a production database. Follow these rules absolutely: + +- **ALL writes go through the SDK endpoint only.** The SDK has production guards, HMAC auth, and signed refs tokens. +- **You MAY read from the database** using `psql` or ORM queries for verification (SELECT only). +- **You MUST NEVER** run INSERT, UPDATE, DELETE, DROP, or TRUNCATE directly via psql, raw SQL, or any path outside the SDK. +- **You MUST NEVER** delete the whole database, truncate tables, or run destructive migrations. +- The SDK's `down` action only deletes records that `up` created, verified by a cryptographically signed token. + +## The #1 rule — read before writing a single factory + +**`db..create()` (or any equivalent ORM/SQL write) inside a factory body for a model +whose audit says `independently_created: true` is NEVER acceptable.** There is no condition +under which this is the right output. If calling the audited function feels hard (inline in +a route, buried in a framework hook, needs DI, triggers Temporal), the answer is never +"just use the ORM." The answer is one of: extract, wire DI, use the app's test-mode +toggle, or stop and ask the user. + +If you catch yourself typing `prisma.x.create`, `db.x.create`, `tx.insert`, `Repo.insert`, +`::create`, `Model.objects.create`, `entityManager.persist`, etc. inside a factory +body for an audited model — delete it. Go back to the per-model decision tree below. + +The entire value of factories is that tests run through the user's real creation path. An +inline ORM call bypasses password hashing, slug generation, audit logs, Stripe sync, +framework hooks that provision sibling rows, state-machine transitions, and every piece of +business logic the user will add next month. It produces data that looks right in a +`SELECT *` but is silently wrong in ways the tests can't catch. ## Instructions -1. First, fetch the latest implementation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. + + To fetch a doc, run the bash command literally — the shell expands the path, not you: - Use WebFetch to read BOTH of these: - - `https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt` - - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt` + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` - Follow those instructions for how to implement the endpoint. + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. -2. Read `autonoma/scenarios.md` — parse the frontmatter and full scenario data. +2. Fetch the latest implementation instructions: -3. Explore the backend codebase to understand: - - Framework (Next.js, Express, Elixir/Phoenix, etc.) - - Database layer (Prisma, Drizzle, raw SQL, Ecto, etc.) - - Authentication mechanism (session cookies, JWT, etc.) + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement-scenarios.txt" + curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt" + ``` + + These are the source of truth. Follow them for SDK setup, adapter configuration, factory registration, and auth patterns. + +3. Read `autonoma/entity-audit.md` — parse the frontmatter. For every model with + `independently_created: true`, you MUST register a factory that calls the identified + `creation_function` in `creation_file`. Models with `independently_created: false` get no + factory — the SDK will fall back to raw SQL INSERT automatically. + +4. Read `autonoma/scenarios.md` — parse the frontmatter and full scenario data. Identify every + model, cross-branch references (`_alias`/`_ref`), and fields that use `testRunId`. + +5. Explore the backend codebase to understand: + - Framework (Next.js, Express, Hono, etc.) + - ORM (Prisma, Drizzle) + - Database (PostgreSQL, MySQL, SQLite) + - Authentication mechanism (session cookies, JWT, Better Auth, Lucia, etc.) - Existing route/endpoint patterns + - **Auth-adjacent framework hooks** — Better Auth `databaseHooks`, NextAuth callbacks, + Lucia adapters, Clerk webhooks. These frequently contain the real creation logic for + User/Session/Account and also write to sibling tables (Organization, Member, Billing). + The audit will flag these with `needs_extraction: true`. + - **App composition root** — where the app wires services, clients, and repositories + (DI container, service registry, module init). You'll reuse this wiring when a + creation function needs dependencies beyond `ctx.executor`. + +## Factory registration philosophy + +Register a factory for **every model with `independently_created: true`** — no exceptions. + +This is true even if the creation function looks trivial. A factory wired up to `ProjectService.create()` +that today just calls `prisma.project.create()` will automatically benefit from any business logic +the user adds later (audit log, Stripe sync, cache write). Raw SQL, by contrast, can never run +that logic — it's always a compatibility risk. + +Models with `independently_created: false` fall back to the SDK's raw SQL path. That's safe because +the audit explicitly determined there's no creation logic to preserve. + +## Dependents, cascades, and teardown + +For every root (`independently_created: true`) decide how its dependents will be torn down +before writing the factory. The `created_by` list in the audit tells you which models come +into existence as a byproduct of this root's creation flow — those rows must also be deleted +when the SDK tears down the root. + +Walk this decision tree in order. The first match wins; if none match, STOP and report. + +1. **Schema cascade** — check the ORM schema. If the FK chain from every dependent back to + the root is `onDelete: Cascade` (Prisma) / `ON DELETE CASCADE` (raw SQL) / analogous in + your ORM, you're done. The SDK deletes the root row and the DB cleans up the rest. No + `teardown` field needed on the factory. +2. **Existing delete function** — if the codebase has a delete method that already tears + down the same subtree (e.g. a `Service.delete` that removes the root AND + every dependent it minted), register `teardown` on the factory to call that function. + Same principle as the `create` side: stay on the user's code path. +3. **Return dependents' IDs the production function ALREADY returns** — if the production + `create` function returns the dependent IDs in its result (e.g. returns + `{ root, child, grandchild }`), forward those IDs in your factory's return so they land + in refs, then register a `teardown` that deletes them in reverse FK order. +4. **None of the above — STOP.** Do NOT modify the production service to return more IDs + than it already does just to make teardown work. Doing so changes the real code path to + serve test needs, which is exactly the inversion we avoid. Report the gap to the user + and let them choose: add a cascade, add a delete function, or accept orphans until + `TRUNCATE` between test runs. + +The `created_by[].why` field is a useful hint for this: if it says "minted inline in the +same transaction", option 1 (schema cascade) is usually set up correctly; if it says "seeded +with the owner so onboarding has something to advance through", check whether the dependent +is behind a soft-delete flag the root's delete function already handles. + +Pure dependents (`independently_created: false`) never have their own `teardown` — they are +torn down via their owner's factory (one of the four options above). + +## Compatibility with legacy audits + +Older audits used a single `independently_created` field. The validators read both schemas and +treat `independently_created: true` as `independently_created: true` with an empty `created_by`. +If the audit you're reading only has `independently_created`, you can still register factories, +but you'll lose the `created_by` teardown guidance above — prefer regenerating the audit +with the current prompt when possible. + +## Research pass — MANDATORY before writing any factory + +Post-mortems of past runs show a consistent failure mode: the agent makes **one bad +decision and applies it 50 times**. The research pass prevents this by forcing you to +open every relevant file and document a per-model decision *before* touching the handler. + +Write a table to `autonoma/.factory-plan.md` with one row per `independently_created: true` +model in the audit. Fill EVERY cell — do not leave any as TODO. The orchestrator and +the user will review this table before you write a single factory. + +``` +| Model | Audit function | File opened? | Import path | DI dependencies observed | Decision (Branch 1/2/3) | Notes | +|-------|----------------|--------------|-------------|--------------------------|-------------------------|-------| +``` + +Column rules: + +- **File opened?** — "yes, lines X-Y" or "no, why". If you write "no", you MUST NOT + proceed. You cannot decide Branch 1 vs Branch 2 without reading the file. +- **Import path** — the exact `import ... from "..."` statement you will add to the + handler. If the symbol is inline in a hook/route (Branch 1), this column holds the + *new* export path you will create during extraction, not the current inline location. +- **DI dependencies observed** — every constructor arg or closed-over variable the + function uses. `ctx.executor` for a DB-only service is the trivial case; any logger, + event bus, Temporal client, analytics client, etc. must be listed. This is where + past agents gave up silently — we want the give-up moment to be visible. +- **Decision** — Branch 1 (extract inline → export → call), Branch 2 (import existing + export → call), or Branch 3 (audit is wrong, argue why). "Inline ORM" is NOT a valid + decision. + +### Cross-codebase DI discovery + +Before filling the table, run these greps against the backend to find real +instantiation patterns. The agent debrief identified this as the single actionable +guidance past runs were missing: + +```bash +# Find how each service is actually constructed in production code. +grep -rnE "new ${ServiceName}\(" apps/ --include='*.ts' --include='*.tsx' | head -20 +# Find exported singletons and module-level instances. +grep -rnE "^(export )?(const|let) [a-zA-Z]+ = new " apps/ --include='*.ts' | head -40 +# Find composition root candidates. +grep -rnlE "(container|registry|services/index|app\.module)" apps/ | head +``` + +Use the results to fill the "DI dependencies observed" column honestly. If a service +needs `logger, eventBus, temporal, analytics` and you can't find where the app wires +them, STOP and ask the user — do NOT fall back to raw ORM. + +### External-side-effects policy reminder + +When the creation function triggers Temporal / GitHub / analytics / BetterAuth hooks, +you are NOT allowed to skip the function. You must either: +1. Call the real function and let the test-mode toggle handle it (grep for + `process.env.NODE_ENV === "test"`, `AUTONOMA_TEST_MODE`, `DISABLE_*`, or similar). +2. Call the real function and let external calls fail gracefully — most SDKs throw, + which is fine if the DB writes complete first. +3. Wrap the external call with a try/catch **inside the real function**, not inside + the factory. + +Never replicate DB writes the function performs. If the real function writes to +sibling tables (Organization, Member, BillingCustomer from BetterAuth's `user.create` +hook; a default Folder from `createProject`), those writes come for free only when +you call the real function. Inlining `db.user.create()` silently drops them. + +--- + +## Per-model decision tree (run this BEFORE writing any factory) + +For every model with `independently_created: true` in `autonoma/entity-audit.md`, walk this tree +in order. Do NOT skip. Each branch has exactly one legitimate output — there is no "give up +and use `db..create()`" escape hatch. + +### Branch 1 — `needs_extraction: true` + +Meaning: the creation logic exists inline in a route handler, a framework hook (Better Auth +`databaseHooks`, NextAuth callbacks, Express middleware closures), or an anonymous closure. +There is no named export to import. + +**Mandatory action — extract before wiring:** + +1. Open `creation_file`. Find the inline block named by `creation_function`. +2. Move the body into a new **named, exported function** in the nearest sensible module + (a new `*.service.ts`, `*.repository.ts`, a sibling `create-.ts`, or an existing + service file if one exists nearby). The function must: + - Take a plain input object (no `req`/`res`/`ctx` — those are HTTP concerns). + - Return the created record (at minimum `{ id }`). + - Preserve every side effect the inline block had — including writes to sibling tables + that framework hooks produce (e.g. Better Auth's `user.create` hook provisioning an + Organization, Member, BillingCustomer; NextAuth's callback writing Account rows). +3. Replace the inline block with a call to the new function. The real HTTP caller's + behavior MUST stay identical. Run the project's typecheck/test command before moving on. + **Leave a short comment** (1–2 lines) above the new exported function explaining why it + was extracted — e.g. `// Extracted from the Better Auth databaseHooks.user.create closure + so the Autonoma Environment Factory can reuse the same creation path (Org + Member + + billing provisioning) as production. See autonoma/entity-audit.md.` This is a courtesy + to the developers who will encounter the new function — they should be able to tell at a + glance that it was lifted out for factory reuse, not invented for it. +4. **Update `autonoma/entity-audit.md` in-place** — change `creation_file` to the new file, + `creation_function` to the new exported name, add `extracted_to: `, + and keep `needs_extraction: true` so the fidelity rubric's framework-hook + carve-out can score the factory against the extracted helper. + Downstream steps read the audit; they must see the fixed state. +5. Now — and only now — import the new function and wire the factory. + +If extraction is genuinely impossible (the inline block depends on `req`/`res` in a way that +can't be untangled, or it's generated code you can't edit), **STOP and ask the user**. Do +NOT fall back to raw ORM. That is the bug we are trying to prevent. + +**Concrete example — Better Auth `databaseHooks`:** + +The audit marks `User` with `needs_extraction: true`, `creation_file: src/auth.ts`, +`creation_function: buildAuth (databaseHooks.user.create)`. Reading `src/auth.ts`, the real +creation logic lives inside a closure passed to `betterAuth({ databaseHooks: { user: { create: async (user) => {...} } } })`, which calls `db.user.create`, then `ensureOrgMembership`, then provisions a `BillingCustomer`, then enqueues a welcome email. + +Wrong: import `db` and call `db.user.create(...)` in the factory — silently skips the +Organization/Member/BillingCustomer rows and every downstream test that reads them breaks. + +Right: extract the closure body into `export async function createUserWithOnboarding(input)` +in `src/auth/create-user.ts`, call it from the Better Auth hook (so production still works), +update the audit, then `import { createUserWithOnboarding }` in the factory. + +### Branch 2 — `independently_created: true`, no `needs_extraction` + +Meaning: a named exported function or class method already exists. Import it and call it. +Do not copy its body. Do not call the ORM directly "because it's simpler." The whole point +is to stay on the user's code path. + +Go to the DI playbook below to figure out how to invoke it. + +### Branch 3 — `independently_created: false` + +Do not register a factory at all. The SDK's raw SQL fallback handles it. Writing a factory +here just so you can call `db..create()` is the anti-pattern in disguise — let the +SDK do it. + +## DI / constructor-injection playbook + +Factories receive `(data, ctx)` where `ctx.executor` is the DB client/transaction. That's +enough for simple service classes but many creation functions need more. Walk this list in +order — the first match wins: + +1. **Top-level exported function** — `import { createX } from "..."; return createX(data);`. + Simplest case. Most services should end up here after Branch 1 extraction. +2. **Static method on a class** — `return XService.create(data, ctx.executor);`. Pass + `ctx.executor` as the DB/transaction argument so writes stay in the SDK's transaction. +3. **Instance method, needs only a DB client** — + `const svc = new XService(ctx.executor); return svc.create(data);`. Mirrors how the app + instantiates it at call time. +4. **Instance method, needs more dependencies (logger, event bus, config, clients)** — + find the app's composition root (DI container, service registry, `container.ts`, + `app.module.ts`, `services/index.ts`) and reuse it. Two viable patterns: + - **Import the already-constructed singleton** the app exports for production use: + `import { userService } from "@/services"; return userService.create(data);`. + - **Rebuild the service the same way the composition root does**, substituting + `ctx.executor` for the DB dependency and importing real singletons for everything + else (logger, event bus). Do not invent mocks. Example: + + ```ts + import { logger, eventBus, temporalClient } from "@/lib/singletons"; + + UserProfile: defineFactory({ + create: async (data, ctx) => { + const svc = new UserProfileService({ + db: ctx.executor, + logger, + eventBus, + temporal: temporalClient, + }); + return svc.create(data); + }, + }), + ``` +5. **Framework-scoped dependencies (NestJS provider, Fastify plugin, Rails concern)** — + bootstrap the smallest containing module and resolve the service from it. If that turns + into a 50-line boilerplate, that's a signal the composition root should expose a helper + the factory can call; add the helper to the app and use it. Still never `db.create()`. +6. **Impossible** — if you genuinely can't wire the dependencies without rewriting the + service, STOP and ask the user. Do NOT fall back to raw ORM. + +Never mock, stub, or fake a dependency. The factory must exercise real code. + +## External side effects policy + +Audited creation functions often perform side effects beyond the DB row: enqueueing a +Temporal workflow, hitting the GitHub/Stripe/Slack API, sending an email, publishing to a +message bus, writing a semantic embedding, firing an analytics event, calling an LLM. + +**Your goal is correct DB state, not production-grade external delivery.** The factory MUST +preserve every DB write the real function performs (including writes to sibling tables +done by ORM hooks, framework hooks, triggers). It is NOT responsible for making every +network call succeed. Order of preference: + +1. **Call the real function with real side effects.** If Temporal/GitHub/Stripe clients are + already wired for the test environment (sandbox keys, a local Temporal dev server, + mocked SDKs in test config), just call through. Cleanest option when infra is available. +2. **Use the app's existing test-mode toggle.** Most apps have one: an env var + (`NODE_ENV=test`, `DISABLE_WORKFLOWS=1`, `ANALYTICS_DISABLED=1`), a feature flag, a + null-object client injected in tests. Find it, set it on the handler's environment, and + call the real function. +3. **Wrap external-only calls and let them no-op on failure.** If no toggle exists and the + call would fail in the test environment, the acceptable pattern is to try/catch the + outbound call inside the real function's wrapper — not inside a rewritten factory body. + Prefer exposing a toggle in the app over adding try/catch at the factory layer. Only use + this for calls whose failure does not affect DB state under test. If a test later + asserts on a row the side effect would have created, make it succeed (option 1 or 2). +4. **Reimplement the DB writes inline.** NEVER. If you find yourself typing + `db..create` inside a factory to replicate what a hook or workflow would + have done, STOP. That means the function wasn't truly "called" — you re-wrote it. Go + back to option 1 or 2, or ask the user. + +**What you are NOT allowed to skip:** + +- Password hashing, slug generation, ID derivation, normalisation — pure CPU work inside + the creation function; calling the function gets them for free. +- DB writes performed by ORM hooks / framework hooks / triggers on the model being created. + Better Auth's `databaseHooks.user.create` writes to Organization, Member, BillingCustomer + — if you call `db.user.create()` instead of the real signup function, those rows go + missing and every test that reads them breaks silently. +- Writes to sibling tables done by the creation function itself (e.g. `createProject` + writing a default Folder row). If you don't call the function, those rows go missing too. ## CRITICAL: Before Writing Any Code **Ask the user for confirmation** before implementing. Present your plan: -> "I'm about to implement the Autonoma Environment Factory endpoint. Here's what I'll do: +> "I'm about to set up the Autonoma SDK. Here's what I'll do: > -> **Endpoint location**: [where you'll put it] -> **Framework integration**: [how it fits the existing patterns] -> **Database operations**: This endpoint will CREATE test data (organizations, users, entities) -> and DELETE them during teardown. It will NOT modify or delete any existing data. -> **Security**: HMAC-SHA256 request signing + JWT-signed refs for safe teardown +> **SDK packages**: [list packages to install] +> **Endpoint location**: [where the handler file will go] +> **Scope field**: [e.g., organizationId] +> +> **Models needing extraction (`needs_extraction: true`)**: +> - [Model]: inline in `[file]#[block]` → will extract to `[new file]#[new function]` +> - ... +> +> **Factories to register** (from entity-audit.md): +> - [Model]: calls `[file]#[function]` (DI: [top-level import / `new Service(ctx.executor)` / composition-root singleton]; side effects: [list, or "none — future-proofs against added logic"]) +> - ... +> +> **External side effects strategy**: [test-mode toggle name / sandbox credentials / try-catch wrapper] +> +> **Raw SQL fallback** (no creation code in audit): [list] +> +> **Auth callback**: [how sessions/tokens will be created] +> +> **Database operations**: The SDK creates test data by calling the factories you register +> (or raw SQL for models without creation code). It deletes only what it created during +> teardown (verified by a signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on +> existing data. > > **Environment variables needed**: -> - `AUTONOMA_SIGNING_SECRET` — shared secret for HMAC request verification -> - `AUTONOMA_JWT_SECRET` — secret for signing/verifying refs tokens +> - `AUTONOMA_SHARED_SECRET` — shared with Autonoma for HMAC request verification +> - `AUTONOMA_SIGNING_SECRET` — private, for signing refs tokens > > To generate these secrets, run: > ```bash @@ -61,123 +415,296 @@ Your input is `autonoma/scenarios.md`. Your output is working endpoint code with > Run this command TWICE — once for each secret. Use DIFFERENT values for each. > Set them in your `.env` file (or equivalent): > ``` -> AUTONOMA_SIGNING_SECRET= -> AUTONOMA_JWT_SECRET= +> AUTONOMA_SHARED_SECRET= +> AUTONOMA_SIGNING_SECRET= > ``` > > Shall I proceed?" **Do NOT proceed until the user confirms.** -## Implementation Requirements +## Implementation -### Always Implement on the Backend +### 1. Install SDK packages -Find the project's backend and implement the endpoint there. Look for: -- API route directories (e.g., `app/api/`, `pages/api/`, `src/routes/`, `lib/`) -- Existing endpoint patterns to match -- If it's a monorepo, find the backend package/app +Pick the correct packages for the project's stack: -If you can't find the backend, ask the user where it is. +| Your ORM | Package | +|----------|---------| +| Prisma | `@autonoma-ai/sdk-prisma` | +| Drizzle | `@autonoma-ai/sdk-drizzle` | -### Environment Variables +| Your Framework | Package | +|----------------|---------| +| Next.js App Router, Hono, Bun, Deno | `@autonoma-ai/server-web` | +| Express, Fastify | `@autonoma-ai/server-express` | +| Node.js http | `@autonoma-ai/server-node` | -Always use these exact names: -- `AUTONOMA_SIGNING_SECRET` — for HMAC-SHA256 request verification -- `AUTONOMA_JWT_SECRET` — for JWT signing of refs tokens +Always install `@autonoma-ai/sdk` as the core package. -### Security Layers (All Required) +### 2. Do the extractions FIRST -1. **Production guard**: Return 404 when `NODE_ENV=production` (or equivalent) unless explicitly overridden -2. **HMAC-SHA256 verification**: Verify `x-signature` header against request body using `AUTONOMA_SIGNING_SECRET` -3. **Signed refs (JWT)**: Sign refs in `up` response, verify in `down` request using `AUTONOMA_JWT_SECRET` +Before writing the handler, walk every `needs_extraction: true` model in the audit and do +the extraction per Branch 1 of the decision tree. After each extraction, update +`autonoma/entity-audit.md` in-place. This must happen before Step 3 — the handler imports +these new exports by name. -### Creation and Teardown Order +### 3. Create the endpoint handler + +Write a single handler file that: +1. Imports and configures the ORM adapter with the scope field +2. Registers factories for EVERY model with `independently_created: true` in entity-audit.md +3. Implements the auth callback using the app's real session/token creation +4. Passes both secrets from environment variables -- **Up**: Create parent entities before children (org → users → projects → tests → runs) -- **Down**: Delete in REVERSE order (runs → tests → projects → users → org) -- Do NOT rely on ORM cascade behavior — explicit deletion is safer -- Use `testRunId` in all unique fields to prevent parallel test collisions +Match existing codebase patterns — import style, file organization, error handling. + +### 4. Register factories (one per model with creation code) + +For every entry in entity-audit.md with `independently_created: true`: + +- Import the function from `creation_file` (post-extraction if Branch 1 applied) +- Wrap it in `defineFactory({ create, teardown? })` from `@autonoma-ai/sdk` +- In `create`: call the imported function with the resolved data and return at least `{ id }` (the primary key) +- Optionally define `teardown` for custom cleanup (SQL DELETE is the default) + +#### The one thing you MUST NOT do + +Do not re-implement the creation logic inline using the ORM, even if calling the real function +is inconvenient (constructor arguments, DI containers, weird signatures). The entire point of +the factory is to stay on the user's code path so that when they add business logic later — +password hashing, audit logs, Stripe sync, state-machine transitions — the test data gets it +for free. Inline ORM calls bypass all of that silently and are the #1 bug source in generated +factories. -### Endpoint Actions +**A raw ORM/DB write MUST NEVER appear in a factory body for a `independently_created: true` +model.** There are no exceptions. Exact patterns vary by language/ORM — a non-exhaustive list: -| Action | Purpose | -|------------|-------------------------------| -| `discover` | Return available scenarios | -| `up` | Create scenario data, return auth + refs | -| `down` | Verify refs token, delete data | +- TypeScript/JavaScript: `prisma..create(`, `db..create(`, `tx.insert(`, `drizzle.insert(`, `knex('').insert(`, `sequelize.models..create(`, `typeorm.getRepository(...).save(`, `mongoose.Model.create(`, `await .create(`, `.upsert(` +- Python: `session.add(`, `session.execute(insert(...))`, `Model.objects.create(`, `Model(...).save(`, `db.session.add(`, `conn.execute("INSERT ...")` +- Ruby/Rails: `.create(`, `.create!(`, `.new(...).save`, `.insert(`, `ActiveRecord::Base.connection.execute("INSERT ...")` +- PHP/Laravel: `::create(`, `new (...)->save()`, `DB::table('...')->insert(`, `$repository->persist(` +- Java/Spring: `entityManager.persist(`, `.save(`, `jdbcTemplate.update("INSERT ...")` +- Go: `db.Create(`, `gorm.DB.Create(`, `sq.Insert(`, raw `db.Exec("INSERT ...")` / `db.ExecContext(...)` +- Elixir/Ecto: `Repo.insert(`, `Repo.insert!(`, `Repo.insert_all(` +- Rust: `diesel::insert_into(`, `sqlx::query!("INSERT ...")`, `sea_orm::ActiveModel ... .insert(` +- Raw SQL anywhere: an `INSERT INTO ` string literal passed to a query/exec/prepare API -## CRITICAL: Test Within the Session +If you wrote one of these inside a factory body for a model whose audit says +`independently_created: true`, you took the trap. Delete it. Go back to the per-model decision +tree and the DI playbook. -After implementing the endpoint, you MUST test it to verify it works: +**WRONG — re-implementing creation logic inline (this is the trap):** -1. **Check if the dev server is running** or start it -2. **Generate temporary secrets** for testing: - ```bash - export AUTONOMA_SIGNING_SECRET=$(openssl rand -hex 32) - export AUTONOMA_JWT_SECRET=$(openssl rand -hex 32) - ``` +```ts +// entity-audit.md said: creation_function = OnboardingManager.getState +OnboardingState: defineFactory({ + create: async (data) => { + // Bypasses OnboardingManager entirely. If the user adds logic later, tests silently diverge. + return db.onboardingState.create({ data: { applicationId: data.applicationId, step: "welcome" } }); + }, +}), +``` -3. **Test the discover action**: - ```bash - BODY='{"action":"discover"}' - SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //') - curl -s -X POST http://localhost:PORT/api/autonoma \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY" | python3 -m json.tool - ``` +**RIGHT — call the audit's identified function, even if you have to instantiate a class:** -4. **Test the up action** (for each scenario): - ```bash - BODY='{"action":"up","environment":"standard","testRunId":"test-001"}' - SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //') - UP=$(curl -s -X POST http://localhost:PORT/api/autonoma \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY") - echo "$UP" | python3 -m json.tool - ``` +```ts +import { OnboardingManager } from "@/lib/onboarding-manager"; -5. **Test the down action** using refs from up: - ```bash - REFS=$(echo "$UP" | python3 -c "import sys,json; print(json.dumps(json.load(sys.stdin)['refs']))") - TOKEN=$(echo "$UP" | python3 -c "import sys,json; print(json.load(sys.stdin)['refsToken'])") - BODY=$(python3 -c "import json; print(json.dumps({'action':'down','testRunId':'test-001','refs':json.loads('$REFS'),'refsToken':'$TOKEN'}))") - SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //') - curl -s -X POST http://localhost:PORT/api/autonoma \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY" | python3 -m json.tool - ``` +OnboardingState: defineFactory({ + create: async (data, ctx) => { + // Uses the real code path. Any business logic added later flows through automatically. + const manager = new OnboardingManager(ctx.executor); + return manager.getState(data.applicationId); + }, +}), +``` + +### 4b. Populate `tableNameMap` sparsely (do not mirror the factory registry) + +The SDK auto-derives model names from SQL tables by splitting on `_` and PascalCasing +each part. **No pluralization is performed.** `organization` → `Organization`; +`organizations` → `Organizations`; `api_key` → `ApiKey`; `api_keys` → `ApiKeys`. + +Do NOT write a `tableNameMap` / `table_name_map` that mirrors your factory registry +1:1. That doubles the maintenance surface and is a silent-breakage foot-gun — adding a +new model forces two edits and forgetting one silently misroutes creates. + +**Algorithm to follow before writing the map:** -6. **Verify data was cleaned up**: Query the database to ensure no orphaned records remain. +1. List every factory key you intend to register. +2. For each key, compute `autoName = snakeToPascal(dbTable)` — split on `_`, PascalCase + each part, concatenate. No pluralization step. +3. If `autoName === factoryKey`: **do not add** the entry. +4. If `autoName !== factoryKey`: add the entry. +5. If after step 4 the map is empty, **omit the `tableNameMap` field entirely**. -If any test fails, fix the implementation and re-test. +**Worked example (plural DB tables, singular factory keys):** + +```ts +// DB tables: organizations, users, api_keys +// Factory keys: Organization, User, ApiKey +// Every auto-derived name disagrees → every factory needs one entry: +tableNameMap: { + Organization: 'organizations', + User: 'users', + ApiKey: 'api_keys', +}, +factories: { Organization: ..., User: ..., ApiKey: ... }, +``` + +**Worked example (singular DB tables):** + +```ts +// DB tables: organization, user, api_key +// Factory keys: Organization, User, ApiKey +// Every auto-derived name matches → omit tableNameMap entirely. +factories: { Organization: ..., User: ..., ApiKey: ... }, +``` + +**Red flag.** If `tableNameMap` ends up with exactly one entry per factory and every +entry is a plural↔singular rename, you have two options: + +- (a) Keep the map (verbose but explicit). +- (b) Change factory keys to match the plural auto-derived names (`Organizations`, + `Users`, `ApiKeys`) and drop the map entirely. + +Prefer (b) unless scenario files already use the singular convention. A `tableNameMap` +that is a 1:1 copy of the factory registry means you're doing work the SDK already +does. + +### 5. Register the route + +Add the endpoint to the app's routing. + +### 6. Set up environment variables + +Add `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` to `.env`. If `.env.example` exists, add placeholders. + +## Smoke test + +Before writing the sentinel, run a single `discover` call to confirm the endpoint is wired +up and HMAC works. Do NOT run `up` or `down` here — that is the scenario-validator's job. + +```bash +export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)} +export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)} + +BODY='{"action":"discover"}' +SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') +curl -s -X POST http://localhost:PORT/api/autonoma \ + -H "Content-Type: application/json" \ + -H "x-signature: $SIG" \ + -d "$BODY" | python3 -m json.tool +``` + +Expected: JSON with `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`. + +If this fails, fix the handler (likely the adapter config or route mount) before writing +the sentinel. + +## CRITICAL: Factory-integrity check (before writing the sentinel) + +Prove every factory calls the audit's identified `creation_function`. This is deterministic +static analysis, not a vibe check. Run it yourself and HALT if it fails — the next step +(scenario-validator) runs the exact same check and will kick the work back. + +### Step A — collect the audit targets + +Parse `autonoma/entity-audit.md` and build a list of `(model, creation_file, creation_function)` +for every model with `independently_created: true`. Also flag any entry that still has +`needs_extraction: true` — that's a bug (you were supposed to extract first and clear the +flag). HALT and go do the extraction. + +### Step B — grep the handler for the anti-pattern + +```bash +grep -nE '(prisma|db|tx)\.[a-zA-Z_]+\.(create|createMany|insert|upsert)\(' +``` + +Every match inside a `defineFactory({ create })` body is a RED FLAG. The only legitimate +matches are: +- Inside a model's `teardown` body (custom cleanup is allowed). +- Outside any `defineFactory` (auth callback, scope helpers, etc.). +- Inside a factory for a model the audit marked `independently_created: false` (no service exists; + raw ORM is the documented fallback — though the SDK does this automatically, so you usually + shouldn't even write such a factory). + +Anything else is the trap. Do NOT ship it. + +### Step C — per-model structural check + +For each `(model, creation_file, creation_function)` from Step A, verify ALL of: + +1. An `import` (or `require`) line pulls `creation_function` — or the class/object that owns + it — into the handler file, from a path that resolves to `creation_file`. +2. The factory body for `model` invokes that identified symbol (e.g. `manager.getState(...)`, + `createUser(...)`, `ProjectService.create(...)`, `service.create(...)`). +3. The factory body does NOT contain a raw ORM write for `model` (`db..create(...)`, + `prisma..create(...)`, `tx.insert(Table)`, etc.). + +If any model fails any of the three, STOP. Fix the factory per the per-model decision tree +and the DI playbook, then re-run this check from Step A. + +### Step D — commit only when clean + +Only write `autonoma/.endpoint-implemented` after: +- Every `needs_extraction: true` flag in the audit has been resolved. +- Step B returns zero anti-pattern matches inside factory bodies. +- Step C passes for every audited model. +- The discover smoke test returns 200 with the expected schema shape. + +If you extracted any route-handler or framework-hook logic into a new exported function +(per Branch 1), the audit must have been updated in-place; re-read it after the edit before +running Step A. + +## CRITICAL: Write the implementation sentinel + +After the discover smoke test passes AND the factory-integrity check passes, use the +`Write` tool to create `autonoma/.endpoint-implemented` with a short plain-text summary: + +``` +Endpoint implemented. +- handler: +- packages: +- factories registered: +- extractions performed: +- scope field: +- auth callback: +``` + +Do NOT use `touch` — the hook fires only on `Write`/`Edit`. + +The next step (scenario-validator) will exercise up/down for every scenario and write +`autonoma/.endpoint-validated`. E2E test generation is blocked until that happens. ## What to Explain to the User -After implementation, explain: +After implementation and validation, explain: + +1. **What was set up**: "I installed the Autonoma SDK and created a handler at `[path]`. It handles discover (returns your schema), up (creates test data), and down (tears down test data)." + +2. **Extractions performed**: For each `needs_extraction: true` model, show the inline block → new exported function mapping, and confirm the original caller now invokes the new function. -1. **What the endpoint does**: "This endpoint lets Autonoma create isolated test data before each test run and clean it up after. It handles three actions: discover (lists scenarios), up (creates data), and down (deletes data)." +3. **Factories registered**: List each factory — which function it wraps, which DI pattern was used, and what side effects the audit observed (or "none — factory is registered to future-proof"). -2. **Why it's secure**: "Three security layers protect your data: - - Production guard: The endpoint returns 404 in production - - Request signing: Every request is verified with HMAC-SHA256 using your signing secret - - Signed refs: Teardown can only delete data that was actually created by the endpoint, verified by JWT" +4. **External side effects strategy**: which toggle/sandbox/wrapper was used. -3. **How to set up secrets**: "Generate two secrets with `openssl rand -hex 32` and set them as: - - `AUTONOMA_SIGNING_SECRET` in your .env file - - `AUTONOMA_JWT_SECRET` in your .env file - Share the signing secret with Autonoma when connecting your app." +5. **How to set up secrets**: "Generate two secrets with `openssl rand -hex 32` and set them as: + - `AUTONOMA_SHARED_SECRET` — share this with Autonoma + - `AUTONOMA_SIGNING_SECRET` — keep this private" -4. **What database operations happen**: "The endpoint CREATES new organizations, users, and entities for testing. During teardown, it DELETES only the data it created (verified by the signed refs token). It never modifies or deletes existing data." +6. **Safety**: "The SDK can only INSERT records via the factories you registered (which call the user's real creation functions) or raw SQL for models without creation code. Teardown only deletes records that were created (verified by a cryptographically signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on existing data." ## Important -- Always prefer implementing in the project's existing backend — don't create a standalone server -- Match existing code patterns and conventions in the project +- Always implement in the project's existing backend — don't create a standalone server +- Match existing code patterns and conventions - Use the same ORM/database layer the project already uses -- Handle circular foreign keys with transaction-wrapped deletion -- Always use `testRunId` to make unique fields (emails, org names) to prevent parallel test collisions -- Test the FULL lifecycle (discover → up → down) within the session +- Register factories for EVERY model with `independently_created: true` in the audit — no exceptions, even for thin wrappers +- Resolve every `needs_extraction: true` by extracting FIRST, then wiring the factory +- Never reimplement the user's creation logic in a factory — always call their function +- `db..create()` in a factory for a `independently_created: true` model is NEVER acceptable +- ALL database writes go through the SDK endpoint — never write directly +- Use `testRunId` to make unique fields (emails, org names) to prevent parallel test collisions +- Validate the FULL lifecycle (discover → up → verify → down → verify) before completing diff --git a/agents/focused-test-case-generator.md b/agents/focused-test-case-generator.md new file mode 100644 index 0000000..4cda3a5 --- /dev/null +++ b/agents/focused-test-case-generator.md @@ -0,0 +1,207 @@ +--- +description: > + Generates E2E test cases focused on a specific user-defined topic or feature area as markdown files from knowledge base and scenarios.. + Creates an INDEX.md with test distribution metadata and individual test files + with YAML frontmatter for deterministic validation. +tools: + - Read + - Glob + - Grep + - Write + - Bash + - Agent + - WebFetch +maxTurns: 80 +--- + +# Focused E2E Test Case Generator + +You generate E2E test cases scoped to a specific topic or feature area as markdown files.. Your inputs are: +- `FOCUS_PROMPT` — the user-defined focus topic. **Every test you write must be relevant to this topic. Do not generate tests outside the requested scope.** +- `FOCUS_SLUG` — the output folder name- +- `autonoma/AUTONOMA.md` (knowledge base with core flows in frontmatter) — if it exists +- `autonoma/skills/` (skill files for navigation) — if they exist +- `autonoma/scenarios.md` (test data scenarios with frontmatter) — if it exists +- `EXISTING_TESTS` — a list of existing test titles (to avoid duplication) — if provided + +Your output is a directory `autonoma/qa-tests/{FOCUS_SLUG}/` containing: +1. `INDEX.md` — index with test distribution metadata +2. Subdirectories organized by sub-feature within the focus area, each containing test files + +## Instructions + +1. First, fetch the latest test generation instructions: + + Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt` + and follow those instructions for how to generate tests — except scope all tests to the `FOCUS_PROMPT`. + +2. Read all available input files: + - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count (if it exists) + - All files in `autonoma/skills/` (if they exist) + - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields** (if it exists) + - If neither `autonoma/AUTONOMA.md` nor `autonoma/scenarios.md` exists, scan the codebase for routes and features relevant to the focus area + +3. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter + declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has + a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`. + When writing test steps that involve a variable field value — typing it, asserting it, or + navigating to it — you MUST use the `{{token}}` placeholder, never the hardcoded literal from + the scenario body. At runtime the agent resolves these tokens to their actual values. + + Example: if `variable_fields` includes `{{deadline_1}}` for `Tasks.deadline`: + - good: "assert the task deadline shows `{{deadline_1}}`" + - bad: "assert the task deadline shows 2025-06-15" + +4. Review the `EXISTING_TESTS` list provided (if any). Do not generate tests + whose title or purpose substantially duplicates an existing test. + +5. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test. + The scenarios exist only to provide preconditions and known data for app behavior tests. + Do NOT generate tests whose purpose is to verify: + - that the scenario contains the documented entity counts + - that every scenario row, seed, or example value exists + - that the Environment Factory created data correctly + - that `standard`, `empty`, or `large` themselves are "correct" as artifacts + + Only reference scenario data when it is necessary to exercise a real user-facing flow within + the focus area. + +6. Count the routes/features/pages in the codebase relevant to the focus area to establish the + coverage correlation. Focus strictly on what belongs to `FOCUS_PROMPT` — do not pad with + unrelated tests. + +7. Generate test files organized in subdirectories by sub-feature within the focus area. + +8. Write `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` FIRST (before individual test files). + +9. Write individual test files into subdirectories. + +## CRITICAL: INDEX.md Format + +The file `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` MUST start with YAML frontmatter in this exact format: + +```yaml +--- +total_tests: 18 +total_folders: 3 +folders: + - name: "sign-document" + description: "Signing a document from start to finish" + test_count: 8 + critical: 3 + high: 3 + mid: 1 + low: 1 + - name: "signature-edge-cases" + description: "Edge cases in the signing flow" + test_count: 6 + critical: 1 + high: 2 + mid: 2 + low: 1 + - name: "document-management" + description: "Document upload, deletion, and access control" + test_count: 4 + critical: 0 + high: 2 + mid: 1 + low: 1 +coverage_correlation: + routes_or_features: 6 + expected_test_range_min: 18 + expected_test_range_max: 30 +--- +``` + +### INDEX Frontmatter Rules + +- **total_tests**: Sum of all tests across all folders. Must be a positive integer. +- **total_folders**: Number of subdirectories. Must match the length of `folders` list. +- **folders**: One entry per subdirectory. Each has: + - `name`: Folder name (kebab-case, matches the actual subdirectory name) + - `description`: What this folder covers within the focus area + - `test_count`: Number of test files in this folder + - `critical`, `high`, `mid`, `low`: Count of tests at each criticality level. **Must sum to test_count.** +- **coverage_correlation**: Explains why the test count makes sense for the focus area. + - `routes_or_features`: Number of distinct routes/features relevant to the focus + - `expected_test_range_min`: Lower bound of expected tests (routes_or_features * 3) + - `expected_test_range_max`: Upper bound of expected tests (routes_or_features * 5, or higher for core-heavy focus areas) + - **total_tests must fall within [expected_test_range_min, expected_test_range_max]** + +### After the INDEX frontmatter + +The body of INDEX.md should contain: +- A human-readable summary of what the focused test suite covers +- A table listing every folder with its test count and description +- A table listing every test file with its title, criticality, scenario, and flow + +## CRITICAL: Individual Test File Format + +Each test file in `autonoma/qa-tests/{FOCUS_SLUG}/{folder-name}/` MUST start with YAML frontmatter: + +```yaml +--- +title: "Sign a document with valid credentials" +description: "Verify a user can complete the signing flow for a standard document" +criticality: critical +scenario: standard +flow: "Document Signing" +--- +``` + +### Test File Frontmatter Rules + +- **title**: Short, descriptive test name (string, non-empty) +- **description**: One sentence explaining what the test verifies (string, non-empty) +- **criticality**: Exactly one of: `critical`, `high`, `mid`, `low` +- **scenario**: Which scenario this test uses — `standard`, `empty`, or `large`. If `scenarios.md` + does not exist, use `standard` as the default. +- **flow**: Which feature/flow this test belongs to — must match a feature name from AUTONOMA.md + frontmatter if that file exists, otherwise use a descriptive name for the focus sub-feature. + +### After the test frontmatter + +The body follows the standard Autonoma test format from the fetched instructions: +- **Setup**: Scenario reference and any preconditions +- **Steps**: Numbered list using only: click, scroll, type, assert +- **Expected Result**: What should be true when the test passes + +## Test Distribution Guidelines + +- Focus budget entirely on the `FOCUS_PROMPT` domain — every test must belong to the focus topic +- Within the focus area, apply the same criticality distribution: + - Core sub-flows of the focus (from AUTONOMA.md where `core: true`, scoped to the topic): mostly `critical` and `high` + - Supporting sub-flows: mostly `high` and `mid` + - Settings/admin within the focus: mostly `mid` and `low` +- Never write conditional steps — each test follows one deterministic path +- Assertions must specify exact text, element, or visual state +- Reference scenario data by exact values from scenarios.md, EXCEPT for variable fields — use `{{token}}` placeholders for those +- Do not spend test budget "auditing" scenario contents. Scenario data is setup, not the product behavior under test. +- Do not write meta-tests such as "verify the seeded counts match scenarios.md" or "verify the Environment Factory created the right fixtures" +- If a seeded value is not needed for a user-facing flow within the focus area, do not assert it just because it exists in scenarios.md +- Do not duplicate any test from `EXISTING_TESTS` + +## Validation + +Hook scripts will automatically validate your output when you write files. If validation fails, +you'll receive an error message. Fix the issue and rewrite the file. + +**INDEX.md validation checks:** +- Frontmatter contains total_tests, total_folders, folders, coverage_correlation +- Folder criticality counts sum to test_count per folder +- Sum of all folder test_counts equals total_tests +- total_tests falls within expected_test_range + +**Individual test file validation checks:** +- Frontmatter contains title, description, criticality, scenario, flow +- criticality is one of: critical, high, mid, low +- All string fields are non-empty + +## Important + +- Write INDEX.md FIRST, then individual test files +- The folder names in INDEX.md must match actual subdirectory names +- Use subagents to parallelize test generation across folders +- Each test must be self-contained — no dependencies on other tests +- Do not write code (no Playwright, no Cypress) — tests are markdown with natural language steps +- Stay within the focus scope — quality and relevance over quantity diff --git a/agents/kb-generator.md b/agents/kb-generator.md index f26e998..cd83f42 100644 --- a/agents/kb-generator.md +++ b/agents/kb-generator.md @@ -21,22 +21,38 @@ You generate a structured knowledge base for a codebase. Your output MUST be wri ## Instructions -1. First, fetch the latest knowledge base generation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt` - and follow those instructions for how to analyze the codebase. + To fetch a doc, run the bash command literally — the shell expands the path, not you: -2. Create the output directory if it doesn't exist: + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest knowledge base generation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt" + ``` + + Read the output and follow those instructions for how to analyze the codebase. + +3. Create the output directory if it doesn't exist: ```bash mkdir -p autonoma/skills ``` -3. Follow the fetched instructions to analyze the codebase — discover the application, +4. Follow the fetched instructions to analyze the codebase — discover the application, map pages and flows, identify core workflows. -4. Write the output to `autonoma/AUTONOMA.md`. +5. Write the output to `autonoma/AUTONOMA.md`. -5. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered. +6. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered. ## CRITICAL: Output Format diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md index cfb7aa3..57cc418 100644 --- a/agents/scenario-generator.md +++ b/agents/scenario-generator.md @@ -21,20 +21,128 @@ and `autonoma/skills/`. Your output MUST be written to `autonoma/scenarios.md` w ## Instructions -1. First, fetch the latest scenario generation instructions: - - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt` - and follow those instructions for how to design scenarios. - -2. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types. - -3. Scan `autonoma/skills/` to understand what entities can be created and their relationships. - -4. Explore the backend codebase to map the data model (database schema, API routes, types). - -5. Design three scenarios: `standard`, `empty`, `large`. - -6. Write the output to `autonoma/scenarios.md`. +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. + + To fetch a doc, run the bash command literally — the shell expands the path, not you: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest scenario generation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-scenarios.txt" + ``` + + Read the output and follow those instructions for how to design scenarios. + +3. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types. + +4. Read `autonoma/entity-audit.md` — this is the authoritative schema map from Step 2. + It lists every model, its relationships, and whether creation goes through a factory or + raw SQL. Use it as the source of truth for model names, fields, FK edges, and the scope field. + +5. Scan `autonoma/skills/` to understand what entities can be created and their relationships. + +6. Explore the backend codebase only to fill gaps the audit does not cover (e.g. enum values, + string length limits, constraint details). + +7. **Scoping analysis** — assess whether the scope entity provides real per-run data isolation. + Ask: does the scope entity parent most other models via required FKs? Can a new scope entity + be created per test run (i.e. it has creatable fields beyond auto-generated IDs)? Do most + models eventually chain back to the scope entity? + + If yes to all: the app has natural multi-tenant isolation — each test run creates its own + scope entity and all child data is automatically partitioned. + + If the scope entity is a singleton, shared across users, or does not meaningfully partition + data across concurrent runs: the app **lacks natural per-run isolation**. In this case you + MUST slug all identifying fields with `{{testRunId}}` (see step 9) so parallel or sequential + test runs never collide on lookup, search, or assertion values. + +8. Design three scenarios: `standard`, `empty`, `large`. + +9. **Variable fields.** Prefer hardcoded values when they make tests simpler, more reviewable, + and more stable. If a field needs run-level uniqueness but can still be expressed as a + concrete literal, prefer a planner-chosen hardcoded value with a discriminator suffix over + introducing a variable placeholder. + Example: prefer `Acme Project qa-17` encoded as a concrete value over turning the field + into `{{project_name}}` unless later tests truly need the placeholder. + + **Exception — apps without natural per-run isolation:** if your scoping analysis determined + the app lacks natural multi-tenant isolation, **reverse the default**. Slug ALL identifying + fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline + `{{testRunId}}` so every value a test might search, type, or assert on screen is unique to + that test run. Pattern: `Concrete Value {{testRunId}}` (e.g. `Acme Corp {{testRunId}}`). + Each slugged field becomes a `variable_field` entry with `generator: derived from testRunId`. + + Use variable fields sparingly. Only mark a value as variable when at least one of these is true: + - the field must be globally unique or is highly collision-prone across runs + - the backend or SDK generates the value at runtime + - the value is inherently time-based, unstable, or nondeterministic + - hardcoding it would make later tests misleading or brittle + - **the app lacks natural per-run isolation** and the field is used in lookups, searches, or assertions + + Fields that are time-sensitive (dates, deadlines, timestamps) or have any uniqueness/format + constraint enforced by the database or application **must** be variable — hardcoding them + will cause test failures when the hardcoded value expires or collides. + + Do not mark a field as variable just because it is user-facing text, could be unique in + theory, or you want to avoid choosing a concrete literal. + + Every variable field must have: + - a double-curly token such as `{{project_title}}` + - the entity field it belongs to, such as `Project.title` + - the scenario names that use it + - a reason explaining why it truly must vary + - a plain-language test reference such as `({{project_title}} variable)` + + `generator` is optional. Use a short free-form strategy note such as `derived from testRunId`, + `planner literal plus discriminator`, `backend-generated`, `UUID suffix`, or `timestamp-based`. + Do not default to `faker`. Prefer deterministic derivation from stable inputs, and use `faker` + only as a last resort. + +10. **Nested tree constraint.** Design scenario entity tables so they can be expressed as a + nested tree rooted at the scope entity. Step 4 (env-factory) and Step 5 (scenario-validator) + will convert scenarios into nested `create` payloads — flat cross-model structures connected + only by `_ref` break when JSON key order is not preserved. Children must nest under their + parent using the relation field names from the audit. Use `_ref` only for cross-branch + references that cannot be expressed through nesting. + +11. **Standalone vs via-owner choice.** For every model that appears in a scenario, consult + the audit and pick one of two paths: + + - If the model has `independently_created: true` and the scenario narrative wants it + in isolation (e.g. the user creates a child directly, independent of any root), add + it as a top-level tree node. The SDK will call its factory directly. + - If the model appears in some owner's `created_by` list and the scenario narrative + already includes that owner (e.g. the scenario already has the root, and a default + child / onboarding row / deployment row comes along for free), **do NOT add the + model as a separate node**. It is created as a side effect of the owner's factory. + Quote the `why` from the audit in the scenario prose so the reader knows where it + came from. + + **Dual models** (`independently_created: true` AND listed in someone's `created_by`) + get to pick per-scenario: + + - Narrative where the root is being created for the first time → the child comes in + via the owner (via-owner path). + - Narrative where the root already exists and the user is creating a standalone child + → the child is a top-level node (standalone-factory path); its owner is also in + the tree, as its FK parent. + + Never double-create a dependent. If the audit says an owner mints a dependent row + inline, and your scenario has that owner, the dependent must not appear as a separate + tree node — the factory already creates it, and adding it twice will either fail + uniqueness checks or produce confusing test state. + +12. Write the output to `autonoma/scenarios.md`. ## CRITICAL: Output Format @@ -62,6 +170,19 @@ entity_types: - name: "Test" - name: "Run" - name: "Folder" +variable_fields: + - token: "{{project_title}}" + entity: "Project.title" + scenarios: + - standard + - large + generator: "planner literal plus discriminator" + reason: "title must be unique per test run" + test_reference: "({{project_title}} variable)" +planning_sections: + - schema_summary + - relationship_map + - variable_data_strategy --- ``` @@ -75,10 +196,28 @@ entity_types: - `total_entities`: Total count of entities created in this scenario - **entity_types**: List of ALL entity types discovered in the data model. Each has: - `name`: Entity type name (e.g., "User", "Project", "Run") +- **variable_fields**: List of generated or per-run values that tests must not treat as + hardcoded literals. May be `[]` if no variable fields are needed. Each entry has: + - `token`: double-curly placeholder such as `{{project_title}}` + - `entity`: entity field path such as `Project.title` + - `scenarios`: list of scenario names that use this variable + - `reason`: why this field must be generated + - `test_reference`: how tests should refer to the value in natural language + - optional `generator`: free-form generation hint such as `derived from testRunId` +- **planning_sections**: A list describing which planning artifacts are present. It must include: + - `schema_summary` + - `relationship_map` + - `variable_data_strategy` + - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation + and you need to explain why fields were aggressively slugged with `{{testRunId}}` ### After the frontmatter The rest of the file follows the standard scenarios.md format from the fetched instructions: +- Include a `## Schema Summary` section listing the key models and required fields driving the scenarios. +- Include a `## Relationship Map` section describing parent/child and FK relationships. +- Include a `## Variable Data Strategy` section explaining which values are generated and how tests reference them. +- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation. - Scenario: `standard` (credentials, entity tables with concrete data, aggregate counts) - Scenario: `empty` (credentials, all entity types listed as None) - Scenario: `large` (credentials, high-volume data described in aggregate) @@ -90,17 +229,24 @@ you'll receive an error message. Fix the issue and rewrite the file. The validation checks: - File starts with `---` (YAML frontmatter) -- Frontmatter contains scenario_count, scenarios, entity_types +- Frontmatter contains scenario_count, scenarios, entity_types, variable_fields, planning_sections - scenarios list length matches scenario_count - Required scenarios (standard, empty, large) are present - Each scenario has name, description, entity_types, total_entities - entity_types is a non-empty list with name fields +- variable_fields entries use double-curly tokens and known scenario names +- planning_sections includes schema_summary, relationship_map, and variable_data_strategy ## Important -- **The scenario data is a contract.** Tests will assert against these exact values. +- **The scenario data is a contract.** Fixed values are hard assertions; variable fields are explicit placeholders. +- Prefer concrete literals unless the field truly must vary across runs. +- Use variables sparingly. A smaller, justified variable list is better than marking every identity field dynamic. +- Do not default to `faker`. Prefer deterministic strategies — planner-chosen literals with stable discriminators, derivation from `testRunId`, or backend-generated values. - Every value must be concrete — not "some applications" but "3 applications: Marketing Website, Android App, iOS App" - Every relationship must be explicit — which entities belong to which - Every enum value must be covered in `standard` - Use subagents to parallelize data model discovery -- If you can't find the database schema, ask the user where the backend is +- Only use `{{testRunId}}` as a template token in scenario BODIES (field values). Custom tokens like `{{user_email_alice}}` are only valid in `variable_fields` declarations — when the SDK resolves payloads at runtime it only knows built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`). If a field needs uniqueness inside the scenario body, inline testRunId: e.g. `alice-{{testRunId}}@test.local`. +- Design scenarios so each entity table can be serialised as a nested tree rooted at the scope entity. Flat cross-model `_ref`-only structures break when JSON key order is not preserved. +- If the audit does not describe a model you need, ask the user rather than guessing. diff --git a/agents/scenario-validator.md b/agents/scenario-validator.md new file mode 100644 index 0000000..f5ec61c --- /dev/null +++ b/agents/scenario-validator.md @@ -0,0 +1,259 @@ +--- +description: > + Validates the Environment Factory endpoint end-to-end by running discover/up/down + against every scenario, iteratively fixing handler bugs and reconciling scenarios.md + with the real behavior. Writes autonoma/.endpoint-validated on success. Hard gate + before E2E test generation. +tools: + - Read + - Glob + - Grep + - Write + - Edit + - Bash + - Agent + - WebFetch +maxTurns: 120 +--- + +# Scenario Validator: iterative fix loop + reality reconciliation + +The Environment Factory endpoint exists (step 4 wrote `autonoma/.endpoint-implemented`). +Your job is to prove it actually works and keep iterating until it does. The E2E test +generator (step 6) is gated on your sentinel — if you do not write +`autonoma/.endpoint-validated`, no tests get generated. + +## Database Safety (absolute) + +- ALL writes go through the SDK endpoint only. Never INSERT/UPDATE/DELETE/DROP/TRUNCATE via psql or raw SQL. +- You MAY run SELECT via psql / ORM read queries to verify data. +- The SDK's `down` action deletes only what `up` created (signed refs token). + +## Inputs + +- `autonoma/entity-audit.md` — every model and whether it needs a factory +- `autonoma/scenarios.md` — scenario definitions (may contain mistakes you will correct) +- The handler file created in step 4 +- A running dev server (start one if it is not up — ask the user for the port) +- `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` (for HMAC signing + preflight) + +## Outputs + +- `autonoma/scenario-recipes.json` — validated nested `create` trees per scenario +- `autonoma/.scenario-validation.json` — terminal artifact the orchestrator reads +- `autonoma/.endpoint-validated` — sentinel that gates Step 6 (test generation) + +## The loop + +Repeat until all three actions succeed for every scenario OR you exhaust 5 iterations +(if you hit 5, STOP and report — do not fake success): + +1. Fetch the protocol docs (first iteration only): + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/protocol.txt" + curl -sSfL "$(cat autonoma/.docs-url)/llms/scenarios.txt" + ``` + + If curl fails, STOP and report — do not fabricate a URL. + +2. Export working secrets (same values the handler reads): + + ```bash + export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)} + export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)} + ``` + +3. Run `discover` via curl with proper HMAC. + - The response MUST contain `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`. + - **Coverage check**: every model in `entity-audit.md` MUST appear in `schema.models`. If one is missing, fix the handler's model filter / adapter config and restart the loop. + - **Factory coverage check**: open the handler file(s), extract the registered factory names. Every model with `independently_created: true` in the audit MUST be registered. + - **Factory-body integrity check (deterministic, MANDATORY)**: this is the check the env-factory agent is supposed to run before writing its sentinel. Re-run it here; do not trust the upstream. Steps: + 1. Grep the handler file(s) for raw DB/ORM writes. The pattern set must cover every + language and ORM the SDK supports — any of these appearing inside a factory body for a + model with `independently_created: true` is a FAIL: + ```bash + # TypeScript/JavaScript — Prisma, Drizzle, Knex, Sequelize, TypeORM, Mongoose + grep -nE '(prisma|db|tx|trx)\.[a-zA-Z_]+\.(create|createMany|upsert)\(|\b(drizzle|db|tx)\.insert\(|\bknex\([^)]*\)\.insert\(|\.models\.[A-Za-z_]+\.create\(|getRepository\([^)]*\)\.save\(|\bMongoose.*\.create\(' + + # Python — SQLAlchemy, Django ORM + grep -nE '\bsession\.(add|execute|bulk_insert_mappings)\(|\.objects\.create\(|\.save\(\)' + + # Ruby/Rails — ActiveRecord + grep -nE '\b[A-Z][A-Za-z0-9]*\.(create|create!|insert|insert_all)\(|\.new\([^)]*\)\.save' + + # PHP/Laravel — Eloquent, raw DB + grep -nE '\b[A-Z][A-Za-z0-9]*::create\(|->save\(\)|\bDB::table\([^)]*\)->insert\(' + + # Java/Spring — JPA, JDBC + grep -nE '\bentityManager\.persist\(|\b[a-zA-Z]+Repository\.save\(|\bjdbcTemplate\.update\(' + + # Go — GORM, database/sql, squirrel + grep -nE '\.Create\(|\bdb\.Exec(Context)?\(|\bsq\.Insert\(' + + # Elixir/Ecto + grep -nE '\bRepo\.(insert|insert!|insert_all)\(' + + # Rust — Diesel, SQLx, SeaORM + grep -nE '\bdiesel::insert_into\(|\bsqlx::query!?\("INSERT|ActiveModel[^{]*\.insert\(' + + # Raw SQL INSERT in any language + grep -niE '"[^"]*INSERT\s+INTO\b|'"'"'[^'"'"']*INSERT\s+INTO\b' + ``` + Use the pattern set appropriate for the project's stack (determined from the handler file + and `entity-audit.md`); include the raw-SQL pattern unconditionally. Any match that + falls inside a factory body for a `independently_created: true` model is a FAIL. + 2. For each `(model, creation_file, creation_function)` from `entity-audit.md`, verify the handler contains both an `import` resolving to `creation_file` AND an invocation of `creation_function` inside that model's factory body. + 3. If any model fails either check, this is a **handler bug** (path 3a). Fix by importing and calling the audited function. If the audit pointed at an inline route handler (no exported function), extract it into a named exported function in a nearby module, replace the route body with a call to the new function, update `entity-audit.md` in-place with the new `creation_file`/`creation_function`, then restart this step. + 4. The validator MUST NOT write `.endpoint-validated` while any factory body contains a raw ORM create for its own model. + +4. For each scenario in `scenarios.md`: + 1. Build the `{action:"up", create:..., testRunId:"-"}` body from the scenario. + 2. HMAC-sign and POST. + 3. If non-200 or error body, pick one of three paths: + a. **Handler bug** (missing factory, bad FK handling, wrong adapter config) → fix the handler and restart. + b. **Scenario bug** (field does not exist on the model, FK target wrong, scope field missing) → edit `scenarios.md` to match reality and restart. Log the change. + c. **Unfeasible scenario** (requires data the app cannot produce) → REMOVE the scenario from `scenarios.md` with justification. Restart. + 4. If 200: parse `auth`, `refs`, `refsToken`. + - **Auth check**: `auth` MUST be non-null and contain at least one of `{ cookies, headers, token, user }`. If empty, the auth callback is not wired — fix it and restart. + - **Refs check**: every top-level model in the `create` tree MUST appear in `refs`. + 5. Verify DB state with a read-only `SELECT` for at least one refs id. + 6. POST `{action:"down", refsToken}`. Expect `{ok:true}`. + 7. Verify the refs rows are gone. + +5. After every scenario passes cleanly, emit the scenario recipes. + + Write `autonoma/scenario-recipes.json` with this shape (recipes mirror the `create` + trees you just validated — one entry per scenario): + + ```json + { + "version": 1, + "source": { + "scenariosPath": "autonoma/scenarios.md" + }, + "validationMode": "endpoint-lifecycle", + "recipes": [ + { + "name": "standard", + "description": "Realistic dataset for core flows", + "create": { + "Organization": [{ + "_alias": "org1", + "name": "Acme Corp" + }] + }, + "variables": { + "testRunId": { + "strategy": "derived", + "source": "testRunId", + "format": "{testRunId}" + } + }, + "validation": { + "status": "validated", + "method": "endpoint-up-down", + "phase": "ok", + "up_ms": 12, + "down_ms": 8 + } + } + ] + } + ``` + + Rules: + - top-level keys MUST be exactly `version`, `source`, `validationMode`, `recipes` + - `version` must be integer `1` + - `validationMode` must be `sdk-check` or `endpoint-lifecycle` (use `endpoint-lifecycle` + when you drove up/down via HTTP in the loop above) + - `recipes` MUST include `standard`, `empty`, and `large` + - every recipe MUST contain `name`, `description`, `create`, and `validation` + - every `validation` object MUST contain `status: "validated"`, `phase: "ok"`, and a + valid `method` (one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`) + - **Nested tree**: `create` MUST use a nested tree rooted at the scope entity. Do NOT + use flat top-level model keys connected only by `_ref`. Nest children under their + parent using relation field names. Use `_ref` only for cross-branch references that + cannot be expressed through nesting. + - **Variables**: if `create` contains `{{token}}` placeholders, include a `variables` + object. Every `{{token}}` in `create` must match a key in `variables`; every key + in `variables` must be used in `create`. Fully concrete recipes do not need `variables`. + Allowed strategies: `literal`, `derived`, `faker`. Any collision-prone unique value + must be derived from `testRunId`. + - Do NOT write the legacy shape — no top-level `generatedAt`, no top-level `scenarios`, + no per-recipe `validated`, no per-recipe `timing`. + +6. Run preflight on the emitted recipes: + + ```bash + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" \ + autonoma/scenario-recipes.json + ``` + + This resolves tokenized payloads and re-runs signed up/down against the live endpoint. + Requires `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` in the environment. + + If preflight exits non-zero, fix the failing recipe (or the corresponding scenario) and + re-run. Do NOT proceed to step 7 until preflight passes. + +7. Write the terminal artifact `autonoma/.scenario-validation.json` with this shape: + + ```json + { + "status": "ok", + "preflightPassed": true, + "smokeTestPassed": true, + "validatedScenarios": ["standard", "empty", "large"], + "failedScenarios": [], + "blockingIssues": [], + "recipePath": "autonoma/scenario-recipes.json", + "validationMode": "endpoint-lifecycle", + "endpointUrl": "http://localhost:3000/api/autonoma" + } + ``` + + On failure keep the same shape with `status: "failed"`, `preflightPassed: false` when + preflight did not pass, populated `failedScenarios`, and concrete `blockingIssues`. + +8. Write the sentinel `autonoma/.endpoint-validated`. + + Use the `Write` tool (NOT `touch` — the hook fires only on `Write`/`Edit`) with a short + plain-text report: + + ``` + Validated N scenarios across M models. + - discover: all audited models present, all independently_created factories registered + - up: all N scenarios created successfully, auth returned {cookies|headers|token} + - down: all N scenarios cleaned up, no orphans + - recipes: autonoma/scenario-recipes.json emitted, preflight passed + - scenarios.md edits: + ``` + +## Iteration discipline + +- One handler fix per iteration, then re-run everything. Do not chain fixes blind. +- If the same scenario fails twice in a row with the same error, the scenario itself is probably wrong — prefer editing `scenarios.md` over contorting the handler. +- If you have edited `scenarios.md`, re-read it from disk after every edit. + +## When you hit the 5-iteration cap + +STOP and write a clear failure report. Do NOT write `.endpoint-validated`. Include: + +- the last failing curl body + response +- which scenario(s) failed +- which handler file + line range is most likely at fault + +The orchestrator will surface this to the user, who can intervene manually. + +## scenarios.md reconciliation rules + +When you edit `scenarios.md`, preserve the frontmatter shape (the validator hook checks +it). Allowed: + +- Drop a scenario entirely (decrement `scenario_count`, update the `scenarios` summary). +- Remove/rename fields on a model to match what `discover` reports. +- Adjust FK aliases so they reference models that actually exist. +- Flatten cross-branch references that the handler cannot resolve. + +Disallowed: silently changing a scenario's intent (e.g. renaming "admin with one project" +to "user with one project" without reflecting that in the description). diff --git a/agents/test-case-generator.md b/agents/test-case-generator.md index f4b8ec5..ee951f0 100644 --- a/agents/test-case-generator.md +++ b/agents/test-case-generator.md @@ -27,27 +27,67 @@ Your output is a directory `autonoma/qa-tests/` containing: ## Instructions -1. First, fetch the latest test generation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt` - and follow those instructions for how to generate tests. + To fetch a doc, run the bash command literally — the shell expands the path, not you: -2. Read all input files: + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest test generation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-e2e-tests.txt" + ``` + + Read the output and follow those instructions for how to generate tests. + +3. Read all input files: - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count - All files in `autonoma/skills/` - - `autonoma/scenarios.md` — parse the frontmatter to get scenarios and entity_types - -3. Count the routes/features/pages in the codebase to establish the coverage correlation. + - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields** + +4. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter + declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has + a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`. + When writing test steps that involve a variable field value — typing it, asserting it, or + navigating to it — you MUST use the `{{token}}` placeholder, never the hardcoded literal from + the scenario body. At runtime the agent resolves these tokens to their actual values. + + Example: if `variable_fields` includes `{{deadline_1}}` for `Tasks.deadline`: + - good: "assert the task deadline shows `{{deadline_1}}`" + - bad: "assert the task deadline shows 2025-06-15" + +5. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test. + The scenarios exist only to provide preconditions and known data for app behavior tests. + Do NOT generate tests whose purpose is to verify: + - that the scenario contains the documented entity counts + - that every scenario row, seed, or example value exists + - that the Environment Factory created data correctly + - that `standard`, `empty`, or `large` themselves are "correct" as artifacts + + Only reference scenario data when it is necessary to exercise a real user-facing flow. + Example: + - good: "open the project `{{project_title}}` and verify editing works" + - bad: "verify the scenario created 12 projects and 3 users" + +6. Count the routes/features/pages in the codebase to establish the coverage correlation. The total test count should roughly correlate: - Rule of thumb: 3-5 tests per route/feature for supporting flows - Rule of thumb: 8-15 tests per core flow - This is approximate — use judgment, but the INDEX must declare the correlation -4. Generate test files organized in subdirectories by feature/flow. +7. Generate test files organized in subdirectories by feature/flow. -5. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files). +8. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files). -6. Write individual test files into subdirectories. +9. Write individual test files into subdirectories. ## CRITICAL: INDEX.md Format @@ -144,7 +184,10 @@ The body follows the standard Autonoma test format from the fetched instructions - **Administrative/settings**: 15-20% of tests, mostly `mid` and `low` - Never write conditional steps — each test follows one deterministic path - Assertions must specify exact text, element, or visual state -- Reference scenario data by exact values from scenarios.md +- Reference scenario data by exact values from scenarios.md, EXCEPT for variable fields — use `{{token}}` placeholders for those +- Do not spend test budget "auditing" scenario contents. Scenario data is setup, not the product behavior under test. +- Do not write meta-tests such as "verify the seeded counts match scenarios.md" or "verify the Environment Factory created the right fixtures" +- If a seeded value is not needed for a user-facing flow, do not assert it just because it exists in scenarios.md ## Validation diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md new file mode 100644 index 0000000..da61432 --- /dev/null +++ b/commands/generate-adhoc-tests.md @@ -0,0 +1,506 @@ +--- +name: generate-adhoc-tests +description: > + Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline. + Each step runs in an isolated subagent and must pass deterministic validation before the next + step begins. When scenarios already exist in Autonoma, fetches context from the API and runs only + Step 3 scoped to the topic. On a first run, executes the full 4-step pipeline with Step 3 focused. + Use when you want targeted test coverage for a specific feature or domain. +--- + +# Autonoma Focused E2E Test Generation Pipeline + +You are orchestrating a focused test generation pipeline. Each step runs as an isolated subagent. +**Every step MUST complete successfully and pass validation before the next step begins.** +Do NOT skip steps. Do NOT proceed if validation fails. + +## User Confirmation Between Steps + +By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for +confirmation using the `AskUserQuestion` tool. This creates an interactive +UI prompt that makes it clear the user needs to respond before the pipeline continues. + +After calling `AskUserQuestion`, wait for the user's response. +Only proceed to the next step after they confirm. + +**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`, +skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting +the summary. The summaries are still displayed — only the confirmation prompt is skipped. + +## Before Starting + +Resolve the focus prompt from the user's input (the text after the command name): + +```bash +FOCUS_PROMPT="" +FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g') +echo "Focus: $FOCUS_PROMPT" +echo "Slug: $FOCUS_SLUG" +``` + +If no focus description was provided, list top-level route/feature directories in the codebase, +call `AskUserQuestion` with 3–4 suggested focus areas plus an "Other" option, wait for the user's +response, then derive `FOCUS_SLUG` from their answer. + +Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference): +```bash +AUTONOMA_ROOT="$(pwd)" +echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root +mkdir -p autonoma/skills autonoma/qa-tests +``` + +The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back: +```bash +PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '') +``` + +Read the environment variables. These are required for reporting progress back to Autonoma: +- `AUTONOMA_API_KEY` — your Autonoma API key +- `AUTONOMA_PROJECT_ID` — your Autonoma project ID +- `AUTONOMA_API_URL` — Autonoma API base URL +- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps + +Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`. + +Create the generation record so the dashboard can track progress in real time: +```bash +RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}") +HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) +BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') +echo "Setup API response (HTTP $HTTP_STATUS): $BODY" +GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') +mkdir -p autonoma +echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}" +echo "Generation ID: $GENERATION_ID" +``` + +If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation. + +## Checking Existing Setup + +Check whether scenarios with active recipes already exist in Autonoma for this application: +```bash +SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/scenarios" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}") +HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c " +import json, sys +data = json.loads(sys.stdin.read()) +active = [s for s in data.get('scenarios', []) if s.get('hasActiveRecipe')] +print('yes' if active else 'no') +" 2>/dev/null || echo "no") +echo "$SCENARIOS_RESPONSE" > /tmp/autonoma-scenarios-response.json +echo "Has active scenarios: $HAS_SCENARIOS" +``` + +**If `HAS_SCENARIOS=yes`** — scenarios and tests already exist. Fetch context from the API and run only Step 3: + +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') + +curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" > /tmp/autonoma-test-suite.json + +SCENARIOS_CONTEXT=$(python3 -c " +import json +with open('/tmp/autonoma-scenarios-response.json') as f: + data = json.load(f) +lines = ['## Available Scenarios', ''] +for s in data.get('scenarios', []): + status = 'active' if s.get('hasActiveRecipe') else 'no recipe' + lines.append(f\"- **{s['name']}** ({status})\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") + +TESTS_CONTEXT=$(python3 -c " +import json +with open('/tmp/autonoma-test-suite.json') as f: + data = json.load(f) +tests = data.get('tests', []) +lines = [f'## Existing Tests ({len(tests)} total)', ''] +for t in tests: + lines.append(f\"- {t['name']} (slug: {t['slug']})\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") + +SKILLS_CONTEXT=$(python3 -c " +import json +with open('/tmp/autonoma-test-suite.json') as f: + data = json.load(f) +skills = data.get('skills', []) +lines = [f'## Available Skills ({len(skills)} total)', ''] +for s in skills: + lines.append(f\"- {s['name']}: {s['description']}\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") +``` + +Skip to **Step 3: Generate Focused E2E Test Cases** and pass the fetched context inline in the subagent task — do not run Steps 1, 2, or 4. + +**If `HAS_SCENARIOS=no`** — this is a first run. Continue with the full pipeline below (Steps 1 through 4). + +--- + +## Step 1: Generate Knowledge Base + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true +``` + +Spawn the `kb-generator` subagent with the following task: + +> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` +> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with +> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. +> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. +> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. + +**After the subagent completes:** +1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty +2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically +3. Read the file and present the frontmatter to the user — specifically the core_flows table + +Report step complete and upload skills: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ') +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true + +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true + +[ -n "$GENERATION_ID" ] && python3 -c " +import os, json, sys +root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' +skills = [] +d = os.path.join(root, 'autonoma/skills') +if os.path.isdir(d): + for f in os.listdir(d): + if f.endswith('.md'): + with open(os.path.join(d, f)) as fh: + skills.append({'name': f, 'content': fh.read()}) +print(json.dumps({'skills': skills})) +" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." + - options: ["Yes, proceed to Step 2", "I want to suggest changes"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2. + +## Step 2: Generate Scenarios + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true +``` + +Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`. +This step requires these environment variables: +- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint +- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint + +If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access. +Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint. +State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2. + +Fetch and validate the artifact: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +mkdir -p "$AUTONOMA_ROOT/autonoma" +BODY='{"action":"discover"}' +SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') +RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ + -H "Content-Type: application/json" \ + -H "x-signature: $SIG" \ + -d "$BODY") +HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) +DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') +if [ "$HTTP_STATUS" != "200" ]; then + echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY" + exit 1 +fi +printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json" +python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" +``` + +If the fetch fails or validation fails, stop the pipeline at Step 2. +Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command. + +Spawn the `scenario-generator` subagent with the following task: + +> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover +> artifact from `autonoma/discover.json`. +> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. +> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types, +> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a +> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before +> introducing a variable placeholder. Use variable fields only for truly dynamic values such as +> backend-generated or time-based fields. `generator` is optional and must not default to `faker`. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. + +**After the subagent completes:** +1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty +2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`) +3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically +4. Read the file and present the summary to the user — scenario names, entity counts, entity types, + discover schema counts, and the minimal variable field tokens that remain dynamic + +Report step complete: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests." + - options: ["Yes, proceed to Step 3", "I want to suggest changes"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3. + +## Step 3: Generate Focused E2E Test Cases + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":2,"name":"Focused E2E Tests"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Generating focused E2E test cases..."}}' || true +``` + +Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual +values for FOCUS_PROMPT, FOCUS_SLUG, and — when coming from the API-fetch path — the context +variables SCENARIOS_CONTEXT, TESTS_CONTEXT, and SKILLS_CONTEXT before spawning): + +> **FOCUS_PROMPT**: +> **FOCUS_SLUG**: +> +> *(API-fetch path only — omit this block when running the full pipeline)* +> Context fetched from the Autonoma API (use this instead of reading local files): +> +> +> +> +> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, +> and scenarios from `autonoma/scenarios.md` (if they exist and no inline context was provided above). +> Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT. +> Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`. +> You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing +> total_tests, total_folders, folder breakdown, and coverage_correlation. +> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. +> Treat scenario data as fixture input only. Do not generate tests whose purpose is to verify +> scenario counts, seeded inventories, or Environment Factory correctness. Only reference +> scenario data when it is needed to test a real user-facing app behavior within the focus area. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. + +**After the subagent completes:** +1. Verify `autonoma/qa-tests/${FOCUS_SLUG}/INDEX.md` exists and is non-empty +2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter +3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation + +Report step complete and upload test cases: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused test cases. Uploading to dashboard...\"}}" || true + +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":2,"name":"Focused E2E Tests"}}' || true + +[ -n "$GENERATION_ID" ] && python3 -c " +import os, json +proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' +qa_dir = os.path.join(proj_root, 'autonoma/qa-tests/${FOCUS_SLUG}') +test_cases = [] +for root, dirs, files in os.walk(qa_dir): + for f in files: + if f.endswith('.md') and f != 'INDEX.md': + path = os.path.join(root, f) + folder = os.path.relpath(root, qa_dir) + with open(path) as fh: + content = fh.read() + entry = {'name': f, 'content': content} + if folder != '.': + entry['folder'] = '${FOCUS_SLUG}/' + folder + test_cases.append(entry) +print(json.dumps({'testCases': test_cases})) +" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly." + - options: ["Yes, proceed to Step 4", "I want to suggest changes", "Done — skip Step 4 (scenarios already exist)"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4 (or stop here if coming from the API-fetch path). + +If coming from the **API-fetch path** (scenarios already existed), stop here after uploading. Step 4 is not needed. + +## Step 4: Environment Factory + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true +``` + +This step requires these environment variables: +- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint +- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint + +If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for +preflight validation. State plainly that both environment variables are mandatory. + +Spawn the `env-factory-generator` subagent with the following task: + +> Read `autonoma/discover.json` and `autonoma/scenarios.md`. +> Implement or complete the Autonoma Environment Factory in the project's backend so it can +> support the planned scenarios with the current SDK contract, then validate the planned scenarios +> against that implementation. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt +> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first. +> Preserve the existing discover integration if it already works, and finish `up` / `down` +> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`. +> Smoke-test the discover -> up -> down lifecycle in-session after implementing. +> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`. +> The recipe file must match the current setup API schema: +> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use +> `name`, `description`, `create`, and `validation` with `status: "validated"`, +> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`. +> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`. +> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines +> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`. +> Persisted `create` must remain tokenized — never store resolved concrete values. +> After writing the recipe file, run the preflight helper to validate all recipes against the +> live SDK endpoint before uploading: +> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json` +> The preflight must pass for all three scenarios before Step 4 is considered complete. + +**After the subagent completes:** +1. Verify the backend implementation or integration changes were made +2. Verify `autonoma/scenario-recipes.json` exists and is non-empty +3. Run the preflight helper if the subagent did not already do so: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" +``` +If preflight fails, do NOT proceed to upload. Report the failure to the user and stop. +4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results +5. Report which environment variables the backend now requires +6. Report any backend issues that still need manual attention + +Report step complete: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true +if [ -n "$GENERATION_ID" ]; then + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then + echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete." + exit 1 + fi + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete." + exit 1 + fi +fi +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true +``` + +## Completion + +After all steps complete, summarize: +- **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`) +- **Step 1**: Knowledge base location and core flow count *(full pipeline only)* +- **Step 2**: Scenario count and entity types covered *(full pipeline only)* +- **Step 3**: Total focused test count, folder breakdown, coverage correlation +- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results *(full pipeline only)* diff --git a/commands/generate-tests.md b/commands/generate-tests.md index c750253..4ccc236 100644 --- a/commands/generate-tests.md +++ b/commands/generate-tests.md @@ -9,296 +9,183 @@ description: > # Autonoma E2E Test Generation Pipeline -You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent. +You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent. **Every step MUST complete successfully and pass validation before the next step begins.** Do NOT skip steps. Do NOT proceed if validation fails. ## CRITICAL: User Confirmation Between Steps -After each step (1, 2, and 3), you MUST present the summary and then ask the user for -confirmation using the `AskUserQuestion` tool. This creates an interactive -UI prompt that makes it clear the user needs to respond before the pipeline continues. +After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation +using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm. -After calling `AskUserQuestion`, wait for the user's response. -Only proceed to the next step after they confirm. +## How lifecycle reporting works -## Before Starting - -Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference): -```bash -AUTONOMA_ROOT="$(pwd)" -echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root -mkdir -p autonoma/skills autonoma/qa-tests -``` +You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that: -Read the environment variables. These are required for reporting progress back to Autonoma: -- `AUTONOMA_API_KEY` — your Autonoma API key -- `AUTONOMA_PROJECT_ID` — your Autonoma project ID -- `AUTONOMA_API_URL` — Autonoma API base URL +- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`. +- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output, + emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate + (test files cannot be written until `autonoma/.endpoint-validated` exists). -Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`. +## Before Starting -Create the generation record so the dashboard can track progress in real time: ```bash -RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}") -HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) -BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') -echo "Setup API response (HTTP $HTTP_STATUS): $BODY" -GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') -mkdir -p autonoma -echo "$GENERATION_ID" > autonoma/.generation-id -echo "Generation ID: $GENERATION_ID" +mkdir -p autonoma/skills autonoma/qa-tests ``` -If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation. +The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`. ## Step 1: Generate Knowledge Base -Report step start: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true -``` - -Spawn the `kb-generator` subagent with the following task: - -> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` -> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with -> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. -> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. -> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. - -**After the subagent completes:** -1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty -2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically -3. Read the file and present the frontmatter to the user — specifically the core_flows table - -Report step complete and upload skills: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ') -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true - -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true - -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json, sys -root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -skills = [] -d = os.path.join(root, 'autonoma/skills') -if os.path.isdir(d): - for f in os.listdir(d): - if f.endswith('.md'): - with open(os.path.join(d, f)) as fh: - skills.append({'name': f, 'content': fh.read()}) -print(json.dumps({'skills': skills})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` - -4. Call `AskUserQuestion` with: - - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." - - options: ["Yes, proceed to Step 2", "I want to suggest changes"] -5. Wait for the user's response before proceeding. - -## Step 2: Generate Scenarios - -Report step start: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true -``` +Spawn `kb-generator`: -Spawn the `scenario-generator` subagent with the following task: +> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML +> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill +> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals). +> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`. -> Read the knowledge base from `autonoma/AUTONOMA.md` and `autonoma/skills/`. -> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. -> The file MUST have YAML frontmatter with scenario_count, scenarios summary, and entity_types. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. +After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body). -**After the subagent completes:** -1. Verify `autonoma/scenarios.md` exists and is non-empty -2. The PostToolUse hook will have validated the frontmatter format automatically -3. Read the file and present the frontmatter summary to the user — scenario names, entity counts, entity types +## Step 2: Entity Creation Audit -Report step complete: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Scenarios generated. 3 test data environments defined (standard, empty, large)."}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true -``` +Spawn `entity-audit-generator`: -4. Call `AskUserQuestion` with: - - question: "Do these scenarios look correct? The standard scenario data becomes hard assertions in your tests." - - options: ["Yes, proceed to Step 3", "I want to suggest changes"] -5. Wait for the user's response before proceeding. +> Read the knowledge base. Audit how each database model is created. For every model, find the +> dedicated creation function in a service/repository/helper. Classify as `independently_created: true` +> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output +> `autonoma/entity-audit.md` with frontmatter listing each model. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`. -## Step 3: Generate E2E Test Cases +After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`. -Report step start: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Generating E2E test cases from knowledge base and scenarios..."}}' || true -``` - -Spawn the `test-case-generator` subagent with the following task: +## Step 3: Generate Scenarios -> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, -> and scenarios from `autonoma/scenarios.md`. -> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`. -> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests, -> total_folders, folder breakdown, and coverage_correlation. -> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. +Spawn `scenario-generator`: -**After the subagent completes:** -1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty -2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter -3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation - -Report step complete and upload test cases: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} test cases. Uploading to dashboard...\"}}" || true - -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' || true - -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -qa_dir = os.path.join(proj_root, 'autonoma/qa-tests') -test_cases = [] -for root, dirs, files in os.walk(qa_dir): - for f in files: - if f.endswith('.md') and f != 'INDEX.md': - path = os.path.join(root, f) - folder = os.path.relpath(root, qa_dir) - with open(path) as fh: - content = fh.read() - entry = {'name': f, 'content': content} - if folder != '.': - entry['folder'] = folder - test_cases.append(entry) -print(json.dumps({'testCases': test_cases})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` +> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write +> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types, +> variable_fields, planning_sections). Mark values as variable only when they must vary across +> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural +> per-run isolation). Design entity tables so they serialise as nested trees rooted at the +> scope entity. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`. -4. Call `AskUserQuestion` with: - - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app." - - options: ["Yes, proceed to Step 4", "I want to suggest changes"] -5. Wait for the user's response before proceeding. +After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`. ## Step 4: Implement Environment Factory -Report step start: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Implementing Environment Factory endpoint in your backend..."}}' || true -``` - -Spawn the `env-factory-generator` subagent with the following task: - -> Read the scenarios from `autonoma/scenarios.md` and implement the Autonoma Environment Factory -> endpoint in the project's backend. The endpoint handles discover/up/down actions. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt -> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first. -> After implementing, run integration tests to verify the endpoint works. -> Use AUTONOMA_SIGNING_SECRET and AUTONOMA_JWT_SECRET as environment variable names. - -**After the subagent completes:** -1. Verify the endpoint was created and tests pass -2. Present the results to the user — what was implemented, where, test results -3. Report any issues that need manual attention - -Report step complete: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"log","data":{"message":"Environment Factory implemented and verified."}}' || true -[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true -``` +Spawn `env-factory-generator`: + +> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure +> the handler. Register a factory for every model with `independently_created: true` (call the audit's +> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback +> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity +> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down` +> — that is step 5. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"` +> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`. +> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names. + +After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary, +`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`. + +## Step 5: Validate Scenario Lifecycle + +Spawn `scenario-validator`: + +> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4. +> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5 +> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails +> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On +> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at +> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe +> per scenario), run `preflight_scenario_recipes.py` against it, and write +> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write` +> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and +> report — do NOT write the sentinel. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`. +> Verify: every audited model appears in `discover.schema.models`, every `independently_created` +> model has a registered factory, `auth` is non-empty, DB state is correct before and after +> `down`, and preflight exits 0. + +After completion: +1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON + AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`: + enforce and upload the recipes to the dashboard, then ack. + + ```bash + AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}" + VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json" + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + + # Enforce terminal artifact contract + python3 - "$VALIDATION_ARTIFACT" <<'PY' + import json, sys + payload = json.load(open(sys.argv[1])) + if payload.get("status") != "ok": + raise SystemExit("status must be ok before Step 5 can upload recipes") + if payload.get("preflightPassed") is not True: + raise SystemExit("preflightPassed must be true before Step 5 can upload recipes") + PY + + [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; } + python3 -c "import json; json.load(open('$RECIPE_PATH'))" \ + || { echo "scenario-recipes.json is not valid JSON"; exit 1; } + + # Re-run preflight at the orchestrator level for belt-and-suspenders safety. + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \ + || { echo "Preflight failed at orchestrator gate"; exit 1; } + + # Upload to dashboard + GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id") + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \ + "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2 + exit 1 + fi + ``` + + Then present validation summary (scenarios passed, any edits made to `scenarios.md`, + recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`. + +2. If any of those artifacts are missing/invalid: the agent failed — surface the failure + report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook + will also block test file writes. + +## Step 6: Generate E2E Test Cases + +Spawn `test-case-generator`: + +> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has +> been reconciled with reality in step 5 — use it as the source of truth). Parse the +> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any +> variable value (typed, asserted, or navigated to), never the hardcoded literal. +> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests +> that "audit" seeded counts or fixture existence. +> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with +> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test +> file needs frontmatter (title, description, criticality, scenario, flow). +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`. + +After completion: +1. Verify `autonoma/qa-tests/INDEX.md` exists +2. Present INDEX summary +3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed` + for the final step, marking the setup complete. ## Completion -After all steps complete, summarize: -- **Step 1**: Knowledge base location and core flow count -- **Step 2**: Scenario count and entity types covered -- **Step 3**: Total test count, folder breakdown, coverage correlation -- **Step 4**: Endpoint location, test results, env var setup instructions +Summarize each step: +- **Step 1**: KB location, core flows +- **Step 2**: entity audit — factories vs raw SQL +- **Step 3**: scenarios generated +- **Step 4**: endpoint implemented (handler path, packages, factories registered) +- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any) +- **Step 6**: test count, folder breakdown diff --git a/hooks/hooks.json b/hooks/hooks.json index d694b5d..310a20c 100644 --- a/hooks/hooks.json +++ b/hooks/hooks.json @@ -1,8 +1,28 @@ { "hooks": { + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pipeline-kickoff.sh" + } + ] + } + ], + "PreToolUse": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pretool-heartbeat.sh" + } + ] + } + ], "PostToolUse": [ { - "matcher": "Write", + "matcher": "Write|Edit", "hooks": [ { "type": "command", diff --git a/hooks/pipeline-kickoff.sh b/hooks/pipeline-kickoff.sh new file mode 100755 index 0000000..29425b7 --- /dev/null +++ b/hooks/pipeline-kickoff.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# UserPromptSubmit hook. Fires on every user prompt, early-exits unless: +# 1. The prompt invokes the generate-tests skill/command, AND +# 2. The pipeline has not already been kicked off (no autonoma/.generation-id). +# +# When both conditions hold, this script owns pipeline startup so the agent +# never has to remember to do it: +# - verifies required env vars (hard-fails if AUTONOMA_DOCS_URL is unset) +# - creates autonoma/ output dirs +# - writes autonoma/.docs-url +# - POSTs /v1/setup/setups to create the generation record +# - writes autonoma/.generation-id +# - emits step.started for step 0 +# +# Exit 0 always (best-effort reporting must never block test generation). + +set -u + +INPUT=$(cat) + +PROMPT=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('prompt',''))" 2>/dev/null || echo '') + +# Match either the slash command or a direct mention of the skill name +case "$PROMPT" in + */generate-tests*|*generate-tests*) ;; + *) exit 0 ;; +esac + +# Idempotency: if we've already kicked off this project's pipeline, nothing to do. +if [ -s autonoma/.generation-id ]; then + exit 0 +fi + +# Hard-require AUTONOMA_DOCS_URL — the plugin refuses to guess a docs URL. +if [ -z "${AUTONOMA_DOCS_URL:-}" ]; then + echo "[autonoma pipeline-kickoff] ERROR: AUTONOMA_DOCS_URL is not set." >&2 + echo "[autonoma pipeline-kickoff] Re-launch Claude using the onboarding command from the Autonoma dashboard (it exports AUTONOMA_DOCS_URL), or export it manually before running /generate-tests." >&2 + exit 0 +fi + +mkdir -p autonoma/skills autonoma/qa-tests +echo "$AUTONOMA_DOCS_URL" > autonoma/.docs-url + +# Nothing below this line should ever fail hard — we must not block the agent. +if [ -z "${AUTONOMA_API_URL:-}" ] || [ -z "${AUTONOMA_API_KEY:-}" ] || [ -z "${AUTONOMA_PROJECT_ID:-}" ]; then + echo "[autonoma pipeline-kickoff] WARN: AUTONOMA_API_URL/AUTONOMA_API_KEY/AUTONOMA_PROJECT_ID not all set. Skipping dashboard reporting." >&2 + exit 0 +fi + +# Derive a human-readable app name from the project dir (best-effort). +APP_NAME=$(basename "$(pwd)") + +RESPONSE=$(curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}" 2>/dev/null || echo '{}') + +GENERATION_ID=$(echo "$RESPONSE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') + +if [ -z "$GENERATION_ID" ]; then + echo "[autonoma pipeline-kickoff] WARN: setup creation returned no id. Dashboard will not reflect this run." >&2 + exit 0 +fi + +echo "$GENERATION_ID" > autonoma/.generation-id +echo "[autonoma pipeline-kickoff] Pipeline kickoff complete. generation_id=${GENERATION_ID}" >&2 + +curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' >/dev/null 2>&1 || true + +touch autonoma/.step-0-started + +# --------------------------------------------------------------------------- +# Launch the transcript streamer as a detached background daemon. It tails +# the session JSONL and forwards assistant text/thinking/tool-use/tool-result +# events to /v1/setup/setups/{id}/events so the dashboard can render a live +# activity log. Best-effort, never blocks. +# --------------------------------------------------------------------------- +TRANSCRIPT_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '') + +if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + STREAMER_PID_FILE="autonoma/.streamer.pid" + STREAMER_LOG="autonoma/.streamer.log" + STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py" + + # If a prior streamer is still alive (e.g. from a previous session in this + # project dir), replace it — the transcript path has changed. + if [ -s "$STREAMER_PID_FILE" ]; then + existing_pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null || echo '') + if [ -n "$existing_pid" ] && kill -0 "$existing_pid" 2>/dev/null; then + kill "$existing_pid" 2>/dev/null || true + fi + fi + + if [ -f "$STREAMER_SCRIPT" ]; then + nohup python3 "$STREAMER_SCRIPT" \ + "$TRANSCRIPT_PATH" \ + "$GENERATION_ID" \ + "$AUTONOMA_API_URL" \ + "$AUTONOMA_API_KEY" \ + >> "$STREAMER_LOG" 2>&1 "$STREAMER_PID_FILE" + disown "$STREAMER_PID" 2>/dev/null || true + echo "[autonoma pipeline-kickoff] Transcript streamer started. pid=${STREAMER_PID} transcript=${TRANSCRIPT_PATH}" >&2 + fi +fi + +exit 0 diff --git a/hooks/preflight_scenario_recipes.py b/hooks/preflight_scenario_recipes.py new file mode 100644 index 0000000..b2416c2 --- /dev/null +++ b/hooks/preflight_scenario_recipes.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +"""Preflight resolver and endpoint lifecycle checker for scenario recipes. + +Reads autonoma/scenario-recipes.json, resolves tokenized recipes into transient +concrete payloads, then sends signed up/down requests to AUTONOMA_SDK_ENDPOINT +for each recipe. Exits non-zero on any failure. Never rewrites the recipe file. +""" +import hashlib +import hmac +import json +import os +import re +import sys +import time +import urllib.request + +# --------------------------------------------------------------------------- +# Variable resolution +# --------------------------------------------------------------------------- + +ALLOWED_STRATEGIES = {'literal', 'derived', 'faker'} +ALLOWED_FAKER_GENERATORS = { + 'person.firstName', + 'person.lastName', + 'internet.email', + 'company.name', + 'lorem.words', +} + +# Seeded Faker generators — deterministic: same (testRunId + ":" + tokenName) → same value. +# Uses the `Faker` library (pip install Faker) for realistic data generation. + +def _seed_int(seed_str: str) -> int: + return int(hashlib.sha256(seed_str.encode()).hexdigest(), 16) + + +def _get_faker(seed_str: str): + """Return a seeded Faker instance.""" + from faker import Faker + fake = Faker() + fake.seed_instance(_seed_int(seed_str)) + return fake + + +# Map generator ids to Faker method calls. +_FAKER_METHOD_MAP = { + 'person.firstName': lambda f: f.first_name(), + 'person.lastName': lambda f: f.last_name(), + 'internet.email': lambda f: f.email(), + 'company.name': lambda f: f.company(), + 'lorem.words': lambda f: ' '.join(f.words(3)), +} + + +def _faker_generate(generator: str, seed_str: str) -> str: + method = _FAKER_METHOD_MAP.get(generator) + if method is None: + raise ValueError(f'Unsupported faker generator: {generator}') + fake = _get_faker(seed_str) + return method(fake) + + +def resolve_variable(var_def: dict, test_run_id: str, token_name: str) -> object: + """Resolve a single variable definition to a concrete value.""" + strategy = var_def.get('strategy') + if strategy not in ALLOWED_STRATEGIES: + raise ValueError(f'Unsupported variable strategy: {strategy}') + + if strategy == 'literal': + return var_def['value'] + + if strategy == 'derived': + source = var_def.get('source') + if source != 'testRunId': + raise ValueError(f'derived.source must be "testRunId", got: {source}') + fmt = var_def.get('format') + if not fmt or not isinstance(fmt, str): + raise ValueError(f'derived.format must be a non-empty string') + return fmt.replace('{testRunId}', test_run_id) + + if strategy == 'faker': + generator = var_def.get('generator') + if not generator or not isinstance(generator, str): + raise ValueError(f'faker.generator must be a non-empty string') + if generator not in ALLOWED_FAKER_GENERATORS: + raise ValueError(f'Unsupported faker generator: {generator}') + seed_str = f'{test_run_id}:{token_name}' + return _faker_generate(generator, seed_str) + + raise ValueError(f'Unsupported variable strategy: {strategy}') + + +def _find_tokens(obj) -> set: + """Find all {{token}} placeholders in a JSON-like structure.""" + tokens = set() + if isinstance(obj, str): + tokens.update(re.findall(r'\{\{(\w+)\}\}', obj)) + elif isinstance(obj, list): + for item in obj: + tokens.update(_find_tokens(item)) + elif isinstance(obj, dict): + for v in obj.values(): + tokens.update(_find_tokens(v)) + return tokens + + +def _resolve_value(val, resolved_vars: dict): + """Deep-resolve a single value, replacing {{token}} patterns.""" + if isinstance(val, str): + # Check for full-string replacement (entire string is one token) + m = re.fullmatch(r'\{\{(\w+)\}\}', val) + if m: + token = m.group(1) + if token not in resolved_vars: + raise ValueError(f'Unresolved token: {{{{{token}}}}}') + return resolved_vars[token] + # Embedded replacement + def _replace(match): + token = match.group(1) + if token not in resolved_vars: + raise ValueError(f'Unresolved token: {{{{{token}}}}}') + return str(resolved_vars[token]) + result = re.sub(r'\{\{(\w+)\}\}', _replace, val) + return result + if isinstance(val, list): + return [_resolve_value(item, resolved_vars) for item in val] + if isinstance(val, dict): + return {k: _resolve_value(v, resolved_vars) for k, v in val.items()} + return val + + +def resolve_recipe(recipe: dict, test_run_id: str) -> dict: + """Resolve a tokenized recipe create payload into a concrete payload. + + Returns the resolved create dict. Raises on any resolution failure. + """ + create = recipe.get('create', {}) + variables = recipe.get('variables', {}) + + # Validate: every token in create has a variable definition + tokens_in_create = _find_tokens(create) + var_keys = set(variables.keys()) + + missing = tokens_in_create - var_keys + if missing: + raise ValueError(f'Tokens without variable definitions: {missing}') + + unused = var_keys - tokens_in_create + if unused: + raise ValueError(f'Unused variable definitions: {unused}') + + # Resolve all variables + resolved = {} + for name, var_def in variables.items(): + resolved[name] = resolve_variable(var_def, test_run_id, name) + + # Deep-resolve the create payload + resolved_create = _resolve_value(create, resolved) + + # Final check: no unresolved tokens remain + remaining = _find_tokens(resolved_create) + if remaining: + raise ValueError(f'Unresolved tokens after resolution: {remaining}') + + return resolved_create + + +# --------------------------------------------------------------------------- +# Signed HTTP helpers +# --------------------------------------------------------------------------- + +def _sign(body_bytes: bytes, secret: str) -> str: + return hmac.new(secret.encode(), body_bytes, hashlib.sha256).hexdigest() + + +def _post(url: str, payload: dict, secret: str) -> tuple: + """POST JSON to url with HMAC signature. Returns (status, response_dict, elapsed_ms).""" + body = json.dumps(payload).encode() + sig = _sign(body, secret) + req = urllib.request.Request( + url, + data=body, + headers={ + 'Content-Type': 'application/json', + 'x-signature': sig, + }, + method='POST', + ) + start = time.time() + try: + resp = urllib.request.urlopen(req) + elapsed = int((time.time() - start) * 1000) + data = json.loads(resp.read()) + return resp.status, data, elapsed + except urllib.error.HTTPError as e: + elapsed = int((time.time() - start) * 1000) + try: + data = json.loads(e.read()) + except Exception: + data = {'error': str(e)} + return e.code, data, elapsed + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def generate_test_run_id(scenario_name: str) -> str: + ms = int(time.time() * 1000) + suffix = hashlib.sha256(f'{scenario_name}{ms}'.encode()).hexdigest()[:6] + return f'autonoma-preflight-{scenario_name}-{ms}-{suffix}' + + +def preflight(recipe_path: str, endpoint: str, secret: str) -> bool: + """Run preflight for all recipes. Returns True on success.""" + with open(recipe_path) as f: + data = json.load(f) + + recipes = data.get('recipes', []) + all_ok = True + results = [] + + for recipe in recipes: + name = recipe.get('name', '') + test_run_id = generate_test_run_id(name) + + # Step 1: Resolve + print(f'\n--- Preflight: {name} ---') + print(f' testRunId: {test_run_id}') + try: + resolved_create = resolve_recipe(recipe, test_run_id) + except ValueError as e: + print(f' FAIL (recipe compilation): {e}') + all_ok = False + results.append({'name': name, 'status': 'fail', 'phase': 'compilation', 'error': str(e)}) + continue + + # Step 2: Signed up + up_payload = { + 'action': 'up', + 'create': resolved_create, + 'testRunId': test_run_id, + } + status, resp, up_ms = _post(endpoint, up_payload, secret) + print(f' up: HTTP {status} ({up_ms}ms)') + if status < 200 or status >= 300: + print(f' FAIL (endpoint up): HTTP {status} — {json.dumps(resp)}') + all_ok = False + results.append({'name': name, 'status': 'fail', 'phase': 'up', 'http': status}) + continue + + # Validate up response + for field in ('auth', 'refs', 'refsToken'): + if field not in resp: + print(f' FAIL (endpoint up): missing field "{field}" in response') + all_ok = False + results.append({'name': name, 'status': 'fail', 'phase': 'up', 'error': f'missing {field}'}) + break + else: + # Step 3: Signed down + down_payload = { + 'action': 'down', + 'refs': resp['refs'], + 'refsToken': resp['refsToken'], + 'testRunId': test_run_id, + } + d_status, d_resp, down_ms = _post(endpoint, down_payload, secret) + print(f' down: HTTP {d_status} ({down_ms}ms)') + if d_status < 200 or d_status >= 300: + print(f' FAIL (endpoint down): HTTP {d_status} — {json.dumps(d_resp)}') + all_ok = False + results.append({'name': name, 'status': 'fail', 'phase': 'down', 'http': d_status}) + continue + + print(f' OK (up: {up_ms}ms, down: {down_ms}ms)') + results.append({'name': name, 'status': 'ok', 'up_ms': up_ms, 'down_ms': down_ms}) + continue + # If we broke out of the for-else, continue to next recipe + continue + + print(f'\n--- Summary ---') + for r in results: + status_str = 'OK' if r['status'] == 'ok' else f"FAIL ({r.get('phase', '?')})" + print(f" {r['name']}: {status_str}") + + return all_ok + + +def main(): + if len(sys.argv) < 2: + print(f'Usage: {sys.argv[0]} ') + sys.exit(1) + + recipe_path = sys.argv[1] + + # Ensure Faker is available + try: + import faker # noqa: F401 + except ImportError: + import subprocess + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'Faker', '-q'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + endpoint = os.environ.get('AUTONOMA_SDK_ENDPOINT') + secret = os.environ.get('AUTONOMA_SHARED_SECRET') + + if not endpoint: + print('ERROR: AUTONOMA_SDK_ENDPOINT is not set') + sys.exit(1) + if not secret: + print('ERROR: AUTONOMA_SHARED_SECRET is not set') + sys.exit(1) + + ok = preflight(recipe_path, endpoint, secret) + sys.exit(0 if ok else 1) + + +if __name__ == '__main__': + main() diff --git a/hooks/pretool-heartbeat.sh b/hooks/pretool-heartbeat.sh new file mode 100755 index 0000000..7dd4bf2 --- /dev/null +++ b/hooks/pretool-heartbeat.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Emits a lightweight "activity" event for every tool call so the dashboard +# can show Claude is still alive. Best-effort — failures never block the +# pipeline. Only fires when a generation is active (autonoma/.generation-id +# exists) and the Autonoma API is reachable. + +set -u + +INPUT=$(cat) + +# Guard: only fire during an active generation. +GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') +[ -z "$GENERATION_ID" ] && exit 0 +[ -z "${AUTONOMA_API_URL:-}" ] && exit 0 +[ -z "${AUTONOMA_API_KEY:-}" ] && exit 0 + +# --------------------------------------------------------------------------- +# Streamer liveness check + auto-revive. If the transcript streamer daemon +# has died (crash, OS restart, etc.) re-launch it so the dashboard keeps +# receiving events. kill -0 is nearly free when the process is alive. +# Skipped when the plugin's streamer.py is missing (e.g. older plugin cache). +# --------------------------------------------------------------------------- +STREAMER_PID_FILE="autonoma/.streamer.pid" +STREAMER_LOG="autonoma/.streamer.log" +STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py" + +streamer_alive() { + [ -s "$STREAMER_PID_FILE" ] || return 1 + local pid + pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null) + [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null +} + +if ! streamer_alive && [ -f "$STREAMER_SCRIPT" ]; then + TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '') + if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + nohup python3 "$STREAMER_SCRIPT" \ + "$TRANSCRIPT_PATH" \ + "$GENERATION_ID" \ + "$AUTONOMA_API_URL" \ + "$AUTONOMA_API_KEY" \ + >> "$STREAMER_LOG" 2>&1 "$STREAMER_PID_FILE" + disown "$NEW_PID" 2>/dev/null || true + echo "[$(date +%H:%M:%S)] streamer revived by pretool-heartbeat pid=$NEW_PID transcript=$TRANSCRIPT_PATH" >> "$STREAMER_LOG" + fi +fi + +# Build the payload: tool name + a short preview of the most informative arg. +# Heavy args (full file contents from Write/Edit) are never forwarded. +PAYLOAD=$(printf '%s' "$INPUT" | python3 -c " +import json, sys +try: + data = json.load(sys.stdin) +except Exception: + sys.exit(0) +tool = data.get('tool_name') or '' +if not tool: + sys.exit(0) +inp = data.get('tool_input') or {} +# Pick the first informative string field; never forward large blobs. +preview = '' +for key in ('command', 'description', 'file_path', 'pattern', 'path', 'query', 'prompt', 'url'): + v = inp.get(key) + if isinstance(v, str) and v.strip(): + preview = v.replace('\n', ' ').strip()[:200] + break +print(json.dumps({'type': 'activity', 'data': {'tool': tool, 'preview': preview}})) +" 2>/dev/null) + +[ -z "$PAYLOAD" ] && exit 0 + +# Short timeout — the hook runs before every tool call, never block the session. +curl --max-time 2 -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" >/dev/null 2>&1 || true + +exit 0 diff --git a/hooks/transcript-streamer.py b/hooks/transcript-streamer.py new file mode 100755 index 0000000..be496ca --- /dev/null +++ b/hooks/transcript-streamer.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""Streams Claude Code session transcript events to the Autonoma dashboard. + +Spawned as a detached background process by pipeline-kickoff.sh when a +/generate-tests run starts. Tails the session JSONL as Claude appends to it, +extracts assistant text + thinking + tool calls + tool results, and POSTs +each as a `transcript` event to /v1/setup/setups/{id}/events so the dashboard +can render a live activity log. + +Self-terminates after IDLE_SECONDS of no new transcript data. Safe to kill +at any time — the daemon is stateless and holds no locks. + +Usage: + python3 transcript-streamer.py +""" + +from __future__ import annotations + +import json +import os +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path + +POLL_INTERVAL = 0.75 +IDLE_SECONDS = 1800 # 30 min with no new lines → daemon exits +MAX_TEXT_CHARS = 4000 +MAX_PREVIEW_CHARS = 500 +HTTP_TIMEOUT = 2.0 + + +def main() -> None: + if len(sys.argv) != 5: + sys.exit(2) + transcript_path, generation_id, api_url, api_key = sys.argv[1:5] + if not all([transcript_path, generation_id, api_url, api_key]): + sys.exit(0) + + path = Path(transcript_path) + # Start at end of file. Anything written before this daemon launched was + # already visible in the terminal before the dashboard existed — don't + # replay it. + last_size = path.stat().st_size if path.exists() else 0 + idle = 0.0 + log(f"streamer up transcript={transcript_path} generation_id={generation_id} api_url={api_url} start_offset={last_size}") + + while idle < IDLE_SECONDS: + if not path.exists(): + time.sleep(POLL_INTERVAL) + idle += POLL_INTERVAL + continue + + size = path.stat().st_size + if size < last_size: + # File was rotated/truncated — reset. + last_size = 0 + if size == last_size: + time.sleep(POLL_INTERVAL) + idle += POLL_INTERVAL + continue + + idle = 0.0 + with path.open("r", encoding="utf-8", errors="replace") as fh: + fh.seek(last_size) + for line in fh: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + payload = extract_event(entry) + if payload is not None: + forward(payload, generation_id, api_url, api_key) + last_size = fh.tell() + + +def extract_event(entry: dict) -> dict | None: + """Turn a transcript line into a dashboard event, or None to skip.""" + etype = entry.get("type") + is_sidechain = bool(entry.get("isSidechain", False)) + uuid = entry.get("uuid") + + if etype == "assistant": + msg = entry.get("message") or {} + content = msg.get("content") or [] + texts: list[str] = [] + tool_uses: list[dict] = [] + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + t = (block.get("text") or "").strip() + if t: + texts.append(t) + elif btype == "thinking": + t = (block.get("thinking") or "").strip() + if t: + texts.append(f"[thinking] {t}") + elif btype == "tool_use": + tool_uses.append({ + "name": block.get("name") or "unknown", + "input_preview": _preview(block.get("input") or {}), + }) + if not texts and not tool_uses: + return None + data: dict = {"role": "assistant", "is_sidechain": is_sidechain} + if uuid: + data["uuid"] = uuid + if texts: + data["text"] = "\n".join(texts)[:MAX_TEXT_CHARS] + if tool_uses: + data["tool_uses"] = tool_uses + return {"type": "transcript", "data": data} + + if etype == "user": + msg = entry.get("message") or {} + content = msg.get("content") + # Tool results arrive as user messages whose content is a list of + # tool_result blocks. Raw text user messages (the original prompt) + # are skipped — they're already visible to the dashboard. + if not isinstance(content, list): + return None + results: list[dict] = [] + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") != "tool_result": + continue + body = _flatten_tool_result(block.get("content")) + entry_out: dict = {"is_error": bool(block.get("is_error"))} + if body: + entry_out["preview"] = body[:MAX_PREVIEW_CHARS] + results.append(entry_out) + if not results: + return None + data = {"role": "tool_result", "is_sidechain": is_sidechain, "results": results} + if uuid: + data["uuid"] = uuid + return {"type": "transcript", "data": data} + + return None + + +def _flatten_tool_result(raw) -> str: + if isinstance(raw, str): + return raw + if isinstance(raw, list): + parts: list[str] = [] + for c in raw: + if isinstance(c, dict) and c.get("type") == "text": + parts.append(c.get("text", "")) + elif isinstance(c, str): + parts.append(c) + return "\n".join(parts) + return "" + + +def _preview(obj) -> str: + try: + s = json.dumps(obj, default=str, ensure_ascii=False) + except Exception: + s = str(obj) + return s[:MAX_PREVIEW_CHARS] + + +def forward(payload: dict, generation_id: str, api_url: str, api_key: str) -> None: + url = f"{api_url.rstrip('/')}/v1/setup/setups/{generation_id}/events" + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + method="POST", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + ) + try: + with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp: + resp.read() + log(f"POST {resp.status} {payload.get('type')} {_summarize(payload)}") + except urllib.error.HTTPError as e: + body = "" + try: + body = e.read().decode("utf-8", errors="replace")[:300] + except Exception: + pass + log(f"POST {e.code} {payload.get('type')} body={body}") + except (urllib.error.URLError, TimeoutError, ConnectionError) as e: + log(f"POST network-error {payload.get('type')} err={e!r}") + except Exception as e: + log(f"POST unknown-error {payload.get('type')} err={e!r}") + + +def _summarize(payload: dict) -> str: + data = payload.get("data") or {} + role = data.get("role") + if role == "assistant": + snippet = (data.get("text") or "").replace("\n", " ")[:80] + tools = ",".join(t.get("name", "?") for t in data.get("tool_uses") or []) + return f"role=assistant text={snippet!r} tools=[{tools}]" + if role == "tool_result": + return f"role=tool_result n_results={len(data.get('results') or [])}" + return "" + + +def log(msg: str) -> None: + # Emit to stderr which is redirected to autonoma/.streamer.log by the kickoff hook. + try: + print(f"[{time.strftime('%H:%M:%S')}] {msg}", file=sys.stderr, flush=True) + except Exception: + pass + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + pass + except Exception: + # Daemon must never propagate — swallow and exit clean so nothing + # surfaces in the user's terminal. + pass diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh index 5fda0fe..071d7d7 100755 --- a/hooks/validate-pipeline-output.sh +++ b/hooks/validate-pipeline-output.sh @@ -1,41 +1,264 @@ #!/bin/bash -# Validates pipeline output files after Write tool use. +# Validates pipeline output files after Write tool use and emits lifecycle +# events + artifact uploads to the Autonoma dashboard on successful artifact +# production. All backend reporting lives here so the agent can never forget. +# # Exit 0 = allow (file is valid or not a pipeline file) # Exit 2 = block and send error message to Claude +set -u + INPUT=$(cat) -# Extract the file path from the tool input FILE_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tool_input',{}).get('file_path',''))" 2>/dev/null) if [ -z "$FILE_PATH" ]; then exit 0 fi -# Resolve the validators directory relative to this script +# ---------------------------------------------------------------------------- +# Lifecycle emission helpers +# ---------------------------------------------------------------------------- +_reporting_ready() { + local generation_id + generation_id=$(cat autonoma/.generation-id 2>/dev/null || echo '') + [ -n "$generation_id" ] && [ -n "${AUTONOMA_API_URL:-}" ] && [ -n "${AUTONOMA_API_KEY:-}" ] +} + +# emit_step_event [] — idempotent via marker. +emit_step_event() { + local step="$1" + local action="$2" + local name="${3:-}" + local marker="autonoma/.step-${step}-${action}" + + [ -f "$marker" ] && return 0 + mkdir -p autonoma 2>/dev/null || true + touch "$marker" + + _reporting_ready || return 0 + local generation_id + generation_id=$(cat autonoma/.generation-id) + + local payload + if [ -n "$name" ]; then + payload=$(printf '{"type":"step.%s","data":{"step":%s,"name":"%s"}}' "$action" "$step" "$name") + else + payload=$(printf '{"type":"step.%s","data":{"step":%s}}' "$action" "$step") + fi + + curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$payload" >/dev/null 2>&1 || true +} + +# upload_skills — bundle autonoma/skills/*.md and POST to /artifacts. Idempotent. +upload_skills() { + local marker="autonoma/.skills-uploaded" + [ -f "$marker" ] && return 0 + _reporting_ready || return 0 + [ -d autonoma/skills ] || return 0 + + local generation_id + generation_id=$(cat autonoma/.generation-id) + + python3 -c " +import os, json +skills = [] +d = 'autonoma/skills' +if os.path.isdir(d): + for f in sorted(os.listdir(d)): + if f.endswith('.md'): + with open(os.path.join(d, f)) as fh: + skills.append({'name': f, 'content': fh.read()}) +print(json.dumps({'skills': skills})) +" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- >/dev/null 2>&1 || true + + touch "$marker" +} + +# upload_test_cases — bundle autonoma/qa-tests/**/*.md (except INDEX) and POST. Idempotent. +upload_test_cases() { + local marker="autonoma/.test-cases-uploaded" + [ -f "$marker" ] && return 0 + _reporting_ready || return 0 + [ -d autonoma/qa-tests ] || return 0 + + local generation_id + generation_id=$(cat autonoma/.generation-id) + + python3 -c " +import os, json +test_cases = [] +for root, dirs, files in os.walk('autonoma/qa-tests'): + for f in sorted(files): + if f.endswith('.md') and f != 'INDEX.md': + path = os.path.join(root, f) + folder = os.path.relpath(root, 'autonoma/qa-tests') + with open(path) as fh: + content = fh.read() + entry = {'name': f, 'content': content} + if folder != '.': + entry['folder'] = folder + test_cases.append(entry) +print(json.dumps({'testCases': test_cases})) +" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- >/dev/null 2>&1 || true + + touch "$marker" +} + +# ---------------------------------------------------------------------------- +# Sentinel files: no validation, just event emission. +# - autonoma/.endpoint-implemented — env-factory agent writes this after the +# discover smoke test + factory-integrity check pass; signals step 3 complete. +# - autonoma/.endpoint-validated — scenario-validator writes this after the full +# up/down lifecycle passes for every scenario; signals step 4 complete AND +# unlocks the gate that allows qa-tests/*.md to be written. +# - autonoma/.step--ack — orchestrator writes this AFTER the user has +# confirmed via AskUserQuestion; this is the *only* path that emits +# step.started for step N. The UI can therefore show "waiting for +# confirmation" in the gap between step.completed (N-1) and step.started N. +# ---------------------------------------------------------------------------- +STEP_NAMES=("Knowledge Base" "Entity Audit" "Scenarios" "Implement" "Validate" "E2E Tests") + +case "$FILE_PATH" in + */autonoma/.endpoint-implemented) + # Hook-level factory-integrity gate. The env-factory agent's self-policed + # check has proven insufficient — see the post-mortem in the plugin repo. + # This validator parses autonoma/entity-audit.md, opens the handler named + # in the sentinel body, and blocks the write when any factory for a + # independently_created: true model contains an inline ORM write. + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + # Gate 1 — cheap syntactic checks (grep, mount, audit-flip cap). + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_endpoint_implemented.py" "$FILE_PATH" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 2 — creation_file immutability (catches the audit-rewrite attack + # without needing an LLM call). Cheap, fast, deterministic. + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_creation_file_immutable.py" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 3 — semantic per-model fidelity via claude -p fan-out. Reads the + # rubric from the docs URL at runtime (updatable without plugin changes). + # Blocks on hard failures; transient errors + missing config are + # warning-only so a broken docs endpoint does not freeze the pipeline. + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_factory_fidelity.py" "$FILE_PATH" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 3 prints progress to stderr even on success; surface it so the + # user sees the validator actually ran. + printf '%s\n' "$OUTPUT" >&2 + emit_step_event 3 completed "Implement" + exit 0 + ;; + */autonoma/.endpoint-validated) + emit_step_event 4 completed "Validate" + exit 0 + ;; + */autonoma/.pipeline-complete) + emit_step_event 5 completed "E2E Tests" + exit 0 + ;; + */autonoma/.step-*-ack) + ack_num=$(basename "$FILE_PATH" | sed -E 's/^\.step-([0-9]+)-ack$/\1/') + if [[ "$ack_num" =~ ^[0-9]+$ ]] && [ "$ack_num" -ge 0 ] && [ "$ack_num" -lt ${#STEP_NAMES[@]} ]; then + emit_step_event "$ack_num" started "${STEP_NAMES[$ack_num]}" + fi + # Snapshot entity-audit.md the moment the user confirms the audit is + # accepted (step-2-ack = "Scenarios starting", which fires AFTER the user + # approves the Entity Audit). This snapshot is diffed against the current + # audit at .endpoint-implemented time to detect the env-factory agent + # gaming the factory-integrity check by mass-flipping independently_created + # true -> false. See the post-mortem in the plugin repo. + if [ "$ack_num" = "2" ] && [ -f "autonoma/entity-audit.md" ] && [ ! -f "autonoma/.entity-audit-step2.md" ]; then + cp autonoma/entity-audit.md autonoma/.entity-audit-step2.md 2>/dev/null || true + fi + exit 0 + ;; +esac + +# ---------------------------------------------------------------------------- +# Validation gate: test files (INDEX.md or any qa-tests/*.md) cannot be written +# until the scenario-validator writes autonoma/.endpoint-validated. This +# prevents step 6 from generating tests against an unproven endpoint. +# ---------------------------------------------------------------------------- +case "$FILE_PATH" in + */autonoma/qa-tests/INDEX.md|*/autonoma/qa-tests/*.md) + if [ ! -f "autonoma/.endpoint-validated" ]; then + echo "VALIDATION GATE: Cannot write $FILE_PATH — autonoma/.endpoint-validated is missing. Complete Step 5 (scenario-validator) first. The validator must run discover/up/down against every scenario and write the sentinel before test generation is allowed." >&2 + exit 2 + fi + ;; +esac + +# ---------------------------------------------------------------------------- +# Validation routing +# ---------------------------------------------------------------------------- SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" VALIDATORS_DIR="$SCRIPT_DIR/validators" -# Ensure PyYAML is available (required for frontmatter parsing) python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null -# Only validate pipeline output files +STEP_COMPLETED="" +STEP_COMPLETED_NAME="" +STEP_STARTED="" +STEP_STARTED_NAME="" +POST_UPLOAD="" + case "$FILE_PATH" in */autonoma/AUTONOMA.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_kb.py" VALIDATOR_NAME="validate-kb" + STEP_COMPLETED=0 + STEP_COMPLETED_NAME="Knowledge Base" + STEP_STARTED=1 + STEP_STARTED_NAME="Entity Audit" + POST_UPLOAD="skills" ;; */autonoma/features.json) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_features.py" VALIDATOR_NAME="validate-features" ;; + */autonoma/entity-audit.md) + VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_entity_audit.py" + VALIDATOR_NAME="validate-entity-audit" + STEP_COMPLETED=1 + STEP_COMPLETED_NAME="Entity Audit" + STEP_STARTED=2 + STEP_STARTED_NAME="Scenarios" + ;; */autonoma/scenarios.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenarios.py" VALIDATOR_NAME="validate-scenarios" + STEP_COMPLETED=2 + STEP_COMPLETED_NAME="Scenarios" + STEP_STARTED=3 + STEP_STARTED_NAME="Implement" + ;; + */autonoma/scenario-recipes.json) + VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_recipes.py" + VALIDATOR_NAME="validate-scenario-recipes" + ;; + */autonoma/.scenario-validation.json) + VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py" + VALIDATOR_NAME="validate-scenario-validation" ;; */autonoma/qa-tests/INDEX.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py" VALIDATOR_NAME="validate-test-index" + STEP_COMPLETED=5 + STEP_COMPLETED_NAME="E2E Tests" + POST_UPLOAD="test_cases" ;; */autonoma/qa-tests/*/[!I]*.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py" @@ -46,25 +269,21 @@ case "$FILE_PATH" in ;; esac -# Check file exists if [ ! -f "$FILE_PATH" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: File does not exist: $FILE_PATH" >&2 exit 2 fi -# Check file is non-empty if [ ! -s "$FILE_PATH" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: File is empty: $FILE_PATH" >&2 exit 2 fi -# Check validator script exists if [ ! -f "$VALIDATOR_SCRIPT" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: Validator script not found: $VALIDATOR_SCRIPT" >&2 exit 2 fi -# Run the validator RESULT=$(python3 "$VALIDATOR_SCRIPT" "$FILE_PATH" 2>&1) EXIT_CODE=$? @@ -73,7 +292,6 @@ if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then exit 2 fi -# For INDEX.md, also validate directory structure if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py" DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1) @@ -84,4 +302,17 @@ if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then fi fi +# Validation passed — emit lifecycle events and upload artifacts. +# Note: step.started for the NEXT step is NOT emitted here. It fires only when +# the orchestrator writes autonoma/.step--ack after the user confirms via +# AskUserQuestion. That gap gives the UI its "waiting for confirmation" banner. +if [ -n "$STEP_COMPLETED" ]; then + emit_step_event "$STEP_COMPLETED" completed "$STEP_COMPLETED_NAME" +fi + +case "$POST_UPLOAD" in + skills) upload_skills ;; + test_cases) upload_test_cases ;; +esac + exit 0 diff --git a/hooks/validators/_audit_schema.py b/hooks/validators/_audit_schema.py new file mode 100644 index 0000000..f66891e --- /dev/null +++ b/hooks/validators/_audit_schema.py @@ -0,0 +1,67 @@ +"""Shared helpers for reading the entity audit with backwards compatibility. + +Two schemas exist on disk: + +- v1 (legacy): each model entry has `has_creation_code: bool` and, when true, + `creation_file` / `creation_function` / `side_effects`. Nothing about who + else mints the model. +- v2 (current): each model entry has `independently_created: bool` plus a + `created_by: [{owner, via, why}]` list. + +The compat shim translates v1 into v2 on read so callers only reason about +`independently_created`. We never rewrite the on-disk file here — that's the +audit generator's job. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml # type: ignore + + +def load_audit(path: Path) -> dict[str, dict]: + """Return {model_name: normalized_entry}. Empty dict if the file is missing or malformed.""" + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = _normalize(entry) + return out + + +def _normalize(entry: dict[str, Any]) -> dict[str, Any]: + """Return a copy of entry with `independently_created` + `created_by` populated. + + - If `independently_created` is already set, the entry is v2 — leave it alone + (just default `created_by` to []). + - Otherwise fall back to v1 `has_creation_code` and set `created_by: []`. + """ + out = dict(entry) + if "independently_created" not in out: + out["independently_created"] = bool(out.get("has_creation_code")) + if "created_by" not in out or out["created_by"] is None: + out["created_by"] = [] + return out + + +def is_independently_created(entry: dict[str, Any]) -> bool: + """True when the model has its own standalone creation path (factory-worthy).""" + if "independently_created" in entry: + return bool(entry["independently_created"]) + return bool(entry.get("has_creation_code")) diff --git a/hooks/validators/evals/README.md b/hooks/validators/evals/README.md new file mode 100644 index 0000000..5c5c7f8 --- /dev/null +++ b/hooks/validators/evals/README.md @@ -0,0 +1,53 @@ +# Factory-fidelity evals + +Ad-hoc eval harness for the semantic validator in `../validate_factory_fidelity.py`. +Each fixture simulates one model's Step 2 audit entry, current audit entry, +factory block, helper (optional), and original creation snippet, then asserts +the verdict the rubric should produce. + +## Run + +```bash +# against a local Astro dev server +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py + +# single fixture +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py --only good_uses_service + +# dump the rendered prompt without calling claude (for debugging) +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py --write-prompt +``` + +Requires the `claude` CLI on `PATH`. Model is configurable via +`AUTONOMA_FIDELITY_MODEL` (defaults to `sonnet`). + +## Fixture schema + +```json +{ + "model": "", + "expected_verdict": "pass" | "fail", + "expected_fail_criteria": [1, 2, 3, 4], + "step2_audit_entry": "", + "current_audit_entry": "", + "handler_path": "", + "factory_block": "", + "helper_section": "File: \\nFunction: \\n\\n```\\n\\n```", + "original_creation_file": "", + "original_creation_snippet": "" +} +``` + +Keep fixtures generic — placeholder names (`UserService`, `src/users/...`) only, +no references to real Autonoma-internal codebases. The rubric itself is generic; +evals that leak specific names would mask rubric bias. + +## When to add a fixture + +- New failure mode observed in the wild → add a `bad_*.json` that captures it + with the smallest reproduction, and confirm the current rubric catches it. +- Rubric edit → run the full suite against the new rubric. A fixture flipping + verdict is a signal that the criteria are ambiguous; tighten the wording. diff --git a/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json new file mode 100644 index 0000000..de57863 --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json @@ -0,0 +1,12 @@ +{ + "model": "Session", + "expected_verdict": "fail", + "expected_fail_criteria": [3], + "step2_audit_entry": "- name: Session\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.createSession\n side_effects:\n - Signs session token\n - Records session in audit log\n", + "current_audit_entry": "- name: Session\n has_creation_code: true\n creation_file: src/routes/autonoma/autonoma-factories.ts\n creation_function: createSession\n side_effects:\n - Signs session token\n - Records session in audit log\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "Session: defineFactory({\n async create(data, ctx) {\n return createSession(ctx.executor, data);\n },\n}),", + "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createSession\n\n```\n// Thin wrapper around buildAuth.createSession — preserves signing + audit.\nexport async function createSession(db, data) {\n const auth = buildAuth(db);\n return auth.createSession(data);\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n database: prismaAdapter(db),\n createSession: async (data) => {\n const token = signSessionToken(data);\n const session = await db.session.create({ data: { ...data, token } });\n await auditLog.record('session.created', { sessionId: session.id });\n return session;\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/bad_missing_owner.json b/hooks/validators/evals/fixtures/bad_missing_owner.json new file mode 100644 index 0000000..69e20aa --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_missing_owner.json @@ -0,0 +1,7 @@ +{ + "kind": "audit_validator", + "note": "Dependent whose created_by owner doesn't exist in the audit. The audit VALIDATOR (not the fidelity validator) must reject. This fixture is asserted via subprocess against validate_entity_audit.py.", + "audit_frontmatter": "model_count: 2\nfactory_count: 1\nmodels:\n - name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects: []\n created_by: []\n - name: Branch\n independently_created: false\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Minted inline — but Application is not in this audit.\"\n", + "expected_exit": 1, + "expected_stderr_substring": "owner='Application' does not match any model" +} diff --git a/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json new file mode 100644 index 0000000..9eb5f41 --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 2], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return ctx.executor.user.create({ data });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input, deps) {\n const hashed = await bcrypt.hash(input.password, 10);\n const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n await ensureOrgMembership(user, deps);\n await analytics.capture('user_signed_up', { userId: user.id });\n return user;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json new file mode 100644 index 0000000..82c3daf --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 2, 4], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.databaseHooks.user.create\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n - Fires signup webhook\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.databaseHooks.user.create\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n - Fires signup webhook\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return createUser(ctx.executor, data);\n },\n}),", + "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createUser\n\n```\n// better-auth's internal adapter does the same thing — no business logic\n// beyond the raw insert.\nexport async function createUser(db, data) {\n return db.user.create({ data, select: { id: true } });\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n database: prismaAdapter(db),\n databaseHooks: {\n user: {\n create: async (user) => {\n const created = await db.user.create({ data: user });\n await ensureOrgMembership(created, { db });\n await ensureBillingProvisioning(created, { db });\n await analytics.capture('user_signed_up', { userId: created.id });\n await fireSignupWebhook(created);\n return created;\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/dependent_skipped.json b/hooks/validators/evals/fixtures/dependent_skipped.json new file mode 100644 index 0000000..1e131cd --- /dev/null +++ b/hooks/validators/evals/fixtures/dependent_skipped.json @@ -0,0 +1,7 @@ +{ + "kind": "audit_filter", + "note": "Pure dependent (independently_created:false) must be silently skipped by the fidelity validator — no factory, no claude -p call. This fixture is evaluated by checking validate_factory_fidelity's model list, not by calling the LLM.", + "model": "BranchDeployment", + "expected_verdict": "skip", + "step2_audit_entry": "- name: BranchDeployment\n independently_created: false\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Minted inside the Application transaction so the default branch has a deployment row wired up from the start.\"\n" +} diff --git a/hooks/validators/evals/fixtures/dual_judged_on_standalone.json b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json new file mode 100644 index 0000000..569a5cd --- /dev/null +++ b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json @@ -0,0 +1,13 @@ +{ + "note": "Dual model (independently_created:true AND in someone's created_by). Must be judged ONLY on its standalone factory; the via-owner relationship must not affect the verdict.", + "model": "Branch", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: Branch\n independently_created: true\n creation_file: src/branches/branch.service.ts\n creation_function: BranchService.create\n side_effects:\n - Writes a default BranchSettings row\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n", + "current_audit_entry": "- name: Branch\n independently_created: true\n creation_file: src/branches/branch.service.ts\n creation_function: BranchService.create\n side_effects:\n - Writes a default BranchSettings row\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "Branch: defineFactory({\n async create(data, ctx) {\n return BranchService.create(data, { executor: ctx.executor });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/branches/branch.service.ts", + "original_creation_snippet": "export const BranchService = {\n async create(input, deps) {\n const branch = await deps.executor.branch.create({ data: input });\n await deps.executor.branchSettings.create({ data: { branchId: branch.id, theme: 'default' } });\n return branch;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json new file mode 100644 index 0000000..73934fb --- /dev/null +++ b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return createUserWithOnboarding(db, data, { signupHooks, platformEvents });\n },\n}),", + "helper_section": "File: src/auth/create-user-with-onboarding.ts\nFunction: createUserWithOnboarding\n\n```\nexport async function createUserWithOnboarding(db, data, { signupHooks, platformEvents }) {\n const user = await db.user.create({ data: { name: data.name, email: data.email } });\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n return user;\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = () => betterAuth({\n databaseHooks: {\n user: {\n create: {\n after: async (user) => {\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n },\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json new file mode 100644 index 0000000..f3eccef --- /dev/null +++ b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 4], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return db.user.create({ data: { name: data.name, email: data.email } });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = () => betterAuth({\n databaseHooks: {\n user: {\n create: {\n after: async (user) => {\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n },\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json new file mode 100644 index 0000000..b2c2078 --- /dev/null +++ b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/create-user.ts\n creation_function: createUser\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/create-user.ts\n creation_function: createUser\n extracted_to: src/auth/create-user.ts\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return createUser(data, { db: ctx.executor, analytics, billing });\n },\n}),", + "helper_section": "File: src/auth/create-user.ts\nFunction: createUser\n\n```\n// Extracted from the databaseHooks.user.create closure for Environment\n// Factory reuse (preserves Org + Member + billing provisioning).\nexport async function createUser(input, deps) {\n const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n await ensureOrgMembership(user, deps);\n await ensureBillingProvisioning(user, deps);\n await deps.analytics.capture('user_signed_up', { userId: user.id });\n return user;\n}\n```", + "original_creation_file": "src/auth/create-user.ts", + "original_creation_snippet": "export async function createUser(input, deps) {\n const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n await ensureOrgMembership(user, deps);\n await ensureBillingProvisioning(user, deps);\n await deps.analytics.capture('user_signed_up', { userId: user.id });\n return user;\n}" +} diff --git a/hooks/validators/evals/fixtures/good_uses_service.json b/hooks/validators/evals/fixtures/good_uses_service.json new file mode 100644 index 0000000..86684bd --- /dev/null +++ b/hooks/validators/evals/fixtures/good_uses_service.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return UserService.create(data, { executor: ctx.executor });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input, deps) {\n const hashed = await bcrypt.hash(input.password, 10);\n const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n await ensureOrgMembership(user, deps);\n await analytics.capture('user_signed_up', { userId: user.id });\n return user;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/helper_unresolvable_errors.json b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json new file mode 100644 index 0000000..552741a --- /dev/null +++ b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "error", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password\n - Provisions Org + Member\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password\n - Provisions Org + Member\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return createUserWithMystery(data);\n },\n}),", + "helper_section": "(The factory calls identifiers that were not resolvable as named imports: createUserWithMystery. Treat this as missing-context, not as evidence of a raw-write factory.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input) {\n return db.user.create({ data: input });\n },\n};" +} diff --git a/hooks/validators/evals/run_evals.py b/hooks/validators/evals/run_evals.py new file mode 100755 index 0000000..fc695ed --- /dev/null +++ b/hooks/validators/evals/run_evals.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +"""Evals for the semantic factory-fidelity validator + the entity-audit +validator's schema invariants. + +Each fixture is a self-contained JSON blob. The kind of fixture is chosen by +`expected_verdict` (or by the `kind` field for non-LLM fixtures): + +- `expected_verdict: "pass" | "fail" | "error"` — LLM fixture. Feeds the + prompt to `claude -p`, parses the JSON verdict, and asserts verdict + + failing criteria match. `error` is used when a fixture deliberately + withholds context (e.g. helper unresolvable) and the LLM should decline + to fail-judge rather than falsely fail. +- `expected_verdict: "skip"` — filter fixture. Asserts that the fidelity + validator's model selector would NOT include this model (i.e. the audit + entry is pure dependent / legacy false). No LLM call, no cost. +- `kind: "audit_validator"` — audit-validator fixture. Synthesises a + minimal entity-audit.md from `audit_frontmatter`, runs + `validate_entity_audit.py` as a subprocess, and asserts the exit code + + stderr substring. + +Run: + AUTONOMA_DOCS_URL=http://localhost:4321 python3 hooks/validators/evals/run_evals.py + + # single fixture: + ... run_evals.py --only good_uses_service + +Exits 0 on success, 1 on any mismatch. +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + +HERE = Path(__file__).resolve().parent +VALIDATORS = HERE.parent +sys.path.insert(0, str(VALIDATORS)) + +import validate_factory_fidelity as v # noqa: E402 +from _audit_schema import is_independently_created # noqa: E402 + + +def load_fixture(path: Path) -> dict: + return json.loads(path.read_text()) + + +def render_prompt(fixture: dict, rubric: str, tpl: str) -> str: + return ( + tpl.replace("{{RUBRIC}}", rubric) + .replace("{{MODEL}}", fixture["model"]) + .replace("{{STEP2_AUDIT_ENTRY}}", fixture["step2_audit_entry"]) + .replace("{{CURRENT_AUDIT_ENTRY}}", fixture["current_audit_entry"]) + .replace("{{HANDLER_PATH}}", fixture.get("handler_path", "(fixture)")) + .replace("{{FACTORY_BLOCK}}", fixture["factory_block"]) + .replace("{{HELPER_SECTION}}", fixture.get("helper_section", "(The factory does not call an external helper.)")) + .replace("{{ORIGINAL_CREATION_FILE}}", fixture.get("original_creation_file", "(unknown)")) + .replace("{{ORIGINAL_CREATION_SNIPPET}}", fixture.get("original_creation_snippet", "")) + ) + + +def run_skip_fixture(fixture: dict) -> tuple[bool, str]: + """Parse fixture's step2_audit_entry as a single-model YAML list and assert + is_independently_created() returns False (so the fidelity validator would skip it).""" + import yaml + try: + parsed = yaml.safe_load(fixture["step2_audit_entry"]) + except yaml.YAMLError as e: + return False, f"could not parse step2_audit_entry: {e}" + if not isinstance(parsed, list) or not parsed or not isinstance(parsed[0], dict): + return False, "step2_audit_entry must be a single-entry YAML list" + entry = parsed[0] + if is_independently_created(entry): + return False, ( + f"fidelity validator would NOT skip this model — is_independently_created " + f"returned True for entry {entry!r}" + ) + return True, "ok" + + +def run_audit_validator_fixture(fixture: dict) -> tuple[bool, str]: + fm = fixture["audit_frontmatter"] + expected_exit = int(fixture.get("expected_exit", 1)) + expected_substr = fixture.get("expected_stderr_substring", "") + with tempfile.TemporaryDirectory() as td: + audit = Path(td) / "entity-audit.md" + audit.write_text("---\n" + fm + "---\nBody\n") + proc = subprocess.run( + [sys.executable, str(VALIDATORS / "validate_entity_audit.py"), str(audit)], + capture_output=True, text=True, timeout=30, + ) + if proc.returncode != expected_exit: + return False, ( + f"exit mismatch: expected={expected_exit} observed={proc.returncode} " + f"stdout={proc.stdout!r} stderr={proc.stderr!r}" + ) + combined = (proc.stdout or "") + (proc.stderr or "") + if expected_substr and expected_substr not in combined: + return False, f"expected stderr substring {expected_substr!r} not in output:\n{combined}" + return True, "ok" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--only", help="Run a single fixture by name (no extension)") + ap.add_argument("--write-prompt", action="store_true", help="Write the rendered prompt for each LLM fixture to stdout and exit without calling claude") + args = ap.parse_args() + + os.chdir(VALIDATORS.parent.parent) + Path("autonoma").mkdir(exist_ok=True) + url_file = Path("autonoma/.docs-url") + restore = url_file.exists() + prior = url_file.read_text() if restore else None + docs = os.environ.get("AUTONOMA_DOCS_URL") + if docs: + url_file.write_text(docs.strip()) + + fixtures_dir = HERE / "fixtures" + fixtures = sorted(fixtures_dir.glob("*.json")) + if args.only: + fixtures = [f for f in fixtures if f.stem == args.only] + if not fixtures: + print(f"no fixture named {args.only}", file=sys.stderr) + return 1 + + # Only fetch rubric if we have any LLM fixtures left in the run list + needs_llm = any( + load_fixture(fp).get("expected_verdict") in ("pass", "fail", "error") + for fp in fixtures + ) + rubric = tpl = None + try: + if needs_llm: + pair = v.fetch_rubric() + if not pair: + print("could not fetch rubric — set AUTONOMA_DOCS_URL", file=sys.stderr) + return 1 + rubric, tpl = pair + finally: + if restore: + url_file.write_text(prior or "") + elif docs: + try: + url_file.unlink() + except OSError: + pass + + fails: list[str] = [] + for fp in fixtures: + fixture = load_fixture(fp) + kind = fixture.get("kind") + expected = fixture.get("expected_verdict") + + if kind == "audit_validator": + ok, detail = run_audit_validator_fixture(fixture) + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: audit_validator") + if not ok: + print(f" reason: {detail}") + fails.append(fp.stem) + continue + + if expected == "skip": + ok, detail = run_skip_fixture(fixture) + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: expected=skip observed={'skip' if ok else 'NOT-skipped'}") + if not ok: + print(f" reason: {detail}") + fails.append(fp.stem) + continue + + # LLM fixture + if args.write_prompt: + print(f"── {fp.stem} ──") + print(render_prompt(fixture, rubric, tpl)) + print() + continue + verdict = v.run_claude(render_prompt(fixture, rubric, tpl)) + observed = verdict.get("verdict", "error") + matched = observed == expected + detail_ok = True + if expected == "fail" and observed == "fail": + expected_fails = set(fixture.get("expected_fail_criteria") or []) + if expected_fails: + observed_fails = {c.get("id") for c in (verdict.get("criteria") or []) if c.get("status") == "fail"} + missing = expected_fails - observed_fails + if missing: + detail_ok = False + ok = matched and detail_ok + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: expected={expected} observed={observed}") + if not ok: + print(f" reason: expected criteria={fixture.get('expected_fail_criteria')} observed={[c for c in (verdict.get('criteria') or [])]}") + print(f" fix_hint: {verdict.get('fix_hint','')}") + fails.append(fp.stem) + + if fails: + print(f"\n{len(fails)} eval failure(s): {', '.join(fails)}", file=sys.stderr) + return 1 + print("\nall evals passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hooks/validators/validate_creation_file_immutable.py b/hooks/validators/validate_creation_file_immutable.py new file mode 100755 index 0000000..3bfdf87 --- /dev/null +++ b/hooks/validators/validate_creation_file_immutable.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Validator: `creation_file` must be immutable after Step 2. + +Rationale — Run 4 post-mortem. The env-factory agent evaded the factory +integrity hook by (a) extracting stubs into a new file under the handler's +directory and (b) rewriting `creation_file` in the audit to point at the stub, +so every downstream check validated against fabricated ground truth. + +Rule: for every model with `has_creation_code: true` in BOTH the Step 2 +snapshot AND the current audit, the `creation_file` column must not change. +Allowed transitions: + - row removed from current (not a change, model dropped) + - has_creation_code flipped true -> false (covered by the audit-flip cap) + - a new model added in current (snapshot has no row to compare) + +Exit 0 = clean. Exit 2 with actionable message on violation. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import yaml # type: ignore + +import sys as _sys +from pathlib import Path as _Path +_sys.path.insert(0, str(_Path(__file__).resolve().parent)) +from _audit_schema import is_independently_created # noqa: E402 + + +def load_audit(path: Path) -> dict[str, dict]: + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = entry + return out + + +def main() -> None: + snap = load_audit(Path("autonoma/.entity-audit-step2.md")) + cur = load_audit(Path("autonoma/entity-audit.md")) + if not snap: + # Snapshot missing — skip silently. The audit-flip check already + # prints a warning when appropriate. + sys.exit(0) + + violations: list[tuple[str, str, str]] = [] + for name, snap_entry in snap.items(): + if not is_independently_created(snap_entry): + continue + cur_entry = cur.get(name) + if cur_entry is None: + continue + if not is_independently_created(cur_entry): + # Flipped to false — caught elsewhere. + continue + snap_file = (snap_entry.get("creation_file") or "").strip() + cur_file = (cur_entry.get("creation_file") or "").strip() + if snap_file and cur_file and snap_file != cur_file: + violations.append((name, snap_file, cur_file)) + + if not violations: + sys.exit(0) + + lines = [ + f"CREATION_FILE IMMUTABILITY VIOLATED — {len(violations)} models had " + "their Step 2 `creation_file` column overwritten.", + "", + "The Step 2 audit is a statement about the existing codebase at " + "analysis time. Its `creation_file` column names where the real " + "creation logic lives BEFORE the factory was written. Overwriting it " + "to point at a file the factory agent created is the audit-rewrite " + "attack from the Run 4 post-mortem — it makes every downstream check " + "validate against fabricated ground truth.", + "", + "Violations (model: snapshot_path -> current_path):", + ] + for name, s, c in violations[:40]: + lines.append(f" - {name}: {s} -> {c}") + if len(violations) > 40: + lines.append(f" ... and {len(violations) - 40} more") + lines.append("") + lines.append( + "To fix: restore the original `creation_file` values from " + "autonoma/.entity-audit-step2.md. If you extracted the creation code " + "into a new helper, record that in an `extracted_to:` field — do NOT " + "overwrite `creation_file`. The audit's creation_file must continue " + "to name the file where the real business logic originally lives." + ) + sys.stderr.write("\n".join(lines) + "\n") + sys.exit(2) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_discover.py b/hooks/validators/validate_discover.py new file mode 100644 index 0000000..102cc8c --- /dev/null +++ b/hooks/validators/validate_discover.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Validates autonoma/discover.json structure.""" +import json +import re +import sys + + +TYPE_PATTERN = re.compile(r"^(?:[A-Za-z][A-Za-z0-9_]*|enum\([^()]+\))(?:\[\])?$") + + +filepath = sys.argv[1] + +try: + with open(filepath) as fh: + payload = json.load(fh) +except Exception as e: + print(f'Invalid JSON: {e}') + sys.exit(1) + +if not isinstance(payload, dict): + print('discover.json must contain a JSON object') + sys.exit(1) + +schema = payload.get('schema') +if not isinstance(schema, dict): + print('discover.json must contain a "schema" object') + sys.exit(1) + +required_schema_fields = ['models', 'edges', 'relations', 'scopeField'] +missing = [f for f in required_schema_fields if f not in schema] +if missing: + print(f'schema is missing required fields: {missing}') + sys.exit(1) + +models = schema.get('models') +if not isinstance(models, list) or len(models) == 0: + print('schema.models must be a non-empty list') + sys.exit(1) + +for i, model in enumerate(models): + if not isinstance(model, dict): + print(f'schema.models[{i}] must be an object') + sys.exit(1) + if not isinstance(model.get('name'), str) or not model.get('name', '').strip(): + print(f'schema.models[{i}].name must be a non-empty string') + sys.exit(1) + fields = model.get('fields') + if not isinstance(fields, list): + print(f'schema.models[{i}].fields must be a list') + sys.exit(1) + for j, field in enumerate(fields): + if not isinstance(field, dict): + print(f'schema.models[{i}].fields[{j}] must be an object') + sys.exit(1) + for key in ['name', 'type', 'isRequired', 'isId', 'hasDefault']: + if key not in field: + print(f'schema.models[{i}].fields[{j}] missing required field: {key}') + sys.exit(1) + field_type = field.get('type') + if not isinstance(field_type, str) or len(field_type.strip()) == 0: + print(f'schema.models[{i}].fields[{j}].type must be a non-empty string') + sys.exit(1) + if TYPE_PATTERN.match(field_type.strip()) is None: + print( + f'schema.models[{i}].fields[{j}].type must use a supported type format, got: {field_type}' + ) + sys.exit(1) + +edges = schema.get('edges') +if not isinstance(edges, list): + print('schema.edges must be a list') + sys.exit(1) + +for i, edge in enumerate(edges): + if not isinstance(edge, dict): + print(f'schema.edges[{i}] must be an object') + sys.exit(1) + for key in ['from', 'to', 'localField', 'foreignField', 'nullable']: + if key not in edge: + print(f'schema.edges[{i}] missing required field: {key}') + sys.exit(1) + +relations = schema.get('relations') +if not isinstance(relations, list): + print('schema.relations must be a list') + sys.exit(1) + +for i, relation in enumerate(relations): + if not isinstance(relation, dict): + print(f'schema.relations[{i}] must be an object') + sys.exit(1) + for key in ['parentModel', 'childModel', 'parentField', 'childField']: + if key not in relation: + print(f'schema.relations[{i}] missing required field: {key}') + sys.exit(1) + +scope_field = schema.get('scopeField') +if not isinstance(scope_field, str) or len(scope_field.strip()) == 0: + print('schema.scopeField must be a non-empty string') + sys.exit(1) + +print('OK') diff --git a/hooks/validators/validate_endpoint_implemented.py b/hooks/validators/validate_endpoint_implemented.py new file mode 100755 index 0000000..4e27ac4 --- /dev/null +++ b/hooks/validators/validate_endpoint_implemented.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +"""Validator for autonoma/.endpoint-implemented. + +Blocks the sentinel write when the handler file contains an inline ORM write +inside a defineFactory({ create }) body for a model the entity audit marked +has_creation_code: true. This is the #1 bug the env-factory agent ships and +the agent's self-policing factory-integrity check has proven insufficient. + +Inputs: path to .endpoint-implemented (via validate-pipeline-output.sh). +Reads: + - autonoma/entity-audit.md (frontmatter: models with has_creation_code true/false) + - the handler file path recorded in .endpoint-implemented body (first match of "handler: ") + +Exit codes: + 0 — clean + 2 — anti-pattern found; prints a Claude-facing error message on stderr + +The regex set mirrors the language list in agents/env-factory-generator.md's +"The one thing you MUST NOT do" section. Raw SQL literal INSERTs are not +matched here because distinguishing them from teardown DELETE strings in the +same factory block requires full parsing — the grep-level anti-pattern +detection catches the >95% case. +""" + +from __future__ import annotations + +import os +import re +import sys +from pathlib import Path + +import yaml # type: ignore + +SENTINEL_PATH = sys.argv[1] if len(sys.argv) > 1 else "" + +# Max number of models allowed to flip from has_creation_code: true to false +# between the Step 2 snapshot and the audit at .endpoint-implemented time. +# Overridable via env for unusual migrations; default 5 matches the agent's +# own recommendation in the third-run post-mortem. +AUDIT_FLIP_CAP = int(os.environ.get("AUTONOMA_AUDIT_FLIP_CAP", "5")) + +# Standalone server patterns: when the handler directory contains a file that +# starts its own HTTP server instead of exporting a router mounted on the main +# app, we block. This is the second bug from the third-run post-mortem. +STANDALONE_SERVER_PATTERNS = [ + re.compile(r"\bserve\s*\(\s*\{[^}]*\bfetch\b", re.DOTALL), # @hono/node-server + re.compile(r"\bapp\.listen\s*\("), # express / hono-node + re.compile(r"\bhttp\.createServer\s*\("), # raw node + re.compile(r"\buvicorn\.run\s*\("), # python + re.compile(r"\bFlask\s*\([^)]*\)[^\n]*\.run\s*\("), # flask + re.compile(r"\brun!\s*$", re.MULTILINE), # ruby sinatra-ish +] + +# Anti-pattern: ORM create/insert/upsert calls that almost certainly belong to +# a raw ORM write rather than a service/repository method call. +ORM_ANTI_PATTERN = re.compile( + r"\b(prisma|db|tx|ctx\.executor)\." # ORM root + r"[a-zA-Z_][a-zA-Z0-9_]*\." # model accessor + r"(create|createMany|insert|insertMany|upsert)\s*\(", + re.IGNORECASE, +) + +# A second class: Drizzle-style `tx.insert(xTable)` / `db.insert(xTable)`. +DRIZZLE_INSERT = re.compile( + r"\b(tx|db|ctx\.executor)\.insert\s*\(", +) + +FACTORY_HEADER = re.compile( + r"([A-Z][A-Za-z0-9_]*)\s*:\s*defineFactory\s*\(\s*\{", +) + + +def fail(message: str) -> None: + sys.stderr.write(message + "\n") + sys.exit(2) + + +def find_matching_brace(src: str, open_idx: int) -> int: + """Given index of `{`, return index of matching `}`. + + Naive balancer — ignores strings/comments. Good enough for generated + handler files that follow the standard shape. + """ + depth = 0 + i = open_idx + n = len(src) + while i < n: + c = src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + return i + i += 1 + return -1 + + +def extract_factory_bodies(src: str) -> list[tuple[str, str]]: + """Return list of (model_name, factory_inner_src).""" + out: list[tuple[str, str]] = [] + for m in FACTORY_HEADER.finditer(src): + model = m.group(1) + brace_open = src.find("{", m.end() - 1) + if brace_open < 0: + continue + brace_close = find_matching_brace(src, brace_open) + if brace_close < 0: + continue + out.append((model, src[brace_open + 1 : brace_close])) + return out + + +def extract_create_body(factory_src: str) -> str: + """Find the `create:` or `create(` body inside a factory config object.""" + # Pattern: create(data, ctx) { ... } OR create: async (data, ctx) => { ... } + # OR create: (data, ctx) => { ... } + create_start = re.search(r"\bcreate\s*[(:]", factory_src) + if not create_start: + return "" + # Find the first `{` after create_start. + brace_open = factory_src.find("{", create_start.end()) + if brace_open < 0: + return "" + brace_close = find_matching_brace(factory_src, brace_open) + if brace_close < 0: + return "" + return factory_src[brace_open + 1 : brace_close] + + +def parse_audit() -> dict[str, bool]: + """Return {model_name: has_creation_code}.""" + audit_path = Path("autonoma/entity-audit.md") + if not audit_path.exists(): + fail("Missing autonoma/entity-audit.md — cannot verify factory integrity.") + text = audit_path.read_text() + if not text.startswith("---"): + fail("autonoma/entity-audit.md missing YAML frontmatter.") + end = text.find("\n---", 3) + if end < 0: + fail("autonoma/entity-audit.md frontmatter not terminated.") + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError as e: + fail(f"autonoma/entity-audit.md frontmatter not valid YAML: {e}") + models = fm.get("models") or [] + out: dict[str, bool] = {} + for entry in models: + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = bool(entry.get("has_creation_code")) + return out + + +def resolve_handler_path() -> Path: + """Read the handler path recorded in .endpoint-implemented body.""" + if not SENTINEL_PATH or not Path(SENTINEL_PATH).exists(): + fail(".endpoint-implemented sentinel path not provided or missing.") + body = Path(SENTINEL_PATH).read_text() + + candidates: list[str] = [] + m = re.search(r"handler:\s*(\S+)", body, re.IGNORECASE) + if m: + candidates.append(m.group(1).rstrip(".,;:")) + # Fallback: extract every path-looking token ending in a source extension. + for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body): + candidates.append(tok.rstrip(".,;:")) + + seen: set[str] = set() + for cand in candidates: + if cand in seen: + continue + seen.add(cand) + p = Path(cand) + if not p.is_absolute(): + p = Path.cwd() / cand + if p.exists() and p.is_file(): + return p + + fail( + ".endpoint-implemented body must name the handler file (e.g. a line " + "'handler: apps/api/src/routes/autonoma/autonoma.handler.ts') so the " + "factory-integrity validator can locate it. Checked: " + + ", ".join(candidates[:8] or ["(no path tokens found)"]) + ) + return Path() # unreachable + + +def check_audit_flip() -> list[str]: + """Compare the Step 2 snapshot to the current audit; return error lines. + + Enforces a cap on how many models may flip from has_creation_code: true + to false between Step 2 ack and .endpoint-implemented. If no snapshot + exists (older projects that started before this hook shipped) we skip + silently — the snapshot is created automatically on .step-2-ack. + """ + snapshot = Path("autonoma/.entity-audit-step2.md") + current = Path("autonoma/entity-audit.md") + if not snapshot.exists() or not current.exists(): + return [] + + def _true_set(path: Path) -> set[str]: + text = path.read_text() + if not text.startswith("---"): + return set() + end = text.find("\n---", 3) + if end < 0: + return set() + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return set() + out: set[str] = set() + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if name and bool(entry.get("has_creation_code")): + out.add(str(name)) + return out + + before = _true_set(snapshot) + after = _true_set(current) + flipped = sorted(before - after) + if len(flipped) <= AUDIT_FLIP_CAP: + return [] + + lines = [ + f"AUDIT FLIP CAP EXCEEDED — {len(flipped)} models flipped from " + f"has_creation_code: true to false since Step 2 (cap: {AUDIT_FLIP_CAP}).", + "", + "The env-factory agent is editing ground truth to dodge the factory " + "integrity check. Branch 3 (\"audit is factually wrong\") is for cases " + "where the audit's creation_function does NOT exist or creates NOTHING " + "— not for cases where calling it is inconvenient (complex DI, external " + "side effects, Temporal workflows, bulk orchestrators). Those are " + "Branch 2 problems: extract helpers, wire constructor deps, or guard " + "external calls in the service itself.", + "", + "Models flipped (showing first 40):", + ] + for name in flipped[:40]: + lines.append(f" - {name}") + if len(flipped) > 40: + lines.append(f" ... and {len(flipped) - 40} more") + lines.append("") + lines.append( + "To proceed: (a) restore has_creation_code: true for the models above " + "and write real factories per the Per-model decision tree, or (b) if " + "you truly believe a subset should flip, ask the user to raise " + "AUTONOMA_AUDIT_FLIP_CAP and confirm the diff." + ) + return lines + + +def check_handler_mount(handler_path: Path) -> list[str]: + """Return error lines if the handler isn't mounted on the main app. + + Two checks: + 1. No sibling file in the handler directory starts its own server. + 2. Somewhere in the backend source tree, a file imports the handler + (by relative path, module path, or file basename). + """ + handler_dir = handler_path.parent + errors: list[str] = [] + + # 1) Detect standalone server files in the handler directory. + standalone_hits: list[tuple[Path, str]] = [] + for sibling in handler_dir.iterdir(): + if not sibling.is_file(): + continue + if sibling == handler_path: + continue + if sibling.name.endswith((".test.ts", ".test.js", ".spec.ts", ".spec.js")): + continue + if sibling.suffix not in {".ts", ".tsx", ".js", ".mjs", ".py", ".rb", ".go", ".rs", ".java"}: + continue + try: + text = sibling.read_text() + except OSError: + continue + for pat in STANDALONE_SERVER_PATTERNS: + if pat.search(text): + standalone_hits.append((sibling, pat.pattern)) + break + + if standalone_hits: + errors.append( + "STANDALONE SERVER DETECTED — the Autonoma handler must be mounted " + "as a route on the existing application, not run as its own HTTP " + "server. The following files bind their own port:" + ) + errors.append("") + for p, pat in standalone_hits: + errors.append(f" - {p} (matched: {pat})") + errors.append("") + errors.append( + "Fix: delete the standalone server file and mount the handler as a " + "route on the main app, following the same pattern every other " + "feature uses (e.g. `app.route(\"/api/autonoma\", router)` in Hono, " + "`app.use(\"/api/autonoma\", router)` in Express, or the equivalent " + "for your framework). Read the main app entry file first and copy " + "its existing routing pattern." + ) + errors.append("") + + # 2) Verify the handler is imported from somewhere reachable. We use the + # last two path segments (parent-dir/file-stem) to avoid false positives + # from unrelated packages that happen to share the parent-dir name (e.g. + # `@autonoma/logger` vs the local `autonoma/handler`). + handler_basename = handler_path.stem # e.g. "handler" + handler_parent_dir = handler_dir.name # e.g. "autonoma" + specific_fragment = f"{handler_parent_dir}/{handler_basename}" # "autonoma/handler" + # Also accept any file in the same parent directory (routes on the router + # file next to handler.ts still count as mounting — e.g. autonoma/router.ts + # is imported by app.ts and imports handler.ts). + import_patterns = [ + re.compile(rf"['\"][^'\"]*{re.escape(specific_fragment)}(?:['\"]|\.[a-z]+['\"])"), + re.compile(rf"\bfrom\s+[\w.]*{re.escape(handler_parent_dir)}\.{re.escape(handler_basename)}\b"), # python + ] + found_import = False + root = Path.cwd() + # Only scan source dirs with reasonable extensions. + source_exts = {".ts", ".tsx", ".js", ".mjs", ".cjs", ".py", ".rb", ".go", ".rs", ".java", ".ex", ".exs", ".php"} + skip_dirs = {"node_modules", ".git", "dist", "build", ".next", ".turbo", "target", "vendor", "__pycache__", "autonoma"} + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")] + for fn in filenames: + if not any(fn.endswith(ext) for ext in source_exts): + continue + fp = Path(dirpath) / fn + if fp.resolve() == handler_path.resolve(): + continue + if fp.parent.resolve() == handler_path.parent.resolve(): + # Don't count imports inside the handler's own directory — the + # standalone server.ts imports handler.ts but that isn't + # "reachable from the main app". + continue + try: + text = fp.read_text() + except OSError: + continue + for pat in import_patterns: + if pat.search(text): + found_import = True + break + if found_import: + break + if found_import: + break + + if not found_import: + errors.append( + f"HANDLER NOT MOUNTED — no file outside {handler_dir} imports the " + f"Autonoma handler. The endpoint is unreachable from the main " + f"application's routes." + ) + errors.append("") + errors.append( + "Fix: import the handler (or its router) from the main app's entry " + "file (e.g. apps/api/src/app.ts) and mount it on a route. The " + "Autonoma platform sends HMAC-signed requests to the main API's " + "public URL — a handler that nothing imports is dead code." + ) + errors.append("") + + return errors + + +def main() -> None: + audit = parse_audit() + handler_path = resolve_handler_path() + src = handler_path.read_text() + + violations: list[tuple[str, int, str]] = [] + factories = extract_factory_bodies(src) + + seen_models: set[str] = set() + for model, factory_src in factories: + seen_models.add(model) + if not audit.get(model): + # has_creation_code: false or unknown — ORM fallback is legitimate. + continue + create_body = extract_create_body(factory_src) + if not create_body: + continue + for m in ORM_ANTI_PATTERN.finditer(create_body): + line_no = create_body[: m.start()].count("\n") + 1 + snippet = create_body.splitlines()[line_no - 1].strip() + violations.append((model, line_no, snippet)) + for m in DRIZZLE_INSERT.finditer(create_body): + line_no = create_body[: m.start()].count("\n") + 1 + snippet = create_body.splitlines()[line_no - 1].strip() + violations.append((model, line_no, snippet)) + + # Flag audited models missing a factory entirely. + missing_factories = [ + name for name, has_code in audit.items() if has_code and name not in seen_models + ] + + audit_flip_errors = check_audit_flip() + mount_errors = check_handler_mount(handler_path) + + if not violations and not missing_factories and not audit_flip_errors and not mount_errors: + sys.exit(0) + + lines = [ + "FACTORY INTEGRITY CHECK FAILED — .endpoint-implemented will NOT be written.", + "", + f"Handler inspected: {handler_path}", + "", + ] + if violations: + lines.append( + "The following factories contain inline ORM writes for models the audit " + "marked has_creation_code: true. This is the #1 trap the env-factory " + "agent is warned about. You MUST call the audited creation_function " + "(extracting it first if needs_extraction: true). See the Per-model " + "decision tree and DI playbook in the env-factory prompt." + ) + lines.append("") + for model, line_no, snippet in violations: + lines.append(f" - {model} factory body: line {line_no}: {snippet}") + lines.append("") + if missing_factories: + lines.append( + "The following models are has_creation_code: true in the audit but have " + "no defineFactory registration in the handler:" + ) + for name in missing_factories: + lines.append(f" - {name}") + lines.append("") + if audit_flip_errors: + lines.extend(audit_flip_errors) + if mount_errors: + lines.extend(mount_errors) + if violations or missing_factories: + lines.append( + "To fix: re-run the Per-model decision tree for every failing model. If the " + "creation function is inline in a route/framework hook, extract it into a " + "named exported function, update entity-audit.md in place (clear " + "needs_extraction), then call the new function from the factory." + ) + fail("\n".join(lines)) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_entity_audit.py b/hooks/validators/validate_entity_audit.py new file mode 100644 index 0000000..ee65369 --- /dev/null +++ b/hooks/validators/validate_entity_audit.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Validates entity-audit.md frontmatter format. + +Supports two schemas: + +- v2 (current): each model has `independently_created: bool` and + `created_by: [{owner, via, why}]`. When `independently_created: true` the + entry must also have `creation_file`, `creation_function`, and optionally + `side_effects`. Dependents (`independently_created: false`) must have a + non-empty `created_by` pointing at a model that exists in the audit. + +- v1 (legacy): each model has `has_creation_code: bool`. We still accept it + and translate on read (see _audit_schema.py). v1 audits cannot express + `created_by`, so the dependent-has-owner invariant is vacuously satisfied. +""" +import sys +import yaml +from pathlib import Path + +filepath = sys.argv[1] +content = open(filepath).read() + +if not content.startswith('---'): + print('File must start with YAML frontmatter (---)') + sys.exit(1) + +parts = content.split('---', 2) +if len(parts) < 3: + print('Missing closing --- for frontmatter') + sys.exit(1) + +try: + fm = yaml.safe_load(parts[1]) +except Exception as e: + print(f'Invalid YAML in frontmatter: {e}') + sys.exit(1) + +if not isinstance(fm, dict): + print('Frontmatter must be a YAML mapping') + sys.exit(1) + +required = ['model_count', 'factory_count', 'models'] +missing = [f for f in required if f not in fm] +if missing: + print(f'Missing required frontmatter fields: {missing}') + sys.exit(1) + +for count_field in ['model_count', 'factory_count']: + val = fm.get(count_field) + if not isinstance(val, int) or val < 0: + print(f'{count_field} must be a non-negative integer') + sys.exit(1) + +if fm['model_count'] < 1: + print('model_count must be at least 1 — no models were audited') + sys.exit(1) + +models = fm.get('models') +if not isinstance(models, list) or len(models) == 0: + print('models must be a non-empty list') + sys.exit(1) + +if len(models) != fm['model_count']: + print(f'model_count ({fm["model_count"]}) does not match models array length ({len(models)})') + sys.exit(1) + + +def is_indep(model): + if 'independently_created' in model: + return bool(model['independently_created']) + return bool(model.get('has_creation_code')) + + +# First pass: sanity + collect names for cross-reference +names = set() +for i, model in enumerate(models): + if not isinstance(model, dict): + print(f'models[{i}] must be a mapping') + sys.exit(1) + if 'name' not in model or not isinstance(model['name'], str) or not model['name'].strip(): + print(f'models[{i}].name must be a non-empty string') + sys.exit(1) + names.add(model['name']) + +# Second pass: schema checks per model +factory_count = 0 +for i, model in enumerate(models): + name = model['name'] + has_v2 = 'independently_created' in model + has_v1 = 'has_creation_code' in model + if not has_v2 and not has_v1: + print(f'models[{i}] ({name}) missing classification (independently_created or has_creation_code)') + sys.exit(1) + if has_v2 and not isinstance(model['independently_created'], bool): + print(f'models[{i}] ({name}).independently_created must be a boolean') + sys.exit(1) + if has_v1 and not isinstance(model['has_creation_code'], bool): + print(f'models[{i}] ({name}).has_creation_code must be a boolean') + sys.exit(1) + + indep = is_indep(model) + + if indep: + factory_count += 1 + if 'creation_file' not in model or not isinstance(model.get('creation_file'), str): + print(f'models[{i}] ({name}) independently_created=true but missing creation_file') + sys.exit(1) + if 'creation_function' not in model or not isinstance(model.get('creation_function'), str): + print(f'models[{i}] ({name}) independently_created=true but missing creation_function') + sys.exit(1) + if 'side_effects' in model and not isinstance(model['side_effects'], list): + print(f'models[{i}] ({name}) side_effects must be a list when present') + sys.exit(1) + + # created_by invariants (v2 only — v1 has no such field) + cb = model.get('created_by') + if cb is None: + # v1 audits don't have it; v2 requires it (empty allowed for roots) + if has_v2: + print(f'models[{i}] ({name}) missing required field: created_by (list, may be empty)') + sys.exit(1) + continue + + if not isinstance(cb, list): + print(f'models[{i}] ({name}).created_by must be a list') + sys.exit(1) + + if not indep and len(cb) == 0: + print( + f'models[{i}] ({name}) is marked independently_created=false but has no ' + 'created_by entries. Every dependent must have at least one owner — ' + 'either find the creation path, or mark the model independently_created=true.' + ) + sys.exit(1) + + for j, owner_entry in enumerate(cb): + if not isinstance(owner_entry, dict): + print(f'models[{i}] ({name}).created_by[{j}] must be a mapping') + sys.exit(1) + for req in ('owner', 'via', 'why'): + val = owner_entry.get(req) + if not isinstance(val, str) or not val.strip(): + print( + f'models[{i}] ({name}).created_by[{j}].{req} must be a non-empty string' + ) + sys.exit(1) + if owner_entry['owner'] not in names: + print( + f'models[{i}] ({name}).created_by[{j}].owner={owner_entry["owner"]!r} ' + f'does not match any model in the audit. Check the owner name or add the owner model.' + ) + sys.exit(1) + if owner_entry['owner'] == name: + print(f'models[{i}] ({name}).created_by[{j}].owner cannot be the model itself') + sys.exit(1) + +if factory_count != fm['factory_count']: + # Autofix the count instead of blocking. Count-drift is bookkeeping, not a + # structural bug — the previous behaviour made the agent oscillate between + # stale counts on every edit. Warn loudly but keep the pipeline moving. + import sys as _sys + _sys.stderr.write( + f'[validate-entity-audit] autofixing factory_count: was ' + f'{fm["factory_count"]}, now {factory_count}\n' + ) + # Rewrite the file in place, preserving the body. + fm['factory_count'] = factory_count + new_fm = yaml.safe_dump(fm, sort_keys=False).rstrip() + "\n" + rewritten = '---\n' + new_fm + '---' + parts[2] + Path(filepath).write_text(rewritten) + +print('OK') diff --git a/hooks/validators/validate_factory_fidelity.py b/hooks/validators/validate_factory_fidelity.py new file mode 100755 index 0000000..c4f03da --- /dev/null +++ b/hooks/validators/validate_factory_fidelity.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python3 +"""Validator: semantic per-model factory fidelity using claude -p. + +Rationale — Run 4 post-mortem. Heuristic hooks have been bypassed three runs +in a row. The agent found factorings that satisfy every regex while still +producing bare-insert stubs. Only a model that can read the diff between the +Step 2 snapshot and the current code can tell a faithful extraction apart +from a stub. + +How it works: + 1. Fetch the factory-fidelity rubric + prompt template from + $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt + 2. Load the Step 2 audit snapshot (ground truth) and the current audit. + 3. For every model with independently_created: true in the snapshot, build a + prompt with: Step 2 entry, current entry, factory block, helper (if + imported), original creation_function snippet. + 4. Run `claude -p --output-format json ""` in parallel (bounded + concurrency). Each subprocess inherits the parent's model/provider + config via env. + 5. Parse JSON verdicts. If any fail, block the sentinel and return the + compiled feedback to the env-factory agent. + +Exit 0 = all verdicts pass (or no models to check). +Exit 2 = one or more verdicts failed; stderr contains the feedback the + agent should use to self-correct. +Exit 0 with a stderr warning = environment not configured to run the check + (missing docs URL, claude CLI not found). We do NOT block in that + case — the cheap hooks remain the primary gate. +""" + +from __future__ import annotations + +import concurrent.futures as futures +import json +import os +import re +import shutil +import subprocess +import sys +import time +import urllib.request +from pathlib import Path +from typing import Optional + +import yaml # type: ignore + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _audit_schema import is_independently_created # noqa: E402 + +CONCURRENCY = int(os.environ.get("AUTONOMA_FIDELITY_CONCURRENCY", "6")) +PER_MODEL_TIMEOUT = int(os.environ.get("AUTONOMA_FIDELITY_TIMEOUT", "180")) +MAX_MODELS = int(os.environ.get("AUTONOMA_FIDELITY_MAX_MODELS", "60")) +SNIPPET_MAX_LINES = 200 +DOCS_SLUG = "llms/test-planner/factory-fidelity-rubric.txt" + + +def warn(msg: str) -> None: + sys.stderr.write(f"[fidelity-validator] {msg}\n") + + +def load_audit(path: Path) -> dict[str, dict]: + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if isinstance(entry, dict): + name = entry.get("name") or entry.get("model") + if name: + out[str(name)] = entry + return out + + +def fetch_rubric() -> Optional[tuple[str, str]]: + """Return (rubric_text, prompt_template) or None if unavailable.""" + url_file = Path("autonoma/.docs-url") + if not url_file.exists(): + warn("autonoma/.docs-url missing — skipping semantic validation.") + return None + base = url_file.read_text().strip().rstrip("/") + url = f"{base}/{DOCS_SLUG}" + try: + with urllib.request.urlopen(url, timeout=20) as resp: + content = resp.read().decode("utf-8") + except Exception as e: + warn(f"failed to fetch rubric from {url}: {e} — skipping.") + return None + # Split at "## Prompt template" + parts = content.split("## Prompt template", 1) + if len(parts) != 2: + warn("rubric page is missing '## Prompt template' section — skipping.") + return None + rubric_md = parts[0] + # The prompt template lives between explicit HTML-comment delimiters to + # avoid clashing with the inner ``` fences the template itself contains. + tpl_match = re.search( + r"\s*\n(.*?)\n", + parts[1], + re.DOTALL, + ) + if not tpl_match: + warn("rubric page missing / markers — skipping.") + return None + return rubric_md.strip(), tpl_match.group(1) + + +def resolve_handler_path(sentinel_path: str) -> Optional[Path]: + body = Path(sentinel_path).read_text() + m = re.search(r"handler(?:_path)?:\s*(\S+)", body, re.IGNORECASE) + candidates: list[str] = [] + if m: + candidates.append(m.group(1).rstrip(".,;:")) + for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body): + candidates.append(tok.rstrip(".,;:")) + for cand in candidates: + p = Path(cand) + if not p.is_absolute(): + p = Path.cwd() / cand + if p.is_file(): + return p + return None + + +def find_factory_block(handler_src: str, model: str) -> str: + header = re.search(rf"\b{re.escape(model)}\s*:\s*defineFactory\s*\(\s*\{{", handler_src) + if not header: + return "" + brace = handler_src.find("{", header.end() - 1) + if brace < 0: + return "" + depth = 0 + i = brace + n = len(handler_src) + while i < n: + c = handler_src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + start = handler_src.rfind("\n", 0, header.start()) + 1 + return handler_src[start : i + 1] + i += 1 + return "" + + +def _load_tsconfig_paths(cwd: Path) -> list[tuple[str, list[str]]]: + """Best-effort parse of tsconfig.json compilerOptions.paths for alias + resolution. Walks up a few ancestors so apps/api/ monorepos pick up the + root tsconfig. Silently returns [] on any parse error.""" + roots: list[Path] = [cwd] + cur = cwd + for _ in range(4): + cur = cur.parent + roots.append(cur) + seen: set[Path] = set() + out: list[tuple[str, list[str]]] = [] + for root in roots: + for name in ("tsconfig.json", "tsconfig.base.json"): + p = root / name + if p in seen or not p.is_file(): + continue + seen.add(p) + try: + raw = p.read_text() + raw = re.sub(r"//[^\n]*", "", raw) + raw = re.sub(r",\s*([}\]])", r"\1", raw) + data = json.loads(raw) + except Exception: + continue + co = (data.get("compilerOptions") or {}) + base_url = co.get("baseUrl") or "." + base_dir = (p.parent / base_url).resolve() + for prefix, resolutions in (co.get("paths") or {}).items(): + if not isinstance(resolutions, list): + continue + resolved = [str((base_dir / r).resolve()) for r in resolutions if isinstance(r, str)] + out.append((prefix, resolved)) + return out + + +def _resolve_import_path(rel: str, handler_path: Path, alias_map: list[tuple[str, list[str]]]) -> Optional[Path]: + """Resolve an import specifier to a filesystem path. Handles relative + imports and TS path aliases with trailing /*.""" + candidates: list[Path] = [] + if rel.startswith("."): + candidates.append((handler_path.parent / rel).resolve()) + elif rel.startswith("/"): + candidates.append(Path(rel)) + else: + for prefix, resolutions in alias_map: + pref = prefix.rstrip("*").rstrip("/") + if rel == pref or rel.startswith(pref + "/"): + tail = rel[len(pref):].lstrip("/") + for r in resolutions: + root = r.rstrip("*").rstrip("/") + candidates.append(Path(root) / tail if tail else Path(root)) + for c in candidates: + for ext in (".ts", ".tsx", ".js", ".mjs", ""): + p = Path(str(c) + ext) + if p.is_file(): + return p + for idx in ("index.ts", "index.tsx", "index.js"): + p = c / idx + if p.is_file(): + return p + return None + + +_IDENT_BLOCKLIST = { + "if", "for", "while", "switch", "return", "await", "async", "new", + "Date", "String", "Number", "Boolean", "Object", "Array", "Error", + "Promise", "Map", "Set", "JSON", "Math", "console", "typeof", "function", + "require", "import", "catch", "throw", "void", "delete", "instanceof", +} + + +def find_helpers(handler_src: str, handler_path: Path, factory_block: str) -> list[tuple[Path, str, str]]: + """Return every (helper_path, helper_fn_name, helper_source) the factory + block invokes via a named import in the handler. Strips string/template + literals first so identifiers inside quotes don't produce false calls.""" + if not factory_block: + return [] + stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block) + candidates = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST + alias_map = _load_tsconfig_paths(Path.cwd()) + imports: dict[str, str] = {} + for m in re.finditer( + r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]", + handler_src, + ): + spec = m.group(2) + for name in m.group(1).split(","): + name = name.strip() + if " as " in name: + name = name.split(" as ", 1)[1].strip() + if name: + imports[name] = spec + out: list[tuple[Path, str, str]] = [] + seen: set[Path] = set() + for name in sorted(candidates): + spec = imports.get(name) + if not spec: + continue + resolved = _resolve_import_path(spec, handler_path, alias_map) + if not resolved or resolved in seen: + continue + seen.add(resolved) + try: + text = resolved.read_text() + except OSError: + continue + snippet = extract_fn_snippet(text, name) or text[:4000] + out.append((resolved, name, snippet)) + return out + + +def find_helper(handler_src: str, handler_path: Path, model: str, factory_block: str) -> Optional[tuple[Path, str, str]]: + """Legacy single-helper accessor kept for backwards compat.""" + helpers = find_helpers(handler_src, handler_path, factory_block) + return helpers[0] if helpers else None + + +def _unresolved_calls(handler_src: str, factory_block: str, resolved: list[tuple[Path, str, str]]) -> list[str]: + """Identifiers called in the factory block that weren't in resolved + not in the blocklist.""" + if not factory_block: + return [] + stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block) + calls = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST + resolved_names = {name for _, name, _ in resolved} + # Also strip anything that looks like a member access call (obj.method() captured as "method") + # by requiring the name to appear as a named import too. + imported = set(re.findall( + r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"][^'\"]+['\"]", + handler_src, + )) + imported_names: set[str] = set() + for group in imported: + for n in group.split(","): + n = n.strip() + if " as " in n: + n = n.split(" as ", 1)[1].strip() + if n: + imported_names.add(n) + return sorted((calls & imported_names) - resolved_names) + + +def extract_fn_snippet(src: str, fn_name: str) -> str: + """Find `export (async )?function fn_name(` or `fn_name =` and return body.""" + patterns = [ + rf"export\s+(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(", + rf"export\s+const\s+{re.escape(fn_name)}\s*=", + rf"(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(", + ] + for pat in patterns: + m = re.search(pat, src) + if not m: + continue + # Grab until the matching closing brace of the first "{" after m.end() + brace = src.find("{", m.end()) + if brace < 0: + continue + depth = 0 + i = brace + n = len(src) + while i < n: + c = src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + start = src.rfind("\n", 0, m.start()) + 1 + snippet = src[start : i + 1] + return "\n".join(snippet.splitlines()[:SNIPPET_MAX_LINES]) + i += 1 + return "" + + +def load_original_snippet(snap_entry: dict) -> tuple[str, str]: + """Return (file_path_str, snippet).""" + cfile = (snap_entry.get("creation_file") or "").strip() + cfn = (snap_entry.get("creation_function") or "").strip() + if not cfile: + return "", "(Step 2 audit did not record a creation_file)" + p = Path(cfile) + if not p.is_absolute(): + p = Path.cwd() / cfile + if not p.is_file(): + return cfile, f"(file not found at {p})" + try: + text = p.read_text() + except OSError as e: + return cfile, f"(could not read file: {e})" + if cfn: + snip = extract_fn_snippet(text, cfn) + if snip: + return cfile, snip + return cfile, "\n".join(text.splitlines()[:SNIPPET_MAX_LINES]) + + +def yaml_entry(entry: dict) -> str: + return yaml.safe_dump([entry], sort_keys=False).rstrip() + + +def fill_template( + tpl: str, + rubric: str, + model: str, + snap_entry: dict, + cur_entry: Optional[dict], + handler_path: Path, + factory_block: str, + helpers: list[tuple[Path, str, str]], + unresolved_calls: list[str], + orig_path: str, + orig_snippet: str, +) -> str: + if helpers: + blocks = [] + for p, name, body in helpers: + blocks.append(f"File: {p}\nFunction: {name}\n\n```\n{body}\n```") + helper_section = "\n\n".join(blocks) + if unresolved_calls: + helper_section += ( + "\n\n(Additional identifiers called by the factory were not resolvable " + f"as imports and may or may not be helpers: {', '.join(unresolved_calls)})" + ) + elif unresolved_calls: + helper_section = ( + "(The factory calls identifiers that were not resolvable as named imports: " + f"{', '.join(unresolved_calls)}. Treat this as missing-context, not as evidence " + "of a raw-write factory.)" + ) + else: + helper_section = "(The factory does not call an external helper.)" + + needs_extraction = "true" if snap_entry.get("needs_extraction") else "false" + extracted_to = str(snap_entry.get("extracted_to") or "").strip() or "(not set)" + + return ( + tpl.replace("{{RUBRIC}}", rubric) + .replace("{{MODEL}}", model) + .replace("{{STEP2_AUDIT_ENTRY}}", yaml_entry(snap_entry)) + .replace( + "{{CURRENT_AUDIT_ENTRY}}", + yaml_entry(cur_entry) if cur_entry else "(model not present in current audit)", + ) + .replace("{{HANDLER_PATH}}", str(handler_path)) + .replace("{{FACTORY_BLOCK}}", factory_block or "(factory registration not found)") + .replace("{{HELPER_SECTION}}", helper_section) + .replace("{{NEEDS_EXTRACTION}}", needs_extraction) + .replace("{{EXTRACTED_TO}}", extracted_to) + .replace("{{ORIGINAL_CREATION_FILE}}", orig_path or "(unknown)") + .replace("{{ORIGINAL_CREATION_SNIPPET}}", orig_snippet) + ) + + +def run_claude(prompt: str) -> dict: + """Spawn `claude -p --output-format json` with the prompt on stdin. + + Model is configurable via AUTONOMA_FIDELITY_MODEL (defaults to "sonnet", + which is cheap, fast, and reliable for bounded rubric tasks). Set to empty + string to inherit whatever model the CLI picks. + """ + cmd = ["claude", "-p", "--output-format", "json"] + model = os.environ.get("AUTONOMA_FIDELITY_MODEL", "sonnet") + if model: + cmd.extend(["--model", model]) + try: + proc = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=PER_MODEL_TIMEOUT, + ) + except subprocess.TimeoutExpired: + return {"verdict": "error", "error": "timeout"} + except FileNotFoundError: + return {"verdict": "error", "error": "claude CLI not found"} + if proc.returncode != 0: + return {"verdict": "error", "error": f"claude exit {proc.returncode}: {proc.stderr[:400]}"} + out = proc.stdout.strip() + # Outer envelope from `claude -p --output-format json` wraps the assistant + # response in a JSON object with a "result" field containing the text. + try: + envelope = json.loads(out) + except json.JSONDecodeError: + # Assume raw stdout is the JSON we asked for. + return parse_verdict(out) + inner = envelope.get("result") or envelope.get("text") or envelope.get("output") or "" + if isinstance(inner, list): + inner = "\n".join(str(x) for x in inner) + return parse_verdict(str(inner)) + + +def parse_verdict(text: str) -> dict: + text = text.strip() + if text.startswith("```"): + text = re.sub(r"^```[a-zA-Z]*\n", "", text) + text = re.sub(r"\n```\s*$", "", text) + try: + return json.loads(text) + except json.JSONDecodeError: + m = re.search(r"\{.*\}", text, re.DOTALL) + if m: + try: + return json.loads(m.group(0)) + except json.JSONDecodeError: + pass + return {"verdict": "error", "error": f"could not parse verdict: {text[:300]}"} + + +def validate_one(task: dict) -> dict: + verdict = run_claude(task["prompt"]) + verdict["model"] = task["model"] + return verdict + + +def main() -> None: + if os.environ.get("AUTONOMA_SKIP_FIDELITY") == "1": + warn("AUTONOMA_SKIP_FIDELITY=1 — skipping.") + sys.exit(0) + + if shutil.which("claude") is None: + warn("`claude` CLI not on PATH — skipping semantic validation.") + sys.exit(0) + + if len(sys.argv) < 2: + warn("no sentinel path provided") + sys.exit(0) + sentinel = sys.argv[1] + + rubric_pair = fetch_rubric() + if not rubric_pair: + sys.exit(0) + rubric, tpl = rubric_pair + + snap = load_audit(Path("autonoma/.entity-audit-step2.md")) + cur = load_audit(Path("autonoma/entity-audit.md")) + if not snap: + warn("Step 2 snapshot missing — skipping.") + sys.exit(0) + + handler_path = resolve_handler_path(sentinel) + if handler_path is None: + warn("handler path not resolvable from sentinel — skipping.") + sys.exit(0) + handler_src = handler_path.read_text() + + models = [name for name, entry in snap.items() if is_independently_created(entry)] + if not models: + sys.exit(0) + if len(models) > MAX_MODELS: + warn(f"truncating from {len(models)} to {MAX_MODELS} models (override via AUTONOMA_FIDELITY_MAX_MODELS).") + models = models[:MAX_MODELS] + + tasks = [] + for model in models: + snap_entry = snap[model] + cur_entry = cur.get(model) + factory_block = find_factory_block(handler_src, model) + helpers = find_helpers(handler_src, handler_path, factory_block) if factory_block else [] + unresolved = _unresolved_calls(handler_src, factory_block, helpers) if factory_block else [] + orig_path, orig_snippet = load_original_snippet(snap_entry) + prompt = fill_template( + tpl, rubric, model, snap_entry, cur_entry, handler_path, + factory_block, helpers, unresolved, orig_path, orig_snippet, + ) + tasks.append({"model": model, "prompt": prompt}) + + t0 = time.time() + warn(f"running semantic validation for {len(tasks)} models (concurrency={CONCURRENCY}).") + + results: list[dict] = [] + with futures.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex: + for res in ex.map(validate_one, tasks): + results.append(res) + + elapsed = time.time() - t0 + warn(f"semantic validation complete in {elapsed:.1f}s.") + + failures = [r for r in results if r.get("verdict") == "fail"] + errors = [r for r in results if r.get("verdict") == "error"] + passes = [r for r in results if r.get("verdict") == "pass"] + + warn(f"results: {len(passes)} pass, {len(failures)} fail, {len(errors)} error.") + + if errors and not failures: + # Don't block on our own infra errors; log and allow. + warn("no hard failures; transient errors will not block the sentinel.") + for e in errors[:5]: + warn(f" - {e.get('model','?')}: {e.get('error','')[:200]}") + sys.exit(0) + + if not failures: + sys.exit(0) + + lines = [ + f"FACTORY FIDELITY CHECK FAILED — {len(failures)} of {len(results)} models " + "do not faithfully reproduce their Step 2 creation behaviour.", + "", + "This is the semantic check. It reads the Step 2 snapshot (ground truth), " + "the current audit, the factory registration, and the original creation " + "function, then applies the rubric at:", + " $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt", + "", + "Per-model feedback:", + "", + ] + for r in failures: + model = r.get("model", "?") + lines.append(f"── {model} ──") + for c in r.get("criteria", []) or []: + if c.get("status") == "fail": + lines.append(f" ✗ Criterion {c.get('id')}: {c.get('reason','')}") + fix = r.get("fix_hint", "") + if fix: + lines.append(f" → Fix: {fix}") + lines.append("") + lines.append( + "To fix: for each failing model, either (a) call the original " + "creation_function from the Step 2 audit (the one in the APPLICATION " + "codebase, not the helper the factory wrote), or (b) make the helper a " + "thin wrapper that calls that function. Do NOT leave bare ORM inserts " + "in the helper. If a side effect truly conflicts with the SDK's " + "scenario tree (e.g. sibling rows get created twice), document in a " + "comment which sibling factory owns that row and reference it." + ) + sys.stderr.write("\n".join(lines) + "\n") + sys.exit(2) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_scenario_recipes.py b/hooks/validators/validate_scenario_recipes.py new file mode 100644 index 0000000..70ad4b1 --- /dev/null +++ b/hooks/validators/validate_scenario_recipes.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +"""Validates autonoma/scenario-recipes.json schema.""" +import json +import re +import sys +from pathlib import Path + + +TYPE_PATTERN = re.compile(r"^(?:[A-Za-z][A-Za-z0-9_]*|enum\([^()]+\))(?:\[\])?$") +TOKEN_OR_REF_PATTERN = re.compile(r"^(?:\{\{\w+\}\}|_ref:.+)$") + + +def _parse_type(type_name): + if not isinstance(type_name, str): + return None + + is_list = type_name.endswith('[]') + base = type_name[:-2] if is_list else type_name + if not TYPE_PATTERN.match(type_name): + return None + + if base.startswith('enum(') and base.endswith(')'): + values = [value.strip() for value in base[5:-1].split(',') if value.strip()] + return {'kind': 'enum', 'values': values, 'is_list': is_list} + + return {'kind': 'scalar', 'name': base, 'is_list': is_list} + + +def _resolve_source_path(filepath, source_path): + recipe_dir = Path(filepath).resolve().parent + raw_path = Path(source_path) + + if raw_path.is_absolute(): + return raw_path + + for base_dir in (recipe_dir, *recipe_dir.parents): + candidate = (base_dir / source_path).resolve() + if candidate.is_file(): + return candidate + + return (recipe_dir / source_path).resolve() + + +def _load_discover_schema(filepath, source): + if not isinstance(source, dict): + return None, None + + discover_path = source.get('discoverPath') + if not isinstance(discover_path, str) or len(discover_path.strip()) == 0: + return None, None + + resolved_path = _resolve_source_path(filepath, discover_path) + if not resolved_path.is_file(): + return None, f'source.discoverPath does not exist: {discover_path}' + + try: + with open(resolved_path) as fh: + payload = json.load(fh) + except Exception as exc: + return None, f'source.discoverPath is not valid JSON: {exc}' + + schema = payload.get('schema') + if not isinstance(schema, dict): + return None, 'source.discoverPath must point to a discover file with a "schema" object' + + models = schema.get('models') + if not isinstance(models, list): + return None, 'source.discoverPath schema.models must be a list' + + model_map = {} + for model in models: + if not isinstance(model, dict): + continue + name = model.get('name') + fields = model.get('fields') + if not isinstance(name, str) or not isinstance(fields, list): + continue + field_map = {} + for field in fields: + if not isinstance(field, dict): + continue + field_name = field.get('name') + field_type = field.get('type') + if isinstance(field_name, str) and isinstance(field_type, str): + field_map[field_name] = field + model_map[name] = field_map + + # Collect relation field names used as nesting keys in nested tree create payloads + relation_fields = set() + # Map child FK fields to their parent model for flat-format detection. + # e.g. { ("Users", "organizationId"): "Organizations" } + nestable_fk_edges = {} + relations = schema.get('relations') + if isinstance(relations, list): + for rel in relations: + if isinstance(rel, dict) and isinstance(rel.get('parentField'), str): + relation_fields.add(rel['parentField']) + # A relation where childField is an FK column on the child model means + # the child SHOULD be nested under the parent via the parentField key. + if (isinstance(rel, dict) + and isinstance(rel.get('parentModel'), str) + and isinstance(rel.get('childModel'), str) + and isinstance(rel.get('childField'), str) + and isinstance(rel.get('parentField'), str)): + child_model = rel['childModel'] + child_fk = rel['childField'] + parent_model = rel['parentModel'] + # Only record edges where child FK is a real column (not the reverse relation) + if child_model in model_map and child_fk in model_map[child_model]: + nestable_fk_edges[(child_model, child_fk)] = parent_model + + return { + 'models': model_map, + 'relation_fields': relation_fields, + 'nestable_fk_edges': nestable_fk_edges, + }, None + + +def _validate_value_against_field(value, field, path): + parsed_type = _parse_type(field.get('type')) + if parsed_type is None: + return f'{path} has unsupported discover type: {field.get("type")}' + + if isinstance(value, str) and TOKEN_OR_REF_PATTERN.match(value): + return None + + if parsed_type['is_list']: + if not isinstance(value, list): + return f'{path} must be a list because discover type is {field.get("type")}' + return None + + if isinstance(value, list): + return f'{path} must not be a list because discover type is {field.get("type")}' + + if parsed_type['kind'] == 'enum' and isinstance(value, str): + if value not in parsed_type['values']: + return ( + f'{path} has invalid enum value "{value}". ' + f'Expected one of {parsed_type["values"]}' + ) + + return None + + +def _validate_create_against_discover(create, discover_info, recipe_index): + if discover_info is None: + return None + + model_map = discover_info['models'] + relation_fields = discover_info['relation_fields'] + nestable_fk_edges = discover_info.get('nestable_fk_edges', {}) + + top_level_models = set(create.keys()) + + for model_name, entities in create.items(): + if model_name not in model_map: + return f'recipes[{recipe_index}].create.{model_name} is not present in discover schema' + if not isinstance(entities, list): + return f'recipes[{recipe_index}].create.{model_name} must be an array' + + field_map = model_map[model_name] + for entity_index, entity in enumerate(entities): + if not isinstance(entity, dict): + return f'recipes[{recipe_index}].create.{model_name}[{entity_index}] must be an object' + for field_name, value in entity.items(): + if field_name.startswith('_'): + continue + # Skip relation nesting keys (e.g. userses, projectses) + if field_name in relation_fields: + continue + if field_name not in field_map: + return ( + f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} ' + 'is not present in discover schema' + ) + + # Detect flat-format _ref on FK fields that should be nested. + # If an entity uses {"_ref": "..."} for a FK field whose parent + # model is also a top-level key in create, the recipe is using + # flat format instead of the required nested tree structure. + if (isinstance(value, dict) + and '_ref' in value + and len(value) == 1): + parent_model = nestable_fk_edges.get((model_name, field_name)) + if parent_model and parent_model in top_level_models: + return ( + f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} ' + f'uses {{"_ref": "..."}} but {model_name} should be nested under ' + f'{parent_model} using the relation field instead of flat _ref. ' + f'The dashboard may reorder JSON keys, which breaks flat _ref resolution. ' + f'Use a nested tree structure rooted at the scope entity.' + ) + + error = _validate_value_against_field( + value, + field_map[field_name], + f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name}', + ) + if error is not None: + return error + + return None + +filepath = sys.argv[1] + +try: + data = json.load(open(filepath)) +except Exception as e: + print(f'Invalid JSON: {e}') + sys.exit(1) + +if not isinstance(data, dict): + print('Root must be a JSON object') + sys.exit(1) + +required = ['version', 'source', 'validationMode', 'recipes'] +missing = [f for f in required if f not in data] +if missing: + print(f'Missing required fields: {missing}') + sys.exit(1) + +version = data.get('version') +if version != 1: + print('version must be exactly 1') + sys.exit(1) + +source = data.get('source') +if not isinstance(source, dict): + print('source must be an object') + sys.exit(1) + +for field in ['discoverPath', 'scenariosPath']: + value = source.get(field) + if not isinstance(value, str) or len(value.strip()) == 0: + print(f'source.{field} must be a non-empty string') + sys.exit(1) + +discover_info, discover_error = _load_discover_schema(filepath, source) +if discover_error is not None: + print(discover_error) + sys.exit(1) + +validation_mode = data.get('validationMode') +valid_modes = {'sdk-check', 'endpoint-lifecycle'} +if validation_mode not in valid_modes: + print(f'validationMode must be one of {valid_modes}, got: {validation_mode}') + sys.exit(1) + +recipes = data.get('recipes') +if not isinstance(recipes, list) or len(recipes) < 3: + print('recipes must be an array with at least 3 entries') + sys.exit(1) + +required_names = {'standard', 'empty', 'large'} +found_names = set() + +for i, recipe in enumerate(recipes): + if not isinstance(recipe, dict): + print(f'recipes[{i}] must be an object') + sys.exit(1) + + for field in ['name', 'description', 'create', 'validation']: + if field not in recipe: + print(f'recipes[{i}] missing required field: {field}') + sys.exit(1) + + name = recipe.get('name') + if not isinstance(name, str) or len(name.strip()) == 0: + print(f'recipes[{i}].name must be a non-empty string') + sys.exit(1) + found_names.add(name) + + description = recipe.get('description') + if not isinstance(description, str) or len(description.strip()) == 0: + print(f'recipes[{i}].description must be a non-empty string') + sys.exit(1) + + create = recipe.get('create') + if not isinstance(create, dict) or len(create) == 0: + print(f'recipes[{i}].create must be a non-empty object') + sys.exit(1) + create_error = _validate_create_against_discover(create, discover_info, i) + if create_error is not None: + print(create_error) + sys.exit(1) + + validation = recipe.get('validation') + if not isinstance(validation, dict): + print(f'recipes[{i}].validation must be an object') + sys.exit(1) + + for field in ['status', 'method', 'phase']: + if field not in validation: + print(f'recipes[{i}].validation missing required field: {field}') + sys.exit(1) + + if validation.get('status') != 'validated': + print(f'recipes[{i}].validation.status must be exactly "validated"') + sys.exit(1) + + if validation.get('phase') != 'ok': + print(f'recipes[{i}].validation.phase must be exactly "ok"') + sys.exit(1) + + method = validation.get('method') + valid_methods = {'checkScenario', 'checkAllScenarios', 'endpoint-up-down'} + if method not in valid_methods: + print(f'recipes[{i}].validation.method must be one of {valid_methods}, got: {method}') + sys.exit(1) + + for field in ['up_ms', 'down_ms']: + if field in validation: + value = validation.get(field) + if not isinstance(value, int) or value < 0: + print(f'recipes[{i}].validation.{field} must be a non-negative integer') + sys.exit(1) + + # --- variables validation (optional) --- + variables = recipe.get('variables') + if variables is not None: + if not isinstance(variables, dict): + print(f'recipes[{i}].variables must be an object') + sys.exit(1) + + # Find all tokens used in create + def _find_tokens(obj): + tokens = set() + if isinstance(obj, str): + tokens.update(re.findall(r'\{\{(\w+)\}\}', obj)) + elif isinstance(obj, list): + for item in obj: + tokens.update(_find_tokens(item)) + elif isinstance(obj, dict): + for v in obj.values(): + tokens.update(_find_tokens(v)) + return tokens + + tokens_in_create = _find_tokens(create) + var_keys = set(variables.keys()) + + missing_vars = tokens_in_create - var_keys + if missing_vars: + print(f'recipes[{i}]: tokens without variable definitions: {sorted(missing_vars)}') + sys.exit(1) + + unused_vars = var_keys - tokens_in_create + if unused_vars: + print(f'recipes[{i}]: unused variable definitions: {sorted(unused_vars)}') + sys.exit(1) + + allowed_strategies = {'literal', 'derived', 'faker'} + for var_name, var_def in variables.items(): + if not isinstance(var_def, dict): + print(f'recipes[{i}].variables.{var_name} must be an object') + sys.exit(1) + strategy = var_def.get('strategy') + if strategy not in allowed_strategies: + print(f'recipes[{i}].variables.{var_name}.strategy must be one of {allowed_strategies}, got: {strategy}') + sys.exit(1) + if strategy == 'literal': + if 'value' not in var_def: + print(f'recipes[{i}].variables.{var_name}: literal must have "value"') + sys.exit(1) + val = var_def['value'] + if not isinstance(val, (str, int, float, bool)) and val is not None: + print(f'recipes[{i}].variables.{var_name}: literal.value must be a scalar') + sys.exit(1) + elif strategy == 'derived': + if var_def.get('source') != 'testRunId': + print(f'recipes[{i}].variables.{var_name}: derived.source must be "testRunId"') + sys.exit(1) + fmt = var_def.get('format') + if not isinstance(fmt, str) or len(fmt.strip()) == 0: + print(f'recipes[{i}].variables.{var_name}: derived.format must be a non-empty string') + sys.exit(1) + elif strategy == 'faker': + gen = var_def.get('generator') + if not isinstance(gen, str) or len(gen.strip()) == 0: + print(f'recipes[{i}].variables.{var_name}: faker.generator must be a non-empty string') + sys.exit(1) + +missing_names = required_names - found_names +if missing_names: + print(f'Missing required recipes: {missing_names}') + sys.exit(1) + +print('OK') diff --git a/hooks/validators/validate_scenario_validation.py b/hooks/validators/validate_scenario_validation.py new file mode 100644 index 0000000..1339352 --- /dev/null +++ b/hooks/validators/validate_scenario_validation.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Validates autonoma/.scenario-validation.json.""" +import json +import sys +from urllib.parse import urlparse + + +filepath = sys.argv[1] + + +def fail(message: str) -> None: + print(message) + sys.exit(1) + + +try: + with open(filepath) as fh: + payload = json.load(fh) +except Exception as exc: + fail(f"Invalid JSON: {exc}") + +if not isinstance(payload, dict): + fail("Root must be a JSON object") + +required = [ + "status", + "preflightPassed", + "smokeTestPassed", + "validatedScenarios", + "failedScenarios", + "blockingIssues", + "recipePath", + "validationMode", + "endpointUrl", +] +missing = [field for field in required if field not in payload] +if missing: + fail(f"Missing required fields: {missing}") + +if payload.get("status") not in {"ok", "failed"}: + fail('status must be "ok" or "failed"') + +for field in ["preflightPassed", "smokeTestPassed"]: + if not isinstance(payload.get(field), bool): + fail(f"{field} must be a boolean") + +for field in ["validatedScenarios", "failedScenarios", "blockingIssues"]: + value = payload.get(field) + if not isinstance(value, list) or not all(isinstance(item, str) for item in value): + fail(f"{field} must be a list of strings") + +recipe_path = payload.get("recipePath") +if not isinstance(recipe_path, str) or not recipe_path.strip(): + fail("recipePath must be a non-empty string") + +validation_mode = payload.get("validationMode") +if validation_mode not in {"sdk-check", "endpoint-lifecycle"}: + fail('validationMode must be "sdk-check" or "endpoint-lifecycle"') + +endpoint_url = payload.get("endpointUrl") +if not isinstance(endpoint_url, str) or not endpoint_url.strip(): + fail("endpointUrl must be a non-empty string") +parsed = urlparse(endpoint_url) +if parsed.scheme not in {"http", "https"} or not parsed.netloc: + fail("endpointUrl must be an absolute http/https URL") + +print("OK") diff --git a/hooks/validators/validate_scenarios.py b/hooks/validators/validate_scenarios.py index eb77f5c..b080522 100644 --- a/hooks/validators/validate_scenarios.py +++ b/hooks/validators/validate_scenarios.py @@ -73,4 +73,60 @@ print(f'entity_types[{i}] must be a mapping with at least a "name" field') sys.exit(1) +# Validate variable_fields (required, may be empty list) +if 'variable_fields' not in fm: + print('Missing required frontmatter field: variable_fields (use [] if none)') + sys.exit(1) + +scenario_name_set = {s['name'] for s in scenarios} +variable_fields = fm.get('variable_fields') +if not isinstance(variable_fields, list): + print('variable_fields must be a list') + sys.exit(1) + +for i, variable in enumerate(variable_fields): + if not isinstance(variable, dict): + print(f'variable_fields[{i}] must be a mapping') + sys.exit(1) + for field in ['token', 'entity', 'scenarios', 'reason', 'test_reference']: + if field not in variable: + print(f'variable_fields[{i}] missing required field: {field}') + sys.exit(1) + + token = variable.get('token') + if not isinstance(token, str) or len(token) < 5 or not token.startswith('{{') or not token.endswith('}}'): + print(f'variable_fields[{i}].token must use double curly braces, e.g. {{title}}') + sys.exit(1) + + for field in ['entity', 'reason', 'test_reference']: + value = variable.get(field) + if not isinstance(value, str) or len(value.strip()) == 0: + print(f'variable_fields[{i}].{field} must be a non-empty string') + sys.exit(1) + + vscenarios = variable.get('scenarios') + if not isinstance(vscenarios, list) or len(vscenarios) == 0: + print(f'variable_fields[{i}].scenarios must be a non-empty list') + sys.exit(1) + for name in vscenarios: + if name not in scenario_name_set: + print(f'variable_fields[{i}].scenarios references unknown scenario: {name}') + sys.exit(1) + +# Validate planning_sections (required; must contain the four core sections) +if 'planning_sections' not in fm: + print('Missing required frontmatter field: planning_sections') + sys.exit(1) + +planning = fm.get('planning_sections') +if not isinstance(planning, list) or len(planning) == 0: + print('planning_sections must be a non-empty list') + sys.exit(1) + +required_sections = {'schema_summary', 'relationship_map', 'variable_data_strategy'} +missing_sections = required_sections - set(planning) +if missing_sections: + print(f'planning_sections missing required entries: {sorted(missing_sections)}') + sys.exit(1) + print('OK') diff --git a/hooks/validators/validate_sdk_endpoint.py b/hooks/validators/validate_sdk_endpoint.py new file mode 100644 index 0000000..fd7df1e --- /dev/null +++ b/hooks/validators/validate_sdk_endpoint.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Validates autonoma/.sdk-endpoint.""" +import sys +from urllib.parse import urlparse + + +filepath = sys.argv[1] + +try: + with open(filepath) as fh: + value = fh.read().strip() +except Exception as exc: + print(f'Unable to read file: {exc}') + sys.exit(1) + +if not value: + print('.sdk-endpoint must contain a non-empty URL') + sys.exit(1) + +parsed = urlparse(value) +if parsed.scheme not in {'http', 'https'}: + print('.sdk-endpoint must use http or https') + sys.exit(1) + +if not parsed.netloc: + print('.sdk-endpoint must include a host') + sys.exit(1) + +print('OK') diff --git a/skills/generate-adhoc-tests/SKILL.md b/skills/generate-adhoc-tests/SKILL.md new file mode 100644 index 0000000..4f6dd14 --- /dev/null +++ b/skills/generate-adhoc-tests/SKILL.md @@ -0,0 +1,508 @@ +--- +name: generate-adhoc-tests +description: > + Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline. + Each step runs in an isolated subagent and must pass deterministic validation before the next + step begins. When scenarios already exist in Autonoma, fetches context from the API and runs only + Step 3 scoped to the topic. On a first run, executes the full 4-step pipeline with Step 3 focused. + Use when you want targeted test coverage for a specific feature or domain. +--- + +# Autonoma Focused E2E Test Generation Pipeline + +You are orchestrating a focused test generation pipeline. Each step runs as an isolated subagent. +**Every step MUST complete successfully and pass validation before the next step begins.** +Do NOT skip steps. Do NOT proceed if validation fails. + +## User Confirmation Between Steps + +By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for +confirmation using the `AskUserQuestion` tool. This creates an interactive +UI prompt that makes it clear the user needs to respond before the pipeline continues. + +After calling `AskUserQuestion`, wait for the user's response. +Only proceed to the next step after they confirm. + +**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`, +skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting +the summary. The summaries are still displayed — only the confirmation prompt is skipped. + +## Before Starting + +Resolve the focus prompt from the user's input (the text after the command name): + +```bash +FOCUS_PROMPT="" +FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g') +echo "Focus: $FOCUS_PROMPT" +echo "Slug: $FOCUS_SLUG" +``` + +If no focus description was provided, list top-level route/feature directories in the codebase, +call `AskUserQuestion` with 3–4 suggested focus areas plus an "Other" option, wait for the user's +response, then derive `FOCUS_SLUG` from their answer. + +Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference): +```bash +AUTONOMA_ROOT="$(pwd)" +echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root +mkdir -p autonoma/skills autonoma/qa-tests +``` + +The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back: +```bash +PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '') +``` + +Read the environment variables. These are required for reporting progress back to Autonoma: +- `AUTONOMA_API_KEY` — your Autonoma API key +- `AUTONOMA_PROJECT_ID` — your Autonoma project ID +- `AUTONOMA_API_URL` — Autonoma API base URL +- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps + +Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`. + +Create the generation record so the dashboard can track progress in real time: +```bash +RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}") +HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) +BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') +echo "Setup API response (HTTP $HTTP_STATUS): $BODY" +GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') +mkdir -p autonoma +echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}" +echo "Generation ID: $GENERATION_ID" +``` + +If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation. + +## Checking Existing Setup + +Check whether scenarios with active recipes already exist in Autonoma for this application: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +HAS_SCENARIOS="no" +SCENARIOS_RESPONSE="" +if [ -n "$GENERATION_ID" ]; then + SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}") + HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c " +import json, sys +data = json.loads(sys.stdin.read()) +active = [s for s in data.get('scenarios', []) if s.get('hasActiveRecipe')] +print('yes' if active else 'no') +" 2>/dev/null || echo "no") +fi +echo "Has active scenarios: $HAS_SCENARIOS" +``` + +**If `HAS_SCENARIOS=yes`** — scenarios and tests already exist. Fetch context from the API and run only Step 3: + +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') + +EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}") + +SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c " +import json, sys +data = json.loads(sys.stdin.read()) +lines = ['## Available Scenarios', ''] +for s in data.get('scenarios', []): + status = 'active' if s.get('hasActiveRecipe') else 'no recipe' + lines.append(f\"- **{s['name']}** ({status})\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") + +TESTS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c " +import json, sys +data = json.loads(sys.stdin.read()) +tests = data.get('tests', []) +lines = [f'## Existing Tests ({len(tests)} total)', ''] +for t in tests: + lines.append(f\"- {t['name']} (slug: {t['slug']})\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") + +SKILLS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c " +import json, sys +data = json.loads(sys.stdin.read()) +skills = data.get('skills', []) +lines = [f'## Available Skills ({len(skills)} total)', ''] +for s in skills: + lines.append(f\"- {s['name']}: {s['description']}\") +print('\n'.join(lines)) +" 2>/dev/null || echo "") +``` + +Skip to **Step 3: Generate Focused E2E Test Cases** and pass the fetched context inline in the subagent task — do not run Steps 1, 2, or 4. + +**If `HAS_SCENARIOS=no`** — this is a first run. Continue with the full pipeline below (Steps 1 through 4). + +--- + +## Step 1: Generate Knowledge Base + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true +``` + +Spawn the `kb-generator` subagent with the following task: + +> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` +> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with +> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. +> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. +> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. + +**After the subagent completes:** +1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty +2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically +3. Read the file and present the frontmatter to the user — specifically the core_flows table + +Report step complete and upload skills: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ') +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true + +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true + +[ -n "$GENERATION_ID" ] && python3 -c " +import os, json, sys +root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' +skills = [] +d = os.path.join(root, 'autonoma/skills') +if os.path.isdir(d): + for f in os.listdir(d): + if f.endswith('.md'): + with open(os.path.join(d, f)) as fh: + skills.append({'name': f, 'content': fh.read()}) +print(json.dumps({'skills': skills})) +" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." + - options: ["Yes, proceed to Step 2", "I want to suggest changes"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2. + +## Step 2: Generate Scenarios + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true +``` + +Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`. +This step requires these environment variables: +- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint +- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint + +If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access. +Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint. +State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2. + +Fetch and validate the artifact: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +mkdir -p "$AUTONOMA_ROOT/autonoma" +BODY='{"action":"discover"}' +SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') +RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ + -H "Content-Type: application/json" \ + -H "x-signature: $SIG" \ + -d "$BODY") +HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) +DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') +if [ "$HTTP_STATUS" != "200" ]; then + echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY" + exit 1 +fi +printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json" +python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" +``` + +If the fetch fails or validation fails, stop the pipeline at Step 2. +Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command. + +Spawn the `scenario-generator` subagent with the following task: + +> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover +> artifact from `autonoma/discover.json`. +> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. +> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types, +> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a +> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before +> introducing a variable placeholder. Use variable fields only for truly dynamic values such as +> backend-generated or time-based fields. `generator` is optional and must not default to `faker`. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. + +**After the subagent completes:** +1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty +2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`) +3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically +4. Read the file and present the summary to the user — scenario names, entity counts, entity types, + discover schema counts, and the minimal variable field tokens that remain dynamic + +Report step complete: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests." + - options: ["Yes, proceed to Step 3", "I want to suggest changes"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3. + +## Step 3: Generate Focused E2E Test Cases + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":2,"name":"Focused E2E Tests"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Generating focused E2E test cases..."}}' || true +``` + +Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual +values for FOCUS_PROMPT, FOCUS_SLUG, and — when coming from the API-fetch path — the context +variables SCENARIOS_CONTEXT, TESTS_CONTEXT, and SKILLS_CONTEXT before spawning): + +> **FOCUS_PROMPT**: +> **FOCUS_SLUG**: +> +> *(API-fetch path only — omit this block when running the full pipeline)* +> Context fetched from the Autonoma API (use this instead of reading local files): +> +> +> +> +> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, +> and scenarios from `autonoma/scenarios.md` (if they exist and no inline context was provided above). +> Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT. +> Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`. +> You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing +> total_tests, total_folders, folder breakdown, and coverage_correlation. +> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. +> Treat scenario data as fixture input only. Do not generate tests whose purpose is to verify +> scenario counts, seeded inventories, or Environment Factory correctness. Only reference +> scenario data when it is needed to test a real user-facing app behavior within the focus area. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. + +**After the subagent completes:** +1. Verify `autonoma/qa-tests/${FOCUS_SLUG}/INDEX.md` exists and is non-empty +2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter +3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation + +Report step complete and upload test cases: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused test cases. Uploading to dashboard...\"}}" || true + +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":2,"name":"Focused E2E Tests"}}' || true + +[ -n "$GENERATION_ID" ] && python3 -c " +import os, json +proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' +qa_dir = os.path.join(proj_root, 'autonoma/qa-tests/${FOCUS_SLUG}') +test_cases = [] +for root, dirs, files in os.walk(qa_dir): + for f in files: + if f.endswith('.md') and f != 'INDEX.md': + path = os.path.join(root, f) + folder = os.path.relpath(root, qa_dir) + with open(path) as fh: + content = fh.read() + entry = {'name': f, 'content': content} + if folder != '.': + entry['folder'] = '${FOCUS_SLUG}/' + folder + test_cases.append(entry) +print(json.dumps({'testCases': test_cases})) +" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- || true +``` + +4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with: + - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly." + - options: ["Yes, proceed to Step 4", "I want to suggest changes", "Done — skip Step 4 (scenarios already exist)"] + Wait for the user's response before proceeding. + **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4 (or stop here if coming from the API-fetch path). + +If coming from the **API-fetch path** (scenarios already existed), stop here after uploading. Step 4 is not needed. + +## Step 4: Environment Factory + +Report step start: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true +``` + +This step requires these environment variables: +- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint +- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint + +If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for +preflight validation. State plainly that both environment variables are mandatory. + +Spawn the `env-factory-generator` subagent with the following task: + +> Read `autonoma/discover.json` and `autonoma/scenarios.md`. +> Implement or complete the Autonoma Environment Factory in the project's backend so it can +> support the planned scenarios with the current SDK contract, then validate the planned scenarios +> against that implementation. +> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt +> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first. +> Preserve the existing discover integration if it already works, and finish `up` / `down` +> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`. +> Smoke-test the discover -> up -> down lifecycle in-session after implementing. +> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`. +> The recipe file must match the current setup API schema: +> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use +> `name`, `description`, `create`, and `validation` with `status: "validated"`, +> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`. +> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`. +> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines +> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`. +> Persisted `create` must remain tokenized — never store resolved concrete values. +> After writing the recipe file, run the preflight helper to validate all recipes against the +> live SDK endpoint before uploading: +> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json` +> The preflight must pass for all three scenarios before Step 4 is considered complete. + +**After the subagent completes:** +1. Verify the backend implementation or integration changes were made +2. Verify `autonoma/scenario-recipes.json` exists and is non-empty +3. Run the preflight helper if the subagent did not already do so: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" +``` +If preflight fails, do NOT proceed to upload. Report the failure to the user and stop. +4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results +5. Report which environment variables the backend now requires +6. Report any backend issues that still need manual attention + +Report step complete: +```bash +AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') +GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '') +echo "GENERATION_ID=${GENERATION_ID:-}" +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true +if [ -n "$GENERATION_ID" ]; then + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then + echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete." + exit 1 + fi + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete." + exit 1 + fi +fi +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true +[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true +``` + +## Completion + +After all steps complete, summarize: +- **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`) +- **Step 1**: Knowledge base location and core flow count *(full pipeline only)* +- **Step 2**: Scenario count and entity types covered *(full pipeline only)* +- **Step 3**: Total focused test count, folder breakdown, coverage correlation +- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results *(full pipeline only)* diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md index 509f3a5..4ccc236 100644 --- a/skills/generate-tests/SKILL.md +++ b/skills/generate-tests/SKILL.md @@ -9,234 +9,183 @@ description: > # Autonoma E2E Test Generation Pipeline -You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent. +You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent. **Every step MUST complete successfully and pass validation before the next step begins.** Do NOT skip steps. Do NOT proceed if validation fails. ## CRITICAL: User Confirmation Between Steps -After each step (1, 2, and 3), you MUST present the summary and then ask the user for -confirmation using the `AskUserQuestion` tool. This creates an interactive -UI prompt that makes it clear the user needs to respond before the pipeline continues. +After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation +using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm. -After calling `AskUserQuestion`, wait for the user's response. -Only proceed to the next step after they confirm. +## How lifecycle reporting works -## Before Starting +You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that: -Create the output directory: -```bash -mkdir -p autonoma/skills autonoma/qa-tests -``` +- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`. +- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output, + emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate + (test files cannot be written until `autonoma/.endpoint-validated` exists). -Read the environment variables. These are required for reporting progress back to Autonoma: -- `AUTONOMA_API_KEY` — your Autonoma API key -- `AUTONOMA_PROJECT_ID` — your Autonoma project ID -- `AUTONOMA_API_URL` — Autonoma API base URL +## Before Starting -Create the generation record so the dashboard can track progress in real time: ```bash -RESPONSE=$(curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\"}" 2>/dev/null || echo '{}') -GENERATION_ID=$(echo "$RESPONSE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') -mkdir -p autonoma -echo "$GENERATION_ID" > autonoma/.generation-id -echo "Generation ID: $GENERATION_ID" +mkdir -p autonoma/skills autonoma/qa-tests ``` -If `GENERATION_ID` is empty, continue anyway — reporting is best-effort and must never block test generation. +The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`. ## Step 1: Generate Knowledge Base -Report step start: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' 2>/dev/null || true -``` - -Spawn the `kb-generator` subagent with the following task: +Spawn `kb-generator`: -> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` -> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with -> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. -> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. -> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. +> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML +> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill +> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals). +> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`. -**After the subagent completes:** -1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty -2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically -3. Read the file and present the frontmatter to the user — specifically the core_flows table +After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body). -Report step complete and upload skills: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' 2>/dev/null || true - -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -skills = [] -d = 'autonoma/skills' -if os.path.isdir(d): - for f in os.listdir(d): - if f.endswith('.md'): - with open(os.path.join(d, f)) as fh: - skills.append({'name': f, 'content': fh.read()}) -print(json.dumps({'skills': skills})) -" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- 2>/dev/null || true -``` +## Step 2: Entity Creation Audit -4. Call `AskUserQuestion` with: - - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." - - options: ["Yes, proceed to Step 2", "I want to suggest changes"] -5. Wait for the user's response before proceeding. +Spawn `entity-audit-generator`: -## Step 2: Generate Scenarios +> Read the knowledge base. Audit how each database model is created. For every model, find the +> dedicated creation function in a service/repository/helper. Classify as `independently_created: true` +> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output +> `autonoma/entity-audit.md` with frontmatter listing each model. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`. -Report step start: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' 2>/dev/null || true -``` +After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`. -Spawn the `scenario-generator` subagent with the following task: +## Step 3: Generate Scenarios -> Read the knowledge base from `autonoma/AUTONOMA.md` and `autonoma/skills/`. -> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. -> The file MUST have YAML frontmatter with scenario_count, scenarios summary, and entity_types. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. +Spawn `scenario-generator`: -**After the subagent completes:** -1. Verify `autonoma/scenarios.md` exists and is non-empty -2. The PostToolUse hook will have validated the frontmatter format automatically -3. Read the file and present the frontmatter summary to the user — scenario names, entity counts, entity types +> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write +> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types, +> variable_fields, planning_sections). Mark values as variable only when they must vary across +> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural +> per-run isolation). Design entity tables so they serialise as nested trees rooted at the +> scope entity. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`. -Report step complete: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' 2>/dev/null || true -``` - -4. Call `AskUserQuestion` with: - - question: "Do these scenarios look correct? The standard scenario data becomes hard assertions in your tests." - - options: ["Yes, proceed to Step 3", "I want to suggest changes"] -5. Wait for the user's response before proceeding. - -## Step 3: Generate E2E Test Cases - -Report step start: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' 2>/dev/null || true -``` - -Spawn the `test-case-generator` subagent with the following task: - -> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, -> and scenarios from `autonoma/scenarios.md`. -> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`. -> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests, -> total_folders, folder breakdown, and coverage_correlation. -> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. - -**After the subagent completes:** -1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty -2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter -3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation - -Report step complete and upload test cases: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' 2>/dev/null || true - -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -test_cases = [] -for root, dirs, files in os.walk('autonoma/qa-tests'): - for f in files: - if f.endswith('.md') and f != 'INDEX.md': - path = os.path.join(root, f) - folder = os.path.relpath(root, 'autonoma/qa-tests') - with open(path) as fh: - content = fh.read() - entry = {'name': f, 'content': content} - if folder != '.': - entry['folder'] = folder - test_cases.append(entry) -print(json.dumps({'testCases': test_cases})) -" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- 2>/dev/null || true -``` - -4. Call `AskUserQuestion` with: - - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app." - - options: ["Yes, proceed to Step 4", "I want to suggest changes"] -5. Wait for the user's response before proceeding. +After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`. ## Step 4: Implement Environment Factory -Report step start: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' 2>/dev/null || true -``` - -Spawn the `env-factory-generator` subagent with the following task: - -> Read the scenarios from `autonoma/scenarios.md` and implement the Autonoma Environment Factory -> endpoint in the project's backend. The endpoint handles discover/up/down actions. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt -> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first. -> After implementing, run integration tests to verify the endpoint works. -> Use AUTONOMA_SIGNING_SECRET and AUTONOMA_JWT_SECRET as environment variable names. - -**After the subagent completes:** -1. Verify the endpoint was created and tests pass -2. Present the results to the user — what was implemented, where, test results -3. Report any issues that need manual attention - -Report step complete: -```bash -GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') -[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' 2>/dev/null || true -``` +Spawn `env-factory-generator`: + +> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure +> the handler. Register a factory for every model with `independently_created: true` (call the audit's +> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback +> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity +> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down` +> — that is step 5. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"` +> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`. +> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names. + +After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary, +`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`. + +## Step 5: Validate Scenario Lifecycle + +Spawn `scenario-validator`: + +> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4. +> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5 +> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails +> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On +> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at +> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe +> per scenario), run `preflight_scenario_recipes.py` against it, and write +> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write` +> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and +> report — do NOT write the sentinel. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`. +> Verify: every audited model appears in `discover.schema.models`, every `independently_created` +> model has a registered factory, `auth` is non-empty, DB state is correct before and after +> `down`, and preflight exits 0. + +After completion: +1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON + AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`: + enforce and upload the recipes to the dashboard, then ack. + + ```bash + AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}" + VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json" + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + + # Enforce terminal artifact contract + python3 - "$VALIDATION_ARTIFACT" <<'PY' + import json, sys + payload = json.load(open(sys.argv[1])) + if payload.get("status") != "ok": + raise SystemExit("status must be ok before Step 5 can upload recipes") + if payload.get("preflightPassed") is not True: + raise SystemExit("preflightPassed must be true before Step 5 can upload recipes") + PY + + [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; } + python3 -c "import json; json.load(open('$RECIPE_PATH'))" \ + || { echo "scenario-recipes.json is not valid JSON"; exit 1; } + + # Re-run preflight at the orchestrator level for belt-and-suspenders safety. + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \ + || { echo "Preflight failed at orchestrator gate"; exit 1; } + + # Upload to dashboard + GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id") + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \ + "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2 + exit 1 + fi + ``` + + Then present validation summary (scenarios passed, any edits made to `scenarios.md`, + recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`. + +2. If any of those artifacts are missing/invalid: the agent failed — surface the failure + report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook + will also block test file writes. + +## Step 6: Generate E2E Test Cases + +Spawn `test-case-generator`: + +> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has +> been reconciled with reality in step 5 — use it as the source of truth). Parse the +> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any +> variable value (typed, asserted, or navigated to), never the hardcoded literal. +> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests +> that "audit" seeded counts or fixture existence. +> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with +> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test +> file needs frontmatter (title, description, criticality, scenario, flow). +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`. + +After completion: +1. Verify `autonoma/qa-tests/INDEX.md` exists +2. Present INDEX summary +3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed` + for the final step, marking the setup complete. ## Completion -After all steps complete, summarize: -- **Step 1**: Knowledge base location and core flow count -- **Step 2**: Scenario count and entity types covered -- **Step 3**: Total test count, folder breakdown, coverage correlation -- **Step 4**: Endpoint location, test results, env var setup instructions +Summarize each step: +- **Step 1**: KB location, core flows +- **Step 2**: entity audit — factories vs raw SQL +- **Step 3**: scenarios generated +- **Step 4**: endpoint implemented (handler path, packages, factories registered) +- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any) +- **Step 6**: test count, folder breakdown diff --git a/tests/test_preflight_scenario_recipes.py b/tests/test_preflight_scenario_recipes.py new file mode 100644 index 0000000..6bb1b44 --- /dev/null +++ b/tests/test_preflight_scenario_recipes.py @@ -0,0 +1,152 @@ +"""Tests for hooks/preflight_scenario_recipes.py resolver logic.""" +import sys +import os + +# Add hooks dir to path so we can import the module +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'hooks')) + +from preflight_scenario_recipes import ( + resolve_variable, + resolve_recipe, + _find_tokens, + _faker_generate, +) +import pytest + + +# --- resolve_variable tests --- + +def test_literal_string(): + v = resolve_variable({'strategy': 'literal', 'value': 'hello'}, 'run1', 'tok') + assert v == 'hello' + + +def test_literal_number(): + v = resolve_variable({'strategy': 'literal', 'value': 42}, 'run1', 'tok') + assert v == 42 + + +def test_literal_null(): + v = resolve_variable({'strategy': 'literal', 'value': None}, 'run1', 'tok') + assert v is None + + +def test_derived(): + v = resolve_variable( + {'strategy': 'derived', 'source': 'testRunId', 'format': 'user+{testRunId}@example.com'}, + 'abc-123', 'tok', + ) + assert v == 'user+abc-123@example.com' + + +def test_faker_deterministic(): + """Same testRunId + token name → same value.""" + v1 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run1', 'first') + v2 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run1', 'first') + assert v1 == v2 + assert isinstance(v1, str) and len(v1) > 0 + + +def test_faker_different_run_id(): + """Different testRunId → different value (with high probability).""" + v1 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run-a', 'first') + v2 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run-b', 'first') + # Not guaranteed but extremely likely with different seeds + # We just check both produce valid strings + assert isinstance(v1, str) + assert isinstance(v2, str) + + +def test_faker_email(): + v = resolve_variable({'strategy': 'faker', 'generator': 'internet.email'}, 'run1', 'email') + assert '@' in v + + +def test_faker_company(): + v = resolve_variable({'strategy': 'faker', 'generator': 'company.name'}, 'run1', 'co') + assert isinstance(v, str) and len(v) > 0 + + +def test_faker_lorem(): + v = resolve_variable({'strategy': 'faker', 'generator': 'lorem.words'}, 'run1', 'w') + assert ' ' in v # multiple words + + +def test_unsupported_faker_generator(): + with pytest.raises(ValueError, match='Unsupported faker generator'): + resolve_variable({'strategy': 'faker', 'generator': 'address.city'}, 'run1', 'tok') + + +def test_unsupported_strategy(): + with pytest.raises(ValueError, match='Unsupported variable strategy'): + resolve_variable({'strategy': 'random'}, 'run1', 'tok') + + +# --- resolve_recipe tests --- + +def test_resolve_full_recipe(): + recipe = { + 'create': { + 'User': [{'email': '{{owner_email}}', 'name': '{{first_name}}'}], + }, + 'variables': { + 'owner_email': {'strategy': 'derived', 'source': 'testRunId', 'format': 'owner+{testRunId}@example.com'}, + 'first_name': {'strategy': 'faker', 'generator': 'person.firstName'}, + }, + } + result = resolve_recipe(recipe, 'test-run-1') + assert result['User'][0]['email'] == 'owner+test-run-1@example.com' + assert isinstance(result['User'][0]['name'], str) + + +def test_embedded_string_replacement(): + recipe = { + 'create': { + 'Org': [{'name': 'Org-{{suffix}}'}], + }, + 'variables': { + 'suffix': {'strategy': 'literal', 'value': 'acme'}, + }, + } + result = resolve_recipe(recipe, 'run1') + assert result['Org'][0]['name'] == 'Org-acme' + + +def test_missing_variable_fails(): + recipe = { + 'create': {'User': [{'email': '{{missing}}'}]}, + 'variables': {}, + } + with pytest.raises(ValueError, match='Tokens without variable definitions'): + resolve_recipe(recipe, 'run1') + + +def test_unused_variable_fails(): + recipe = { + 'create': {'User': [{'email': 'static@example.com'}]}, + 'variables': { + 'extra': {'strategy': 'literal', 'value': 'unused'}, + }, + } + with pytest.raises(ValueError, match='Unused variable definitions'): + resolve_recipe(recipe, 'run1') + + +def test_concrete_recipe_no_variables(): + """Recipe with no tokens and no variables should resolve fine.""" + recipe = { + 'create': {'Org': [{'name': 'Acme'}]}, + } + result = resolve_recipe(recipe, 'run1') + assert result == {'Org': [{'name': 'Acme'}]} + + +# --- _find_tokens tests --- + +def test_find_tokens_nested(): + obj = {'a': [{'b': '{{x}} and {{y}}'}], 'c': '{{z}}'} + assert _find_tokens(obj) == {'x', 'y', 'z'} + + +def test_find_tokens_no_tokens(): + assert _find_tokens({'a': 'hello'}) == set() diff --git a/tests/test_validate_discover.py b/tests/test_validate_discover.py new file mode 100644 index 0000000..0a05909 --- /dev/null +++ b/tests/test_validate_discover.py @@ -0,0 +1,98 @@ +"""Tests for validate_discover.py.""" +from conftest import run_validator + +SCRIPT = 'validate_discover.py' + +VALID = """\ +{ + "schema": { + "models": [ + { + "name": "Organization", + "fields": [ + { + "name": "id", + "type": "String", + "isRequired": true, + "isId": true, + "hasDefault": true + } + ] + } + ], + "edges": [ + { + "from": "User", + "to": "Organization", + "localField": "organizationId", + "foreignField": "id", + "nullable": false + } + ], + "relations": [ + { + "parentModel": "Organization", + "childModel": "User", + "parentField": "users", + "childField": "organizationId" + } + ], + "scopeField": "organizationId" + } +} +""" + + +def test_valid_discover(): + code, out = run_validator(SCRIPT, VALID, filename='discover.json') + assert code == 0 + assert out == 'OK' + + +def test_invalid_json(): + code, out = run_validator(SCRIPT, '{not-json', filename='discover.json') + assert code == 1 + assert 'Invalid JSON' in out + + +def test_missing_schema(): + code, out = run_validator(SCRIPT, '{}', filename='discover.json') + assert code == 1 + assert 'must contain a "schema" object' in out + + +def test_missing_scope_field(): + content = VALID.replace(' "scopeField": "organizationId"\n', '') + content = content.replace(' ],\n }\n}\n', ' ]\n }\n}\n') + code, out = run_validator(SCRIPT, content, filename='discover.json') + assert code == 1 + assert 'schema is missing required fields' in out + + +def test_model_requires_fields(): + content = VALID.replace('"fields": [', '"oops": [') + code, out = run_validator(SCRIPT, content, filename='discover.json') + assert code == 1 + assert 'fields must be a list' in out + + +def test_accepts_enum_and_list_type_formats(): + content = VALID.replace( + '"type": "String"', + '"type": "enum(slack)"', + 1, + ).replace( + '"hasDefault": true', + '"hasDefault": true\n },\n {\n "name": "teamSlugs",\n "type": "String[]",\n "isRequired": true,\n "isId": false,\n "hasDefault": true', + 1, + ) + code, out = run_validator(SCRIPT, content, filename='discover.json') + assert code == 0 + assert out == 'OK' + + +def test_rejects_unsupported_type_format(): + content = VALID.replace('"type": "String"', '"type": "enum(slack"', 1) + code, out = run_validator(SCRIPT, content, filename='discover.json') + assert code == 1 + assert 'must use a supported type format' in out diff --git a/tests/test_validate_scenario_recipes.py b/tests/test_validate_scenario_recipes.py new file mode 100644 index 0000000..d34735d --- /dev/null +++ b/tests/test_validate_scenario_recipes.py @@ -0,0 +1,577 @@ +"""Tests for validate_scenario_recipes.py.""" +import json +from conftest import run_validator, run_validator_with_dir + +SCRIPT = 'validate_scenario_recipes.py' + +VALID_DISCOVER = { + 'schema': { + 'models': [ + { + 'name': 'Organization', + 'fields': [ + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'communicationChannel', 'type': 'enum(slack)', 'isRequired': False, 'isId': False, 'hasDefault': False}, + {'name': 'teamSlugs', 'type': 'String[]', 'isRequired': True, 'isId': False, 'hasDefault': True}, + ], + }, + { + 'name': 'User', + 'fields': [ + {'name': 'email', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + ], + 'edges': [], + 'relations': [], + 'scopeField': 'organizationId', + } +} + +VALID_DATA = { + 'version': 1, + 'source': { + 'discoverPath': 'autonoma/discover.json', + 'scenariosPath': 'autonoma/scenarios.md', + }, + 'validationMode': 'sdk-check', + 'recipes': [ + { + 'name': 'standard', + 'description': 'Realistic variety for core flows', + 'create': { + 'Organization': [{'name': 'Standard Org {{testRunId}}'}], + }, + 'validation': { + 'status': 'validated', + 'method': 'checkScenario', + 'phase': 'ok', + 'up_ms': 12, + 'down_ms': 8, + }, + }, + { + 'name': 'empty', + 'description': 'Empty-state scenario', + 'create': { + 'Organization': [{'name': 'Empty Org {{testRunId}}'}], + }, + 'validation': { + 'status': 'validated', + 'method': 'checkScenario', + 'phase': 'ok', + }, + }, + { + 'name': 'large', + 'description': 'High-volume scenario', + 'create': { + 'Organization': [{'name': 'Large Org {{testRunId}}'}], + }, + 'validation': { + 'status': 'validated', + 'method': 'endpoint-up-down', + 'phase': 'ok', + 'up_ms': 120, + 'down_ms': 65, + }, + }, + ], +} + +VALID_DATA_WITH_VARIABLES = { + 'version': 1, + 'source': { + 'discoverPath': 'autonoma/discover.json', + 'scenariosPath': 'autonoma/scenarios.md', + }, + 'validationMode': 'sdk-check', + 'recipes': [ + { + 'name': 'standard', + 'description': 'Realistic variety for core flows', + 'create': { + 'User': [{'email': '{{owner_email}}'}], + }, + 'variables': { + 'owner_email': { + 'strategy': 'derived', + 'source': 'testRunId', + 'format': 'owner+{testRunId}@example.com', + }, + }, + 'validation': { + 'status': 'validated', + 'method': 'checkScenario', + 'phase': 'ok', + }, + }, + { + 'name': 'empty', + 'description': 'Empty-state scenario', + 'create': { + 'Organization': [{'name': 'Empty Org'}], + }, + 'validation': { + 'status': 'validated', + 'method': 'checkScenario', + 'phase': 'ok', + }, + }, + { + 'name': 'large', + 'description': 'High-volume scenario', + 'create': { + 'Organization': [{'name': '{{company}}'}], + }, + 'variables': { + 'company': { + 'strategy': 'faker', + 'generator': 'company.name', + }, + }, + 'validation': { + 'status': 'validated', + 'method': 'endpoint-up-down', + 'phase': 'ok', + }, + }, + ], +} + + +def _json(data): + return json.dumps(data) + + +def _run_recipe_validator(data, discover=None): + if discover is None: + discover = VALID_DISCOVER + files = { + 'autonoma/scenario-recipes.json': _json(data), + 'autonoma/discover.json': _json(discover), + } + return run_validator_with_dir(SCRIPT, files, 'autonoma/scenario-recipes.json') + + +def test_valid_scenario_recipes(): + code, out = _run_recipe_validator(VALID_DATA) + assert code == 0 + assert out == 'OK' + + +def test_valid_with_variables(): + code, out = _run_recipe_validator(VALID_DATA_WITH_VARIABLES) + assert code == 0 + assert out == 'OK' + + +def test_valid_concrete_without_variables(): + """Fully concrete recipes (no tokens) should pass without variables.""" + data = { + 'version': 1, + 'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'}, + 'validationMode': 'sdk-check', + 'recipes': [ + {'name': 'standard', 'description': 'Std', 'create': {'Organization': [{'name': 'Acme'}]}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}}, + {'name': 'empty', 'description': 'Empty', 'create': {'Organization': [{'name': 'None'}]}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}}, + {'name': 'large', 'description': 'Large', 'create': {'Organization': [{'name': 'Big'}]}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}}, + ], + } + code, out = _run_recipe_validator(data) + assert code == 0 + assert out == 'OK' + + +def test_invalid_json(): + code, out = run_validator(SCRIPT, '{not json', 'scenario-recipes.json') + assert code == 1 + assert 'Invalid JSON' in out + + +def test_missing_required_fields(): + code, out = _run_recipe_validator({'recipes': []}) + assert code == 1 + assert 'Missing required fields' in out + + +def test_invalid_validation_mode(): + data = {**VALID_DATA, 'validationMode': 'rollback'} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'validationMode must be one of' in out + + +def test_missing_required_recipe_name(): + data = {**VALID_DATA} + data['recipes'] = [ + VALID_DATA['recipes'][0], + VALID_DATA['recipes'][1], + { + 'name': 'custom', + 'description': 'Extra recipe', + 'create': { + 'Organization': [{'name': 'Custom Org {{testRunId}}'}], + }, + 'validation': { + 'status': 'validated', + 'method': 'checkScenario', + 'phase': 'ok', + }, + }, + ] + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'Missing required recipes' in out + + +def test_recipe_requires_create(): + data = {**VALID_DATA} + data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']] + data['recipes'][0]['create'] = {} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'create must be a non-empty object' in out + + +def test_validation_status_must_be_validated(): + data = {**VALID_DATA} + data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']] + data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation']) + data['recipes'][0]['validation']['status'] = 'draft' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'validation.status must be exactly "validated"' in out + + +def test_validation_phase_must_be_ok(): + data = {**VALID_DATA} + data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']] + data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation']) + data['recipes'][0]['validation']['phase'] = 'up' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'validation.phase must be exactly "ok"' in out + + +def test_validation_method_must_be_known(): + data = {**VALID_DATA} + data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']] + data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation']) + data['recipes'][0]['validation']['method'] = 'custom' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'validation.method must be one of' in out + + +# --- Variables validation tests --- + +def test_token_without_variable_definition(): + """Token in create with no matching variable should fail.""" + import copy + data = copy.deepcopy(VALID_DATA_WITH_VARIABLES) + # Add a token but no variable + data['recipes'][0]['create']['User'][0]['name'] = '{{missing_var}}' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'tokens without variable definitions' in out + + +def test_unused_variable_definition(): + """Variable defined but not used in create should fail.""" + import copy + data = copy.deepcopy(VALID_DATA_WITH_VARIABLES) + data['recipes'][0]['variables']['extra_unused'] = { + 'strategy': 'literal', + 'value': 'oops', + } + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'unused variable definitions' in out + + +def test_invalid_variable_strategy(): + """Unknown strategy should fail.""" + import copy + data = copy.deepcopy(VALID_DATA_WITH_VARIABLES) + data['recipes'][0]['variables']['owner_email']['strategy'] = 'random' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'strategy must be one of' in out + + +def test_invalid_derived_shape(): + """Derived variable with wrong source should fail.""" + import copy + data = copy.deepcopy(VALID_DATA_WITH_VARIABLES) + data['recipes'][0]['variables']['owner_email']['source'] = 'userId' + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'derived.source must be "testRunId"' in out + + +def test_invalid_literal_scalar(): + """Literal with non-scalar value should fail.""" + import copy + data = copy.deepcopy(VALID_DATA_WITH_VARIABLES) + data['recipes'][0]['create'] = {'User': [{'email': '{{owner_email}}'}]} + data['recipes'][0]['variables'] = { + 'owner_email': { + 'strategy': 'literal', + 'value': [1, 2, 3], # not scalar + }, + } + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'literal.value must be a scalar' in out + + +def test_rejects_unknown_model_from_discover(): + data = json.loads(_json(VALID_DATA)) + data['recipes'][0]['create'] = {'UnknownModel': [{'name': 'Acme'}]} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'is not present in discover schema' in out + + +def test_rejects_unknown_field_from_discover(): + data = json.loads(_json(VALID_DATA)) + data['recipes'][0]['create'] = {'Organization': [{'unknownField': 'Acme'}]} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'unknownField is not present in discover schema' in out + + +def test_rejects_invalid_enum_literal_from_discover(): + data = json.loads(_json(VALID_DATA)) + data['recipes'][0]['create'] = {'Organization': [{'communicationChannel': 'EMAIL'}]} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'invalid enum value "EMAIL"' in out + + +def test_rejects_non_list_value_for_list_field(): + data = json.loads(_json(VALID_DATA)) + data['recipes'][0]['create'] = {'Organization': [{'teamSlugs': 'qa-team'}]} + code, out = _run_recipe_validator(data) + assert code == 1 + assert 'must be a list because discover type is String[]' in out + + +def test_nested_tree_with_relation_fields(): + """Nested tree creates using relation field names from discover should pass.""" + discover = { + 'schema': { + 'models': [ + { + 'name': 'Organization', + 'fields': [ + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + { + 'name': 'User', + 'fields': [ + {'name': 'email', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + ], + 'edges': [ + {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False}, + ], + 'relations': [ + {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'}, + ], + 'scopeField': 'organizationId', + } + } + data = { + 'version': 1, + 'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'}, + 'validationMode': 'sdk-check', + 'recipes': [ + { + 'name': 'standard', 'description': 'Nested tree', + 'create': { + 'Organization': [{ + 'name': 'Acme', + 'users': [{'name': 'Alice', 'email': 'alice@test.com'}], + }], + }, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'empty', 'description': 'Empty', + 'create': {'Organization': []}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'large', 'description': 'Large', + 'create': {'Organization': [{'name': 'Big'}]}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + ], + } + code, out = _run_recipe_validator(data, discover=discover) + assert code == 0 + assert out == 'OK' + + +def test_rejects_flat_ref_for_nestable_fk(): + """Flat _ref for a FK that should be expressed via nesting must be rejected. + + The dashboard may reorder JSON keys, breaking insertion-order-dependent _ref + resolution. Child models must be nested under their parent using relation + field names, not placed in separate top-level arrays with _ref. + """ + discover = { + 'schema': { + 'models': [ + { + 'name': 'Organization', + 'tableName': 'organizations', + 'fields': [ + {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + { + 'name': 'User', + 'tableName': 'users', + 'fields': [ + {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + ], + 'edges': [ + {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False}, + ], + 'relations': [ + {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'}, + ], + 'scopeField': 'organizationId', + } + } + data = { + 'version': 1, + 'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'}, + 'validationMode': 'sdk-check', + 'recipes': [ + { + 'name': 'standard', 'description': 'Flat format with _ref', + 'create': { + 'Organization': [{'_alias': 'org1', 'name': 'Acme'}], + 'User': [{'name': 'Alice', 'organizationId': {'_ref': 'org1'}}], + }, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'empty', 'description': 'Empty', + 'create': {'Organization': []}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'large', 'description': 'Large flat', + 'create': { + 'Organization': [{'_alias': 'org2', 'name': 'Big'}], + 'User': [{'name': 'Bob', 'organizationId': {'_ref': 'org2'}}], + }, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + ], + } + code, out = _run_recipe_validator(data, discover=discover) + assert code == 1 + assert 'should be nested under Organization' in out + assert 'flat _ref' in out + + +def test_allows_cross_branch_ref_in_nested_tree(): + """Cross-branch _ref (e.g. assigneeId pointing to a user) is allowed. + + When a model is NOT a top-level key (it's nested under its parent), a _ref + to it from a sibling branch is the correct pattern and must not be rejected. + """ + discover = { + 'schema': { + 'models': [ + { + 'name': 'Organization', + 'tableName': 'organizations', + 'fields': [ + {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + { + 'name': 'User', + 'tableName': 'users', + 'fields': [ + {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True}, + {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + { + 'name': 'Task', + 'tableName': 'tasks', + 'fields': [ + {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True}, + {'name': 'title', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'assigneeId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False}, + ], + }, + ], + 'edges': [ + {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False}, + {'from': 'Task', 'to': 'User', 'localField': 'assigneeId', 'foreignField': 'id', 'nullable': False}, + {'from': 'Task', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False}, + ], + 'relations': [ + {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'}, + {'parentModel': 'User', 'childModel': 'Task', 'parentField': 'tasks', 'childField': 'assigneeId'}, + {'parentModel': 'Organization', 'childModel': 'Task', 'parentField': 'orgTasks', 'childField': 'organizationId'}, + ], + 'scopeField': 'organizationId', + } + } + data = { + 'version': 1, + 'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'}, + 'validationMode': 'sdk-check', + 'recipes': [ + { + 'name': 'standard', 'description': 'Nested with cross-branch ref', + 'create': { + 'Organization': [{ + 'name': 'Acme', + 'users': [{'_alias': 'alice', 'name': 'Alice'}], + 'orgTasks': [{'title': 'Task 1', 'assigneeId': {'_ref': 'alice'}}], + }], + }, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'empty', 'description': 'Empty', + 'create': {'Organization': []}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + { + 'name': 'large', 'description': 'Large nested', + 'create': {'Organization': [{'name': 'Big'}]}, + 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, + }, + ], + } + code, out = _run_recipe_validator(data, discover=discover) + assert code == 0 + assert out == 'OK' diff --git a/tests/test_validate_scenario_validation.py b/tests/test_validate_scenario_validation.py new file mode 100644 index 0000000..a7f7b07 --- /dev/null +++ b/tests/test_validate_scenario_validation.py @@ -0,0 +1,65 @@ +"""Tests for validate_scenario_validation.py.""" +import json + +from conftest import run_validator + + +SCRIPT = "validate_scenario_validation.py" + + +def valid_payload(**overrides): + payload = { + "status": "ok", + "preflightPassed": True, + "smokeTestPassed": True, + "validatedScenarios": ["standard", "empty", "large"], + "failedScenarios": [], + "blockingIssues": [], + "recipePath": "autonoma/scenario-recipes.json", + "validationMode": "sdk-check", + "endpointUrl": "http://127.0.0.1:3000/api/autonoma", + } + payload.update(overrides) + return payload + + +def test_accepts_valid_payload(): + code, out = run_validator(SCRIPT, json.dumps(valid_payload()), filename=".scenario-validation.json") + assert code == 0 + assert out == "OK" + + +def test_accepts_failed_status_payload(): + code, out = run_validator( + SCRIPT, + json.dumps( + valid_payload( + status="failed", + preflightPassed=False, + validatedScenarios=["standard"], + failedScenarios=["empty", "large"], + blockingIssues=["duplicate email"], + ) + ), + filename=".scenario-validation.json", + ) + assert code == 0 + assert out == "OK" + + +def test_rejects_missing_required_field(): + payload = valid_payload() + payload.pop("recipePath") + code, out = run_validator(SCRIPT, json.dumps(payload), filename=".scenario-validation.json") + assert code == 1 + assert "Missing required fields" in out + + +def test_rejects_invalid_endpoint_url(): + code, out = run_validator( + SCRIPT, + json.dumps(valid_payload(endpointUrl="relative/path")), + filename=".scenario-validation.json", + ) + assert code == 1 + assert "absolute http/https URL" in out diff --git a/tests/test_validate_scenarios.py b/tests/test_validate_scenarios.py index 1101459..100de96 100644 --- a/tests/test_validate_scenarios.py +++ b/tests/test_validate_scenarios.py @@ -22,6 +22,11 @@ entity_types: - name: user - name: task +variable_fields: [] +planning_sections: + - schema_summary + - relationship_map + - variable_data_strategy --- # Scenarios diff --git a/tests/test_validate_sdk_endpoint.py b/tests/test_validate_sdk_endpoint.py new file mode 100644 index 0000000..319e0fb --- /dev/null +++ b/tests/test_validate_sdk_endpoint.py @@ -0,0 +1,35 @@ +"""Tests for validate_sdk_endpoint.py.""" +from conftest import run_validator + + +SCRIPT = 'validate_sdk_endpoint.py' + + +def test_accepts_localhost_url(): + code, out = run_validator(SCRIPT, 'http://localhost:3000/api/autonoma\n', filename='.sdk-endpoint') + assert code == 0 + assert out == 'OK' + + +def test_accepts_https_url(): + code, out = run_validator(SCRIPT, 'https://example.com/autonoma', filename='.sdk-endpoint') + assert code == 0 + assert out == 'OK' + + +def test_rejects_empty_content(): + code, out = run_validator(SCRIPT, '', filename='.sdk-endpoint') + assert code == 1 + assert 'non-empty URL' in out + + +def test_rejects_relative_path(): + code, out = run_validator(SCRIPT, '/api/autonoma', filename='.sdk-endpoint') + assert code == 1 + assert 'http or https' in out + + +def test_rejects_malformed_url(): + code, out = run_validator(SCRIPT, 'https:///missing-host', filename='.sdk-endpoint') + assert code == 1 + assert 'include a host' in out