diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 2de57c6..e43d1e1 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "autonoma-test-planner", "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step", - "version": "1.2.1", + "version": "1.13.1", "author": { "name": "Autonoma" } diff --git a/agents/entity-audit-generator.md b/agents/entity-audit-generator.md new file mode 100644 index 0000000..96b30e0 --- /dev/null +++ b/agents/entity-audit-generator.md @@ -0,0 +1,241 @@ +--- +description: > + Audits every database model to describe every way it comes into existence. + For each model the agent answers two orthogonal questions: (a) does a + standalone creation path exist? (b) which other models' creation flows + produce it as a side effect? Independently-created models get factories; + the rest fall back to raw SQL INSERT and are torn down via their owner(s). +tools: + - Read + - Glob + - Grep + - Write + - Edit + - Bash + - Agent + - WebFetch +maxTurns: 60 +--- + +# Entity Creation Audit + +You audit the codebase to discover **every way each database model is created**. For every model +you answer two orthogonal questions and record the answers so the Environment Factory can plan +factories, scenario trees, and teardown correctly. + +Your input is the knowledge base (`autonoma/AUTONOMA.md` and `autonoma/skills/`). Your output +is `autonoma/entity-audit.md`. + +## The two orthogonal questions + +For every model, answer **both** independently: + +1. **`independently_created`** — *Does the codebase have an exported function / method / + controller that creates this model on its own?* Boolean. +2. **`created_by`** — *When I trace every other model's creation function, does any of them + produce this model as a side effect?* List of `{owner, via, why}` entries; empty if none. + +These are **not** mutually exclusive. A single model can be both. For example, a `` model +may have its own `Service.create()` (answer 1 = true) *and* be minted inline inside a +parent's `Service.createRoot()` transaction as a required default row (answer 2 +non-empty). Both facts are true simultaneously and both matter downstream — the scenario +generator decides per-scenario whether a given `` is introduced via its standalone +factory or comes along with its owner. + +**Do not collapse the two.** Do not omit `created_by` just because `independently_created` is +true. Do not omit `independently_created` just because the model appears in someone else's +`created_by`. + +**When in doubt, prefer `independently_created: true` and include `created_by` anyway.** +Overclassifying a root as a dependent is worse than the inverse — a spurious factory is noisy, +a missing factory leaves a real root untested. + +## The four states a model can be in + +| `independently_created` | `created_by` | Interpretation | +|---|---|---| +| `true` | `[]` | Pure root — only standalone creation exists. | +| `true` | non-empty | Dual — has a standalone path AND is produced by at least one owner. | +| `false` | non-empty | Pure dependent — only reachable via an owner's creation flow. | +| `false` | `[]` | **Invalid.** Unreachable model — either you missed the owner, or the model is never created. Fix the audit before writing it. | + +## Instructions + +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. + + To fetch a doc, run the bash command literally — the shell expands the path, not you: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt" + ``` + + These are the source of truth. Follow them for audit methodology and output format. + +3. Read the knowledge base from `autonoma/AUTONOMA.md` and all skill files in `autonoma/skills/`. + Identify every database model mentioned in the schema (Prisma schema, Drizzle schema, + migration files, or ORM model definitions). + +4. **Pass A — find every standalone creation path.** For each model, search for a dedicated + create function: + - Service files: `*.service.ts`, `*.service.js`, `*Service.*`, `*_service.*` + - Repository files: `*.repository.ts`, `*.repository.js`, `*Repository.*`, `*_repository.*` + - Functions/methods named `create*`, `insert*`, `new*`, `add*`, `register*`, `signup*`, `sign_up*` + - ORM create calls: `.create(`, `.insert(`, `.save(`, `.build(` + - Controller or route handler files that contain inline creation logic + - Framework hooks (Better-Auth `databaseHooks.user.create`, NextAuth callbacks, Devise + callbacks, etc.) — these count as standalone creation paths. + + If a standalone path exists → `independently_created: true` and record `creation_file`, + `creation_function`, and observed `side_effects`. If the only creation is inline in a route + handler or framework-hook closure, still mark `true` and add `needs_extraction: true` — the + env-factory agent will extract into a named export before wiring the factory. + +5. **Pass B — for every standalone creation path, find the sibling rows it mints.** Open each + creation function you found in Pass A and enumerate every write it performs: + - Every `db..create(...)` / `.insert(...)` / `.save(...)` / `.create` call + - Every `.create(...)` / repository call it delegates to + - Every transactional block (`db.$transaction`, `session.begin`, `Repo.transaction`, etc.) + that bundles multiple inserts together + + For each sibling insert, append an entry to **that sibling model's** `created_by` list: + + ```yaml + created_by: + - owner: + via: Service.createRoot> + why: "" + ``` + + The `why` is prose, written for humans. Scenarios and the env-factory teardown logic quote + it verbatim. Make it specific — "Every new `` needs a default `` created inline + in the same transaction so downstream features have something to read from the start" is + useful; "creates a ``" is not. + + One pass per standalone path. When you're done, every sibling that was written inline will + have a `created_by` pointer back to the owner, and every model either has its own standalone + path (`independently_created: true`) or is reachable through at least one owner (non-empty + `created_by`). + +6. **Validate invariants before writing.** A model with `independently_created: false` and + empty `created_by` is a bug — either you missed a creation path, or the model is orphaned + in the schema. Do not ship an audit with orphans. + +7. Side effects are informational — they describe what an independently-created model's + function does. They help humans understand why a factory matters but do not affect + classification. + +## Output Format + +Write `autonoma/entity-audit.md` with YAML frontmatter and markdown body. + +### Frontmatter + +```yaml +--- +model_count: 4 +factory_count: 3 # number of models with independently_created: true +models: + - name: + independently_created: true + creation_file: src//.ts + creation_function: .databaseHooks.user.create + side_effects: + - hashes password + - creates default + rows + created_by: [] + + - name: + independently_created: true + creation_file: src//.service.ts + creation_function: Service.create + side_effects: + - mints a default in the same transaction + - mints an row + created_by: [] + + - name: + independently_created: true + creation_file: src//.service.ts + creation_function: Service.create + side_effects: [] + created_by: + - owner: + via: Service.create + why: "Every new needs a default , created inline in the same transaction so downstream features have something to read from the start." + + - name: + independently_created: false + created_by: + - owner: + via: Service.create + why: "Minted inside the transaction so dependent UI has a row wired up from the start." +--- +``` + +Schema rules: + +- `name` — required (string). +- `independently_created` — required (boolean). +- `creation_file` / `creation_function` / `side_effects` — required **iff** + `independently_created: true`. +- `needs_extraction` — optional boolean; true when the standalone path is inline in a route + handler or framework-hook closure and the env-factory agent will need to extract it. +- `created_by` — required (list, may be empty). Each entry requires `owner` (string — must + match another model's `name`), `via` (string — the function name), and `why` (non-empty + prose string). +- Any model with `independently_created: false` MUST have a non-empty `created_by`. + +### Markdown Body + +After the frontmatter, write: + +#### Roots (models with `independently_created: true`) + +For each, include: +- The model name as a heading +- `creation_file` + `creation_function` +- A brief description of what the function does, including observed side effects +- Any sibling models it mints inline (these are the models with `owner: ` in their + `created_by`). Link back to them so the reader can follow the tree. + +#### Dependents (models with `independently_created: false`) + +A table listing each dependent model, its owner(s) (from `created_by`), and the `why` for each. +This is the map the scenario generator uses: pure dependents are always created through their +owner, not as standalone tree nodes. + +#### Dual-creation models + +A call-out section listing every model with `independently_created: true` AND non-empty +`created_by`. For each, one sentence on when the standalone path is the right choice and when +the via-owner path is. This helps scenarios decide which to use per narrative. + +## Important + +- Be thorough — every inline `db..create(...)` inside someone else's creation function + must produce a `created_by` entry on that sibling, even if that sibling also has its own + service. +- Read the ACTUAL code to locate creation functions and sibling inserts — don't guess from file + names alone. +- If a model has multiple standalone creation paths (e.g., signup + admin-create), pick the + canonical one (usually the public API or most-called path) for `creation_function` and note + alternatives in the body. +- Framework-level hooks (Better-Auth, NextAuth, Devise) count as standalone paths — record them + with `needs_extraction: true` so the env-factory agent lifts the hook body into a named + export before wiring the factory. +- ORM-level hooks (Prisma middleware, Sequelize hooks, ActiveRecord callbacks) DO NOT run on + raw SQL. A pure-dependent (`independently_created: false`) model relying on them is a + correctness bug; call it out in the body. +- **Use subagents aggressively.** Pass A (find standalone paths) and Pass B (find sibling + inserts) are both embarrassingly parallel. diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md new file mode 100644 index 0000000..cd0fb54 --- /dev/null +++ b/agents/env-factory-generator.md @@ -0,0 +1,710 @@ +--- +description: > + Installs the Autonoma SDK and configures the handler by registering factories for + every model with dedicated creation code (from entity-audit.md). Writes + autonoma/.endpoint-implemented on completion. End-to-end validation happens in the + next step (scenario-validator). +tools: + - Read + - Glob + - Grep + - Write + - Edit + - Bash + - Agent + - WebFetch +maxTurns: 60 +--- + +# Environment Factory: SDK Setup + +You install the Autonoma SDK and configure the handler with factories. +Your inputs are `autonoma/scenarios.md` and `autonoma/entity-audit.md`. Your output is an +endpoint that responds to `discover` — end-to-end validation (`up`/`down`) happens in the +next pipeline step. + +## CRITICAL: Database Safety + +You may be connected to a production database. Follow these rules absolutely: + +- **ALL writes go through the SDK endpoint only.** The SDK has production guards, HMAC auth, and signed refs tokens. +- **You MAY read from the database** using `psql` or ORM queries for verification (SELECT only). +- **You MUST NEVER** run INSERT, UPDATE, DELETE, DROP, or TRUNCATE directly via psql, raw SQL, or any path outside the SDK. +- **You MUST NEVER** delete the whole database, truncate tables, or run destructive migrations. +- The SDK's `down` action only deletes records that `up` created, verified by a cryptographically signed token. + +## The #1 rule — read before writing a single factory + +**`db..create()` (or any equivalent ORM/SQL write) inside a factory body for a model +whose audit says `independently_created: true` is NEVER acceptable.** There is no condition +under which this is the right output. If calling the audited function feels hard (inline in +a route, buried in a framework hook, needs DI, triggers Temporal), the answer is never +"just use the ORM." The answer is one of: extract, wire DI, use the app's test-mode +toggle, or stop and ask the user. + +If you catch yourself typing `prisma.x.create`, `db.x.create`, `tx.insert`, `Repo.insert`, +`::create`, `Model.objects.create`, `entityManager.persist`, etc. inside a factory +body for an audited model — delete it. Go back to the per-model decision tree below. + +The entire value of factories is that tests run through the user's real creation path. An +inline ORM call bypasses password hashing, slug generation, audit logs, Stripe sync, +framework hooks that provision sibling rows, state-machine transitions, and every piece of +business logic the user will add next month. It produces data that looks right in a +`SELECT *` but is silently wrong in ways the tests can't catch. + +## Instructions + +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. + + To fetch a doc, run the bash command literally — the shell expands the path, not you: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest implementation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement-scenarios.txt" + curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt" + ``` + + These are the source of truth. Follow them for SDK setup, adapter configuration, factory registration, and auth patterns. + +3. Read `autonoma/entity-audit.md` — parse the frontmatter. For every model with + `independently_created: true`, you MUST register a factory that calls the identified + `creation_function` in `creation_file`. Models with `independently_created: false` get no + factory — the SDK will fall back to raw SQL INSERT automatically. + +4. Read `autonoma/scenarios.md` — parse the frontmatter and full scenario data. Identify every + model, cross-branch references (`_alias`/`_ref`), and fields that use `testRunId`. + +5. Explore the backend codebase to understand: + - Framework (Next.js, Express, Hono, etc.) + - ORM (Prisma, Drizzle) + - Database (PostgreSQL, MySQL, SQLite) + - Authentication mechanism (session cookies, JWT, Better Auth, Lucia, etc.) + - Existing route/endpoint patterns + - **Auth-adjacent framework hooks** — Better Auth `databaseHooks`, NextAuth callbacks, + Lucia adapters, Clerk webhooks. These frequently contain the real creation logic for + User/Session/Account and also write to sibling tables (Organization, Member, Billing). + The audit will flag these with `needs_extraction: true`. + - **App composition root** — where the app wires services, clients, and repositories + (DI container, service registry, module init). You'll reuse this wiring when a + creation function needs dependencies beyond `ctx.executor`. + +## Factory registration philosophy + +Register a factory for **every model with `independently_created: true`** — no exceptions. + +This is true even if the creation function looks trivial. A factory wired up to `ProjectService.create()` +that today just calls `prisma.project.create()` will automatically benefit from any business logic +the user adds later (audit log, Stripe sync, cache write). Raw SQL, by contrast, can never run +that logic — it's always a compatibility risk. + +Models with `independently_created: false` fall back to the SDK's raw SQL path. That's safe because +the audit explicitly determined there's no creation logic to preserve. + +## Dependents, cascades, and teardown + +For every root (`independently_created: true`) decide how its dependents will be torn down +before writing the factory. The `created_by` list in the audit tells you which models come +into existence as a byproduct of this root's creation flow — those rows must also be deleted +when the SDK tears down the root. + +Walk this decision tree in order. The first match wins; if none match, STOP and report. + +1. **Schema cascade** — check the ORM schema. If the FK chain from every dependent back to + the root is `onDelete: Cascade` (Prisma) / `ON DELETE CASCADE` (raw SQL) / analogous in + your ORM, you're done. The SDK deletes the root row and the DB cleans up the rest. No + `teardown` field needed on the factory. +2. **Existing delete function** — if the codebase has a delete method that already tears + down the same subtree (e.g. a `Service.delete` that removes the root AND + every dependent it minted), register `teardown` on the factory to call that function. + Same principle as the `create` side: stay on the user's code path. +3. **Return dependents' IDs the production function ALREADY returns** — if the production + `create` function returns the dependent IDs in its result (e.g. returns + `{ root, child, grandchild }`), forward those IDs in your factory's return so they land + in refs, then register a `teardown` that deletes them in reverse FK order. +4. **None of the above — STOP.** Do NOT modify the production service to return more IDs + than it already does just to make teardown work. Doing so changes the real code path to + serve test needs, which is exactly the inversion we avoid. Report the gap to the user + and let them choose: add a cascade, add a delete function, or accept orphans until + `TRUNCATE` between test runs. + +The `created_by[].why` field is a useful hint for this: if it says "minted inline in the +same transaction", option 1 (schema cascade) is usually set up correctly; if it says "seeded +with the owner so onboarding has something to advance through", check whether the dependent +is behind a soft-delete flag the root's delete function already handles. + +Pure dependents (`independently_created: false`) never have their own `teardown` — they are +torn down via their owner's factory (one of the four options above). + +## Compatibility with legacy audits + +Older audits used a single `independently_created` field. The validators read both schemas and +treat `independently_created: true` as `independently_created: true` with an empty `created_by`. +If the audit you're reading only has `independently_created`, you can still register factories, +but you'll lose the `created_by` teardown guidance above — prefer regenerating the audit +with the current prompt when possible. + +## Research pass — MANDATORY before writing any factory + +Post-mortems of past runs show a consistent failure mode: the agent makes **one bad +decision and applies it 50 times**. The research pass prevents this by forcing you to +open every relevant file and document a per-model decision *before* touching the handler. + +Write a table to `autonoma/.factory-plan.md` with one row per `independently_created: true` +model in the audit. Fill EVERY cell — do not leave any as TODO. The orchestrator and +the user will review this table before you write a single factory. + +``` +| Model | Audit function | File opened? | Import path | DI dependencies observed | Decision (Branch 1/2/3) | Notes | +|-------|----------------|--------------|-------------|--------------------------|-------------------------|-------| +``` + +Column rules: + +- **File opened?** — "yes, lines X-Y" or "no, why". If you write "no", you MUST NOT + proceed. You cannot decide Branch 1 vs Branch 2 without reading the file. +- **Import path** — the exact `import ... from "..."` statement you will add to the + handler. If the symbol is inline in a hook/route (Branch 1), this column holds the + *new* export path you will create during extraction, not the current inline location. +- **DI dependencies observed** — every constructor arg or closed-over variable the + function uses. `ctx.executor` for a DB-only service is the trivial case; any logger, + event bus, Temporal client, analytics client, etc. must be listed. This is where + past agents gave up silently — we want the give-up moment to be visible. +- **Decision** — Branch 1 (extract inline → export → call), Branch 2 (import existing + export → call), or Branch 3 (audit is wrong, argue why). "Inline ORM" is NOT a valid + decision. + +### Cross-codebase DI discovery + +Before filling the table, run these greps against the backend to find real +instantiation patterns. The agent debrief identified this as the single actionable +guidance past runs were missing: + +```bash +# Find how each service is actually constructed in production code. +grep -rnE "new ${ServiceName}\(" apps/ --include='*.ts' --include='*.tsx' | head -20 +# Find exported singletons and module-level instances. +grep -rnE "^(export )?(const|let) [a-zA-Z]+ = new " apps/ --include='*.ts' | head -40 +# Find composition root candidates. +grep -rnlE "(container|registry|services/index|app\.module)" apps/ | head +``` + +Use the results to fill the "DI dependencies observed" column honestly. If a service +needs `logger, eventBus, temporal, analytics` and you can't find where the app wires +them, STOP and ask the user — do NOT fall back to raw ORM. + +### External-side-effects policy reminder + +When the creation function triggers Temporal / GitHub / analytics / BetterAuth hooks, +you are NOT allowed to skip the function. You must either: +1. Call the real function and let the test-mode toggle handle it (grep for + `process.env.NODE_ENV === "test"`, `AUTONOMA_TEST_MODE`, `DISABLE_*`, or similar). +2. Call the real function and let external calls fail gracefully — most SDKs throw, + which is fine if the DB writes complete first. +3. Wrap the external call with a try/catch **inside the real function**, not inside + the factory. + +Never replicate DB writes the function performs. If the real function writes to +sibling tables (Organization, Member, BillingCustomer from BetterAuth's `user.create` +hook; a default Folder from `createProject`), those writes come for free only when +you call the real function. Inlining `db.user.create()` silently drops them. + +--- + +## Per-model decision tree (run this BEFORE writing any factory) + +For every model with `independently_created: true` in `autonoma/entity-audit.md`, walk this tree +in order. Do NOT skip. Each branch has exactly one legitimate output — there is no "give up +and use `db..create()`" escape hatch. + +### Branch 1 — `needs_extraction: true` + +Meaning: the creation logic exists inline in a route handler, a framework hook (Better Auth +`databaseHooks`, NextAuth callbacks, Express middleware closures), or an anonymous closure. +There is no named export to import. + +**Mandatory action — extract before wiring:** + +1. Open `creation_file`. Find the inline block named by `creation_function`. +2. Move the body into a new **named, exported function** in the nearest sensible module + (a new `*.service.ts`, `*.repository.ts`, a sibling `create-.ts`, or an existing + service file if one exists nearby). The function must: + - Take a plain input object (no `req`/`res`/`ctx` — those are HTTP concerns). + - Return the created record (at minimum `{ id }`). + - Preserve every side effect the inline block had — including writes to sibling tables + that framework hooks produce (e.g. Better Auth's `user.create` hook provisioning an + Organization, Member, BillingCustomer; NextAuth's callback writing Account rows). +3. Replace the inline block with a call to the new function. The real HTTP caller's + behavior MUST stay identical. Run the project's typecheck/test command before moving on. + **Leave a short comment** (1–2 lines) above the new exported function explaining why it + was extracted — e.g. `// Extracted from the Better Auth databaseHooks.user.create closure + so the Autonoma Environment Factory can reuse the same creation path (Org + Member + + billing provisioning) as production. See autonoma/entity-audit.md.` This is a courtesy + to the developers who will encounter the new function — they should be able to tell at a + glance that it was lifted out for factory reuse, not invented for it. +4. **Update `autonoma/entity-audit.md` in-place** — change `creation_file` to the new file, + `creation_function` to the new exported name, add `extracted_to: `, + and keep `needs_extraction: true` so the fidelity rubric's framework-hook + carve-out can score the factory against the extracted helper. + Downstream steps read the audit; they must see the fixed state. +5. Now — and only now — import the new function and wire the factory. + +If extraction is genuinely impossible (the inline block depends on `req`/`res` in a way that +can't be untangled, or it's generated code you can't edit), **STOP and ask the user**. Do +NOT fall back to raw ORM. That is the bug we are trying to prevent. + +**Concrete example — Better Auth `databaseHooks`:** + +The audit marks `User` with `needs_extraction: true`, `creation_file: src/auth.ts`, +`creation_function: buildAuth (databaseHooks.user.create)`. Reading `src/auth.ts`, the real +creation logic lives inside a closure passed to `betterAuth({ databaseHooks: { user: { create: async (user) => {...} } } })`, which calls `db.user.create`, then `ensureOrgMembership`, then provisions a `BillingCustomer`, then enqueues a welcome email. + +Wrong: import `db` and call `db.user.create(...)` in the factory — silently skips the +Organization/Member/BillingCustomer rows and every downstream test that reads them breaks. + +Right: extract the closure body into `export async function createUserWithOnboarding(input)` +in `src/auth/create-user.ts`, call it from the Better Auth hook (so production still works), +update the audit, then `import { createUserWithOnboarding }` in the factory. + +### Branch 2 — `independently_created: true`, no `needs_extraction` + +Meaning: a named exported function or class method already exists. Import it and call it. +Do not copy its body. Do not call the ORM directly "because it's simpler." The whole point +is to stay on the user's code path. + +Go to the DI playbook below to figure out how to invoke it. + +### Branch 3 — `independently_created: false` + +Do not register a factory at all. The SDK's raw SQL fallback handles it. Writing a factory +here just so you can call `db..create()` is the anti-pattern in disguise — let the +SDK do it. + +## DI / constructor-injection playbook + +Factories receive `(data, ctx)` where `ctx.executor` is the DB client/transaction. That's +enough for simple service classes but many creation functions need more. Walk this list in +order — the first match wins: + +1. **Top-level exported function** — `import { createX } from "..."; return createX(data);`. + Simplest case. Most services should end up here after Branch 1 extraction. +2. **Static method on a class** — `return XService.create(data, ctx.executor);`. Pass + `ctx.executor` as the DB/transaction argument so writes stay in the SDK's transaction. +3. **Instance method, needs only a DB client** — + `const svc = new XService(ctx.executor); return svc.create(data);`. Mirrors how the app + instantiates it at call time. +4. **Instance method, needs more dependencies (logger, event bus, config, clients)** — + find the app's composition root (DI container, service registry, `container.ts`, + `app.module.ts`, `services/index.ts`) and reuse it. Two viable patterns: + - **Import the already-constructed singleton** the app exports for production use: + `import { userService } from "@/services"; return userService.create(data);`. + - **Rebuild the service the same way the composition root does**, substituting + `ctx.executor` for the DB dependency and importing real singletons for everything + else (logger, event bus). Do not invent mocks. Example: + + ```ts + import { logger, eventBus, temporalClient } from "@/lib/singletons"; + + UserProfile: defineFactory({ + create: async (data, ctx) => { + const svc = new UserProfileService({ + db: ctx.executor, + logger, + eventBus, + temporal: temporalClient, + }); + return svc.create(data); + }, + }), + ``` +5. **Framework-scoped dependencies (NestJS provider, Fastify plugin, Rails concern)** — + bootstrap the smallest containing module and resolve the service from it. If that turns + into a 50-line boilerplate, that's a signal the composition root should expose a helper + the factory can call; add the helper to the app and use it. Still never `db.create()`. +6. **Impossible** — if you genuinely can't wire the dependencies without rewriting the + service, STOP and ask the user. Do NOT fall back to raw ORM. + +Never mock, stub, or fake a dependency. The factory must exercise real code. + +## External side effects policy + +Audited creation functions often perform side effects beyond the DB row: enqueueing a +Temporal workflow, hitting the GitHub/Stripe/Slack API, sending an email, publishing to a +message bus, writing a semantic embedding, firing an analytics event, calling an LLM. + +**Your goal is correct DB state, not production-grade external delivery.** The factory MUST +preserve every DB write the real function performs (including writes to sibling tables +done by ORM hooks, framework hooks, triggers). It is NOT responsible for making every +network call succeed. Order of preference: + +1. **Call the real function with real side effects.** If Temporal/GitHub/Stripe clients are + already wired for the test environment (sandbox keys, a local Temporal dev server, + mocked SDKs in test config), just call through. Cleanest option when infra is available. +2. **Use the app's existing test-mode toggle.** Most apps have one: an env var + (`NODE_ENV=test`, `DISABLE_WORKFLOWS=1`, `ANALYTICS_DISABLED=1`), a feature flag, a + null-object client injected in tests. Find it, set it on the handler's environment, and + call the real function. +3. **Wrap external-only calls and let them no-op on failure.** If no toggle exists and the + call would fail in the test environment, the acceptable pattern is to try/catch the + outbound call inside the real function's wrapper — not inside a rewritten factory body. + Prefer exposing a toggle in the app over adding try/catch at the factory layer. Only use + this for calls whose failure does not affect DB state under test. If a test later + asserts on a row the side effect would have created, make it succeed (option 1 or 2). +4. **Reimplement the DB writes inline.** NEVER. If you find yourself typing + `db..create` inside a factory to replicate what a hook or workflow would + have done, STOP. That means the function wasn't truly "called" — you re-wrote it. Go + back to option 1 or 2, or ask the user. + +**What you are NOT allowed to skip:** + +- Password hashing, slug generation, ID derivation, normalisation — pure CPU work inside + the creation function; calling the function gets them for free. +- DB writes performed by ORM hooks / framework hooks / triggers on the model being created. + Better Auth's `databaseHooks.user.create` writes to Organization, Member, BillingCustomer + — if you call `db.user.create()` instead of the real signup function, those rows go + missing and every test that reads them breaks silently. +- Writes to sibling tables done by the creation function itself (e.g. `createProject` + writing a default Folder row). If you don't call the function, those rows go missing too. + +## CRITICAL: Before Writing Any Code + +**Ask the user for confirmation** before implementing. Present your plan: + +> "I'm about to set up the Autonoma SDK. Here's what I'll do: +> +> **SDK packages**: [list packages to install] +> **Endpoint location**: [where the handler file will go] +> **Scope field**: [e.g., organizationId] +> +> **Models needing extraction (`needs_extraction: true`)**: +> - [Model]: inline in `[file]#[block]` → will extract to `[new file]#[new function]` +> - ... +> +> **Factories to register** (from entity-audit.md): +> - [Model]: calls `[file]#[function]` (DI: [top-level import / `new Service(ctx.executor)` / composition-root singleton]; side effects: [list, or "none — future-proofs against added logic"]) +> - ... +> +> **External side effects strategy**: [test-mode toggle name / sandbox credentials / try-catch wrapper] +> +> **Raw SQL fallback** (no creation code in audit): [list] +> +> **Auth callback**: [how sessions/tokens will be created] +> +> **Database operations**: The SDK creates test data by calling the factories you register +> (or raw SQL for models without creation code). It deletes only what it created during +> teardown (verified by a signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on +> existing data. +> +> **Environment variables needed**: +> - `AUTONOMA_SHARED_SECRET` — shared with Autonoma for HMAC request verification +> - `AUTONOMA_SIGNING_SECRET` — private, for signing refs tokens +> +> To generate these secrets, run: +> ```bash +> openssl rand -hex 32 +> ``` +> Run this command TWICE — once for each secret. Use DIFFERENT values for each. +> Set them in your `.env` file (or equivalent): +> ``` +> AUTONOMA_SHARED_SECRET= +> AUTONOMA_SIGNING_SECRET= +> ``` +> +> Shall I proceed?" + +**Do NOT proceed until the user confirms.** + +## Implementation + +### 1. Install SDK packages + +Pick the correct packages for the project's stack: + +| Your ORM | Package | +|----------|---------| +| Prisma | `@autonoma-ai/sdk-prisma` | +| Drizzle | `@autonoma-ai/sdk-drizzle` | + +| Your Framework | Package | +|----------------|---------| +| Next.js App Router, Hono, Bun, Deno | `@autonoma-ai/server-web` | +| Express, Fastify | `@autonoma-ai/server-express` | +| Node.js http | `@autonoma-ai/server-node` | + +Always install `@autonoma-ai/sdk` as the core package. + +### 2. Do the extractions FIRST + +Before writing the handler, walk every `needs_extraction: true` model in the audit and do +the extraction per Branch 1 of the decision tree. After each extraction, update +`autonoma/entity-audit.md` in-place. This must happen before Step 3 — the handler imports +these new exports by name. + +### 3. Create the endpoint handler + +Write a single handler file that: +1. Imports and configures the ORM adapter with the scope field +2. Registers factories for EVERY model with `independently_created: true` in entity-audit.md +3. Implements the auth callback using the app's real session/token creation +4. Passes both secrets from environment variables + +Match existing codebase patterns — import style, file organization, error handling. + +### 4. Register factories (one per model with creation code) + +For every entry in entity-audit.md with `independently_created: true`: + +- Import the function from `creation_file` (post-extraction if Branch 1 applied) +- Wrap it in `defineFactory({ create, teardown? })` from `@autonoma-ai/sdk` +- In `create`: call the imported function with the resolved data and return at least `{ id }` (the primary key) +- Optionally define `teardown` for custom cleanup (SQL DELETE is the default) + +#### The one thing you MUST NOT do + +Do not re-implement the creation logic inline using the ORM, even if calling the real function +is inconvenient (constructor arguments, DI containers, weird signatures). The entire point of +the factory is to stay on the user's code path so that when they add business logic later — +password hashing, audit logs, Stripe sync, state-machine transitions — the test data gets it +for free. Inline ORM calls bypass all of that silently and are the #1 bug source in generated +factories. + +**A raw ORM/DB write MUST NEVER appear in a factory body for a `independently_created: true` +model.** There are no exceptions. Exact patterns vary by language/ORM — a non-exhaustive list: + +- TypeScript/JavaScript: `prisma..create(`, `db..create(`, `tx.insert(`, `drizzle.insert(`, `knex('').insert(`, `sequelize.models..create(`, `typeorm.getRepository(...).save(`, `mongoose.Model.create(`, `await .create(`, `.upsert(` +- Python: `session.add(`, `session.execute(insert(...))`, `Model.objects.create(`, `Model(...).save(`, `db.session.add(`, `conn.execute("INSERT ...")` +- Ruby/Rails: `.create(`, `.create!(`, `.new(...).save`, `.insert(`, `ActiveRecord::Base.connection.execute("INSERT ...")` +- PHP/Laravel: `::create(`, `new (...)->save()`, `DB::table('...')->insert(`, `$repository->persist(` +- Java/Spring: `entityManager.persist(`, `.save(`, `jdbcTemplate.update("INSERT ...")` +- Go: `db.Create(`, `gorm.DB.Create(`, `sq.Insert(`, raw `db.Exec("INSERT ...")` / `db.ExecContext(...)` +- Elixir/Ecto: `Repo.insert(`, `Repo.insert!(`, `Repo.insert_all(` +- Rust: `diesel::insert_into(`, `sqlx::query!("INSERT ...")`, `sea_orm::ActiveModel ... .insert(` +- Raw SQL anywhere: an `INSERT INTO ` string literal passed to a query/exec/prepare API + +If you wrote one of these inside a factory body for a model whose audit says +`independently_created: true`, you took the trap. Delete it. Go back to the per-model decision +tree and the DI playbook. + +**WRONG — re-implementing creation logic inline (this is the trap):** + +```ts +// entity-audit.md said: creation_function = OnboardingManager.getState +OnboardingState: defineFactory({ + create: async (data) => { + // Bypasses OnboardingManager entirely. If the user adds logic later, tests silently diverge. + return db.onboardingState.create({ data: { applicationId: data.applicationId, step: "welcome" } }); + }, +}), +``` + +**RIGHT — call the audit's identified function, even if you have to instantiate a class:** + +```ts +import { OnboardingManager } from "@/lib/onboarding-manager"; + +OnboardingState: defineFactory({ + create: async (data, ctx) => { + // Uses the real code path. Any business logic added later flows through automatically. + const manager = new OnboardingManager(ctx.executor); + return manager.getState(data.applicationId); + }, +}), +``` + +### 4b. Populate `tableNameMap` sparsely (do not mirror the factory registry) + +The SDK auto-derives model names from SQL tables by splitting on `_` and PascalCasing +each part. **No pluralization is performed.** `organization` → `Organization`; +`organizations` → `Organizations`; `api_key` → `ApiKey`; `api_keys` → `ApiKeys`. + +Do NOT write a `tableNameMap` / `table_name_map` that mirrors your factory registry +1:1. That doubles the maintenance surface and is a silent-breakage foot-gun — adding a +new model forces two edits and forgetting one silently misroutes creates. + +**Algorithm to follow before writing the map:** + +1. List every factory key you intend to register. +2. For each key, compute `autoName = snakeToPascal(dbTable)` — split on `_`, PascalCase + each part, concatenate. No pluralization step. +3. If `autoName === factoryKey`: **do not add** the entry. +4. If `autoName !== factoryKey`: add the entry. +5. If after step 4 the map is empty, **omit the `tableNameMap` field entirely**. + +**Worked example (plural DB tables, singular factory keys):** + +```ts +// DB tables: organizations, users, api_keys +// Factory keys: Organization, User, ApiKey +// Every auto-derived name disagrees → every factory needs one entry: +tableNameMap: { + Organization: 'organizations', + User: 'users', + ApiKey: 'api_keys', +}, +factories: { Organization: ..., User: ..., ApiKey: ... }, +``` + +**Worked example (singular DB tables):** + +```ts +// DB tables: organization, user, api_key +// Factory keys: Organization, User, ApiKey +// Every auto-derived name matches → omit tableNameMap entirely. +factories: { Organization: ..., User: ..., ApiKey: ... }, +``` + +**Red flag.** If `tableNameMap` ends up with exactly one entry per factory and every +entry is a plural↔singular rename, you have two options: + +- (a) Keep the map (verbose but explicit). +- (b) Change factory keys to match the plural auto-derived names (`Organizations`, + `Users`, `ApiKeys`) and drop the map entirely. + +Prefer (b) unless scenario files already use the singular convention. A `tableNameMap` +that is a 1:1 copy of the factory registry means you're doing work the SDK already +does. + +### 5. Register the route + +Add the endpoint to the app's routing. + +### 6. Set up environment variables + +Add `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` to `.env`. If `.env.example` exists, add placeholders. + +## Smoke test + +Before writing the sentinel, run a single `discover` call to confirm the endpoint is wired +up and HMAC works. Do NOT run `up` or `down` here — that is the scenario-validator's job. + +```bash +export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)} +export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)} + +BODY='{"action":"discover"}' +SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') +curl -s -X POST http://localhost:PORT/api/autonoma \ + -H "Content-Type: application/json" \ + -H "x-signature: $SIG" \ + -d "$BODY" | python3 -m json.tool +``` + +Expected: JSON with `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`. + +If this fails, fix the handler (likely the adapter config or route mount) before writing +the sentinel. + +## CRITICAL: Factory-integrity check (before writing the sentinel) + +Prove every factory calls the audit's identified `creation_function`. This is deterministic +static analysis, not a vibe check. Run it yourself and HALT if it fails — the next step +(scenario-validator) runs the exact same check and will kick the work back. + +### Step A — collect the audit targets + +Parse `autonoma/entity-audit.md` and build a list of `(model, creation_file, creation_function)` +for every model with `independently_created: true`. Also flag any entry that still has +`needs_extraction: true` — that's a bug (you were supposed to extract first and clear the +flag). HALT and go do the extraction. + +### Step B — grep the handler for the anti-pattern + +```bash +grep -nE '(prisma|db|tx)\.[a-zA-Z_]+\.(create|createMany|insert|upsert)\(' +``` + +Every match inside a `defineFactory({ create })` body is a RED FLAG. The only legitimate +matches are: +- Inside a model's `teardown` body (custom cleanup is allowed). +- Outside any `defineFactory` (auth callback, scope helpers, etc.). +- Inside a factory for a model the audit marked `independently_created: false` (no service exists; + raw ORM is the documented fallback — though the SDK does this automatically, so you usually + shouldn't even write such a factory). + +Anything else is the trap. Do NOT ship it. + +### Step C — per-model structural check + +For each `(model, creation_file, creation_function)` from Step A, verify ALL of: + +1. An `import` (or `require`) line pulls `creation_function` — or the class/object that owns + it — into the handler file, from a path that resolves to `creation_file`. +2. The factory body for `model` invokes that identified symbol (e.g. `manager.getState(...)`, + `createUser(...)`, `ProjectService.create(...)`, `service.create(...)`). +3. The factory body does NOT contain a raw ORM write for `model` (`db..create(...)`, + `prisma..create(...)`, `tx.insert(Table)`, etc.). + +If any model fails any of the three, STOP. Fix the factory per the per-model decision tree +and the DI playbook, then re-run this check from Step A. + +### Step D — commit only when clean + +Only write `autonoma/.endpoint-implemented` after: +- Every `needs_extraction: true` flag in the audit has been resolved. +- Step B returns zero anti-pattern matches inside factory bodies. +- Step C passes for every audited model. +- The discover smoke test returns 200 with the expected schema shape. + +If you extracted any route-handler or framework-hook logic into a new exported function +(per Branch 1), the audit must have been updated in-place; re-read it after the edit before +running Step A. + +## CRITICAL: Write the implementation sentinel + +After the discover smoke test passes AND the factory-integrity check passes, use the +`Write` tool to create `autonoma/.endpoint-implemented` with a short plain-text summary: + +``` +Endpoint implemented. +- handler: +- packages: +- factories registered: +- extractions performed: +- scope field: +- auth callback: +``` + +Do NOT use `touch` — the hook fires only on `Write`/`Edit`. + +The next step (scenario-validator) will exercise up/down for every scenario and write +`autonoma/.endpoint-validated`. E2E test generation is blocked until that happens. + +## What to Explain to the User + +After implementation and validation, explain: + +1. **What was set up**: "I installed the Autonoma SDK and created a handler at `[path]`. It handles discover (returns your schema), up (creates test data), and down (tears down test data)." + +2. **Extractions performed**: For each `needs_extraction: true` model, show the inline block → new exported function mapping, and confirm the original caller now invokes the new function. + +3. **Factories registered**: List each factory — which function it wraps, which DI pattern was used, and what side effects the audit observed (or "none — factory is registered to future-proof"). + +4. **External side effects strategy**: which toggle/sandbox/wrapper was used. + +5. **How to set up secrets**: "Generate two secrets with `openssl rand -hex 32` and set them as: + - `AUTONOMA_SHARED_SECRET` — share this with Autonoma + - `AUTONOMA_SIGNING_SECRET` — keep this private" + +6. **Safety**: "The SDK can only INSERT records via the factories you registered (which call the user's real creation functions) or raw SQL for models without creation code. Teardown only deletes records that were created (verified by a cryptographically signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on existing data." + +## Important + +- Always implement in the project's existing backend — don't create a standalone server +- Match existing code patterns and conventions +- Use the same ORM/database layer the project already uses +- Register factories for EVERY model with `independently_created: true` in the audit — no exceptions, even for thin wrappers +- Resolve every `needs_extraction: true` by extracting FIRST, then wiring the factory +- Never reimplement the user's creation logic in a factory — always call their function +- `db..create()` in a factory for a `independently_created: true` model is NEVER acceptable +- ALL database writes go through the SDK endpoint — never write directly +- Use `testRunId` to make unique fields (emails, org names) to prevent parallel test collisions +- Validate the FULL lifecycle (discover → up → verify → down → verify) before completing diff --git a/agents/kb-generator.md b/agents/kb-generator.md index f26e998..cd83f42 100644 --- a/agents/kb-generator.md +++ b/agents/kb-generator.md @@ -21,22 +21,38 @@ You generate a structured knowledge base for a codebase. Your output MUST be wri ## Instructions -1. First, fetch the latest knowledge base generation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt` - and follow those instructions for how to analyze the codebase. + To fetch a doc, run the bash command literally — the shell expands the path, not you: -2. Create the output directory if it doesn't exist: + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest knowledge base generation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt" + ``` + + Read the output and follow those instructions for how to analyze the codebase. + +3. Create the output directory if it doesn't exist: ```bash mkdir -p autonoma/skills ``` -3. Follow the fetched instructions to analyze the codebase — discover the application, +4. Follow the fetched instructions to analyze the codebase — discover the application, map pages and flows, identify core workflows. -4. Write the output to `autonoma/AUTONOMA.md`. +5. Write the output to `autonoma/AUTONOMA.md`. -5. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered. +6. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered. ## CRITICAL: Output Format diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md index 342899a..57cc418 100644 --- a/agents/scenario-generator.md +++ b/agents/scenario-generator.md @@ -1,7 +1,7 @@ --- description: > Generates test data scenarios from a knowledge base. - Reads AUTONOMA.md plus SDK discover output and produces scenarios.md with three named test data environments. + Reads AUTONOMA.md and produces scenarios.md with three named test data environments. Output has YAML frontmatter with scenario summaries for deterministic validation. tools: - Read @@ -16,61 +16,71 @@ maxTurns: 40 # Scenario Generator -You generate test data scenarios from a knowledge base. Your inputs are `autonoma/AUTONOMA.md`, -`autonoma/skills/`, and `autonoma/discover.json`. Your output MUST be written to -`autonoma/scenarios.md` with YAML frontmatter. +You generate test data scenarios from a knowledge base. Your input is `autonoma/AUTONOMA.md` +and `autonoma/skills/`. Your output MUST be written to `autonoma/scenarios.md` with YAML frontmatter. ## Instructions -1. First, fetch the latest scenario generation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt` - and follow those instructions for how to design scenarios. + To fetch a doc, run the bash command literally — the shell expands the path, not you: -2. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types. + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` -3. Read `autonoma/discover.json`. Treat the SDK `discover` response as the source of truth for: - - database models - - fields and requiredness - - foreign key edges - - parent/child relations - - scope field + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. - While reading the schema, assess whether the scope entity provides real **per-run data isolation**. - Ask yourself: does the scope entity parent most other models via required foreign keys? Can a new - scope entity be created per test run (i.e. it has creatable fields beyond just auto-generated IDs)? - Do most models in the graph eventually chain back to the scope entity? +2. Fetch the latest scenario generation instructions: - If the answer is yes to all of these, the app has natural multi-tenant isolation — each test run - can create its own scope entity and all child data is automatically partitioned. + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-scenarios.txt" + ``` - If the scope entity is a singleton, shared across users, or doesn't meaningfully partition data - across concurrent runs, the app **lacks natural per-run isolation**. In this case you must slug - all identifying fields with `{{testRunId}}` (see step 6 below) so that parallel or sequential - test runs never collide on lookup, search, or assertion values. + Read the output and follow those instructions for how to design scenarios. + +3. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types. + +4. Read `autonoma/entity-audit.md` — this is the authoritative schema map from Step 2. + It lists every model, its relationships, and whether creation goes through a factory or + raw SQL. Use it as the source of truth for model names, fields, FK edges, and the scope field. + +5. Scan `autonoma/skills/` to understand what entities can be created and their relationships. + +6. Explore the backend codebase only to fill gaps the audit does not cover (e.g. enum values, + string length limits, constraint details). - If `autonoma/discover.json` is missing or malformed, stop and tell the user that Step 2 now - requires a valid SDK discover artifact before scenario generation can continue. +7. **Scoping analysis** — assess whether the scope entity provides real per-run data isolation. + Ask: does the scope entity parent most other models via required FKs? Can a new scope entity + be created per test run (i.e. it has creatable fields beyond auto-generated IDs)? Do most + models eventually chain back to the scope entity? -4. Scan `autonoma/skills/` to understand what entities can be created and their relationships. + If yes to all: the app has natural multi-tenant isolation — each test run creates its own + scope entity and all child data is automatically partitioned. -5. Use the SDK discover schema plus the knowledge base to design three scenarios: `standard`, `empty`, `large`. + If the scope entity is a singleton, shared across users, or does not meaningfully partition + data across concurrent runs: the app **lacks natural per-run isolation**. In this case you + MUST slug all identifying fields with `{{testRunId}}` (see step 9) so parallel or sequential + test runs never collide on lookup, search, or assertion values. + +8. Design three scenarios: `standard`, `empty`, `large`. -6. Prefer hardcoded values when they make the resulting tests simpler, more reviewable, and more stable. - If a field needs run-level uniqueness but can still be expressed as a concrete literal, prefer a planner-chosen - hardcoded value with a discriminator suffix or prefix over introducing a variable placeholder. - Example: prefer `Acme Project testRunId suffix` encoded as a concrete scenario value over turning the whole field +9. **Variable fields.** Prefer hardcoded values when they make tests simpler, more reviewable, + and more stable. If a field needs run-level uniqueness but can still be expressed as a + concrete literal, prefer a planner-chosen hardcoded value with a discriminator suffix over + introducing a variable placeholder. + Example: prefer `Acme Project qa-17` encoded as a concrete value over turning the field into `{{project_name}}` unless later tests truly need the placeholder. - **Exception — apps without natural per-run isolation:** If your scoping analysis in step 3 - determined the app lacks natural multi-tenant isolation, **reverse the default above**. Slug ALL - identifying fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline - `{{testRunId}}` so that every value a test might search for, type into a form, or assert on screen - is unique to that test run. Use the pattern `Concrete Value {{testRunId}}` (e.g. - `Acme Corp {{testRunId}}`, `Main Project {{testRunId}}`). Each slugged field becomes a - `variable_field` entry with `generator: derived from testRunId`. This prevents parallel or - sequential test runs from interfering with each other when there is no scope entity to partition - the data. + **Exception — apps without natural per-run isolation:** if your scoping analysis determined + the app lacks natural multi-tenant isolation, **reverse the default**. Slug ALL identifying + fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline + `{{testRunId}}` so every value a test might search, type, or assert on screen is unique to + that test run. Pattern: `Concrete Value {{testRunId}}` (e.g. `Acme Corp {{testRunId}}`). + Each slugged field becomes a `variable_field` entry with `generator: derived from testRunId`. Use variable fields sparingly. Only mark a value as variable when at least one of these is true: - the field must be globally unique or is highly collision-prone across runs @@ -83,10 +93,8 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom constraint enforced by the database or application **must** be variable — hardcoding them will cause test failures when the hardcoded value expires or collides. - Do not mark a field as variable just because: - - it is user-facing text - - it could be unique in theory - - you want to avoid choosing a concrete literal + Do not mark a field as variable just because it is user-facing text, could be unique in + theory, or you want to avoid choosing a concrete literal. Every variable field must have: - a double-curly token such as `{{project_title}}` @@ -95,20 +103,46 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom - a reason explaining why it truly must vary - a plain-language test reference such as `({{project_title}} variable)` - `generator` is optional. If you include it, use a short free-form strategy note such as - `derived from testRunId`, `planner literal plus discriminator`, `backend-generated`, `UUID suffix`, - or `timestamp-based`. + `generator` is optional. Use a short free-form strategy note such as `derived from testRunId`, + `planner literal plus discriminator`, `backend-generated`, `UUID suffix`, or `timestamp-based`. Do not default to `faker`. Prefer deterministic derivation from stable inputs, and use `faker` - only as a last resort when deterministic strategies are not practical. - - Good: - - use a concrete value such as `Acme Workspace qa-17` when the planner can safely choose it and append a discriminator - - only `{{owner_email}}` is variable because login requires uniqueness across runs - - Bad: - - every user name, organization name, and label is variable with `faker.*` by default - -7. Write the output to `autonoma/scenarios.md`. + only as a last resort. + +10. **Nested tree constraint.** Design scenario entity tables so they can be expressed as a + nested tree rooted at the scope entity. Step 4 (env-factory) and Step 5 (scenario-validator) + will convert scenarios into nested `create` payloads — flat cross-model structures connected + only by `_ref` break when JSON key order is not preserved. Children must nest under their + parent using the relation field names from the audit. Use `_ref` only for cross-branch + references that cannot be expressed through nesting. + +11. **Standalone vs via-owner choice.** For every model that appears in a scenario, consult + the audit and pick one of two paths: + + - If the model has `independently_created: true` and the scenario narrative wants it + in isolation (e.g. the user creates a child directly, independent of any root), add + it as a top-level tree node. The SDK will call its factory directly. + - If the model appears in some owner's `created_by` list and the scenario narrative + already includes that owner (e.g. the scenario already has the root, and a default + child / onboarding row / deployment row comes along for free), **do NOT add the + model as a separate node**. It is created as a side effect of the owner's factory. + Quote the `why` from the audit in the scenario prose so the reader knows where it + came from. + + **Dual models** (`independently_created: true` AND listed in someone's `created_by`) + get to pick per-scenario: + + - Narrative where the root is being created for the first time → the child comes in + via the owner (via-owner path). + - Narrative where the root already exists and the user is creating a standalone child + → the child is a top-level node (standalone-factory path); its owner is also in + the tree, as its FK parent. + + Never double-create a dependent. If the audit says an owner mints a dependent row + inline, and your scenario has that owner, the dependent must not appear as a separate + tree node — the factory already creates it, and adding it twice will either fail + uniqueness checks or produce confusing test state. + +12. Write the output to `autonoma/scenarios.md`. ## CRITICAL: Output Format @@ -136,12 +170,6 @@ entity_types: - name: "Test" - name: "Run" - name: "Folder" -discover: - source: sdk - model_count: 12 - edge_count: 18 - relation_count: 16 - scope_field: "organizationId" variable_fields: - token: "{{project_title}}" entity: "Project.title" @@ -152,7 +180,6 @@ variable_fields: reason: "title must be unique per test run" test_reference: "({{project_title}} variable)" planning_sections: - - sdk_discover - schema_summary - relationship_map - variable_data_strategy @@ -169,33 +196,28 @@ planning_sections: - `total_entities`: Total count of entities created in this scenario - **entity_types**: List of ALL entity types discovered in the data model. Each has: - `name`: Entity type name (e.g., "User", "Project", "Run") -- **discover**: Summary of the SDK discover artifact. It must include: - - `source`: exactly `sdk` - - `model_count`, `edge_count`, `relation_count`: counts from `autonoma/discover.json` - - `scope_field`: scope field name from `autonoma/discover.json` -- **variable_fields**: List of generated or per-run values that tests must not treat as hardcoded literals. - Each entry has: +- **variable_fields**: List of generated or per-run values that tests must not treat as + hardcoded literals. May be `[]` if no variable fields are needed. Each entry has: - `token`: double-curly placeholder such as `{{project_title}}` - `entity`: entity field path such as `Project.title` - `scenarios`: list of scenario names that use this variable - `reason`: why this field must be generated - `test_reference`: how tests should refer to the value in natural language - - optional `generator`: free-form generation hint such as `derived from testRunId` or `backend-generated` + - optional `generator`: free-form generation hint such as `derived from testRunId` - **planning_sections**: A list describing which planning artifacts are present. It must include: - - `sdk_discover` - `schema_summary` - `relationship_map` - `variable_data_strategy` - - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation and you need to explain why fields were aggressively slugged with `{{testRunId}}` + - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation + and you need to explain why fields were aggressively slugged with `{{testRunId}}` ### After the frontmatter The rest of the file follows the standard scenarios.md format from the fetched instructions: -- Include a `## SDK Discover` section summarizing the schema counts and scope field. -- Include a `## Schema Summary` section listing the key models and required fields that drive the scenarios. -- Include a `## Relationship Map` section describing the important parent/child and FK relationships. -- Include a `## Variable Data Strategy` section explaining which values are generated and how tests should reference them. -- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation — explain why fields were aggressively slugged with `{{testRunId}}` and what isolation boundary the slugging replaces. +- Include a `## Schema Summary` section listing the key models and required fields driving the scenarios. +- Include a `## Relationship Map` section describing parent/child and FK relationships. +- Include a `## Variable Data Strategy` section explaining which values are generated and how tests reference them. +- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation. - Scenario: `standard` (credentials, entity tables with concrete data, aggregate counts) - Scenario: `empty` (credentials, all entity types listed as None) - Scenario: `large` (credentials, high-volume data described in aggregate) @@ -207,28 +229,24 @@ you'll receive an error message. Fix the issue and rewrite the file. The validation checks: - File starts with `---` (YAML frontmatter) -- Frontmatter contains scenario_count, scenarios, entity_types, discover, variable_fields -- Frontmatter contains planning_sections metadata +- Frontmatter contains scenario_count, scenarios, entity_types, variable_fields, planning_sections - scenarios list length matches scenario_count - Required scenarios (standard, empty, large) are present - Each scenario has name, description, entity_types, total_entities - entity_types is a non-empty list with name fields -- discover includes sdk source, schema counts, and scope field - variable_fields entries use double-curly tokens and known scenario names -- planning_sections includes sdk_discover, schema_summary, relationship_map, and variable_data_strategy +- planning_sections includes schema_summary, relationship_map, and variable_data_strategy ## Important - **The scenario data is a contract.** Fixed values are hard assertions; variable fields are explicit placeholders. -- Prefer concrete literals for seed data unless the field truly must vary across runs. +- Prefer concrete literals unless the field truly must vary across runs. - Use variables sparingly. A smaller, justified variable list is better than marking every identity field dynamic. -- Do not default to `faker`. Prefer deterministic strategies such as planner-chosen literals with stable discriminator conventions, deriving from `testRunId`, or backend-generated values. -- If a field can safely be a concrete literal for review and testing, keep it concrete. -- Only include `generator` when the generation mechanism is important to communicate. +- Do not default to `faker`. Prefer deterministic strategies — planner-chosen literals with stable discriminators, derivation from `testRunId`, or backend-generated values. - Every value must be concrete — not "some applications" but "3 applications: Marketing Website, Android App, iOS App" - Every relationship must be explicit — which entities belong to which - Every enum value must be covered in `standard` -- Use the SDK discover output instead of re-deriving the schema from local code -- If the discover artifact is missing, ask the user to provide a working SDK discover response -- Only use `{{testRunId}}` as a template token — do not invent custom variable tokens like `{{user_email_alice}}`. The SDK template engine only resolves built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`, etc.). Custom tokens cause a runtime error when the dashboard sends the payload directly to the endpoint. If a field needs uniqueness, inline the testRunId directly: e.g. `alice-{{testRunId}}@test.local` -- Design scenario entity tables so they can be expressed as a nested tree rooted at the scope entity. The Step 4 agent will convert scenarios into nested `create` payloads — flat cross-model `_ref` only structures break when JSON key order is not preserved +- Use subagents to parallelize data model discovery +- Only use `{{testRunId}}` as a template token in scenario BODIES (field values). Custom tokens like `{{user_email_alice}}` are only valid in `variable_fields` declarations — when the SDK resolves payloads at runtime it only knows built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`). If a field needs uniqueness inside the scenario body, inline testRunId: e.g. `alice-{{testRunId}}@test.local`. +- Design scenarios so each entity table can be serialised as a nested tree rooted at the scope entity. Flat cross-model `_ref`-only structures break when JSON key order is not preserved. +- If the audit does not describe a model you need, ask the user rather than guessing. diff --git a/agents/scenario-validator.md b/agents/scenario-validator.md index b91a8b5..f5ec61c 100644 --- a/agents/scenario-validator.md +++ b/agents/scenario-validator.md @@ -1,7 +1,9 @@ --- description: > - Validates planned scenarios against a live Autonoma SDK endpoint and writes - approved scenario recipes. Assumes SDK integration is already complete. + Validates the Environment Factory endpoint end-to-end by running discover/up/down + against every scenario, iteratively fixing handler bugs and reconciling scenarios.md + with the real behavior. Writes autonoma/.endpoint-validated on success. Hard gate + before E2E test generation. tools: - Read - Glob @@ -11,207 +13,247 @@ tools: - Bash - Agent - WebFetch -maxTurns: 60 +maxTurns: 120 --- -# Scenario Validator - -You validate the planned scenarios against an already-working Autonoma SDK endpoint. -Your inputs are `autonoma/discover.json`, `autonoma/scenarios.md`, and the existing backend behavior. -Your output is `autonoma/scenario-recipes.json`. -You MUST also leave a terminal artifact in `autonoma/.scenario-validation.json`. - -## Goal - -Step 1 already handled SDK installation, endpoint wiring, secrets, branch creation, and any PR work. -This step is validation-only. Your job is to: - -1. read the schema contract from `autonoma/discover.json` -2. read the scenario intent from `autonoma/scenarios.md` -3. smoke-test `discover`, `up`, and `down` against the live endpoint -4. validate `standard`, `empty`, and `large` -5. persist approved recipes to `autonoma/scenario-recipes.json` - -## Strict Prohibitions - -- Do NOT install packages. -- Do NOT edit backend code. -- Do NOT modify SDK source code. -- Do NOT modify database schemas or migrations. -- Do NOT create branches, commits, or PRs. -- Do NOT try to "fix" validation failures by changing the SDK contract. - -If validation fails, report the backend or recipe issue clearly and stop. Treat failures as integration or scenario issues, not coding tasks for this step. -On failure, still write `autonoma/.scenario-validation.json` with `status: "failed"` and all blocking issues. - -## Instructions - -1. Fetch the current SDK protocol reference: - - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt` - -2. Read: - - `autonoma/discover.json` - - `autonoma/scenarios.md` - -3. Read `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` from the environment. - - If `AUTONOMA_SDK_ENDPOINT` is missing or the endpoint is unreachable, stop and tell the user to check Step 1 or the local dev server status. - - Do not try to implement or repair the endpoint in this step. - -## Validation Requirements - -### Smoke-test the live endpoint - -At minimum: -1. confirm `discover` works -2. send one signed `up` request with a small inline `create` payload compatible with the schema -3. send the corresponding signed `down` request using the returned `refsToken` -4. verify cleanup succeeds - -### Scenario validation - -After the smoke test works, validate `standard`, `empty`, and `large` against the current backend. - -Prefer: -1. backend-local `checkScenario` / `checkAllScenarios` if already available without code changes -2. signed endpoint `up` / `down` validation otherwise - -Do not change the backend if validation fails. Report the failure and stop. - -## Recipe Shape Requirements - -Write `autonoma/scenario-recipes.json` in this exact logical shape: - -```json -{ - "version": 1, - "source": { - "discoverPath": "autonoma/discover.json", - "scenariosPath": "autonoma/scenarios.md" - }, - "validationMode": "sdk-check", - "recipes": [ - { - "name": "standard", - "description": "Realistic dataset for core flows", - "create": { - "Organization": [{ - "_alias": "org1", - "name": "Acme Corp" - }] - }, - "variables": { - "testRunId": { - "strategy": "derived", - "source": "testRunId", - "format": "{testRunId}" - } - }, - "validation": { - "status": "validated", - "method": "checkScenario", - "phase": "ok", - "up_ms": 12, - "down_ms": 8 - } - } - ] -} -``` - -Required rules: -- top-level keys must be `version`, `source`, `validationMode`, and `recipes` -- `version` must be integer `1` -- `source.discoverPath` must be `autonoma/discover.json` -- `source.scenariosPath` must be `autonoma/scenarios.md` -- `validationMode` must be `sdk-check` or `endpoint-lifecycle` -- `recipes` must include `standard`, `empty`, and `large` -- every recipe must contain `name`, `description`, `create`, and `validation` -- every `validation` object must contain: - - `status: "validated"` - - `method`: one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down` - - `phase: "ok"` - - optional `up_ms` / `down_ms` as non-negative integers - -### Nested tree requirement - -Recipe `create` payloads MUST use a nested tree rooted at the scope entity. -Do NOT use flat top-level model keys connected only by `_ref`. - -Children must be nested under their parent using the relation field names from `discover.json`. -Use `_ref` only for cross-branch references that cannot be expressed through nesting. - -### Variables requirement - -If `create` contains `{{token}}` placeholders, include a `variables` object for that recipe. - -Allowed strategies: -- `literal` -- `derived` -- `faker` - -Rules: -- every `{{token}}` in `create` must have a matching key in `variables` -- every key in `variables` must be used in `create` -- fully concrete recipes do not need `variables` -- if the backend requires explicit scalar foreign-key values in addition to nested trees, include those scalar assignments using `_ref`-resolved values -- any collision-prone unique value must be derived from `testRunId` - -Do not write the old shape. In particular, do not use: -- top-level `generatedAt` -- top-level `scenarios` -- per-recipe `validated` -- per-recipe `timing` - -## Preflight Endpoint Validation - -After writing `autonoma/scenario-recipes.json`, you MUST run: - -```bash -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json -``` - -This requires: -- `AUTONOMA_SDK_ENDPOINT` -- `AUTONOMA_SHARED_SECRET` - -If preflight fails, do NOT rewrite backend code. Report the failure clearly and stop. - -Before returning, always write `autonoma/.scenario-validation.json` with this shape: - -```json -{ - "status": "ok", - "preflightPassed": true, - "smokeTestPassed": true, - "validatedScenarios": ["standard", "empty", "large"], - "failedScenarios": [], - "blockingIssues": [], - "recipePath": "autonoma/scenario-recipes.json", - "validationMode": "sdk-check", - "endpointUrl": "http://localhost:3000/api/autonoma" -} -``` - -If the step fails, keep the same shape but set: -- `status: "failed"` -- `preflightPassed: false` when preflight did not pass -- `failedScenarios` to the scenarios that failed -- `blockingIssues` to the concrete validation/runtime blockers - -## What to Explain to the User - -When finished, explain: -1. the endpoint that was validated -2. whether the smoke `discover -> up -> down` lifecycle passed -3. whether `standard`, `empty`, and `large` validated successfully -4. what validation method was used -5. where `autonoma/scenario-recipes.json` was written -6. where `autonoma/.scenario-validation.json` was written -7. any remaining manual deployment or backend issues that need attention - -## Important - -- Treat `discover.json` as the schema contract and `scenarios.md` as the scenario intent. -- Assume SDK integration is already complete. -- If the endpoint is down, tell the user to restart or redeploy the Step 1 integration instead of attempting code edits here. -- The orchestrator must be able to trust `autonoma/.scenario-validation.json` as the only terminal-state signal for this step. +# Scenario Validator: iterative fix loop + reality reconciliation + +The Environment Factory endpoint exists (step 4 wrote `autonoma/.endpoint-implemented`). +Your job is to prove it actually works and keep iterating until it does. The E2E test +generator (step 6) is gated on your sentinel — if you do not write +`autonoma/.endpoint-validated`, no tests get generated. + +## Database Safety (absolute) + +- ALL writes go through the SDK endpoint only. Never INSERT/UPDATE/DELETE/DROP/TRUNCATE via psql or raw SQL. +- You MAY run SELECT via psql / ORM read queries to verify data. +- The SDK's `down` action deletes only what `up` created (signed refs token). + +## Inputs + +- `autonoma/entity-audit.md` — every model and whether it needs a factory +- `autonoma/scenarios.md` — scenario definitions (may contain mistakes you will correct) +- The handler file created in step 4 +- A running dev server (start one if it is not up — ask the user for the port) +- `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` (for HMAC signing + preflight) + +## Outputs + +- `autonoma/scenario-recipes.json` — validated nested `create` trees per scenario +- `autonoma/.scenario-validation.json` — terminal artifact the orchestrator reads +- `autonoma/.endpoint-validated` — sentinel that gates Step 6 (test generation) + +## The loop + +Repeat until all three actions succeed for every scenario OR you exhaust 5 iterations +(if you hit 5, STOP and report — do not fake success): + +1. Fetch the protocol docs (first iteration only): + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/protocol.txt" + curl -sSfL "$(cat autonoma/.docs-url)/llms/scenarios.txt" + ``` + + If curl fails, STOP and report — do not fabricate a URL. + +2. Export working secrets (same values the handler reads): + + ```bash + export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)} + export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)} + ``` + +3. Run `discover` via curl with proper HMAC. + - The response MUST contain `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`. + - **Coverage check**: every model in `entity-audit.md` MUST appear in `schema.models`. If one is missing, fix the handler's model filter / adapter config and restart the loop. + - **Factory coverage check**: open the handler file(s), extract the registered factory names. Every model with `independently_created: true` in the audit MUST be registered. + - **Factory-body integrity check (deterministic, MANDATORY)**: this is the check the env-factory agent is supposed to run before writing its sentinel. Re-run it here; do not trust the upstream. Steps: + 1. Grep the handler file(s) for raw DB/ORM writes. The pattern set must cover every + language and ORM the SDK supports — any of these appearing inside a factory body for a + model with `independently_created: true` is a FAIL: + ```bash + # TypeScript/JavaScript — Prisma, Drizzle, Knex, Sequelize, TypeORM, Mongoose + grep -nE '(prisma|db|tx|trx)\.[a-zA-Z_]+\.(create|createMany|upsert)\(|\b(drizzle|db|tx)\.insert\(|\bknex\([^)]*\)\.insert\(|\.models\.[A-Za-z_]+\.create\(|getRepository\([^)]*\)\.save\(|\bMongoose.*\.create\(' + + # Python — SQLAlchemy, Django ORM + grep -nE '\bsession\.(add|execute|bulk_insert_mappings)\(|\.objects\.create\(|\.save\(\)' + + # Ruby/Rails — ActiveRecord + grep -nE '\b[A-Z][A-Za-z0-9]*\.(create|create!|insert|insert_all)\(|\.new\([^)]*\)\.save' + + # PHP/Laravel — Eloquent, raw DB + grep -nE '\b[A-Z][A-Za-z0-9]*::create\(|->save\(\)|\bDB::table\([^)]*\)->insert\(' + + # Java/Spring — JPA, JDBC + grep -nE '\bentityManager\.persist\(|\b[a-zA-Z]+Repository\.save\(|\bjdbcTemplate\.update\(' + + # Go — GORM, database/sql, squirrel + grep -nE '\.Create\(|\bdb\.Exec(Context)?\(|\bsq\.Insert\(' + + # Elixir/Ecto + grep -nE '\bRepo\.(insert|insert!|insert_all)\(' + + # Rust — Diesel, SQLx, SeaORM + grep -nE '\bdiesel::insert_into\(|\bsqlx::query!?\("INSERT|ActiveModel[^{]*\.insert\(' + + # Raw SQL INSERT in any language + grep -niE '"[^"]*INSERT\s+INTO\b|'"'"'[^'"'"']*INSERT\s+INTO\b' + ``` + Use the pattern set appropriate for the project's stack (determined from the handler file + and `entity-audit.md`); include the raw-SQL pattern unconditionally. Any match that + falls inside a factory body for a `independently_created: true` model is a FAIL. + 2. For each `(model, creation_file, creation_function)` from `entity-audit.md`, verify the handler contains both an `import` resolving to `creation_file` AND an invocation of `creation_function` inside that model's factory body. + 3. If any model fails either check, this is a **handler bug** (path 3a). Fix by importing and calling the audited function. If the audit pointed at an inline route handler (no exported function), extract it into a named exported function in a nearby module, replace the route body with a call to the new function, update `entity-audit.md` in-place with the new `creation_file`/`creation_function`, then restart this step. + 4. The validator MUST NOT write `.endpoint-validated` while any factory body contains a raw ORM create for its own model. + +4. For each scenario in `scenarios.md`: + 1. Build the `{action:"up", create:..., testRunId:"-"}` body from the scenario. + 2. HMAC-sign and POST. + 3. If non-200 or error body, pick one of three paths: + a. **Handler bug** (missing factory, bad FK handling, wrong adapter config) → fix the handler and restart. + b. **Scenario bug** (field does not exist on the model, FK target wrong, scope field missing) → edit `scenarios.md` to match reality and restart. Log the change. + c. **Unfeasible scenario** (requires data the app cannot produce) → REMOVE the scenario from `scenarios.md` with justification. Restart. + 4. If 200: parse `auth`, `refs`, `refsToken`. + - **Auth check**: `auth` MUST be non-null and contain at least one of `{ cookies, headers, token, user }`. If empty, the auth callback is not wired — fix it and restart. + - **Refs check**: every top-level model in the `create` tree MUST appear in `refs`. + 5. Verify DB state with a read-only `SELECT` for at least one refs id. + 6. POST `{action:"down", refsToken}`. Expect `{ok:true}`. + 7. Verify the refs rows are gone. + +5. After every scenario passes cleanly, emit the scenario recipes. + + Write `autonoma/scenario-recipes.json` with this shape (recipes mirror the `create` + trees you just validated — one entry per scenario): + + ```json + { + "version": 1, + "source": { + "scenariosPath": "autonoma/scenarios.md" + }, + "validationMode": "endpoint-lifecycle", + "recipes": [ + { + "name": "standard", + "description": "Realistic dataset for core flows", + "create": { + "Organization": [{ + "_alias": "org1", + "name": "Acme Corp" + }] + }, + "variables": { + "testRunId": { + "strategy": "derived", + "source": "testRunId", + "format": "{testRunId}" + } + }, + "validation": { + "status": "validated", + "method": "endpoint-up-down", + "phase": "ok", + "up_ms": 12, + "down_ms": 8 + } + } + ] + } + ``` + + Rules: + - top-level keys MUST be exactly `version`, `source`, `validationMode`, `recipes` + - `version` must be integer `1` + - `validationMode` must be `sdk-check` or `endpoint-lifecycle` (use `endpoint-lifecycle` + when you drove up/down via HTTP in the loop above) + - `recipes` MUST include `standard`, `empty`, and `large` + - every recipe MUST contain `name`, `description`, `create`, and `validation` + - every `validation` object MUST contain `status: "validated"`, `phase: "ok"`, and a + valid `method` (one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`) + - **Nested tree**: `create` MUST use a nested tree rooted at the scope entity. Do NOT + use flat top-level model keys connected only by `_ref`. Nest children under their + parent using relation field names. Use `_ref` only for cross-branch references that + cannot be expressed through nesting. + - **Variables**: if `create` contains `{{token}}` placeholders, include a `variables` + object. Every `{{token}}` in `create` must match a key in `variables`; every key + in `variables` must be used in `create`. Fully concrete recipes do not need `variables`. + Allowed strategies: `literal`, `derived`, `faker`. Any collision-prone unique value + must be derived from `testRunId`. + - Do NOT write the legacy shape — no top-level `generatedAt`, no top-level `scenarios`, + no per-recipe `validated`, no per-recipe `timing`. + +6. Run preflight on the emitted recipes: + + ```bash + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" \ + autonoma/scenario-recipes.json + ``` + + This resolves tokenized payloads and re-runs signed up/down against the live endpoint. + Requires `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` in the environment. + + If preflight exits non-zero, fix the failing recipe (or the corresponding scenario) and + re-run. Do NOT proceed to step 7 until preflight passes. + +7. Write the terminal artifact `autonoma/.scenario-validation.json` with this shape: + + ```json + { + "status": "ok", + "preflightPassed": true, + "smokeTestPassed": true, + "validatedScenarios": ["standard", "empty", "large"], + "failedScenarios": [], + "blockingIssues": [], + "recipePath": "autonoma/scenario-recipes.json", + "validationMode": "endpoint-lifecycle", + "endpointUrl": "http://localhost:3000/api/autonoma" + } + ``` + + On failure keep the same shape with `status: "failed"`, `preflightPassed: false` when + preflight did not pass, populated `failedScenarios`, and concrete `blockingIssues`. + +8. Write the sentinel `autonoma/.endpoint-validated`. + + Use the `Write` tool (NOT `touch` — the hook fires only on `Write`/`Edit`) with a short + plain-text report: + + ``` + Validated N scenarios across M models. + - discover: all audited models present, all independently_created factories registered + - up: all N scenarios created successfully, auth returned {cookies|headers|token} + - down: all N scenarios cleaned up, no orphans + - recipes: autonoma/scenario-recipes.json emitted, preflight passed + - scenarios.md edits: + ``` + +## Iteration discipline + +- One handler fix per iteration, then re-run everything. Do not chain fixes blind. +- If the same scenario fails twice in a row with the same error, the scenario itself is probably wrong — prefer editing `scenarios.md` over contorting the handler. +- If you have edited `scenarios.md`, re-read it from disk after every edit. + +## When you hit the 5-iteration cap + +STOP and write a clear failure report. Do NOT write `.endpoint-validated`. Include: + +- the last failing curl body + response +- which scenario(s) failed +- which handler file + line range is most likely at fault + +The orchestrator will surface this to the user, who can intervene manually. + +## scenarios.md reconciliation rules + +When you edit `scenarios.md`, preserve the frontmatter shape (the validator hook checks +it). Allowed: + +- Drop a scenario entirely (decrement `scenario_count`, update the `scenarios` summary). +- Remove/rename fields on a model to match what `discover` reports. +- Adjust FK aliases so they reference models that actually exist. +- Flatten cross-branch references that the handler cannot resolve. + +Disallowed: silently changing a scenario's intent (e.g. renaming "admin with one project" +to "user with one project" without reflecting that in the description). diff --git a/agents/sdk-integrator.md b/agents/sdk-integrator.md deleted file mode 100644 index addd455..0000000 --- a/agents/sdk-integrator.md +++ /dev/null @@ -1,301 +0,0 @@ ---- -description: > - Detects the project stack, installs the Autonoma SDK from package managers, - wires the endpoint, starts a local dev server, verifies discover/up/down, and - opens a PR when possible. -tools: - - Read - - Glob - - Grep - - Write - - Edit - - Bash - - Agent - - WebFetch -maxTurns: 60 ---- - -# SDK Integrator - -You implement the Autonoma SDK integration as the first step of the planner pipeline. - -## Goal - -Detect the stack, install the SDK from package managers, add a minimal endpoint following the matching example or SDK README, ensure secrets exist, start a dev server, verify `discover`, `up`, and `down`, and prepare the repo for user review. - -The SDK reference repo path is provided by the orchestrator in `/tmp/autonoma-sdk-ref-dir`. Treat that repo as read-only reference material only. - -## Strict Rules - -- Install the SDK from package managers only. Never vendor, copy, or link SDK source into the user's app. -- **Never create a standalone server or sidecar.** The endpoint lives as a new route inside the project's existing backend. Do NOT create a new `FastAPI()` / `express()` / `Flask(__name__)` / `Gin.Default()` instance, a new `main.py` / `server.py` / `start-*.py` / `main.go` launcher, or open a separate port. If the project already has a backend, integrate into it. -- **SDK language must match backend language.** Detect the backend's language from its manifest file BEFORE picking an SDK. Do not install the Python SDK into a TypeScript/NestJS project (or vice versa). If no matching SDK exists for the backend language, stop per Step 3 — do NOT fall back to a sidecar in a different language. -- **Never scaffold at repo root when a backend directory exists**, including non-standard names like `core-app-backend/`, `apps/api/`, `services/core/`. Locate the backend first (Step 1). -- Do NOT modify the SDK reference repo. -- Do NOT modify database schemas, migrations, or models. -- Keep integration changes minimal and aligned with the project's existing conventions. -- Do NOT commit `.env`. -- Do NOT commit anything under `autonoma/`. -- You MUST leave a machine-readable terminal artifact in `autonoma/.sdk-integration.json` whether the step succeeds or fails. -- Do NOT report success unless both `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` have been written. - -## Required Order - -### 1. Locate the backend directory and detect the stack - -**Do this BEFORE picking an SDK.** The SDK must match the backend's language, so the backend must be located first. - -#### 1a. Enumerate candidate backend directories - -Use Glob / `ls`. Do NOT hardcode the name `backend/`. Real projects use many conventions: - -- `backend/`, `server/`, `api/`, `service/`, `services/` -- `*-backend/`, `*-api/`, `*-server/`, `core-*/`, `app-*/` (e.g. `core-app-backend/`) -- Monorepo layouts: `apps/*`, `packages/*`, `services/*` -- Single-repo backends at the workspace root - -#### 1b. Identify the backend by manifest file - -For each candidate, look for one of these manifest files — the file's language determines the SDK you install: - -- `package.json`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock` → TypeScript/JavaScript -- `pyproject.toml`, `requirements.txt`, `Pipfile` → Python -- `mix.exs` → Elixir -- `composer.json` → PHP -- `pom.xml`, `build.gradle` → Java -- `Gemfile`, `*.gemspec` → Ruby -- `Cargo.toml` → Rust -- `go.mod` → Go - -Pick exactly one backend. If multiple plausible candidates exist, STOP and ask the user which one — do not guess, do not implement in more than one. - -#### 1c. Confirm with the user before writing any code - -State your finding: - -> "I found the backend at `` (language: ``, framework: ``, ORM: ``, package manager: ``). I'll integrate the SDK there. Is that the right location?" - -Wait for confirmation before installing packages or writing files. - -#### 1d. Determine the rest of the stack - -From the identified backend directory, determine: -- language -- server framework -- ORM or DB adapter -- package manager - -### 2. Map the stack to the SDK docs matrix - -Use the matching runnable example from the SDK reference repo when available. -Otherwise use the documented SDK package combinations from SDK READMEs. - -Supported docs matrix: -- TypeScript: `@autonoma-ai/sdk` plus the matching ORM/server packages -- Python: `autonoma-sdk[...]` -- Elixir: `autonoma_sdk` -- PHP: `autonoma-ai/sdk` -- Java: `com.autonoma.ai:autonoma-sdk` -- Ruby: `autonoma-ai` -- Rust: `autonoma-sdk` -- Go: `github.com/autonoma-ai/sdk-go` - -### 3. Stop immediately if unsupported - -If the detected stack is not supported, stop and output a `mailto:` link to `support@autonoma.app`. - -The mailto body must include: -- detected language -- detected framework -- detected ORM or DB layer -- detected package manager -- repo name or directory name - -### 4. Create a branch - -Create a branch in the user repo: -- preferred base name: `autonoma/feat-autonoma-sdk` -- if it already exists, append `-2`, `-3`, and so on - -### 5. Install SDK packages - -Use the project's package manager. - -Examples: -- TypeScript + Express + Prisma: - - `npm install @autonoma-ai/sdk @autonoma-ai/sdk-prisma @autonoma-ai/server-express` -- TypeScript + Next.js + Drizzle: - - `pnpm add @autonoma-ai/sdk @autonoma-ai/sdk-drizzle @autonoma-ai/server-web` -- Python + FastAPI + SQLAlchemy: - - `pip install "autonoma-sdk[sqlalchemy,fastapi]"` -- Python + Django: - - `pip install "autonoma-sdk[django]"` -- Elixir + Phoenix + Ecto: - - add `{:autonoma_sdk, "~> 0.1"}` - -### 6. Implement the endpoint - -Follow the matching example or README pattern with minimal project-specific glue. - -Requirements: -- match the repo's routing conventions -- preserve existing auth/session patterns if the SDK auth callback needs them -- implement the current SDK contract for `discover`, `up`, and `down` -- do not create a throwaway second app or server if the project already has one - -### 7. Ensure secrets exist - -Check `.env` first if present. - -Ensure: -- `AUTONOMA_SHARED_SECRET` -- `AUTONOMA_SIGNING_SECRET` - -If missing: -- generate with `openssl rand -hex 32` -- ensure the two secrets differ -- append or update `.env` -- append or update `.env.example` with placeholder values and short comments - -Suggested comments: - -```env -# Autonoma SDK - shared secret for HMAC request signing -AUTONOMA_SHARED_SECRET=your-shared-secret-here -# Autonoma SDK - private secret for signing refs tokens -AUTONOMA_SIGNING_SECRET=your-signing-secret-here -``` - -### 8. Ensure planner artifacts are not committed - -If `/autonoma/` is not already ignored, add it to `.git/info/exclude`. - -### 9. Detect and run the dev server - -Prefer the repo's existing dev/start script or command. - -Examples to inspect: -- package scripts: `dev`, `start:dev`, `start` -- `Makefile` -- `Procfile` -- Django `manage.py runserver` -- Phoenix `mix phx.server` - -If a suitable server is already running and the expected endpoint responds, reuse it. -Otherwise start one in the background and persist its PID to: - -```bash -/tmp/autonoma-dev-server-pid -``` - -### 10. Verify endpoint behavior - -Run signed checks against the live endpoint: -1. `discover` -2. minimal `up` -3. `down` using returned `refsToken` - -Do not continue if any of these fail. - -### 11. Write the verified endpoint URL - -Write the final endpoint URL to: - -```text -autonoma/.sdk-endpoint -``` - -The file must contain only one absolute URL. - -### 12. Write the integration handoff artifact - -Write `autonoma/.sdk-integration.json` with this shape: - -```json -{ - "status": "ok", - "endpointUrl": "http://localhost:3000/api/autonoma", - "endpointPath": "/api/autonoma", - "stack": { - "language": "TypeScript", - "framework": "Express", - "orm": "Prisma", - "packageManager": "pnpm" - }, - "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"], - "sharedSecretPresent": true, - "signingSecretPresent": true, - "devServer": { - "startedByPlugin": true, - "pid": 12345 - }, - "verification": { - "discover": { "status": "ok", "validatedByPlugin": true }, - "up": { "status": "ok" }, - "down": { "status": "ok" } - }, - "branch": { - "name": "autonoma/feat-autonoma-sdk" - }, - "pr": { - "url": "https://github.com/..." - }, - "blockingIssues": [] -} -``` - -If the step fails after doing any work, still write `autonoma/.sdk-integration.json` with: -- `status: "failed"` -- the best known values for stack, endpoint, server pid, and branch -- failed verification statuses -- every blocking issue listed in `blockingIssues` - -### 13. Commit only integration changes - -Stage only the SDK integration changes, such as: -- route or handler files -- package-manager manifests and lockfiles -- `.env.example` -- any required config files - -Do NOT stage: -- `.env` -- `autonoma/` - -Commit message: - -```text -feat: integrate autonoma sdk -``` - -### 14. Create a PR when possible - -If `gh` is available: -- push the branch -- create a PR - -Include a summary, required env vars, deployment reminder, and: - -```text -Co-authored-by: Autonoma -``` - -If `gh` is unavailable, report the exact manual next steps instead. - -### 15. Final report - -Explain: -1. detected stack -2. installed packages -3. endpoint path and URL -4. where secrets were added -5. dev server PID -6. PR URL or manual push/PR steps -7. where `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` were written - -## Verification Notes - -- Use the SDK reference repo in `/tmp/autonoma-sdk-ref-dir` only for examples and package-selection guidance. -- Prefer existing project conventions over generic examples when file placement differs. -- If the project already contains a partial SDK integration, extend it rather than replacing it. -- If lifecycle verification passes but artifact writing fails, the step is still incomplete. diff --git a/agents/test-case-generator.md b/agents/test-case-generator.md index 5eccfaf..ee951f0 100644 --- a/agents/test-case-generator.md +++ b/agents/test-case-generator.md @@ -27,17 +27,33 @@ Your output is a directory `autonoma/qa-tests/` containing: ## Instructions -1. First, fetch the latest test generation instructions: +1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use + WebFetch. Do NOT write any URL yourself. The docs base URL lives only in + `autonoma/.docs-url`, written by the orchestrator before any subagent runs. - Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt` - and follow those instructions for how to generate tests. + To fetch a doc, run the bash command literally — the shell expands the path, not you: -2. Read all input files: + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/" + ``` + + If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code + and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback. + +2. Fetch the latest test generation instructions: + + ```bash + curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-e2e-tests.txt" + ``` + + Read the output and follow those instructions for how to generate tests. + +3. Read all input files: - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count - All files in `autonoma/skills/` - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields** -3. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter +4. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`. When writing test steps that involve a variable field value — typing it, asserting it, or @@ -48,7 +64,7 @@ Your output is a directory `autonoma/qa-tests/` containing: - good: "assert the task deadline shows `{{deadline_1}}`" - bad: "assert the task deadline shows 2025-06-15" -4. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test. +5. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test. The scenarios exist only to provide preconditions and known data for app behavior tests. Do NOT generate tests whose purpose is to verify: - that the scenario contains the documented entity counts @@ -61,17 +77,17 @@ Your output is a directory `autonoma/qa-tests/` containing: - good: "open the project `{{project_title}}` and verify editing works" - bad: "verify the scenario created 12 projects and 3 users" -5. Count the routes/features/pages in the codebase to establish the coverage correlation. +6. Count the routes/features/pages in the codebase to establish the coverage correlation. The total test count should roughly correlate: - Rule of thumb: 3-5 tests per route/feature for supporting flows - Rule of thumb: 8-15 tests per core flow - This is approximate — use judgment, but the INDEX must declare the correlation -6. Generate test files organized in subdirectories by feature/flow. +7. Generate test files organized in subdirectories by feature/flow. -7. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files). +8. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files). -8. Write individual test files into subdirectories. +9. Write individual test files into subdirectories. ## CRITICAL: INDEX.md Format @@ -196,4 +212,3 @@ you'll receive an error message. Fix the issue and rewrite the file. - Use subagents to parallelize test generation across folders - Each test must be self-contained — no dependencies on other tests - Do not write code (no Playwright, no Cypress) — tests are markdown with natural language steps -- Prefer testing visible user outcomes over seed correctness or fixture inventory diff --git a/commands/generate-tests.md b/commands/generate-tests.md index 7f0bbc2..4ccc236 100644 --- a/commands/generate-tests.md +++ b/commands/generate-tests.md @@ -9,644 +9,183 @@ description: > # Autonoma E2E Test Generation Pipeline -You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent. +You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent. **Every step MUST complete successfully and pass validation before the next step begins.** Do NOT skip steps. Do NOT proceed if validation fails. -## User Confirmation Between Steps +## CRITICAL: User Confirmation Between Steps -By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the -next step once validation passes. +After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation +using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm. -**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after -Steps 1-4. +## How lifecycle reporting works -**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`, -that means auto-advance as well. +You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that: -If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation -using the `AskUserQuestion` tool. - -After calling `AskUserQuestion`, wait for the user's response. -Only proceed to the next step after they confirm. +- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`. +- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output, + emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate + (test files cannot be written until `autonoma/.endpoint-validated` exists). ## Before Starting -Create the output directory and save the project root: - -```bash -AUTONOMA_ROOT="$(pwd)" -echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root -mkdir -p autonoma autonoma/skills autonoma/qa-tests -cleanup_dev_server() { - DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '') - if [ -n "$DEV_SERVER_PID" ]; then - kill "$DEV_SERVER_PID" 2>/dev/null || true - rm -f /tmp/autonoma-dev-server-pid - echo "Dev server (PID $DEV_SERVER_PID) stopped." - fi -} -``` - -The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write: - -```bash -PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '') -``` - -Read the environment variables required for reporting progress back to Autonoma: -- `AUTONOMA_API_KEY` -- `AUTONOMA_PROJECT_ID` -- `AUTONOMA_API_URL` -- `AUTONOMA_AUTO_ADVANCE` — optional, canonical -- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias - -Add shared helpers before running the pipeline: - ```bash -auto_advance_enabled() { - if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then - return 0 - fi - if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then - return 0 - fi - return 1 -} - -refresh_generation_id() { - AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') - GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -} - -build_event_payload() { - python3 - "$1" "$2" "$3" <<'PY' -import json -import sys - -event_type, key, value = sys.argv[1:4] -print(json.dumps({"type": event_type, "data": {key: json.loads(value)}})) -PY -} - -build_step_payload() { - python3 - "$1" "$2" "$3" <<'PY' -import json -import sys - -event_type, step, name = sys.argv[1:4] -print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}})) -PY -} - -post_setup_event_blocking() { - refresh_generation_id - payload="$1" - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - for attempt in 1 2 3; do - if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null; then - return 0 - fi - sleep "$attempt" - done - echo "ERROR: Failed to post blocking setup event after retries: $payload" - return 1 -} - -post_setup_log() { - refresh_generation_id - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")") - curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null || true -} - -patch_setup_status_blocking() { - refresh_generation_id - status="$1" - message="$2" - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - payload=$(python3 - "$status" "$message" <<'PY' -import json -import sys - -body = {"status": sys.argv[1]} -if sys.argv[2]: - body["errorMessage"] = sys.argv[2] -print(json.dumps(body)) -PY -) - for attempt in 1 2 3; do - if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null; then - return 0 - fi - sleep "$attempt" - done - echo "ERROR: Failed to patch setup status after retries: $status" - return 1 -} - -report_error_and_exit() { - message="$1" - preserve_dev_server="${2:-false}" - payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")") - post_setup_event_blocking "$payload" || true - echo "ERROR: $message" - if [ "$preserve_dev_server" != "true" ]; then - cleanup_dev_server - fi - exit 1 -} - -report_partial_failure_and_exit() { - message="$1" - post_setup_log "$message" - patch_setup_status_blocking "partial_failure" "$message" || true - echo "ERROR: $message" - cleanup_dev_server - exit 1 -} - -rehydrate_sdk_env() { - AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') - AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '') - AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-) - AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-) - export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET - if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then - return 1 - fi - return 0 -} +mkdir -p autonoma/skills autonoma/qa-tests ``` -Prepare the SDK reference repo for Step 1: +The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`. -```bash -SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}" -if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then - echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir -else - SDK_REF_DIR="$(mktemp -d)/autonoma-sdk" - if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then - echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir - else - echo "ERROR: Unable to prepare the SDK reference repo." - cleanup_dev_server - exit 1 - fi -fi -``` - -Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug. +## Step 1: Generate Knowledge Base -Create the generation record so the dashboard can track progress in real time: +Spawn `kb-generator`: -```bash -RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}") -HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) -BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') -echo "Setup API response (HTTP $HTTP_STATUS): $BODY" -GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') -echo "$GENERATION_ID" > autonoma/.generation-id -echo "Generation ID: $GENERATION_ID" -``` +> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML +> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill +> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals). +> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`. -If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway. +After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body). -## Step 1: SDK Integration +## Step 2: Entity Creation Audit -Report step start: +Spawn `entity-audit-generator`: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start." -post_setup_log "Detecting stack and integrating the Autonoma SDK..." -``` - -Spawn the `sdk-integrator` subagent with the following task: - -> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context. -> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with -> a `mailto:support@autonoma.app` link if unsupported. -> Create a branch, install the SDK from package managers only, implement the SDK endpoint following -> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` -> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server, -> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and -> `autonoma/.sdk-integration.json`, commit with -> `feat: integrate autonoma sdk`, and create a PR if `gh` is available. -> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models. - -**After the subagent completes:** -1. Verify `autonoma/.sdk-endpoint` exists and is non-empty -2. Verify `autonoma/.sdk-integration.json` exists and is non-empty -3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file -4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env` -5. Confirm the endpoint is reachable with a signed `discover` request -6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes -7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available - -Load the endpoint and secrets: - -```bash -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \ - || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \ - || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true - -rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true - -BODY='{"action":"discover"}' -SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') -HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY") -if [ "$HTTP_STATUS" != "200" ]; then - report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true -fi -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \ - || report_error_and_exit "Step 1 discover response did not match the required schema." true -``` +> Read the knowledge base. Audit how each database model is created. For every model, find the +> dedicated creation function in a service/repository/helper. Classify as `independently_created: true` +> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output +> `autonoma/entity-audit.md` with frontmatter listing each model. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`. -Report step complete: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true -``` - -7. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here." - - options: ["Yes, proceed to Step 2", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 2. - -## Step 2: Generate Knowledge Base - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start." -post_setup_log "Analyzing codebase structure and identifying features..." -``` - -Spawn the `kb-generator` subagent with the following task: - -> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` -> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with -> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. -> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. -> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. - -**After the subagent completes:** -1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty -2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically -3. Read the file and present the frontmatter to the user — specifically the core_flows table - -Report step complete and upload skills: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ') -post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..." -post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion." -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -skills = [] -d = os.path.join(root, 'autonoma/skills') -if os.path.isdir(d): - for f in os.listdir(d): - if f.endswith('.md'): - with open(os.path.join(d, f)) as fh: - skills.append({'name': f, 'content': fh.read()}) -print(json.dumps({'skills': skills})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." - - options: ["Yes, proceed to Step 3", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 3. +After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`. ## Step 3: Generate Scenarios -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start." -post_setup_log "Mapping data model and designing test data environments..." -``` - -Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`. -This step assumes Step 1 already produced: -- `AUTONOMA_SDK_ENDPOINT` -- `AUTONOMA_SHARED_SECRET` - -Fetch and validate the artifact: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -mkdir -p "$AUTONOMA_ROOT/autonoma" -rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1." -BODY='{"action":"discover"}' -SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') -RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY") -HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) -DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') -if [ "$HTTP_STATUS" != "200" ]; then - report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY" -fi -printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json" -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \ - || report_error_and_exit "Step 3 discover artifact did not pass validation." -``` - -Spawn the `scenario-generator` subagent with the following task: - -> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover -> artifact from `autonoma/discover.json`. -> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. -> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types, -> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a -> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before -> introducing a variable placeholder. Use variable fields only for truly dynamic values such as -> backend-generated or time-based fields. `generator` is optional and must not default to `faker`. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. - -**After the subagent completes:** -1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty -2. Validate `autonoma/discover.json` using the plugin's validator -3. The PostToolUse hook will have validated the frontmatter format automatically -4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic - -Report step complete: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional." -post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion." -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests." - - options: ["Yes, proceed to Step 4", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 4. - -## Step 4: Generate E2E Test Cases - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start." -post_setup_log "Generating E2E test cases from knowledge base and scenarios..." -``` - -Spawn the `test-case-generator` subagent with the following task: - -> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, -> and scenarios from `autonoma/scenarios.md`. -> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`. -> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests, -> total_folders, folder breakdown, and coverage_correlation. -> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. -> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify -> scenario counts, seeded inventories, or Environment Factory correctness. Only reference -> scenario data when it is needed to test a real user-facing app behavior. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. - -**After the subagent completes:** -1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty -2. Verify at least one non-`INDEX.md` test file exists -3. Verify actual test count matches `INDEX.md` -4. Verify folder breakdown matches `INDEX.md` -5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter -6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation - -Enforce the file-count postconditions: - -```bash -INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md" -[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md." -TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') -[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files." -python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md." -import sys -from pathlib import Path -import yaml - -index_path = Path(sys.argv[1]) -actual_count = int(sys.argv[2]) -qa_dir = Path(sys.argv[3]) - -content = index_path.read_text() -parts = content.split('---', 2) -if len(parts) < 3: - raise SystemExit('INDEX.md is missing YAML frontmatter') -frontmatter = yaml.safe_load(parts[1]) - -if frontmatter.get('total_tests') != actual_count: - raise SystemExit( - f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})' - ) - -actual_folders = {} -for path in qa_dir.rglob('*.md'): - if path.name == 'INDEX.md': - continue - folder = path.parent.relative_to(qa_dir).as_posix() - actual_folders[folder] = actual_folders.get(folder, 0) + 1 - -declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])} -if actual_folders != declared_folders: - raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}') -print('OK') -PY -``` - -Report step complete and upload test cases: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') -post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..." -post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion." -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -qa_dir = os.path.join(proj_root, 'autonoma/qa-tests') -test_cases = [] -for root, dirs, files in os.walk(qa_dir): - for f in files: - if f.endswith('.md') and f != 'INDEX.md': - path = os.path.join(root, f) - folder = os.path.relpath(root, qa_dir) - with open(path) as fh: - content = fh.read() - entry = {'name': f, 'content': content} - if folder != '.': - entry['folder'] = folder - test_cases.append(entry) -print(json.dumps({'testCases': test_cases})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app." - - options: ["Yes, proceed to Step 5", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 5. - -## Step 5: Scenario Validation - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start." -post_setup_log "Validating planned scenarios against the live SDK endpoint..." -``` - -Spawn the `scenario-validator` subagent with the following task: - -> Read `autonoma/discover.json` and `autonoma/scenarios.md`. -> Validate the planned scenarios against the existing live SDK endpoint without editing backend code. -> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`, -> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact -> `autonoma/.scenario-validation.json`, and run: -> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json` -> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs. - -**After the subagent completes:** -1. Rehydrate SDK env from Step 1 artifacts -2. Verify `autonoma/.scenario-validation.json` exists and is non-empty -3. Validate `autonoma/.scenario-validation.json` -4. Require `status == "ok"` and `preflightPassed == true` -5. Verify `autonoma/scenario-recipes.json` exists and is non-empty -6. Run the preflight helper if the subagent did not already do so -7. If preflight fails, stop and report the failure without attempting code changes -8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues - -Run and enforce preflight: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1." -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \ - || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact." -python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state." -import json -import sys - -payload = json.load(open(sys.argv[1])) -if payload.get("status") != "ok": - raise SystemExit(f'status must be "ok", got {payload.get("status")!r}') -if payload.get("preflightPassed") is not True: - raise SystemExit('preflightPassed must be true before Step 5 can upload recipes') -print('OK') -PY -[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \ - || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact." -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \ - || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5." -``` - -Report step complete and upload scenario recipes: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_log "Uploading validated scenario recipes to setup..." -if [ -n "$GENERATION_ID" ]; then - RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" - if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then - report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete." - fi - UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @"$RECIPE_PATH") - UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) - UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') - echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" - if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then - report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." - fi - - VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}") - VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) - VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d') - if [ "$VERIFY_STATUS" != "200" ]; then - report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)." - fi -fi -post_setup_log "Scenario validation completed." -post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion." -cleanup_dev_server -``` +Spawn `scenario-generator`: + +> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write +> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types, +> variable_fields, planning_sections). Mark values as variable only when they must vary across +> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural +> per-run isolation). Design entity tables so they serialise as nested trees rooted at the +> scope entity. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`. + +After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`. + +## Step 4: Implement Environment Factory + +Spawn `env-factory-generator`: + +> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure +> the handler. Register a factory for every model with `independently_created: true` (call the audit's +> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback +> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity +> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down` +> — that is step 5. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"` +> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`. +> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names. + +After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary, +`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`. + +## Step 5: Validate Scenario Lifecycle + +Spawn `scenario-validator`: + +> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4. +> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5 +> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails +> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On +> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at +> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe +> per scenario), run `preflight_scenario_recipes.py` against it, and write +> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write` +> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and +> report — do NOT write the sentinel. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`. +> Verify: every audited model appears in `discover.schema.models`, every `independently_created` +> model has a registered factory, `auth` is non-empty, DB state is correct before and after +> `down`, and preflight exits 0. + +After completion: +1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON + AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`: + enforce and upload the recipes to the dashboard, then ack. + + ```bash + AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}" + VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json" + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + + # Enforce terminal artifact contract + python3 - "$VALIDATION_ARTIFACT" <<'PY' + import json, sys + payload = json.load(open(sys.argv[1])) + if payload.get("status") != "ok": + raise SystemExit("status must be ok before Step 5 can upload recipes") + if payload.get("preflightPassed") is not True: + raise SystemExit("preflightPassed must be true before Step 5 can upload recipes") + PY + + [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; } + python3 -c "import json; json.load(open('$RECIPE_PATH'))" \ + || { echo "scenario-recipes.json is not valid JSON"; exit 1; } + + # Re-run preflight at the orchestrator level for belt-and-suspenders safety. + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \ + || { echo "Preflight failed at orchestrator gate"; exit 1; } + + # Upload to dashboard + GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id") + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \ + "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2 + exit 1 + fi + ``` + + Then present validation summary (scenarios passed, any edits made to `scenarios.md`, + recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`. + +2. If any of those artifacts are missing/invalid: the agent failed — surface the failure + report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook + will also block test file writes. + +## Step 6: Generate E2E Test Cases + +Spawn `test-case-generator`: + +> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has +> been reconciled with reality in step 5 — use it as the source of truth). Parse the +> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any +> variable value (typed, asserted, or navigated to), never the hardcoded literal. +> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests +> that "audit" seeded counts or fixture existence. +> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with +> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test +> file needs frontmatter (title, description, criticality, scenario, flow). +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`. + +After completion: +1. Verify `autonoma/qa-tests/INDEX.md` exists +2. Present INDEX summary +3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed` + for the final step, marking the setup complete. ## Completion -After all steps complete, summarize: -- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available -- **Step 2**: knowledge base location and core flow count -- **Step 3**: scenario count and entity types covered -- **Step 4**: total test count, folder breakdown, coverage correlation -- **Step 5**: scenario validation results, smoke-test status, and recipe upload status - -If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID. -For terminal failures after later steps begin, clean up the dev server before returning control to the user. +Summarize each step: +- **Step 1**: KB location, core flows +- **Step 2**: entity audit — factories vs raw SQL +- **Step 3**: scenarios generated +- **Step 4**: endpoint implemented (handler path, packages, factories registered) +- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any) +- **Step 6**: test count, folder breakdown diff --git a/hooks/hooks.json b/hooks/hooks.json index d694b5d..310a20c 100644 --- a/hooks/hooks.json +++ b/hooks/hooks.json @@ -1,8 +1,28 @@ { "hooks": { + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pipeline-kickoff.sh" + } + ] + } + ], + "PreToolUse": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pretool-heartbeat.sh" + } + ] + } + ], "PostToolUse": [ { - "matcher": "Write", + "matcher": "Write|Edit", "hooks": [ { "type": "command", diff --git a/hooks/pipeline-kickoff.sh b/hooks/pipeline-kickoff.sh new file mode 100755 index 0000000..29425b7 --- /dev/null +++ b/hooks/pipeline-kickoff.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# UserPromptSubmit hook. Fires on every user prompt, early-exits unless: +# 1. The prompt invokes the generate-tests skill/command, AND +# 2. The pipeline has not already been kicked off (no autonoma/.generation-id). +# +# When both conditions hold, this script owns pipeline startup so the agent +# never has to remember to do it: +# - verifies required env vars (hard-fails if AUTONOMA_DOCS_URL is unset) +# - creates autonoma/ output dirs +# - writes autonoma/.docs-url +# - POSTs /v1/setup/setups to create the generation record +# - writes autonoma/.generation-id +# - emits step.started for step 0 +# +# Exit 0 always (best-effort reporting must never block test generation). + +set -u + +INPUT=$(cat) + +PROMPT=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('prompt',''))" 2>/dev/null || echo '') + +# Match either the slash command or a direct mention of the skill name +case "$PROMPT" in + */generate-tests*|*generate-tests*) ;; + *) exit 0 ;; +esac + +# Idempotency: if we've already kicked off this project's pipeline, nothing to do. +if [ -s autonoma/.generation-id ]; then + exit 0 +fi + +# Hard-require AUTONOMA_DOCS_URL — the plugin refuses to guess a docs URL. +if [ -z "${AUTONOMA_DOCS_URL:-}" ]; then + echo "[autonoma pipeline-kickoff] ERROR: AUTONOMA_DOCS_URL is not set." >&2 + echo "[autonoma pipeline-kickoff] Re-launch Claude using the onboarding command from the Autonoma dashboard (it exports AUTONOMA_DOCS_URL), or export it manually before running /generate-tests." >&2 + exit 0 +fi + +mkdir -p autonoma/skills autonoma/qa-tests +echo "$AUTONOMA_DOCS_URL" > autonoma/.docs-url + +# Nothing below this line should ever fail hard — we must not block the agent. +if [ -z "${AUTONOMA_API_URL:-}" ] || [ -z "${AUTONOMA_API_KEY:-}" ] || [ -z "${AUTONOMA_PROJECT_ID:-}" ]; then + echo "[autonoma pipeline-kickoff] WARN: AUTONOMA_API_URL/AUTONOMA_API_KEY/AUTONOMA_PROJECT_ID not all set. Skipping dashboard reporting." >&2 + exit 0 +fi + +# Derive a human-readable app name from the project dir (best-effort). +APP_NAME=$(basename "$(pwd)") + +RESPONSE=$(curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}" 2>/dev/null || echo '{}') + +GENERATION_ID=$(echo "$RESPONSE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') + +if [ -z "$GENERATION_ID" ]; then + echo "[autonoma pipeline-kickoff] WARN: setup creation returned no id. Dashboard will not reflect this run." >&2 + exit 0 +fi + +echo "$GENERATION_ID" > autonoma/.generation-id +echo "[autonoma pipeline-kickoff] Pipeline kickoff complete. generation_id=${GENERATION_ID}" >&2 + +curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' >/dev/null 2>&1 || true + +touch autonoma/.step-0-started + +# --------------------------------------------------------------------------- +# Launch the transcript streamer as a detached background daemon. It tails +# the session JSONL and forwards assistant text/thinking/tool-use/tool-result +# events to /v1/setup/setups/{id}/events so the dashboard can render a live +# activity log. Best-effort, never blocks. +# --------------------------------------------------------------------------- +TRANSCRIPT_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '') + +if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + STREAMER_PID_FILE="autonoma/.streamer.pid" + STREAMER_LOG="autonoma/.streamer.log" + STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py" + + # If a prior streamer is still alive (e.g. from a previous session in this + # project dir), replace it — the transcript path has changed. + if [ -s "$STREAMER_PID_FILE" ]; then + existing_pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null || echo '') + if [ -n "$existing_pid" ] && kill -0 "$existing_pid" 2>/dev/null; then + kill "$existing_pid" 2>/dev/null || true + fi + fi + + if [ -f "$STREAMER_SCRIPT" ]; then + nohup python3 "$STREAMER_SCRIPT" \ + "$TRANSCRIPT_PATH" \ + "$GENERATION_ID" \ + "$AUTONOMA_API_URL" \ + "$AUTONOMA_API_KEY" \ + >> "$STREAMER_LOG" 2>&1 "$STREAMER_PID_FILE" + disown "$STREAMER_PID" 2>/dev/null || true + echo "[autonoma pipeline-kickoff] Transcript streamer started. pid=${STREAMER_PID} transcript=${TRANSCRIPT_PATH}" >&2 + fi +fi + +exit 0 diff --git a/hooks/pretool-heartbeat.sh b/hooks/pretool-heartbeat.sh new file mode 100755 index 0000000..7dd4bf2 --- /dev/null +++ b/hooks/pretool-heartbeat.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Emits a lightweight "activity" event for every tool call so the dashboard +# can show Claude is still alive. Best-effort — failures never block the +# pipeline. Only fires when a generation is active (autonoma/.generation-id +# exists) and the Autonoma API is reachable. + +set -u + +INPUT=$(cat) + +# Guard: only fire during an active generation. +GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '') +[ -z "$GENERATION_ID" ] && exit 0 +[ -z "${AUTONOMA_API_URL:-}" ] && exit 0 +[ -z "${AUTONOMA_API_KEY:-}" ] && exit 0 + +# --------------------------------------------------------------------------- +# Streamer liveness check + auto-revive. If the transcript streamer daemon +# has died (crash, OS restart, etc.) re-launch it so the dashboard keeps +# receiving events. kill -0 is nearly free when the process is alive. +# Skipped when the plugin's streamer.py is missing (e.g. older plugin cache). +# --------------------------------------------------------------------------- +STREAMER_PID_FILE="autonoma/.streamer.pid" +STREAMER_LOG="autonoma/.streamer.log" +STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py" + +streamer_alive() { + [ -s "$STREAMER_PID_FILE" ] || return 1 + local pid + pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null) + [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null +} + +if ! streamer_alive && [ -f "$STREAMER_SCRIPT" ]; then + TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '') + if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then + nohup python3 "$STREAMER_SCRIPT" \ + "$TRANSCRIPT_PATH" \ + "$GENERATION_ID" \ + "$AUTONOMA_API_URL" \ + "$AUTONOMA_API_KEY" \ + >> "$STREAMER_LOG" 2>&1 "$STREAMER_PID_FILE" + disown "$NEW_PID" 2>/dev/null || true + echo "[$(date +%H:%M:%S)] streamer revived by pretool-heartbeat pid=$NEW_PID transcript=$TRANSCRIPT_PATH" >> "$STREAMER_LOG" + fi +fi + +# Build the payload: tool name + a short preview of the most informative arg. +# Heavy args (full file contents from Write/Edit) are never forwarded. +PAYLOAD=$(printf '%s' "$INPUT" | python3 -c " +import json, sys +try: + data = json.load(sys.stdin) +except Exception: + sys.exit(0) +tool = data.get('tool_name') or '' +if not tool: + sys.exit(0) +inp = data.get('tool_input') or {} +# Pick the first informative string field; never forward large blobs. +preview = '' +for key in ('command', 'description', 'file_path', 'pattern', 'path', 'query', 'prompt', 'url'): + v = inp.get(key) + if isinstance(v, str) and v.strip(): + preview = v.replace('\n', ' ').strip()[:200] + break +print(json.dumps({'type': 'activity', 'data': {'tool': tool, 'preview': preview}})) +" 2>/dev/null) + +[ -z "$PAYLOAD" ] && exit 0 + +# Short timeout — the hook runs before every tool call, never block the session. +curl --max-time 2 -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" >/dev/null 2>&1 || true + +exit 0 diff --git a/hooks/transcript-streamer.py b/hooks/transcript-streamer.py new file mode 100755 index 0000000..be496ca --- /dev/null +++ b/hooks/transcript-streamer.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""Streams Claude Code session transcript events to the Autonoma dashboard. + +Spawned as a detached background process by pipeline-kickoff.sh when a +/generate-tests run starts. Tails the session JSONL as Claude appends to it, +extracts assistant text + thinking + tool calls + tool results, and POSTs +each as a `transcript` event to /v1/setup/setups/{id}/events so the dashboard +can render a live activity log. + +Self-terminates after IDLE_SECONDS of no new transcript data. Safe to kill +at any time — the daemon is stateless and holds no locks. + +Usage: + python3 transcript-streamer.py +""" + +from __future__ import annotations + +import json +import os +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path + +POLL_INTERVAL = 0.75 +IDLE_SECONDS = 1800 # 30 min with no new lines → daemon exits +MAX_TEXT_CHARS = 4000 +MAX_PREVIEW_CHARS = 500 +HTTP_TIMEOUT = 2.0 + + +def main() -> None: + if len(sys.argv) != 5: + sys.exit(2) + transcript_path, generation_id, api_url, api_key = sys.argv[1:5] + if not all([transcript_path, generation_id, api_url, api_key]): + sys.exit(0) + + path = Path(transcript_path) + # Start at end of file. Anything written before this daemon launched was + # already visible in the terminal before the dashboard existed — don't + # replay it. + last_size = path.stat().st_size if path.exists() else 0 + idle = 0.0 + log(f"streamer up transcript={transcript_path} generation_id={generation_id} api_url={api_url} start_offset={last_size}") + + while idle < IDLE_SECONDS: + if not path.exists(): + time.sleep(POLL_INTERVAL) + idle += POLL_INTERVAL + continue + + size = path.stat().st_size + if size < last_size: + # File was rotated/truncated — reset. + last_size = 0 + if size == last_size: + time.sleep(POLL_INTERVAL) + idle += POLL_INTERVAL + continue + + idle = 0.0 + with path.open("r", encoding="utf-8", errors="replace") as fh: + fh.seek(last_size) + for line in fh: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + payload = extract_event(entry) + if payload is not None: + forward(payload, generation_id, api_url, api_key) + last_size = fh.tell() + + +def extract_event(entry: dict) -> dict | None: + """Turn a transcript line into a dashboard event, or None to skip.""" + etype = entry.get("type") + is_sidechain = bool(entry.get("isSidechain", False)) + uuid = entry.get("uuid") + + if etype == "assistant": + msg = entry.get("message") or {} + content = msg.get("content") or [] + texts: list[str] = [] + tool_uses: list[dict] = [] + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + t = (block.get("text") or "").strip() + if t: + texts.append(t) + elif btype == "thinking": + t = (block.get("thinking") or "").strip() + if t: + texts.append(f"[thinking] {t}") + elif btype == "tool_use": + tool_uses.append({ + "name": block.get("name") or "unknown", + "input_preview": _preview(block.get("input") or {}), + }) + if not texts and not tool_uses: + return None + data: dict = {"role": "assistant", "is_sidechain": is_sidechain} + if uuid: + data["uuid"] = uuid + if texts: + data["text"] = "\n".join(texts)[:MAX_TEXT_CHARS] + if tool_uses: + data["tool_uses"] = tool_uses + return {"type": "transcript", "data": data} + + if etype == "user": + msg = entry.get("message") or {} + content = msg.get("content") + # Tool results arrive as user messages whose content is a list of + # tool_result blocks. Raw text user messages (the original prompt) + # are skipped — they're already visible to the dashboard. + if not isinstance(content, list): + return None + results: list[dict] = [] + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") != "tool_result": + continue + body = _flatten_tool_result(block.get("content")) + entry_out: dict = {"is_error": bool(block.get("is_error"))} + if body: + entry_out["preview"] = body[:MAX_PREVIEW_CHARS] + results.append(entry_out) + if not results: + return None + data = {"role": "tool_result", "is_sidechain": is_sidechain, "results": results} + if uuid: + data["uuid"] = uuid + return {"type": "transcript", "data": data} + + return None + + +def _flatten_tool_result(raw) -> str: + if isinstance(raw, str): + return raw + if isinstance(raw, list): + parts: list[str] = [] + for c in raw: + if isinstance(c, dict) and c.get("type") == "text": + parts.append(c.get("text", "")) + elif isinstance(c, str): + parts.append(c) + return "\n".join(parts) + return "" + + +def _preview(obj) -> str: + try: + s = json.dumps(obj, default=str, ensure_ascii=False) + except Exception: + s = str(obj) + return s[:MAX_PREVIEW_CHARS] + + +def forward(payload: dict, generation_id: str, api_url: str, api_key: str) -> None: + url = f"{api_url.rstrip('/')}/v1/setup/setups/{generation_id}/events" + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + method="POST", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + ) + try: + with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp: + resp.read() + log(f"POST {resp.status} {payload.get('type')} {_summarize(payload)}") + except urllib.error.HTTPError as e: + body = "" + try: + body = e.read().decode("utf-8", errors="replace")[:300] + except Exception: + pass + log(f"POST {e.code} {payload.get('type')} body={body}") + except (urllib.error.URLError, TimeoutError, ConnectionError) as e: + log(f"POST network-error {payload.get('type')} err={e!r}") + except Exception as e: + log(f"POST unknown-error {payload.get('type')} err={e!r}") + + +def _summarize(payload: dict) -> str: + data = payload.get("data") or {} + role = data.get("role") + if role == "assistant": + snippet = (data.get("text") or "").replace("\n", " ")[:80] + tools = ",".join(t.get("name", "?") for t in data.get("tool_uses") or []) + return f"role=assistant text={snippet!r} tools=[{tools}]" + if role == "tool_result": + return f"role=tool_result n_results={len(data.get('results') or [])}" + return "" + + +def log(msg: str) -> None: + # Emit to stderr which is redirected to autonoma/.streamer.log by the kickoff hook. + try: + print(f"[{time.strftime('%H:%M:%S')}] {msg}", file=sys.stderr, flush=True) + except Exception: + pass + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + pass + except Exception: + # Daemon must never propagate — swallow and exit clean so nothing + # surfaces in the user's terminal. + pass diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh index ba71260..071d7d7 100755 --- a/hooks/validate-pipeline-output.sh +++ b/hooks/validate-pipeline-output.sh @@ -1,70 +1,264 @@ #!/bin/bash -# Validates pipeline output files after Write tool use. +# Validates pipeline output files after Write tool use and emits lifecycle +# events + artifact uploads to the Autonoma dashboard on successful artifact +# production. All backend reporting lives here so the agent can never forget. +# # Exit 0 = allow (file is valid or not a pipeline file) # Exit 2 = block and send error message to Claude +set -u + INPUT=$(cat) -# Extract the file path from the tool input FILE_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tool_input',{}).get('file_path',''))" 2>/dev/null) if [ -z "$FILE_PATH" ]; then exit 0 fi -# Resolve the validators directory relative to this script +# ---------------------------------------------------------------------------- +# Lifecycle emission helpers +# ---------------------------------------------------------------------------- +_reporting_ready() { + local generation_id + generation_id=$(cat autonoma/.generation-id 2>/dev/null || echo '') + [ -n "$generation_id" ] && [ -n "${AUTONOMA_API_URL:-}" ] && [ -n "${AUTONOMA_API_KEY:-}" ] +} + +# emit_step_event [] — idempotent via marker. +emit_step_event() { + local step="$1" + local action="$2" + local name="${3:-}" + local marker="autonoma/.step-${step}-${action}" + + [ -f "$marker" ] && return 0 + mkdir -p autonoma 2>/dev/null || true + touch "$marker" + + _reporting_ready || return 0 + local generation_id + generation_id=$(cat autonoma/.generation-id) + + local payload + if [ -n "$name" ]; then + payload=$(printf '{"type":"step.%s","data":{"step":%s,"name":"%s"}}' "$action" "$step" "$name") + else + payload=$(printf '{"type":"step.%s","data":{"step":%s}}' "$action" "$step") + fi + + curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/events" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "$payload" >/dev/null 2>&1 || true +} + +# upload_skills — bundle autonoma/skills/*.md and POST to /artifacts. Idempotent. +upload_skills() { + local marker="autonoma/.skills-uploaded" + [ -f "$marker" ] && return 0 + _reporting_ready || return 0 + [ -d autonoma/skills ] || return 0 + + local generation_id + generation_id=$(cat autonoma/.generation-id) + + python3 -c " +import os, json +skills = [] +d = 'autonoma/skills' +if os.path.isdir(d): + for f in sorted(os.listdir(d)): + if f.endswith('.md'): + with open(os.path.join(d, f)) as fh: + skills.append({'name': f, 'content': fh.read()}) +print(json.dumps({'skills': skills})) +" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- >/dev/null 2>&1 || true + + touch "$marker" +} + +# upload_test_cases — bundle autonoma/qa-tests/**/*.md (except INDEX) and POST. Idempotent. +upload_test_cases() { + local marker="autonoma/.test-cases-uploaded" + [ -f "$marker" ] && return 0 + _reporting_ready || return 0 + [ -d autonoma/qa-tests ] || return 0 + + local generation_id + generation_id=$(cat autonoma/.generation-id) + + python3 -c " +import os, json +test_cases = [] +for root, dirs, files in os.walk('autonoma/qa-tests'): + for f in sorted(files): + if f.endswith('.md') and f != 'INDEX.md': + path = os.path.join(root, f) + folder = os.path.relpath(root, 'autonoma/qa-tests') + with open(path) as fh: + content = fh.read() + entry = {'name': f, 'content': content} + if folder != '.': + entry['folder'] = folder + test_cases.append(entry) +print(json.dumps({'testCases': test_cases})) +" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \ + -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ + -H "Content-Type: application/json" \ + -d @- >/dev/null 2>&1 || true + + touch "$marker" +} + +# ---------------------------------------------------------------------------- +# Sentinel files: no validation, just event emission. +# - autonoma/.endpoint-implemented — env-factory agent writes this after the +# discover smoke test + factory-integrity check pass; signals step 3 complete. +# - autonoma/.endpoint-validated — scenario-validator writes this after the full +# up/down lifecycle passes for every scenario; signals step 4 complete AND +# unlocks the gate that allows qa-tests/*.md to be written. +# - autonoma/.step--ack — orchestrator writes this AFTER the user has +# confirmed via AskUserQuestion; this is the *only* path that emits +# step.started for step N. The UI can therefore show "waiting for +# confirmation" in the gap between step.completed (N-1) and step.started N. +# ---------------------------------------------------------------------------- +STEP_NAMES=("Knowledge Base" "Entity Audit" "Scenarios" "Implement" "Validate" "E2E Tests") + +case "$FILE_PATH" in + */autonoma/.endpoint-implemented) + # Hook-level factory-integrity gate. The env-factory agent's self-policed + # check has proven insufficient — see the post-mortem in the plugin repo. + # This validator parses autonoma/entity-audit.md, opens the handler named + # in the sentinel body, and blocks the write when any factory for a + # independently_created: true model contains an inline ORM write. + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + # Gate 1 — cheap syntactic checks (grep, mount, audit-flip cap). + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_endpoint_implemented.py" "$FILE_PATH" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 2 — creation_file immutability (catches the audit-rewrite attack + # without needing an LLM call). Cheap, fast, deterministic. + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_creation_file_immutable.py" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 3 — semantic per-model fidelity via claude -p fan-out. Reads the + # rubric from the docs URL at runtime (updatable without plugin changes). + # Blocks on hard failures; transient errors + missing config are + # warning-only so a broken docs endpoint does not freeze the pipeline. + if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_factory_fidelity.py" "$FILE_PATH" 2>&1); then + printf '%s\n' "$OUTPUT" >&2 + exit 2 + fi + # Gate 3 prints progress to stderr even on success; surface it so the + # user sees the validator actually ran. + printf '%s\n' "$OUTPUT" >&2 + emit_step_event 3 completed "Implement" + exit 0 + ;; + */autonoma/.endpoint-validated) + emit_step_event 4 completed "Validate" + exit 0 + ;; + */autonoma/.pipeline-complete) + emit_step_event 5 completed "E2E Tests" + exit 0 + ;; + */autonoma/.step-*-ack) + ack_num=$(basename "$FILE_PATH" | sed -E 's/^\.step-([0-9]+)-ack$/\1/') + if [[ "$ack_num" =~ ^[0-9]+$ ]] && [ "$ack_num" -ge 0 ] && [ "$ack_num" -lt ${#STEP_NAMES[@]} ]; then + emit_step_event "$ack_num" started "${STEP_NAMES[$ack_num]}" + fi + # Snapshot entity-audit.md the moment the user confirms the audit is + # accepted (step-2-ack = "Scenarios starting", which fires AFTER the user + # approves the Entity Audit). This snapshot is diffed against the current + # audit at .endpoint-implemented time to detect the env-factory agent + # gaming the factory-integrity check by mass-flipping independently_created + # true -> false. See the post-mortem in the plugin repo. + if [ "$ack_num" = "2" ] && [ -f "autonoma/entity-audit.md" ] && [ ! -f "autonoma/.entity-audit-step2.md" ]; then + cp autonoma/entity-audit.md autonoma/.entity-audit-step2.md 2>/dev/null || true + fi + exit 0 + ;; +esac + +# ---------------------------------------------------------------------------- +# Validation gate: test files (INDEX.md or any qa-tests/*.md) cannot be written +# until the scenario-validator writes autonoma/.endpoint-validated. This +# prevents step 6 from generating tests against an unproven endpoint. +# ---------------------------------------------------------------------------- +case "$FILE_PATH" in + */autonoma/qa-tests/INDEX.md|*/autonoma/qa-tests/*.md) + if [ ! -f "autonoma/.endpoint-validated" ]; then + echo "VALIDATION GATE: Cannot write $FILE_PATH — autonoma/.endpoint-validated is missing. Complete Step 5 (scenario-validator) first. The validator must run discover/up/down against every scenario and write the sentinel before test generation is allowed." >&2 + exit 2 + fi + ;; +esac + +# ---------------------------------------------------------------------------- +# Validation routing +# ---------------------------------------------------------------------------- SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" VALIDATORS_DIR="$SCRIPT_DIR/validators" -# Persist the plugin root so orchestrator/subagent bash snippets can find plugin-local scripts. -# This hook is the earliest reliable place where we know the plugin directory. -PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -echo "$PLUGIN_ROOT" > /tmp/autonoma-plugin-root - -# Ensure PyYAML is available (required for frontmatter parsing) python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null -# Only validate pipeline output files +STEP_COMPLETED="" +STEP_COMPLETED_NAME="" +STEP_STARTED="" +STEP_STARTED_NAME="" +POST_UPLOAD="" + case "$FILE_PATH" in */autonoma/AUTONOMA.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_kb.py" VALIDATOR_NAME="validate-kb" - ;; - */autonoma/discover.json) - VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_discover.py" - VALIDATOR_NAME="validate-discover" - ;; - */autonoma/.sdk-endpoint) - VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_endpoint.py" - VALIDATOR_NAME="validate-sdk-endpoint" - ;; - */autonoma/.sdk-integration.json) - VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_integration.py" - VALIDATOR_NAME="validate-sdk-integration" + STEP_COMPLETED=0 + STEP_COMPLETED_NAME="Knowledge Base" + STEP_STARTED=1 + STEP_STARTED_NAME="Entity Audit" + POST_UPLOAD="skills" ;; */autonoma/features.json) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_features.py" VALIDATOR_NAME="validate-features" ;; + */autonoma/entity-audit.md) + VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_entity_audit.py" + VALIDATOR_NAME="validate-entity-audit" + STEP_COMPLETED=1 + STEP_COMPLETED_NAME="Entity Audit" + STEP_STARTED=2 + STEP_STARTED_NAME="Scenarios" + ;; */autonoma/scenarios.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenarios.py" VALIDATOR_NAME="validate-scenarios" - ;; - */autonoma/.scenario-validation.json) - VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py" - VALIDATOR_NAME="validate-scenario-validation" + STEP_COMPLETED=2 + STEP_COMPLETED_NAME="Scenarios" + STEP_STARTED=3 + STEP_STARTED_NAME="Implement" ;; */autonoma/scenario-recipes.json) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_recipes.py" VALIDATOR_NAME="validate-scenario-recipes" ;; + */autonoma/.scenario-validation.json) + VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py" + VALIDATOR_NAME="validate-scenario-validation" + ;; */autonoma/qa-tests/INDEX.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py" VALIDATOR_NAME="validate-test-index" - ;; - */autonoma/qa-tests/*/INDEX.md) - VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py" - VALIDATOR_NAME="validate-adhoc-test-index" + STEP_COMPLETED=5 + STEP_COMPLETED_NAME="E2E Tests" + POST_UPLOAD="test_cases" ;; */autonoma/qa-tests/*/[!I]*.md) VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py" @@ -75,25 +269,21 @@ case "$FILE_PATH" in ;; esac -# Check file exists if [ ! -f "$FILE_PATH" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: File does not exist: $FILE_PATH" >&2 exit 2 fi -# Check file is non-empty if [ ! -s "$FILE_PATH" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: File is empty: $FILE_PATH" >&2 exit 2 fi -# Check validator script exists if [ ! -f "$VALIDATOR_SCRIPT" ]; then echo "VALIDATION FAILED [$VALIDATOR_NAME]: Validator script not found: $VALIDATOR_SCRIPT" >&2 exit 2 fi -# Run the validator RESULT=$(python3 "$VALIDATOR_SCRIPT" "$FILE_PATH" 2>&1) EXIT_CODE=$? @@ -102,26 +292,6 @@ if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then exit 2 fi -# scenario-recipes.json must also pass live endpoint preflight. This is the -# only deterministic check that the generated create payload actually works -# against the current SDK contract. -if [ "$VALIDATOR_NAME" = "validate-scenario-recipes" ]; then - PREFLIGHT_SCRIPT="$SCRIPT_DIR/preflight_scenario_recipes.py" - if [ ! -f "$PREFLIGHT_SCRIPT" ]; then - echo "VALIDATION FAILED [scenario-recipes-preflight]: Script not found: $PREFLIGHT_SCRIPT" >&2 - exit 2 - fi - - PREFLIGHT_RESULT=$(python3 "$PREFLIGHT_SCRIPT" "$FILE_PATH" 2>&1) - PREFLIGHT_EXIT=$? - if [ $PREFLIGHT_EXIT -ne 0 ]; then - echo "VALIDATION FAILED [scenario-recipes-preflight]: $PREFLIGHT_RESULT" >&2 - exit 2 - fi -fi - -# For root INDEX.md only, also validate directory structure -# (subfolder INDEX.md from adhoc runs uses validate-adhoc-test-index and skips this check) if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py" DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1) @@ -132,4 +302,17 @@ if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then fi fi +# Validation passed — emit lifecycle events and upload artifacts. +# Note: step.started for the NEXT step is NOT emitted here. It fires only when +# the orchestrator writes autonoma/.step--ack after the user confirms via +# AskUserQuestion. That gap gives the UI its "waiting for confirmation" banner. +if [ -n "$STEP_COMPLETED" ]; then + emit_step_event "$STEP_COMPLETED" completed "$STEP_COMPLETED_NAME" +fi + +case "$POST_UPLOAD" in + skills) upload_skills ;; + test_cases) upload_test_cases ;; +esac + exit 0 diff --git a/hooks/validators/_audit_schema.py b/hooks/validators/_audit_schema.py new file mode 100644 index 0000000..f66891e --- /dev/null +++ b/hooks/validators/_audit_schema.py @@ -0,0 +1,67 @@ +"""Shared helpers for reading the entity audit with backwards compatibility. + +Two schemas exist on disk: + +- v1 (legacy): each model entry has `has_creation_code: bool` and, when true, + `creation_file` / `creation_function` / `side_effects`. Nothing about who + else mints the model. +- v2 (current): each model entry has `independently_created: bool` plus a + `created_by: [{owner, via, why}]` list. + +The compat shim translates v1 into v2 on read so callers only reason about +`independently_created`. We never rewrite the on-disk file here — that's the +audit generator's job. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml # type: ignore + + +def load_audit(path: Path) -> dict[str, dict]: + """Return {model_name: normalized_entry}. Empty dict if the file is missing or malformed.""" + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = _normalize(entry) + return out + + +def _normalize(entry: dict[str, Any]) -> dict[str, Any]: + """Return a copy of entry with `independently_created` + `created_by` populated. + + - If `independently_created` is already set, the entry is v2 — leave it alone + (just default `created_by` to []). + - Otherwise fall back to v1 `has_creation_code` and set `created_by: []`. + """ + out = dict(entry) + if "independently_created" not in out: + out["independently_created"] = bool(out.get("has_creation_code")) + if "created_by" not in out or out["created_by"] is None: + out["created_by"] = [] + return out + + +def is_independently_created(entry: dict[str, Any]) -> bool: + """True when the model has its own standalone creation path (factory-worthy).""" + if "independently_created" in entry: + return bool(entry["independently_created"]) + return bool(entry.get("has_creation_code")) diff --git a/hooks/validators/evals/README.md b/hooks/validators/evals/README.md new file mode 100644 index 0000000..5c5c7f8 --- /dev/null +++ b/hooks/validators/evals/README.md @@ -0,0 +1,53 @@ +# Factory-fidelity evals + +Ad-hoc eval harness for the semantic validator in `../validate_factory_fidelity.py`. +Each fixture simulates one model's Step 2 audit entry, current audit entry, +factory block, helper (optional), and original creation snippet, then asserts +the verdict the rubric should produce. + +## Run + +```bash +# against a local Astro dev server +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py + +# single fixture +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py --only good_uses_service + +# dump the rendered prompt without calling claude (for debugging) +AUTONOMA_DOCS_URL=http://localhost:4321 \ + python3 hooks/validators/evals/run_evals.py --write-prompt +``` + +Requires the `claude` CLI on `PATH`. Model is configurable via +`AUTONOMA_FIDELITY_MODEL` (defaults to `sonnet`). + +## Fixture schema + +```json +{ + "model": "", + "expected_verdict": "pass" | "fail", + "expected_fail_criteria": [1, 2, 3, 4], + "step2_audit_entry": "", + "current_audit_entry": "", + "handler_path": "", + "factory_block": "", + "helper_section": "File: \\nFunction: \\n\\n```\\n\\n```", + "original_creation_file": "", + "original_creation_snippet": "" +} +``` + +Keep fixtures generic — placeholder names (`UserService`, `src/users/...`) only, +no references to real Autonoma-internal codebases. The rubric itself is generic; +evals that leak specific names would mask rubric bias. + +## When to add a fixture + +- New failure mode observed in the wild → add a `bad_*.json` that captures it + with the smallest reproduction, and confirm the current rubric catches it. +- Rubric edit → run the full suite against the new rubric. A fixture flipping + verdict is a signal that the criteria are ambiguous; tighten the wording. diff --git a/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json new file mode 100644 index 0000000..de57863 --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json @@ -0,0 +1,12 @@ +{ + "model": "Session", + "expected_verdict": "fail", + "expected_fail_criteria": [3], + "step2_audit_entry": "- name: Session\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.createSession\n side_effects:\n - Signs session token\n - Records session in audit log\n", + "current_audit_entry": "- name: Session\n has_creation_code: true\n creation_file: src/routes/autonoma/autonoma-factories.ts\n creation_function: createSession\n side_effects:\n - Signs session token\n - Records session in audit log\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "Session: defineFactory({\n async create(data, ctx) {\n return createSession(ctx.executor, data);\n },\n}),", + "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createSession\n\n```\n// Thin wrapper around buildAuth.createSession — preserves signing + audit.\nexport async function createSession(db, data) {\n const auth = buildAuth(db);\n return auth.createSession(data);\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n database: prismaAdapter(db),\n createSession: async (data) => {\n const token = signSessionToken(data);\n const session = await db.session.create({ data: { ...data, token } });\n await auditLog.record('session.created', { sessionId: session.id });\n return session;\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/bad_missing_owner.json b/hooks/validators/evals/fixtures/bad_missing_owner.json new file mode 100644 index 0000000..69e20aa --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_missing_owner.json @@ -0,0 +1,7 @@ +{ + "kind": "audit_validator", + "note": "Dependent whose created_by owner doesn't exist in the audit. The audit VALIDATOR (not the fidelity validator) must reject. This fixture is asserted via subprocess against validate_entity_audit.py.", + "audit_frontmatter": "model_count: 2\nfactory_count: 1\nmodels:\n - name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects: []\n created_by: []\n - name: Branch\n independently_created: false\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Minted inline — but Application is not in this audit.\"\n", + "expected_exit": 1, + "expected_stderr_substring": "owner='Application' does not match any model" +} diff --git a/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json new file mode 100644 index 0000000..9eb5f41 --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 2], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return ctx.executor.user.create({ data });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input, deps) {\n const hashed = await bcrypt.hash(input.password, 10);\n const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n await ensureOrgMembership(user, deps);\n await analytics.capture('user_signed_up', { userId: user.id });\n return user;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json new file mode 100644 index 0000000..82c3daf --- /dev/null +++ b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 2, 4], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.databaseHooks.user.create\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n - Fires signup webhook\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/auth.ts\n creation_function: buildAuth.databaseHooks.user.create\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n - Fires signup webhook\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return createUser(ctx.executor, data);\n },\n}),", + "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createUser\n\n```\n// better-auth's internal adapter does the same thing — no business logic\n// beyond the raw insert.\nexport async function createUser(db, data) {\n return db.user.create({ data, select: { id: true } });\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n database: prismaAdapter(db),\n databaseHooks: {\n user: {\n create: async (user) => {\n const created = await db.user.create({ data: user });\n await ensureOrgMembership(created, { db });\n await ensureBillingProvisioning(created, { db });\n await analytics.capture('user_signed_up', { userId: created.id });\n await fireSignupWebhook(created);\n return created;\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/dependent_skipped.json b/hooks/validators/evals/fixtures/dependent_skipped.json new file mode 100644 index 0000000..1e131cd --- /dev/null +++ b/hooks/validators/evals/fixtures/dependent_skipped.json @@ -0,0 +1,7 @@ +{ + "kind": "audit_filter", + "note": "Pure dependent (independently_created:false) must be silently skipped by the fidelity validator — no factory, no claude -p call. This fixture is evaluated by checking validate_factory_fidelity's model list, not by calling the LLM.", + "model": "BranchDeployment", + "expected_verdict": "skip", + "step2_audit_entry": "- name: BranchDeployment\n independently_created: false\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Minted inside the Application transaction so the default branch has a deployment row wired up from the start.\"\n" +} diff --git a/hooks/validators/evals/fixtures/dual_judged_on_standalone.json b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json new file mode 100644 index 0000000..569a5cd --- /dev/null +++ b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json @@ -0,0 +1,13 @@ +{ + "note": "Dual model (independently_created:true AND in someone's created_by). Must be judged ONLY on its standalone factory; the via-owner relationship must not affect the verdict.", + "model": "Branch", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: Branch\n independently_created: true\n creation_file: src/branches/branch.service.ts\n creation_function: BranchService.create\n side_effects:\n - Writes a default BranchSettings row\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n", + "current_audit_entry": "- name: Branch\n independently_created: true\n creation_file: src/branches/branch.service.ts\n creation_function: BranchService.create\n side_effects:\n - Writes a default BranchSettings row\n created_by:\n - owner: Application\n via: ApplicationsService.createApplication\n why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "Branch: defineFactory({\n async create(data, ctx) {\n return BranchService.create(data, { executor: ctx.executor });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/branches/branch.service.ts", + "original_creation_snippet": "export const BranchService = {\n async create(input, deps) {\n const branch = await deps.executor.branch.create({ data: input });\n await deps.executor.branchSettings.create({ data: { branchId: branch.id, theme: 'default' } });\n return branch;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json new file mode 100644 index 0000000..73934fb --- /dev/null +++ b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return createUserWithOnboarding(db, data, { signupHooks, platformEvents });\n },\n}),", + "helper_section": "File: src/auth/create-user-with-onboarding.ts\nFunction: createUserWithOnboarding\n\n```\nexport async function createUserWithOnboarding(db, data, { signupHooks, platformEvents }) {\n const user = await db.user.create({ data: { name: data.name, email: data.email } });\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n return user;\n}\n```", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = () => betterAuth({\n databaseHooks: {\n user: {\n create: {\n after: async (user) => {\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n },\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json new file mode 100644 index 0000000..f3eccef --- /dev/null +++ b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "fail", + "expected_fail_criteria": [1, 4], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/auth/auth.ts\n creation_function: betterAuth.databaseHooks.user.create\n needs_extraction: true\n extracted_to: src/auth/create-user-with-onboarding.ts\n side_effects:\n - ensureOrgMembership\n - signupHooks.run\n - platformEvents.emit\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return db.user.create({ data: { name: data.name, email: data.email } });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/auth/auth.ts", + "original_creation_snippet": "export const buildAuth = () => betterAuth({\n databaseHooks: {\n user: {\n create: {\n after: async (user) => {\n await ensureOrgMembership(db, user.id);\n await signupHooks.run(user);\n await platformEvents.emit('user_signed_up', { userId: user.id });\n },\n },\n },\n },\n});" +} diff --git a/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json new file mode 100644 index 0000000..b2c2078 --- /dev/null +++ b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/create-user.ts\n creation_function: createUser\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/auth/create-user.ts\n creation_function: createUser\n extracted_to: src/auth/create-user.ts\n side_effects:\n - Calls ensureOrgMembership (creates Organization + Member)\n - Calls ensureBillingProvisioning (creates BillingCustomer)\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return createUser(data, { db: ctx.executor, analytics, billing });\n },\n}),", + "helper_section": "File: src/auth/create-user.ts\nFunction: createUser\n\n```\n// Extracted from the databaseHooks.user.create closure for Environment\n// Factory reuse (preserves Org + Member + billing provisioning).\nexport async function createUser(input, deps) {\n const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n await ensureOrgMembership(user, deps);\n await ensureBillingProvisioning(user, deps);\n await deps.analytics.capture('user_signed_up', { userId: user.id });\n return user;\n}\n```", + "original_creation_file": "src/auth/create-user.ts", + "original_creation_snippet": "export async function createUser(input, deps) {\n const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n await ensureOrgMembership(user, deps);\n await ensureBillingProvisioning(user, deps);\n await deps.analytics.capture('user_signed_up', { userId: user.id });\n return user;\n}" +} diff --git a/hooks/validators/evals/fixtures/good_uses_service.json b/hooks/validators/evals/fixtures/good_uses_service.json new file mode 100644 index 0000000..86684bd --- /dev/null +++ b/hooks/validators/evals/fixtures/good_uses_service.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "pass", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "current_audit_entry": "- name: User\n has_creation_code: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password via bcrypt\n - Creates sibling Organization + Member rows\n - Emits user_signed_up analytics event\n", + "handler_path": "src/routes/autonoma/autonoma.handler.ts", + "factory_block": "User: defineFactory({\n async create(data, ctx) {\n return UserService.create(data, { executor: ctx.executor });\n },\n}),", + "helper_section": "(The factory does not call an external helper.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input, deps) {\n const hashed = await bcrypt.hash(input.password, 10);\n const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n await ensureOrgMembership(user, deps);\n await analytics.capture('user_signed_up', { userId: user.id });\n return user;\n },\n};" +} diff --git a/hooks/validators/evals/fixtures/helper_unresolvable_errors.json b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json new file mode 100644 index 0000000..552741a --- /dev/null +++ b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json @@ -0,0 +1,12 @@ +{ + "model": "User", + "expected_verdict": "error", + "expected_fail_criteria": [], + "step2_audit_entry": "- name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password\n - Provisions Org + Member\n created_by: []\n", + "current_audit_entry": "- name: User\n independently_created: true\n creation_file: src/users/user.service.ts\n creation_function: UserService.create\n side_effects:\n - Hashes password\n - Provisions Org + Member\n created_by: []\n", + "handler_path": "src/autonoma/handler.ts", + "factory_block": "User: defineFactory({\n create: async (data) => {\n return createUserWithMystery(data);\n },\n}),", + "helper_section": "(The factory calls identifiers that were not resolvable as named imports: createUserWithMystery. Treat this as missing-context, not as evidence of a raw-write factory.)", + "original_creation_file": "src/users/user.service.ts", + "original_creation_snippet": "export const UserService = {\n async create(input) {\n return db.user.create({ data: input });\n },\n};" +} diff --git a/hooks/validators/evals/run_evals.py b/hooks/validators/evals/run_evals.py new file mode 100755 index 0000000..fc695ed --- /dev/null +++ b/hooks/validators/evals/run_evals.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +"""Evals for the semantic factory-fidelity validator + the entity-audit +validator's schema invariants. + +Each fixture is a self-contained JSON blob. The kind of fixture is chosen by +`expected_verdict` (or by the `kind` field for non-LLM fixtures): + +- `expected_verdict: "pass" | "fail" | "error"` — LLM fixture. Feeds the + prompt to `claude -p`, parses the JSON verdict, and asserts verdict + + failing criteria match. `error` is used when a fixture deliberately + withholds context (e.g. helper unresolvable) and the LLM should decline + to fail-judge rather than falsely fail. +- `expected_verdict: "skip"` — filter fixture. Asserts that the fidelity + validator's model selector would NOT include this model (i.e. the audit + entry is pure dependent / legacy false). No LLM call, no cost. +- `kind: "audit_validator"` — audit-validator fixture. Synthesises a + minimal entity-audit.md from `audit_frontmatter`, runs + `validate_entity_audit.py` as a subprocess, and asserts the exit code + + stderr substring. + +Run: + AUTONOMA_DOCS_URL=http://localhost:4321 python3 hooks/validators/evals/run_evals.py + + # single fixture: + ... run_evals.py --only good_uses_service + +Exits 0 on success, 1 on any mismatch. +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + +HERE = Path(__file__).resolve().parent +VALIDATORS = HERE.parent +sys.path.insert(0, str(VALIDATORS)) + +import validate_factory_fidelity as v # noqa: E402 +from _audit_schema import is_independently_created # noqa: E402 + + +def load_fixture(path: Path) -> dict: + return json.loads(path.read_text()) + + +def render_prompt(fixture: dict, rubric: str, tpl: str) -> str: + return ( + tpl.replace("{{RUBRIC}}", rubric) + .replace("{{MODEL}}", fixture["model"]) + .replace("{{STEP2_AUDIT_ENTRY}}", fixture["step2_audit_entry"]) + .replace("{{CURRENT_AUDIT_ENTRY}}", fixture["current_audit_entry"]) + .replace("{{HANDLER_PATH}}", fixture.get("handler_path", "(fixture)")) + .replace("{{FACTORY_BLOCK}}", fixture["factory_block"]) + .replace("{{HELPER_SECTION}}", fixture.get("helper_section", "(The factory does not call an external helper.)")) + .replace("{{ORIGINAL_CREATION_FILE}}", fixture.get("original_creation_file", "(unknown)")) + .replace("{{ORIGINAL_CREATION_SNIPPET}}", fixture.get("original_creation_snippet", "")) + ) + + +def run_skip_fixture(fixture: dict) -> tuple[bool, str]: + """Parse fixture's step2_audit_entry as a single-model YAML list and assert + is_independently_created() returns False (so the fidelity validator would skip it).""" + import yaml + try: + parsed = yaml.safe_load(fixture["step2_audit_entry"]) + except yaml.YAMLError as e: + return False, f"could not parse step2_audit_entry: {e}" + if not isinstance(parsed, list) or not parsed or not isinstance(parsed[0], dict): + return False, "step2_audit_entry must be a single-entry YAML list" + entry = parsed[0] + if is_independently_created(entry): + return False, ( + f"fidelity validator would NOT skip this model — is_independently_created " + f"returned True for entry {entry!r}" + ) + return True, "ok" + + +def run_audit_validator_fixture(fixture: dict) -> tuple[bool, str]: + fm = fixture["audit_frontmatter"] + expected_exit = int(fixture.get("expected_exit", 1)) + expected_substr = fixture.get("expected_stderr_substring", "") + with tempfile.TemporaryDirectory() as td: + audit = Path(td) / "entity-audit.md" + audit.write_text("---\n" + fm + "---\nBody\n") + proc = subprocess.run( + [sys.executable, str(VALIDATORS / "validate_entity_audit.py"), str(audit)], + capture_output=True, text=True, timeout=30, + ) + if proc.returncode != expected_exit: + return False, ( + f"exit mismatch: expected={expected_exit} observed={proc.returncode} " + f"stdout={proc.stdout!r} stderr={proc.stderr!r}" + ) + combined = (proc.stdout or "") + (proc.stderr or "") + if expected_substr and expected_substr not in combined: + return False, f"expected stderr substring {expected_substr!r} not in output:\n{combined}" + return True, "ok" + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--only", help="Run a single fixture by name (no extension)") + ap.add_argument("--write-prompt", action="store_true", help="Write the rendered prompt for each LLM fixture to stdout and exit without calling claude") + args = ap.parse_args() + + os.chdir(VALIDATORS.parent.parent) + Path("autonoma").mkdir(exist_ok=True) + url_file = Path("autonoma/.docs-url") + restore = url_file.exists() + prior = url_file.read_text() if restore else None + docs = os.environ.get("AUTONOMA_DOCS_URL") + if docs: + url_file.write_text(docs.strip()) + + fixtures_dir = HERE / "fixtures" + fixtures = sorted(fixtures_dir.glob("*.json")) + if args.only: + fixtures = [f for f in fixtures if f.stem == args.only] + if not fixtures: + print(f"no fixture named {args.only}", file=sys.stderr) + return 1 + + # Only fetch rubric if we have any LLM fixtures left in the run list + needs_llm = any( + load_fixture(fp).get("expected_verdict") in ("pass", "fail", "error") + for fp in fixtures + ) + rubric = tpl = None + try: + if needs_llm: + pair = v.fetch_rubric() + if not pair: + print("could not fetch rubric — set AUTONOMA_DOCS_URL", file=sys.stderr) + return 1 + rubric, tpl = pair + finally: + if restore: + url_file.write_text(prior or "") + elif docs: + try: + url_file.unlink() + except OSError: + pass + + fails: list[str] = [] + for fp in fixtures: + fixture = load_fixture(fp) + kind = fixture.get("kind") + expected = fixture.get("expected_verdict") + + if kind == "audit_validator": + ok, detail = run_audit_validator_fixture(fixture) + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: audit_validator") + if not ok: + print(f" reason: {detail}") + fails.append(fp.stem) + continue + + if expected == "skip": + ok, detail = run_skip_fixture(fixture) + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: expected=skip observed={'skip' if ok else 'NOT-skipped'}") + if not ok: + print(f" reason: {detail}") + fails.append(fp.stem) + continue + + # LLM fixture + if args.write_prompt: + print(f"── {fp.stem} ──") + print(render_prompt(fixture, rubric, tpl)) + print() + continue + verdict = v.run_claude(render_prompt(fixture, rubric, tpl)) + observed = verdict.get("verdict", "error") + matched = observed == expected + detail_ok = True + if expected == "fail" and observed == "fail": + expected_fails = set(fixture.get("expected_fail_criteria") or []) + if expected_fails: + observed_fails = {c.get("id") for c in (verdict.get("criteria") or []) if c.get("status") == "fail"} + missing = expected_fails - observed_fails + if missing: + detail_ok = False + ok = matched and detail_ok + tag = "PASS" if ok else "FAIL" + print(f"{tag} {fp.stem}: expected={expected} observed={observed}") + if not ok: + print(f" reason: expected criteria={fixture.get('expected_fail_criteria')} observed={[c for c in (verdict.get('criteria') or [])]}") + print(f" fix_hint: {verdict.get('fix_hint','')}") + fails.append(fp.stem) + + if fails: + print(f"\n{len(fails)} eval failure(s): {', '.join(fails)}", file=sys.stderr) + return 1 + print("\nall evals passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hooks/validators/validate_creation_file_immutable.py b/hooks/validators/validate_creation_file_immutable.py new file mode 100755 index 0000000..3bfdf87 --- /dev/null +++ b/hooks/validators/validate_creation_file_immutable.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Validator: `creation_file` must be immutable after Step 2. + +Rationale — Run 4 post-mortem. The env-factory agent evaded the factory +integrity hook by (a) extracting stubs into a new file under the handler's +directory and (b) rewriting `creation_file` in the audit to point at the stub, +so every downstream check validated against fabricated ground truth. + +Rule: for every model with `has_creation_code: true` in BOTH the Step 2 +snapshot AND the current audit, the `creation_file` column must not change. +Allowed transitions: + - row removed from current (not a change, model dropped) + - has_creation_code flipped true -> false (covered by the audit-flip cap) + - a new model added in current (snapshot has no row to compare) + +Exit 0 = clean. Exit 2 with actionable message on violation. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import yaml # type: ignore + +import sys as _sys +from pathlib import Path as _Path +_sys.path.insert(0, str(_Path(__file__).resolve().parent)) +from _audit_schema import is_independently_created # noqa: E402 + + +def load_audit(path: Path) -> dict[str, dict]: + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = entry + return out + + +def main() -> None: + snap = load_audit(Path("autonoma/.entity-audit-step2.md")) + cur = load_audit(Path("autonoma/entity-audit.md")) + if not snap: + # Snapshot missing — skip silently. The audit-flip check already + # prints a warning when appropriate. + sys.exit(0) + + violations: list[tuple[str, str, str]] = [] + for name, snap_entry in snap.items(): + if not is_independently_created(snap_entry): + continue + cur_entry = cur.get(name) + if cur_entry is None: + continue + if not is_independently_created(cur_entry): + # Flipped to false — caught elsewhere. + continue + snap_file = (snap_entry.get("creation_file") or "").strip() + cur_file = (cur_entry.get("creation_file") or "").strip() + if snap_file and cur_file and snap_file != cur_file: + violations.append((name, snap_file, cur_file)) + + if not violations: + sys.exit(0) + + lines = [ + f"CREATION_FILE IMMUTABILITY VIOLATED — {len(violations)} models had " + "their Step 2 `creation_file` column overwritten.", + "", + "The Step 2 audit is a statement about the existing codebase at " + "analysis time. Its `creation_file` column names where the real " + "creation logic lives BEFORE the factory was written. Overwriting it " + "to point at a file the factory agent created is the audit-rewrite " + "attack from the Run 4 post-mortem — it makes every downstream check " + "validate against fabricated ground truth.", + "", + "Violations (model: snapshot_path -> current_path):", + ] + for name, s, c in violations[:40]: + lines.append(f" - {name}: {s} -> {c}") + if len(violations) > 40: + lines.append(f" ... and {len(violations) - 40} more") + lines.append("") + lines.append( + "To fix: restore the original `creation_file` values from " + "autonoma/.entity-audit-step2.md. If you extracted the creation code " + "into a new helper, record that in an `extracted_to:` field — do NOT " + "overwrite `creation_file`. The audit's creation_file must continue " + "to name the file where the real business logic originally lives." + ) + sys.stderr.write("\n".join(lines) + "\n") + sys.exit(2) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_endpoint_implemented.py b/hooks/validators/validate_endpoint_implemented.py new file mode 100755 index 0000000..4e27ac4 --- /dev/null +++ b/hooks/validators/validate_endpoint_implemented.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +"""Validator for autonoma/.endpoint-implemented. + +Blocks the sentinel write when the handler file contains an inline ORM write +inside a defineFactory({ create }) body for a model the entity audit marked +has_creation_code: true. This is the #1 bug the env-factory agent ships and +the agent's self-policing factory-integrity check has proven insufficient. + +Inputs: path to .endpoint-implemented (via validate-pipeline-output.sh). +Reads: + - autonoma/entity-audit.md (frontmatter: models with has_creation_code true/false) + - the handler file path recorded in .endpoint-implemented body (first match of "handler: ") + +Exit codes: + 0 — clean + 2 — anti-pattern found; prints a Claude-facing error message on stderr + +The regex set mirrors the language list in agents/env-factory-generator.md's +"The one thing you MUST NOT do" section. Raw SQL literal INSERTs are not +matched here because distinguishing them from teardown DELETE strings in the +same factory block requires full parsing — the grep-level anti-pattern +detection catches the >95% case. +""" + +from __future__ import annotations + +import os +import re +import sys +from pathlib import Path + +import yaml # type: ignore + +SENTINEL_PATH = sys.argv[1] if len(sys.argv) > 1 else "" + +# Max number of models allowed to flip from has_creation_code: true to false +# between the Step 2 snapshot and the audit at .endpoint-implemented time. +# Overridable via env for unusual migrations; default 5 matches the agent's +# own recommendation in the third-run post-mortem. +AUDIT_FLIP_CAP = int(os.environ.get("AUTONOMA_AUDIT_FLIP_CAP", "5")) + +# Standalone server patterns: when the handler directory contains a file that +# starts its own HTTP server instead of exporting a router mounted on the main +# app, we block. This is the second bug from the third-run post-mortem. +STANDALONE_SERVER_PATTERNS = [ + re.compile(r"\bserve\s*\(\s*\{[^}]*\bfetch\b", re.DOTALL), # @hono/node-server + re.compile(r"\bapp\.listen\s*\("), # express / hono-node + re.compile(r"\bhttp\.createServer\s*\("), # raw node + re.compile(r"\buvicorn\.run\s*\("), # python + re.compile(r"\bFlask\s*\([^)]*\)[^\n]*\.run\s*\("), # flask + re.compile(r"\brun!\s*$", re.MULTILINE), # ruby sinatra-ish +] + +# Anti-pattern: ORM create/insert/upsert calls that almost certainly belong to +# a raw ORM write rather than a service/repository method call. +ORM_ANTI_PATTERN = re.compile( + r"\b(prisma|db|tx|ctx\.executor)\." # ORM root + r"[a-zA-Z_][a-zA-Z0-9_]*\." # model accessor + r"(create|createMany|insert|insertMany|upsert)\s*\(", + re.IGNORECASE, +) + +# A second class: Drizzle-style `tx.insert(xTable)` / `db.insert(xTable)`. +DRIZZLE_INSERT = re.compile( + r"\b(tx|db|ctx\.executor)\.insert\s*\(", +) + +FACTORY_HEADER = re.compile( + r"([A-Z][A-Za-z0-9_]*)\s*:\s*defineFactory\s*\(\s*\{", +) + + +def fail(message: str) -> None: + sys.stderr.write(message + "\n") + sys.exit(2) + + +def find_matching_brace(src: str, open_idx: int) -> int: + """Given index of `{`, return index of matching `}`. + + Naive balancer — ignores strings/comments. Good enough for generated + handler files that follow the standard shape. + """ + depth = 0 + i = open_idx + n = len(src) + while i < n: + c = src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + return i + i += 1 + return -1 + + +def extract_factory_bodies(src: str) -> list[tuple[str, str]]: + """Return list of (model_name, factory_inner_src).""" + out: list[tuple[str, str]] = [] + for m in FACTORY_HEADER.finditer(src): + model = m.group(1) + brace_open = src.find("{", m.end() - 1) + if brace_open < 0: + continue + brace_close = find_matching_brace(src, brace_open) + if brace_close < 0: + continue + out.append((model, src[brace_open + 1 : brace_close])) + return out + + +def extract_create_body(factory_src: str) -> str: + """Find the `create:` or `create(` body inside a factory config object.""" + # Pattern: create(data, ctx) { ... } OR create: async (data, ctx) => { ... } + # OR create: (data, ctx) => { ... } + create_start = re.search(r"\bcreate\s*[(:]", factory_src) + if not create_start: + return "" + # Find the first `{` after create_start. + brace_open = factory_src.find("{", create_start.end()) + if brace_open < 0: + return "" + brace_close = find_matching_brace(factory_src, brace_open) + if brace_close < 0: + return "" + return factory_src[brace_open + 1 : brace_close] + + +def parse_audit() -> dict[str, bool]: + """Return {model_name: has_creation_code}.""" + audit_path = Path("autonoma/entity-audit.md") + if not audit_path.exists(): + fail("Missing autonoma/entity-audit.md — cannot verify factory integrity.") + text = audit_path.read_text() + if not text.startswith("---"): + fail("autonoma/entity-audit.md missing YAML frontmatter.") + end = text.find("\n---", 3) + if end < 0: + fail("autonoma/entity-audit.md frontmatter not terminated.") + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError as e: + fail(f"autonoma/entity-audit.md frontmatter not valid YAML: {e}") + models = fm.get("models") or [] + out: dict[str, bool] = {} + for entry in models: + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if not name: + continue + out[str(name)] = bool(entry.get("has_creation_code")) + return out + + +def resolve_handler_path() -> Path: + """Read the handler path recorded in .endpoint-implemented body.""" + if not SENTINEL_PATH or not Path(SENTINEL_PATH).exists(): + fail(".endpoint-implemented sentinel path not provided or missing.") + body = Path(SENTINEL_PATH).read_text() + + candidates: list[str] = [] + m = re.search(r"handler:\s*(\S+)", body, re.IGNORECASE) + if m: + candidates.append(m.group(1).rstrip(".,;:")) + # Fallback: extract every path-looking token ending in a source extension. + for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body): + candidates.append(tok.rstrip(".,;:")) + + seen: set[str] = set() + for cand in candidates: + if cand in seen: + continue + seen.add(cand) + p = Path(cand) + if not p.is_absolute(): + p = Path.cwd() / cand + if p.exists() and p.is_file(): + return p + + fail( + ".endpoint-implemented body must name the handler file (e.g. a line " + "'handler: apps/api/src/routes/autonoma/autonoma.handler.ts') so the " + "factory-integrity validator can locate it. Checked: " + + ", ".join(candidates[:8] or ["(no path tokens found)"]) + ) + return Path() # unreachable + + +def check_audit_flip() -> list[str]: + """Compare the Step 2 snapshot to the current audit; return error lines. + + Enforces a cap on how many models may flip from has_creation_code: true + to false between Step 2 ack and .endpoint-implemented. If no snapshot + exists (older projects that started before this hook shipped) we skip + silently — the snapshot is created automatically on .step-2-ack. + """ + snapshot = Path("autonoma/.entity-audit-step2.md") + current = Path("autonoma/entity-audit.md") + if not snapshot.exists() or not current.exists(): + return [] + + def _true_set(path: Path) -> set[str]: + text = path.read_text() + if not text.startswith("---"): + return set() + end = text.find("\n---", 3) + if end < 0: + return set() + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return set() + out: set[str] = set() + for entry in (fm.get("models") or []): + if not isinstance(entry, dict): + continue + name = entry.get("name") or entry.get("model") + if name and bool(entry.get("has_creation_code")): + out.add(str(name)) + return out + + before = _true_set(snapshot) + after = _true_set(current) + flipped = sorted(before - after) + if len(flipped) <= AUDIT_FLIP_CAP: + return [] + + lines = [ + f"AUDIT FLIP CAP EXCEEDED — {len(flipped)} models flipped from " + f"has_creation_code: true to false since Step 2 (cap: {AUDIT_FLIP_CAP}).", + "", + "The env-factory agent is editing ground truth to dodge the factory " + "integrity check. Branch 3 (\"audit is factually wrong\") is for cases " + "where the audit's creation_function does NOT exist or creates NOTHING " + "— not for cases where calling it is inconvenient (complex DI, external " + "side effects, Temporal workflows, bulk orchestrators). Those are " + "Branch 2 problems: extract helpers, wire constructor deps, or guard " + "external calls in the service itself.", + "", + "Models flipped (showing first 40):", + ] + for name in flipped[:40]: + lines.append(f" - {name}") + if len(flipped) > 40: + lines.append(f" ... and {len(flipped) - 40} more") + lines.append("") + lines.append( + "To proceed: (a) restore has_creation_code: true for the models above " + "and write real factories per the Per-model decision tree, or (b) if " + "you truly believe a subset should flip, ask the user to raise " + "AUTONOMA_AUDIT_FLIP_CAP and confirm the diff." + ) + return lines + + +def check_handler_mount(handler_path: Path) -> list[str]: + """Return error lines if the handler isn't mounted on the main app. + + Two checks: + 1. No sibling file in the handler directory starts its own server. + 2. Somewhere in the backend source tree, a file imports the handler + (by relative path, module path, or file basename). + """ + handler_dir = handler_path.parent + errors: list[str] = [] + + # 1) Detect standalone server files in the handler directory. + standalone_hits: list[tuple[Path, str]] = [] + for sibling in handler_dir.iterdir(): + if not sibling.is_file(): + continue + if sibling == handler_path: + continue + if sibling.name.endswith((".test.ts", ".test.js", ".spec.ts", ".spec.js")): + continue + if sibling.suffix not in {".ts", ".tsx", ".js", ".mjs", ".py", ".rb", ".go", ".rs", ".java"}: + continue + try: + text = sibling.read_text() + except OSError: + continue + for pat in STANDALONE_SERVER_PATTERNS: + if pat.search(text): + standalone_hits.append((sibling, pat.pattern)) + break + + if standalone_hits: + errors.append( + "STANDALONE SERVER DETECTED — the Autonoma handler must be mounted " + "as a route on the existing application, not run as its own HTTP " + "server. The following files bind their own port:" + ) + errors.append("") + for p, pat in standalone_hits: + errors.append(f" - {p} (matched: {pat})") + errors.append("") + errors.append( + "Fix: delete the standalone server file and mount the handler as a " + "route on the main app, following the same pattern every other " + "feature uses (e.g. `app.route(\"/api/autonoma\", router)` in Hono, " + "`app.use(\"/api/autonoma\", router)` in Express, or the equivalent " + "for your framework). Read the main app entry file first and copy " + "its existing routing pattern." + ) + errors.append("") + + # 2) Verify the handler is imported from somewhere reachable. We use the + # last two path segments (parent-dir/file-stem) to avoid false positives + # from unrelated packages that happen to share the parent-dir name (e.g. + # `@autonoma/logger` vs the local `autonoma/handler`). + handler_basename = handler_path.stem # e.g. "handler" + handler_parent_dir = handler_dir.name # e.g. "autonoma" + specific_fragment = f"{handler_parent_dir}/{handler_basename}" # "autonoma/handler" + # Also accept any file in the same parent directory (routes on the router + # file next to handler.ts still count as mounting — e.g. autonoma/router.ts + # is imported by app.ts and imports handler.ts). + import_patterns = [ + re.compile(rf"['\"][^'\"]*{re.escape(specific_fragment)}(?:['\"]|\.[a-z]+['\"])"), + re.compile(rf"\bfrom\s+[\w.]*{re.escape(handler_parent_dir)}\.{re.escape(handler_basename)}\b"), # python + ] + found_import = False + root = Path.cwd() + # Only scan source dirs with reasonable extensions. + source_exts = {".ts", ".tsx", ".js", ".mjs", ".cjs", ".py", ".rb", ".go", ".rs", ".java", ".ex", ".exs", ".php"} + skip_dirs = {"node_modules", ".git", "dist", "build", ".next", ".turbo", "target", "vendor", "__pycache__", "autonoma"} + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")] + for fn in filenames: + if not any(fn.endswith(ext) for ext in source_exts): + continue + fp = Path(dirpath) / fn + if fp.resolve() == handler_path.resolve(): + continue + if fp.parent.resolve() == handler_path.parent.resolve(): + # Don't count imports inside the handler's own directory — the + # standalone server.ts imports handler.ts but that isn't + # "reachable from the main app". + continue + try: + text = fp.read_text() + except OSError: + continue + for pat in import_patterns: + if pat.search(text): + found_import = True + break + if found_import: + break + if found_import: + break + + if not found_import: + errors.append( + f"HANDLER NOT MOUNTED — no file outside {handler_dir} imports the " + f"Autonoma handler. The endpoint is unreachable from the main " + f"application's routes." + ) + errors.append("") + errors.append( + "Fix: import the handler (or its router) from the main app's entry " + "file (e.g. apps/api/src/app.ts) and mount it on a route. The " + "Autonoma platform sends HMAC-signed requests to the main API's " + "public URL — a handler that nothing imports is dead code." + ) + errors.append("") + + return errors + + +def main() -> None: + audit = parse_audit() + handler_path = resolve_handler_path() + src = handler_path.read_text() + + violations: list[tuple[str, int, str]] = [] + factories = extract_factory_bodies(src) + + seen_models: set[str] = set() + for model, factory_src in factories: + seen_models.add(model) + if not audit.get(model): + # has_creation_code: false or unknown — ORM fallback is legitimate. + continue + create_body = extract_create_body(factory_src) + if not create_body: + continue + for m in ORM_ANTI_PATTERN.finditer(create_body): + line_no = create_body[: m.start()].count("\n") + 1 + snippet = create_body.splitlines()[line_no - 1].strip() + violations.append((model, line_no, snippet)) + for m in DRIZZLE_INSERT.finditer(create_body): + line_no = create_body[: m.start()].count("\n") + 1 + snippet = create_body.splitlines()[line_no - 1].strip() + violations.append((model, line_no, snippet)) + + # Flag audited models missing a factory entirely. + missing_factories = [ + name for name, has_code in audit.items() if has_code and name not in seen_models + ] + + audit_flip_errors = check_audit_flip() + mount_errors = check_handler_mount(handler_path) + + if not violations and not missing_factories and not audit_flip_errors and not mount_errors: + sys.exit(0) + + lines = [ + "FACTORY INTEGRITY CHECK FAILED — .endpoint-implemented will NOT be written.", + "", + f"Handler inspected: {handler_path}", + "", + ] + if violations: + lines.append( + "The following factories contain inline ORM writes for models the audit " + "marked has_creation_code: true. This is the #1 trap the env-factory " + "agent is warned about. You MUST call the audited creation_function " + "(extracting it first if needs_extraction: true). See the Per-model " + "decision tree and DI playbook in the env-factory prompt." + ) + lines.append("") + for model, line_no, snippet in violations: + lines.append(f" - {model} factory body: line {line_no}: {snippet}") + lines.append("") + if missing_factories: + lines.append( + "The following models are has_creation_code: true in the audit but have " + "no defineFactory registration in the handler:" + ) + for name in missing_factories: + lines.append(f" - {name}") + lines.append("") + if audit_flip_errors: + lines.extend(audit_flip_errors) + if mount_errors: + lines.extend(mount_errors) + if violations or missing_factories: + lines.append( + "To fix: re-run the Per-model decision tree for every failing model. If the " + "creation function is inline in a route/framework hook, extract it into a " + "named exported function, update entity-audit.md in place (clear " + "needs_extraction), then call the new function from the factory." + ) + fail("\n".join(lines)) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_entity_audit.py b/hooks/validators/validate_entity_audit.py new file mode 100644 index 0000000..ee65369 --- /dev/null +++ b/hooks/validators/validate_entity_audit.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Validates entity-audit.md frontmatter format. + +Supports two schemas: + +- v2 (current): each model has `independently_created: bool` and + `created_by: [{owner, via, why}]`. When `independently_created: true` the + entry must also have `creation_file`, `creation_function`, and optionally + `side_effects`. Dependents (`independently_created: false`) must have a + non-empty `created_by` pointing at a model that exists in the audit. + +- v1 (legacy): each model has `has_creation_code: bool`. We still accept it + and translate on read (see _audit_schema.py). v1 audits cannot express + `created_by`, so the dependent-has-owner invariant is vacuously satisfied. +""" +import sys +import yaml +from pathlib import Path + +filepath = sys.argv[1] +content = open(filepath).read() + +if not content.startswith('---'): + print('File must start with YAML frontmatter (---)') + sys.exit(1) + +parts = content.split('---', 2) +if len(parts) < 3: + print('Missing closing --- for frontmatter') + sys.exit(1) + +try: + fm = yaml.safe_load(parts[1]) +except Exception as e: + print(f'Invalid YAML in frontmatter: {e}') + sys.exit(1) + +if not isinstance(fm, dict): + print('Frontmatter must be a YAML mapping') + sys.exit(1) + +required = ['model_count', 'factory_count', 'models'] +missing = [f for f in required if f not in fm] +if missing: + print(f'Missing required frontmatter fields: {missing}') + sys.exit(1) + +for count_field in ['model_count', 'factory_count']: + val = fm.get(count_field) + if not isinstance(val, int) or val < 0: + print(f'{count_field} must be a non-negative integer') + sys.exit(1) + +if fm['model_count'] < 1: + print('model_count must be at least 1 — no models were audited') + sys.exit(1) + +models = fm.get('models') +if not isinstance(models, list) or len(models) == 0: + print('models must be a non-empty list') + sys.exit(1) + +if len(models) != fm['model_count']: + print(f'model_count ({fm["model_count"]}) does not match models array length ({len(models)})') + sys.exit(1) + + +def is_indep(model): + if 'independently_created' in model: + return bool(model['independently_created']) + return bool(model.get('has_creation_code')) + + +# First pass: sanity + collect names for cross-reference +names = set() +for i, model in enumerate(models): + if not isinstance(model, dict): + print(f'models[{i}] must be a mapping') + sys.exit(1) + if 'name' not in model or not isinstance(model['name'], str) or not model['name'].strip(): + print(f'models[{i}].name must be a non-empty string') + sys.exit(1) + names.add(model['name']) + +# Second pass: schema checks per model +factory_count = 0 +for i, model in enumerate(models): + name = model['name'] + has_v2 = 'independently_created' in model + has_v1 = 'has_creation_code' in model + if not has_v2 and not has_v1: + print(f'models[{i}] ({name}) missing classification (independently_created or has_creation_code)') + sys.exit(1) + if has_v2 and not isinstance(model['independently_created'], bool): + print(f'models[{i}] ({name}).independently_created must be a boolean') + sys.exit(1) + if has_v1 and not isinstance(model['has_creation_code'], bool): + print(f'models[{i}] ({name}).has_creation_code must be a boolean') + sys.exit(1) + + indep = is_indep(model) + + if indep: + factory_count += 1 + if 'creation_file' not in model or not isinstance(model.get('creation_file'), str): + print(f'models[{i}] ({name}) independently_created=true but missing creation_file') + sys.exit(1) + if 'creation_function' not in model or not isinstance(model.get('creation_function'), str): + print(f'models[{i}] ({name}) independently_created=true but missing creation_function') + sys.exit(1) + if 'side_effects' in model and not isinstance(model['side_effects'], list): + print(f'models[{i}] ({name}) side_effects must be a list when present') + sys.exit(1) + + # created_by invariants (v2 only — v1 has no such field) + cb = model.get('created_by') + if cb is None: + # v1 audits don't have it; v2 requires it (empty allowed for roots) + if has_v2: + print(f'models[{i}] ({name}) missing required field: created_by (list, may be empty)') + sys.exit(1) + continue + + if not isinstance(cb, list): + print(f'models[{i}] ({name}).created_by must be a list') + sys.exit(1) + + if not indep and len(cb) == 0: + print( + f'models[{i}] ({name}) is marked independently_created=false but has no ' + 'created_by entries. Every dependent must have at least one owner — ' + 'either find the creation path, or mark the model independently_created=true.' + ) + sys.exit(1) + + for j, owner_entry in enumerate(cb): + if not isinstance(owner_entry, dict): + print(f'models[{i}] ({name}).created_by[{j}] must be a mapping') + sys.exit(1) + for req in ('owner', 'via', 'why'): + val = owner_entry.get(req) + if not isinstance(val, str) or not val.strip(): + print( + f'models[{i}] ({name}).created_by[{j}].{req} must be a non-empty string' + ) + sys.exit(1) + if owner_entry['owner'] not in names: + print( + f'models[{i}] ({name}).created_by[{j}].owner={owner_entry["owner"]!r} ' + f'does not match any model in the audit. Check the owner name or add the owner model.' + ) + sys.exit(1) + if owner_entry['owner'] == name: + print(f'models[{i}] ({name}).created_by[{j}].owner cannot be the model itself') + sys.exit(1) + +if factory_count != fm['factory_count']: + # Autofix the count instead of blocking. Count-drift is bookkeeping, not a + # structural bug — the previous behaviour made the agent oscillate between + # stale counts on every edit. Warn loudly but keep the pipeline moving. + import sys as _sys + _sys.stderr.write( + f'[validate-entity-audit] autofixing factory_count: was ' + f'{fm["factory_count"]}, now {factory_count}\n' + ) + # Rewrite the file in place, preserving the body. + fm['factory_count'] = factory_count + new_fm = yaml.safe_dump(fm, sort_keys=False).rstrip() + "\n" + rewritten = '---\n' + new_fm + '---' + parts[2] + Path(filepath).write_text(rewritten) + +print('OK') diff --git a/hooks/validators/validate_factory_fidelity.py b/hooks/validators/validate_factory_fidelity.py new file mode 100755 index 0000000..c4f03da --- /dev/null +++ b/hooks/validators/validate_factory_fidelity.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python3 +"""Validator: semantic per-model factory fidelity using claude -p. + +Rationale — Run 4 post-mortem. Heuristic hooks have been bypassed three runs +in a row. The agent found factorings that satisfy every regex while still +producing bare-insert stubs. Only a model that can read the diff between the +Step 2 snapshot and the current code can tell a faithful extraction apart +from a stub. + +How it works: + 1. Fetch the factory-fidelity rubric + prompt template from + $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt + 2. Load the Step 2 audit snapshot (ground truth) and the current audit. + 3. For every model with independently_created: true in the snapshot, build a + prompt with: Step 2 entry, current entry, factory block, helper (if + imported), original creation_function snippet. + 4. Run `claude -p --output-format json ""` in parallel (bounded + concurrency). Each subprocess inherits the parent's model/provider + config via env. + 5. Parse JSON verdicts. If any fail, block the sentinel and return the + compiled feedback to the env-factory agent. + +Exit 0 = all verdicts pass (or no models to check). +Exit 2 = one or more verdicts failed; stderr contains the feedback the + agent should use to self-correct. +Exit 0 with a stderr warning = environment not configured to run the check + (missing docs URL, claude CLI not found). We do NOT block in that + case — the cheap hooks remain the primary gate. +""" + +from __future__ import annotations + +import concurrent.futures as futures +import json +import os +import re +import shutil +import subprocess +import sys +import time +import urllib.request +from pathlib import Path +from typing import Optional + +import yaml # type: ignore + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _audit_schema import is_independently_created # noqa: E402 + +CONCURRENCY = int(os.environ.get("AUTONOMA_FIDELITY_CONCURRENCY", "6")) +PER_MODEL_TIMEOUT = int(os.environ.get("AUTONOMA_FIDELITY_TIMEOUT", "180")) +MAX_MODELS = int(os.environ.get("AUTONOMA_FIDELITY_MAX_MODELS", "60")) +SNIPPET_MAX_LINES = 200 +DOCS_SLUG = "llms/test-planner/factory-fidelity-rubric.txt" + + +def warn(msg: str) -> None: + sys.stderr.write(f"[fidelity-validator] {msg}\n") + + +def load_audit(path: Path) -> dict[str, dict]: + if not path.exists(): + return {} + text = path.read_text() + if not text.startswith("---"): + return {} + end = text.find("\n---", 3) + if end < 0: + return {} + try: + fm = yaml.safe_load(text[3:end]) + except yaml.YAMLError: + return {} + out: dict[str, dict] = {} + for entry in (fm.get("models") or []): + if isinstance(entry, dict): + name = entry.get("name") or entry.get("model") + if name: + out[str(name)] = entry + return out + + +def fetch_rubric() -> Optional[tuple[str, str]]: + """Return (rubric_text, prompt_template) or None if unavailable.""" + url_file = Path("autonoma/.docs-url") + if not url_file.exists(): + warn("autonoma/.docs-url missing — skipping semantic validation.") + return None + base = url_file.read_text().strip().rstrip("/") + url = f"{base}/{DOCS_SLUG}" + try: + with urllib.request.urlopen(url, timeout=20) as resp: + content = resp.read().decode("utf-8") + except Exception as e: + warn(f"failed to fetch rubric from {url}: {e} — skipping.") + return None + # Split at "## Prompt template" + parts = content.split("## Prompt template", 1) + if len(parts) != 2: + warn("rubric page is missing '## Prompt template' section — skipping.") + return None + rubric_md = parts[0] + # The prompt template lives between explicit HTML-comment delimiters to + # avoid clashing with the inner ``` fences the template itself contains. + tpl_match = re.search( + r"\s*\n(.*?)\n", + parts[1], + re.DOTALL, + ) + if not tpl_match: + warn("rubric page missing / markers — skipping.") + return None + return rubric_md.strip(), tpl_match.group(1) + + +def resolve_handler_path(sentinel_path: str) -> Optional[Path]: + body = Path(sentinel_path).read_text() + m = re.search(r"handler(?:_path)?:\s*(\S+)", body, re.IGNORECASE) + candidates: list[str] = [] + if m: + candidates.append(m.group(1).rstrip(".,;:")) + for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body): + candidates.append(tok.rstrip(".,;:")) + for cand in candidates: + p = Path(cand) + if not p.is_absolute(): + p = Path.cwd() / cand + if p.is_file(): + return p + return None + + +def find_factory_block(handler_src: str, model: str) -> str: + header = re.search(rf"\b{re.escape(model)}\s*:\s*defineFactory\s*\(\s*\{{", handler_src) + if not header: + return "" + brace = handler_src.find("{", header.end() - 1) + if brace < 0: + return "" + depth = 0 + i = brace + n = len(handler_src) + while i < n: + c = handler_src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + start = handler_src.rfind("\n", 0, header.start()) + 1 + return handler_src[start : i + 1] + i += 1 + return "" + + +def _load_tsconfig_paths(cwd: Path) -> list[tuple[str, list[str]]]: + """Best-effort parse of tsconfig.json compilerOptions.paths for alias + resolution. Walks up a few ancestors so apps/api/ monorepos pick up the + root tsconfig. Silently returns [] on any parse error.""" + roots: list[Path] = [cwd] + cur = cwd + for _ in range(4): + cur = cur.parent + roots.append(cur) + seen: set[Path] = set() + out: list[tuple[str, list[str]]] = [] + for root in roots: + for name in ("tsconfig.json", "tsconfig.base.json"): + p = root / name + if p in seen or not p.is_file(): + continue + seen.add(p) + try: + raw = p.read_text() + raw = re.sub(r"//[^\n]*", "", raw) + raw = re.sub(r",\s*([}\]])", r"\1", raw) + data = json.loads(raw) + except Exception: + continue + co = (data.get("compilerOptions") or {}) + base_url = co.get("baseUrl") or "." + base_dir = (p.parent / base_url).resolve() + for prefix, resolutions in (co.get("paths") or {}).items(): + if not isinstance(resolutions, list): + continue + resolved = [str((base_dir / r).resolve()) for r in resolutions if isinstance(r, str)] + out.append((prefix, resolved)) + return out + + +def _resolve_import_path(rel: str, handler_path: Path, alias_map: list[tuple[str, list[str]]]) -> Optional[Path]: + """Resolve an import specifier to a filesystem path. Handles relative + imports and TS path aliases with trailing /*.""" + candidates: list[Path] = [] + if rel.startswith("."): + candidates.append((handler_path.parent / rel).resolve()) + elif rel.startswith("/"): + candidates.append(Path(rel)) + else: + for prefix, resolutions in alias_map: + pref = prefix.rstrip("*").rstrip("/") + if rel == pref or rel.startswith(pref + "/"): + tail = rel[len(pref):].lstrip("/") + for r in resolutions: + root = r.rstrip("*").rstrip("/") + candidates.append(Path(root) / tail if tail else Path(root)) + for c in candidates: + for ext in (".ts", ".tsx", ".js", ".mjs", ""): + p = Path(str(c) + ext) + if p.is_file(): + return p + for idx in ("index.ts", "index.tsx", "index.js"): + p = c / idx + if p.is_file(): + return p + return None + + +_IDENT_BLOCKLIST = { + "if", "for", "while", "switch", "return", "await", "async", "new", + "Date", "String", "Number", "Boolean", "Object", "Array", "Error", + "Promise", "Map", "Set", "JSON", "Math", "console", "typeof", "function", + "require", "import", "catch", "throw", "void", "delete", "instanceof", +} + + +def find_helpers(handler_src: str, handler_path: Path, factory_block: str) -> list[tuple[Path, str, str]]: + """Return every (helper_path, helper_fn_name, helper_source) the factory + block invokes via a named import in the handler. Strips string/template + literals first so identifiers inside quotes don't produce false calls.""" + if not factory_block: + return [] + stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block) + candidates = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST + alias_map = _load_tsconfig_paths(Path.cwd()) + imports: dict[str, str] = {} + for m in re.finditer( + r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]", + handler_src, + ): + spec = m.group(2) + for name in m.group(1).split(","): + name = name.strip() + if " as " in name: + name = name.split(" as ", 1)[1].strip() + if name: + imports[name] = spec + out: list[tuple[Path, str, str]] = [] + seen: set[Path] = set() + for name in sorted(candidates): + spec = imports.get(name) + if not spec: + continue + resolved = _resolve_import_path(spec, handler_path, alias_map) + if not resolved or resolved in seen: + continue + seen.add(resolved) + try: + text = resolved.read_text() + except OSError: + continue + snippet = extract_fn_snippet(text, name) or text[:4000] + out.append((resolved, name, snippet)) + return out + + +def find_helper(handler_src: str, handler_path: Path, model: str, factory_block: str) -> Optional[tuple[Path, str, str]]: + """Legacy single-helper accessor kept for backwards compat.""" + helpers = find_helpers(handler_src, handler_path, factory_block) + return helpers[0] if helpers else None + + +def _unresolved_calls(handler_src: str, factory_block: str, resolved: list[tuple[Path, str, str]]) -> list[str]: + """Identifiers called in the factory block that weren't in resolved + not in the blocklist.""" + if not factory_block: + return [] + stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block) + calls = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST + resolved_names = {name for _, name, _ in resolved} + # Also strip anything that looks like a member access call (obj.method() captured as "method") + # by requiring the name to appear as a named import too. + imported = set(re.findall( + r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"][^'\"]+['\"]", + handler_src, + )) + imported_names: set[str] = set() + for group in imported: + for n in group.split(","): + n = n.strip() + if " as " in n: + n = n.split(" as ", 1)[1].strip() + if n: + imported_names.add(n) + return sorted((calls & imported_names) - resolved_names) + + +def extract_fn_snippet(src: str, fn_name: str) -> str: + """Find `export (async )?function fn_name(` or `fn_name =` and return body.""" + patterns = [ + rf"export\s+(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(", + rf"export\s+const\s+{re.escape(fn_name)}\s*=", + rf"(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(", + ] + for pat in patterns: + m = re.search(pat, src) + if not m: + continue + # Grab until the matching closing brace of the first "{" after m.end() + brace = src.find("{", m.end()) + if brace < 0: + continue + depth = 0 + i = brace + n = len(src) + while i < n: + c = src[i] + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + start = src.rfind("\n", 0, m.start()) + 1 + snippet = src[start : i + 1] + return "\n".join(snippet.splitlines()[:SNIPPET_MAX_LINES]) + i += 1 + return "" + + +def load_original_snippet(snap_entry: dict) -> tuple[str, str]: + """Return (file_path_str, snippet).""" + cfile = (snap_entry.get("creation_file") or "").strip() + cfn = (snap_entry.get("creation_function") or "").strip() + if not cfile: + return "", "(Step 2 audit did not record a creation_file)" + p = Path(cfile) + if not p.is_absolute(): + p = Path.cwd() / cfile + if not p.is_file(): + return cfile, f"(file not found at {p})" + try: + text = p.read_text() + except OSError as e: + return cfile, f"(could not read file: {e})" + if cfn: + snip = extract_fn_snippet(text, cfn) + if snip: + return cfile, snip + return cfile, "\n".join(text.splitlines()[:SNIPPET_MAX_LINES]) + + +def yaml_entry(entry: dict) -> str: + return yaml.safe_dump([entry], sort_keys=False).rstrip() + + +def fill_template( + tpl: str, + rubric: str, + model: str, + snap_entry: dict, + cur_entry: Optional[dict], + handler_path: Path, + factory_block: str, + helpers: list[tuple[Path, str, str]], + unresolved_calls: list[str], + orig_path: str, + orig_snippet: str, +) -> str: + if helpers: + blocks = [] + for p, name, body in helpers: + blocks.append(f"File: {p}\nFunction: {name}\n\n```\n{body}\n```") + helper_section = "\n\n".join(blocks) + if unresolved_calls: + helper_section += ( + "\n\n(Additional identifiers called by the factory were not resolvable " + f"as imports and may or may not be helpers: {', '.join(unresolved_calls)})" + ) + elif unresolved_calls: + helper_section = ( + "(The factory calls identifiers that were not resolvable as named imports: " + f"{', '.join(unresolved_calls)}. Treat this as missing-context, not as evidence " + "of a raw-write factory.)" + ) + else: + helper_section = "(The factory does not call an external helper.)" + + needs_extraction = "true" if snap_entry.get("needs_extraction") else "false" + extracted_to = str(snap_entry.get("extracted_to") or "").strip() or "(not set)" + + return ( + tpl.replace("{{RUBRIC}}", rubric) + .replace("{{MODEL}}", model) + .replace("{{STEP2_AUDIT_ENTRY}}", yaml_entry(snap_entry)) + .replace( + "{{CURRENT_AUDIT_ENTRY}}", + yaml_entry(cur_entry) if cur_entry else "(model not present in current audit)", + ) + .replace("{{HANDLER_PATH}}", str(handler_path)) + .replace("{{FACTORY_BLOCK}}", factory_block or "(factory registration not found)") + .replace("{{HELPER_SECTION}}", helper_section) + .replace("{{NEEDS_EXTRACTION}}", needs_extraction) + .replace("{{EXTRACTED_TO}}", extracted_to) + .replace("{{ORIGINAL_CREATION_FILE}}", orig_path or "(unknown)") + .replace("{{ORIGINAL_CREATION_SNIPPET}}", orig_snippet) + ) + + +def run_claude(prompt: str) -> dict: + """Spawn `claude -p --output-format json` with the prompt on stdin. + + Model is configurable via AUTONOMA_FIDELITY_MODEL (defaults to "sonnet", + which is cheap, fast, and reliable for bounded rubric tasks). Set to empty + string to inherit whatever model the CLI picks. + """ + cmd = ["claude", "-p", "--output-format", "json"] + model = os.environ.get("AUTONOMA_FIDELITY_MODEL", "sonnet") + if model: + cmd.extend(["--model", model]) + try: + proc = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=PER_MODEL_TIMEOUT, + ) + except subprocess.TimeoutExpired: + return {"verdict": "error", "error": "timeout"} + except FileNotFoundError: + return {"verdict": "error", "error": "claude CLI not found"} + if proc.returncode != 0: + return {"verdict": "error", "error": f"claude exit {proc.returncode}: {proc.stderr[:400]}"} + out = proc.stdout.strip() + # Outer envelope from `claude -p --output-format json` wraps the assistant + # response in a JSON object with a "result" field containing the text. + try: + envelope = json.loads(out) + except json.JSONDecodeError: + # Assume raw stdout is the JSON we asked for. + return parse_verdict(out) + inner = envelope.get("result") or envelope.get("text") or envelope.get("output") or "" + if isinstance(inner, list): + inner = "\n".join(str(x) for x in inner) + return parse_verdict(str(inner)) + + +def parse_verdict(text: str) -> dict: + text = text.strip() + if text.startswith("```"): + text = re.sub(r"^```[a-zA-Z]*\n", "", text) + text = re.sub(r"\n```\s*$", "", text) + try: + return json.loads(text) + except json.JSONDecodeError: + m = re.search(r"\{.*\}", text, re.DOTALL) + if m: + try: + return json.loads(m.group(0)) + except json.JSONDecodeError: + pass + return {"verdict": "error", "error": f"could not parse verdict: {text[:300]}"} + + +def validate_one(task: dict) -> dict: + verdict = run_claude(task["prompt"]) + verdict["model"] = task["model"] + return verdict + + +def main() -> None: + if os.environ.get("AUTONOMA_SKIP_FIDELITY") == "1": + warn("AUTONOMA_SKIP_FIDELITY=1 — skipping.") + sys.exit(0) + + if shutil.which("claude") is None: + warn("`claude` CLI not on PATH — skipping semantic validation.") + sys.exit(0) + + if len(sys.argv) < 2: + warn("no sentinel path provided") + sys.exit(0) + sentinel = sys.argv[1] + + rubric_pair = fetch_rubric() + if not rubric_pair: + sys.exit(0) + rubric, tpl = rubric_pair + + snap = load_audit(Path("autonoma/.entity-audit-step2.md")) + cur = load_audit(Path("autonoma/entity-audit.md")) + if not snap: + warn("Step 2 snapshot missing — skipping.") + sys.exit(0) + + handler_path = resolve_handler_path(sentinel) + if handler_path is None: + warn("handler path not resolvable from sentinel — skipping.") + sys.exit(0) + handler_src = handler_path.read_text() + + models = [name for name, entry in snap.items() if is_independently_created(entry)] + if not models: + sys.exit(0) + if len(models) > MAX_MODELS: + warn(f"truncating from {len(models)} to {MAX_MODELS} models (override via AUTONOMA_FIDELITY_MAX_MODELS).") + models = models[:MAX_MODELS] + + tasks = [] + for model in models: + snap_entry = snap[model] + cur_entry = cur.get(model) + factory_block = find_factory_block(handler_src, model) + helpers = find_helpers(handler_src, handler_path, factory_block) if factory_block else [] + unresolved = _unresolved_calls(handler_src, factory_block, helpers) if factory_block else [] + orig_path, orig_snippet = load_original_snippet(snap_entry) + prompt = fill_template( + tpl, rubric, model, snap_entry, cur_entry, handler_path, + factory_block, helpers, unresolved, orig_path, orig_snippet, + ) + tasks.append({"model": model, "prompt": prompt}) + + t0 = time.time() + warn(f"running semantic validation for {len(tasks)} models (concurrency={CONCURRENCY}).") + + results: list[dict] = [] + with futures.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex: + for res in ex.map(validate_one, tasks): + results.append(res) + + elapsed = time.time() - t0 + warn(f"semantic validation complete in {elapsed:.1f}s.") + + failures = [r for r in results if r.get("verdict") == "fail"] + errors = [r for r in results if r.get("verdict") == "error"] + passes = [r for r in results if r.get("verdict") == "pass"] + + warn(f"results: {len(passes)} pass, {len(failures)} fail, {len(errors)} error.") + + if errors and not failures: + # Don't block on our own infra errors; log and allow. + warn("no hard failures; transient errors will not block the sentinel.") + for e in errors[:5]: + warn(f" - {e.get('model','?')}: {e.get('error','')[:200]}") + sys.exit(0) + + if not failures: + sys.exit(0) + + lines = [ + f"FACTORY FIDELITY CHECK FAILED — {len(failures)} of {len(results)} models " + "do not faithfully reproduce their Step 2 creation behaviour.", + "", + "This is the semantic check. It reads the Step 2 snapshot (ground truth), " + "the current audit, the factory registration, and the original creation " + "function, then applies the rubric at:", + " $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt", + "", + "Per-model feedback:", + "", + ] + for r in failures: + model = r.get("model", "?") + lines.append(f"── {model} ──") + for c in r.get("criteria", []) or []: + if c.get("status") == "fail": + lines.append(f" ✗ Criterion {c.get('id')}: {c.get('reason','')}") + fix = r.get("fix_hint", "") + if fix: + lines.append(f" → Fix: {fix}") + lines.append("") + lines.append( + "To fix: for each failing model, either (a) call the original " + "creation_function from the Step 2 audit (the one in the APPLICATION " + "codebase, not the helper the factory wrote), or (b) make the helper a " + "thin wrapper that calls that function. Do NOT leave bare ORM inserts " + "in the helper. If a side effect truly conflicts with the SDK's " + "scenario tree (e.g. sibling rows get created twice), document in a " + "comment which sibling factory owns that row and reference it." + ) + sys.stderr.write("\n".join(lines) + "\n") + sys.exit(2) + + +if __name__ == "__main__": + main() diff --git a/hooks/validators/validate_scenarios.py b/hooks/validators/validate_scenarios.py index 8580715..b080522 100644 --- a/hooks/validators/validate_scenarios.py +++ b/hooks/validators/validate_scenarios.py @@ -26,7 +26,7 @@ sys.exit(1) # Required fields -required = ['scenario_count', 'scenarios', 'entity_types', 'discover', 'variable_fields', 'planning_sections'] +required = ['scenario_count', 'scenarios', 'entity_types'] missing = [f for f in required if f not in fm] if missing: print(f'Missing required frontmatter fields: {missing}') @@ -73,37 +73,12 @@ print(f'entity_types[{i}] must be a mapping with at least a "name" field') sys.exit(1) -# Validate discover metadata -discover = fm.get('discover') -if not isinstance(discover, dict): - print('discover must be a mapping') +# Validate variable_fields (required, may be empty list) +if 'variable_fields' not in fm: + print('Missing required frontmatter field: variable_fields (use [] if none)') sys.exit(1) -for field in ['source', 'model_count', 'edge_count', 'relation_count', 'scope_field']: - if field not in discover: - print(f'discover missing required field: {field}') - sys.exit(1) - -if discover.get('source') != 'sdk': - print('discover.source must be exactly "sdk"') - sys.exit(1) - -for field in ['model_count', 'edge_count', 'relation_count']: - value = discover.get(field) - if not isinstance(value, int) or value < 0: - print(f'discover.{field} must be a non-negative integer') - sys.exit(1) - -scope_field = discover.get('scope_field') -if not isinstance(scope_field, str) or len(scope_field.strip()) == 0: - print('discover.scope_field must be a non-empty string') - sys.exit(1) - -if discover.get('model_count') == 0: - print('discover.model_count must be greater than 0') - sys.exit(1) - -# Validate variable_fields +scenario_name_set = {s['name'] for s in scenarios} variable_fields = fm.get('variable_fields') if not isinstance(variable_fields, list): print('variable_fields must be a list') @@ -129,51 +104,29 @@ print(f'variable_fields[{i}].{field} must be a non-empty string') sys.exit(1) - if 'generator' in variable: - generator = variable.get('generator') - if not isinstance(generator, str) or len(generator.strip()) == 0: - print(f'variable_fields[{i}].generator must be a non-empty string if present') - sys.exit(1) - - scenario_names = variable.get('scenarios') - if not isinstance(scenario_names, list) or len(scenario_names) == 0: + vscenarios = variable.get('scenarios') + if not isinstance(vscenarios, list) or len(vscenarios) == 0: print(f'variable_fields[{i}].scenarios must be a non-empty list') sys.exit(1) - unknown_names = [name for name in scenario_names if name not in found_names] - if unknown_names: - print(f'variable_fields[{i}].scenarios has unknown scenario names: {unknown_names}') - sys.exit(1) + for name in vscenarios: + if name not in scenario_name_set: + print(f'variable_fields[{i}].scenarios references unknown scenario: {name}') + sys.exit(1) -# Validate planning_sections metadata -planning_sections = fm.get('planning_sections') -if not isinstance(planning_sections, list) or len(planning_sections) == 0: - print('planning_sections must be a non-empty list') +# Validate planning_sections (required; must contain the four core sections) +if 'planning_sections' not in fm: + print('Missing required frontmatter field: planning_sections') sys.exit(1) -required_sections = { - 'sdk_discover', - 'schema_summary', - 'relationship_map', - 'variable_data_strategy', -} -optional_sections = { - 'scoping_analysis', -} -allowed_sections = required_sections | optional_sections - -unknown_sections = [section for section in planning_sections if not isinstance(section, str) or len(section.strip()) == 0] -if unknown_sections: - print('planning_sections must contain only non-empty strings') +planning = fm.get('planning_sections') +if not isinstance(planning, list) or len(planning) == 0: + print('planning_sections must be a non-empty list') sys.exit(1) -missing_sections = required_sections - set(planning_sections) +required_sections = {'schema_summary', 'relationship_map', 'variable_data_strategy'} +missing_sections = required_sections - set(planning) if missing_sections: - print(f'Missing required planning_sections: {missing_sections}') + print(f'planning_sections missing required entries: {sorted(missing_sections)}') sys.exit(1) -for section in planning_sections: - if section not in allowed_sections: - print(f'planning_sections contains unknown value: {section}') - sys.exit(1) - print('OK') diff --git a/hooks/validators/validate_sdk_integration.py b/hooks/validators/validate_sdk_integration.py deleted file mode 100644 index fde09df..0000000 --- a/hooks/validators/validate_sdk_integration.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -"""Validates autonoma/.sdk-integration.json.""" -import json -import sys -from urllib.parse import urlparse - - -filepath = sys.argv[1] - - -def fail(message: str) -> None: - print(message) - sys.exit(1) - - -try: - with open(filepath) as fh: - payload = json.load(fh) -except Exception as exc: - fail(f"Invalid JSON: {exc}") - -if not isinstance(payload, dict): - fail("Root must be a JSON object") - -required = [ - "status", - "endpointUrl", - "endpointPath", - "stack", - "packagesInstalled", - "sharedSecretPresent", - "signingSecretPresent", - "devServer", - "verification", - "branch", - "blockingIssues", -] -missing = [field for field in required if field not in payload] -if missing: - fail(f"Missing required fields: {missing}") - -status = payload.get("status") -if status not in {"ok", "failed"}: - fail('status must be "ok" or "failed"') - -endpoint_url = payload.get("endpointUrl") -if not isinstance(endpoint_url, str) or not endpoint_url.strip(): - fail("endpointUrl must be a non-empty string") -parsed = urlparse(endpoint_url) -if parsed.scheme not in {"http", "https"} or not parsed.netloc: - fail("endpointUrl must be an absolute http/https URL") - -endpoint_path = payload.get("endpointPath") -if not isinstance(endpoint_path, str) or not endpoint_path.strip(): - fail("endpointPath must be a non-empty string") - -stack = payload.get("stack") -if not isinstance(stack, dict): - fail("stack must be an object") -for field in ["language", "framework", "orm", "packageManager"]: - if field not in stack: - fail(f"stack.{field} is required") - if stack[field] is not None and not isinstance(stack[field], str): - fail(f"stack.{field} must be a string or null") - -packages = payload.get("packagesInstalled") -if not isinstance(packages, list) or not all(isinstance(item, str) and item.strip() for item in packages): - fail("packagesInstalled must be a list of non-empty strings") - -for field in ["sharedSecretPresent", "signingSecretPresent"]: - if not isinstance(payload.get(field), bool): - fail(f"{field} must be a boolean") - -dev_server = payload.get("devServer") -if not isinstance(dev_server, dict): - fail("devServer must be an object") -if not isinstance(dev_server.get("startedByPlugin"), bool): - fail("devServer.startedByPlugin must be a boolean") -pid = dev_server.get("pid") -if pid is not None and not isinstance(pid, int): - fail("devServer.pid must be an integer or null") - -verification = payload.get("verification") -if not isinstance(verification, dict): - fail("verification must be an object") -for key in ["discover", "up", "down"]: - section = verification.get(key) - if not isinstance(section, dict): - fail(f"verification.{key} must be an object") - if section.get("status") not in {"ok", "failed"}: - fail(f'verification.{key}.status must be "ok" or "failed"') - -if not isinstance(verification.get("discover", {}).get("validatedByPlugin"), bool): - fail("verification.discover.validatedByPlugin must be a boolean") - -branch = payload.get("branch") -if not isinstance(branch, dict) or not isinstance(branch.get("name"), str) or not branch.get("name", "").strip(): - fail("branch.name must be a non-empty string") - -pr = payload.get("pr") -if pr is not None: - if not isinstance(pr, dict): - fail("pr must be an object or null") - url = pr.get("url") - if url is not None: - if not isinstance(url, str) or not url.strip(): - fail("pr.url must be a non-empty string or null") - -blocking = payload.get("blockingIssues") -if not isinstance(blocking, list) or not all(isinstance(item, str) for item in blocking): - fail("blockingIssues must be a list of strings") - -print("OK") diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md index 7f0bbc2..4ccc236 100644 --- a/skills/generate-tests/SKILL.md +++ b/skills/generate-tests/SKILL.md @@ -9,644 +9,183 @@ description: > # Autonoma E2E Test Generation Pipeline -You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent. +You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent. **Every step MUST complete successfully and pass validation before the next step begins.** Do NOT skip steps. Do NOT proceed if validation fails. -## User Confirmation Between Steps +## CRITICAL: User Confirmation Between Steps -By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the -next step once validation passes. +After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation +using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm. -**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after -Steps 1-4. +## How lifecycle reporting works -**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`, -that means auto-advance as well. +You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that: -If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation -using the `AskUserQuestion` tool. - -After calling `AskUserQuestion`, wait for the user's response. -Only proceed to the next step after they confirm. +- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`. +- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output, + emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate + (test files cannot be written until `autonoma/.endpoint-validated` exists). ## Before Starting -Create the output directory and save the project root: - -```bash -AUTONOMA_ROOT="$(pwd)" -echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root -mkdir -p autonoma autonoma/skills autonoma/qa-tests -cleanup_dev_server() { - DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '') - if [ -n "$DEV_SERVER_PID" ]; then - kill "$DEV_SERVER_PID" 2>/dev/null || true - rm -f /tmp/autonoma-dev-server-pid - echo "Dev server (PID $DEV_SERVER_PID) stopped." - fi -} -``` - -The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write: - -```bash -PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '') -``` - -Read the environment variables required for reporting progress back to Autonoma: -- `AUTONOMA_API_KEY` -- `AUTONOMA_PROJECT_ID` -- `AUTONOMA_API_URL` -- `AUTONOMA_AUTO_ADVANCE` — optional, canonical -- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias - -Add shared helpers before running the pipeline: - ```bash -auto_advance_enabled() { - if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then - return 0 - fi - if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then - return 0 - fi - return 1 -} - -refresh_generation_id() { - AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') - GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -} - -build_event_payload() { - python3 - "$1" "$2" "$3" <<'PY' -import json -import sys - -event_type, key, value = sys.argv[1:4] -print(json.dumps({"type": event_type, "data": {key: json.loads(value)}})) -PY -} - -build_step_payload() { - python3 - "$1" "$2" "$3" <<'PY' -import json -import sys - -event_type, step, name = sys.argv[1:4] -print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}})) -PY -} - -post_setup_event_blocking() { - refresh_generation_id - payload="$1" - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - for attempt in 1 2 3; do - if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null; then - return 0 - fi - sleep "$attempt" - done - echo "ERROR: Failed to post blocking setup event after retries: $payload" - return 1 -} - -post_setup_log() { - refresh_generation_id - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")") - curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null || true -} - -patch_setup_status_blocking() { - refresh_generation_id - status="$1" - message="$2" - if [ -z "$GENERATION_ID" ]; then - return 0 - fi - payload=$(python3 - "$status" "$message" <<'PY' -import json -import sys - -body = {"status": sys.argv[1]} -if sys.argv[2]: - body["errorMessage"] = sys.argv[2] -print(json.dumps(body)) -PY -) - for attempt in 1 2 3; do - if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "$payload" >/dev/null; then - return 0 - fi - sleep "$attempt" - done - echo "ERROR: Failed to patch setup status after retries: $status" - return 1 -} - -report_error_and_exit() { - message="$1" - preserve_dev_server="${2:-false}" - payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")") - post_setup_event_blocking "$payload" || true - echo "ERROR: $message" - if [ "$preserve_dev_server" != "true" ]; then - cleanup_dev_server - fi - exit 1 -} - -report_partial_failure_and_exit() { - message="$1" - post_setup_log "$message" - patch_setup_status_blocking "partial_failure" "$message" || true - echo "ERROR: $message" - cleanup_dev_server - exit 1 -} - -rehydrate_sdk_env() { - AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') - AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '') - AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-) - AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-) - export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET - if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then - return 1 - fi - return 0 -} +mkdir -p autonoma/skills autonoma/qa-tests ``` -Prepare the SDK reference repo for Step 1: +The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`. -```bash -SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}" -if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then - echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir -else - SDK_REF_DIR="$(mktemp -d)/autonoma-sdk" - if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then - echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir - else - echo "ERROR: Unable to prepare the SDK reference repo." - cleanup_dev_server - exit 1 - fi -fi -``` - -Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug. +## Step 1: Generate Knowledge Base -Create the generation record so the dashboard can track progress in real time: +Spawn `kb-generator`: -```bash -RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}") -HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) -BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') -echo "Setup API response (HTTP $HTTP_STATUS): $BODY" -GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '') -echo "$GENERATION_ID" > autonoma/.generation-id -echo "Generation ID: $GENERATION_ID" -``` +> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML +> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill +> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals). +> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`. -If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway. +After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body). -## Step 1: SDK Integration +## Step 2: Entity Creation Audit -Report step start: +Spawn `entity-audit-generator`: -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start." -post_setup_log "Detecting stack and integrating the Autonoma SDK..." -``` - -Spawn the `sdk-integrator` subagent with the following task: - -> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context. -> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with -> a `mailto:support@autonoma.app` link if unsupported. -> Create a branch, install the SDK from package managers only, implement the SDK endpoint following -> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` -> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server, -> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and -> `autonoma/.sdk-integration.json`, commit with -> `feat: integrate autonoma sdk`, and create a PR if `gh` is available. -> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models. - -**After the subagent completes:** -1. Verify `autonoma/.sdk-endpoint` exists and is non-empty -2. Verify `autonoma/.sdk-integration.json` exists and is non-empty -3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file -4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env` -5. Confirm the endpoint is reachable with a signed `discover` request -6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes -7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available - -Load the endpoint and secrets: - -```bash -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \ - || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \ - || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true - -rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true - -BODY='{"action":"discover"}' -SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') -HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY") -if [ "$HTTP_STATUS" != "200" ]; then - report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true -fi -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \ - || report_error_and_exit "Step 1 discover response did not match the required schema." true -``` +> Read the knowledge base. Audit how each database model is created. For every model, find the +> dedicated creation function in a service/repository/helper. Classify as `independently_created: true` +> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output +> `autonoma/entity-audit.md` with frontmatter listing each model. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`. -Report step complete: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true -``` - -7. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here." - - options: ["Yes, proceed to Step 2", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 2. - -## Step 2: Generate Knowledge Base - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start." -post_setup_log "Analyzing codebase structure and identifying features..." -``` - -Spawn the `kb-generator` subagent with the following task: - -> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md` -> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with -> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count. -> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered. -> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first. - -**After the subagent completes:** -1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty -2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically -3. Read the file and present the frontmatter to the user — specifically the core_flows table - -Report step complete and upload skills: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ') -post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..." -post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion." -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -skills = [] -d = os.path.join(root, 'autonoma/skills') -if os.path.isdir(d): - for f in os.listdir(d): - if f.endswith('.md'): - with open(os.path.join(d, f)) as fh: - skills.append({'name': f, 'content': fh.read()}) -print(json.dumps({'skills': skills})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this core flows table look correct? These flows determine how the test budget is distributed." - - options: ["Yes, proceed to Step 3", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 3. +After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`. ## Step 3: Generate Scenarios -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start." -post_setup_log "Mapping data model and designing test data environments..." -``` - -Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`. -This step assumes Step 1 already produced: -- `AUTONOMA_SDK_ENDPOINT` -- `AUTONOMA_SHARED_SECRET` - -Fetch and validate the artifact: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -mkdir -p "$AUTONOMA_ROOT/autonoma" -rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1." -BODY='{"action":"discover"}' -SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //') -RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \ - -H "Content-Type: application/json" \ - -H "x-signature: $SIG" \ - -d "$BODY") -HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) -DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d') -if [ "$HTTP_STATUS" != "200" ]; then - report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY" -fi -printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json" -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \ - || report_error_and_exit "Step 3 discover artifact did not pass validation." -``` - -Spawn the `scenario-generator` subagent with the following task: - -> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover -> artifact from `autonoma/discover.json`. -> Generate test data scenarios. Write the output to `autonoma/scenarios.md`. -> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types, -> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a -> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before -> introducing a variable placeholder. Use variable fields only for truly dynamic values such as -> backend-generated or time-based fields. `generator` is optional and must not default to `faker`. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first. - -**After the subagent completes:** -1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty -2. Validate `autonoma/discover.json` using the plugin's validator -3. The PostToolUse hook will have validated the frontmatter format automatically -4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic - -Report step complete: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional." -post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion." -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests." - - options: ["Yes, proceed to Step 4", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 4. - -## Step 4: Generate E2E Test Cases - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start." -post_setup_log "Generating E2E test cases from knowledge base and scenarios..." -``` - -Spawn the `test-case-generator` subagent with the following task: - -> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`, -> and scenarios from `autonoma/scenarios.md`. -> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`. -> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests, -> total_folders, folder breakdown, and coverage_correlation. -> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow. -> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify -> scenario counts, seeded inventories, or Environment Factory correctness. Only reference -> scenario data when it is needed to test a real user-facing app behavior. -> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first. - -**After the subagent completes:** -1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty -2. Verify at least one non-`INDEX.md` test file exists -3. Verify actual test count matches `INDEX.md` -4. Verify folder breakdown matches `INDEX.md` -5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter -6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation - -Enforce the file-count postconditions: - -```bash -INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md" -[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md." -TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') -[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files." -python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md." -import sys -from pathlib import Path -import yaml - -index_path = Path(sys.argv[1]) -actual_count = int(sys.argv[2]) -qa_dir = Path(sys.argv[3]) - -content = index_path.read_text() -parts = content.split('---', 2) -if len(parts) < 3: - raise SystemExit('INDEX.md is missing YAML frontmatter') -frontmatter = yaml.safe_load(parts[1]) - -if frontmatter.get('total_tests') != actual_count: - raise SystemExit( - f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})' - ) - -actual_folders = {} -for path in qa_dir.rglob('*.md'): - if path.name == 'INDEX.md': - continue - folder = path.parent.relative_to(qa_dir).as_posix() - actual_folders[folder] = actual_folders.get(folder, 0) + 1 - -declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])} -if actual_folders != declared_folders: - raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}') -print('OK') -PY -``` - -Report step complete and upload test cases: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ') -post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..." -post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion." -[ -n "$GENERATION_ID" ] && python3 -c " -import os, json -proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.' -qa_dir = os.path.join(proj_root, 'autonoma/qa-tests') -test_cases = [] -for root, dirs, files in os.walk(qa_dir): - for f in files: - if f.endswith('.md') and f != 'INDEX.md': - path = os.path.join(root, f) - folder = os.path.relpath(root, qa_dir) - with open(path) as fh: - content = fh.read() - entry = {'name': f, 'content': content} - if folder != '.': - entry['folder'] = folder - test_cases.append(entry) -print(json.dumps({'testCases': test_cases})) -" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @- || true -``` - -4. **If auto-advance is disabled:** Call `AskUserQuestion` with: - - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app." - - options: ["Yes, proceed to Step 5", "I want to suggest changes"] - Wait for the user's response before proceeding. - **Otherwise:** Skip the prompt and proceed directly to Step 5. - -## Step 5: Scenario Validation - -Report step start: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start." -post_setup_log "Validating planned scenarios against the live SDK endpoint..." -``` - -Spawn the `scenario-validator` subagent with the following task: - -> Read `autonoma/discover.json` and `autonoma/scenarios.md`. -> Validate the planned scenarios against the existing live SDK endpoint without editing backend code. -> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`, -> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact -> `autonoma/.scenario-validation.json`, and run: -> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json` -> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs. - -**After the subagent completes:** -1. Rehydrate SDK env from Step 1 artifacts -2. Verify `autonoma/.scenario-validation.json` exists and is non-empty -3. Validate `autonoma/.scenario-validation.json` -4. Require `status == "ok"` and `preflightPassed == true` -5. Verify `autonoma/scenario-recipes.json` exists and is non-empty -6. Run the preflight helper if the subagent did not already do so -7. If preflight fails, stop and report the failure without attempting code changes -8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues - -Run and enforce preflight: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1." -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \ - || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact." -python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state." -import json -import sys - -payload = json.load(open(sys.argv[1])) -if payload.get("status") != "ok": - raise SystemExit(f'status must be "ok", got {payload.get("status")!r}') -if payload.get("preflightPassed") is not True: - raise SystemExit('preflightPassed must be true before Step 5 can upload recipes') -print('OK') -PY -[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \ - || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact." -python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \ - || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5." -``` - -Report step complete and upload scenario recipes: - -```bash -AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.') -GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '') -echo "GENERATION_ID=${GENERATION_ID:-}" -post_setup_log "Uploading validated scenario recipes to setup..." -if [ -n "$GENERATION_ID" ]; then - RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" - if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then - report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete." - fi - UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \ - -H "Content-Type: application/json" \ - -d @"$RECIPE_PATH") - UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) - UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') - echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" - if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then - report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." - fi - - VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \ - -H "Authorization: Bearer ${AUTONOMA_API_KEY}") - VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) - VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d') - if [ "$VERIFY_STATUS" != "200" ]; then - report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)." - fi -fi -post_setup_log "Scenario validation completed." -post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion." -cleanup_dev_server -``` +Spawn `scenario-generator`: + +> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write +> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types, +> variable_fields, planning_sections). Mark values as variable only when they must vary across +> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural +> per-run isolation). Design entity tables so they serialise as nested trees rooted at the +> scope entity. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`. + +After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`. + +## Step 4: Implement Environment Factory + +Spawn `env-factory-generator`: + +> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure +> the handler. Register a factory for every model with `independently_created: true` (call the audit's +> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback +> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity +> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down` +> — that is step 5. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"` +> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`. +> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names. + +After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary, +`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`. + +## Step 5: Validate Scenario Lifecycle + +Spawn `scenario-validator`: + +> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4. +> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5 +> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails +> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On +> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at +> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe +> per scenario), run `preflight_scenario_recipes.py` against it, and write +> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write` +> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and +> report — do NOT write the sentinel. +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`. +> Verify: every audited model appears in `discover.schema.models`, every `independently_created` +> model has a registered factory, `auth` is non-empty, DB state is correct before and after +> `down`, and preflight exits 0. + +After completion: +1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON + AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`: + enforce and upload the recipes to the dashboard, then ack. + + ```bash + AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}" + VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json" + RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json" + + # Enforce terminal artifact contract + python3 - "$VALIDATION_ARTIFACT" <<'PY' + import json, sys + payload = json.load(open(sys.argv[1])) + if payload.get("status") != "ok": + raise SystemExit("status must be ok before Step 5 can upload recipes") + if payload.get("preflightPassed") is not True: + raise SystemExit("preflightPassed must be true before Step 5 can upload recipes") + PY + + [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; } + python3 -c "import json; json.load(open('$RECIPE_PATH'))" \ + || { echo "scenario-recipes.json is not valid JSON"; exit 1; } + + # Re-run preflight at the orchestrator level for belt-and-suspenders safety. + python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \ + || { echo "Preflight failed at orchestrator gate"; exit 1; } + + # Upload to dashboard + GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id") + UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \ + "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \ + -d @"$RECIPE_PATH") + UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2) + UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d') + echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY" + if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then + echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2 + exit 1 + fi + ``` + + Then present validation summary (scenarios passed, any edits made to `scenarios.md`, + recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`. + +2. If any of those artifacts are missing/invalid: the agent failed — surface the failure + report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook + will also block test file writes. + +## Step 6: Generate E2E Test Cases + +Spawn `test-case-generator`: + +> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has +> been reconciled with reality in step 5 — use it as the source of truth). Parse the +> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any +> variable value (typed, asserted, or navigated to), never the hardcoded literal. +> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests +> that "audit" seeded counts or fixture existence. +> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with +> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test +> file needs frontmatter (title, description, criticality, scenario, flow). +> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`. + +After completion: +1. Verify `autonoma/qa-tests/INDEX.md` exists +2. Present INDEX summary +3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed` + for the final step, marking the setup complete. ## Completion -After all steps complete, summarize: -- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available -- **Step 2**: knowledge base location and core flow count -- **Step 3**: scenario count and entity types covered -- **Step 4**: total test count, folder breakdown, coverage correlation -- **Step 5**: scenario validation results, smoke-test status, and recipe upload status - -If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID. -For terminal failures after later steps begin, clean up the dev server before returning control to the user. +Summarize each step: +- **Step 1**: KB location, core flows +- **Step 2**: entity audit — factories vs raw SQL +- **Step 3**: scenarios generated +- **Step 4**: endpoint implemented (handler path, packages, factories registered) +- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any) +- **Step 6**: test count, folder breakdown diff --git a/tests/test_validate_pipeline_output.py b/tests/test_validate_pipeline_output.py deleted file mode 100644 index b40bc26..0000000 --- a/tests/test_validate_pipeline_output.py +++ /dev/null @@ -1,321 +0,0 @@ -"""Tests for hooks/validate-pipeline-output.sh.""" -import json -import os -import subprocess -import tempfile -import threading -from contextlib import contextmanager -from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer -from pathlib import Path - - -ROOT = Path(__file__).resolve().parents[1] -SCRIPT = ROOT / 'hooks' / 'validate-pipeline-output.sh' - -VALID_DISCOVER = { - 'schema': { - 'models': [ - { - 'name': 'Organization', - 'fields': [ - { - 'name': 'name', - 'type': 'String', - 'isRequired': True, - 'isId': False, - 'hasDefault': False, - }, - ], - }, - ], - 'edges': [], - 'relations': [], - 'scopeField': 'organizationId', - }, -} - -VALID_RECIPES = { - 'version': 1, - 'source': { - 'discoverPath': 'autonoma/discover.json', - 'scenariosPath': 'autonoma/scenarios.md', - }, - 'validationMode': 'sdk-check', - 'recipes': [ - { - 'name': 'standard', - 'description': 'Standard baseline', - 'create': {'Organization': [{'name': 'Acme Standard'}]}, - 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, - }, - { - 'name': 'empty', - 'description': 'Empty workspace', - 'create': {'Organization': [{'name': 'Acme Empty'}]}, - 'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}, - }, - { - 'name': 'large', - 'description': 'Large workspace', - 'create': {'Organization': [{'name': 'Acme Large'}]}, - 'validation': {'status': 'validated', 'method': 'endpoint-up-down', 'phase': 'ok'}, - }, - ], -} - - -def test_sdk_endpoint_hook_accepts_valid_url(): - env = os.environ.copy() - - code, out, err = _run_hook( - { - 'autonoma/.sdk-endpoint': 'http://127.0.0.1:3000/api/autonoma\n', - }, - 'autonoma/.sdk-endpoint', - env, - ) - - assert code == 0 - assert out == '' - assert err == '' - - -def test_sdk_endpoint_hook_blocks_invalid_url(): - env = os.environ.copy() - - code, _, err = _run_hook( - { - 'autonoma/.sdk-endpoint': '/api/autonoma\n', - }, - 'autonoma/.sdk-endpoint', - env, - ) - - assert code == 2 - assert 'validate-sdk-endpoint' in err - assert 'http or https' in err - - -def test_sdk_integration_hook_accepts_valid_json(): - env = os.environ.copy() - - code, out, err = _run_hook( - { - 'autonoma/.sdk-integration.json': json.dumps( - { - 'status': 'ok', - 'endpointUrl': 'http://127.0.0.1:3000/api/autonoma', - 'endpointPath': '/api/autonoma', - 'stack': { - 'language': 'TypeScript', - 'framework': 'Express', - 'orm': 'Prisma', - 'packageManager': 'pnpm', - }, - 'packagesInstalled': ['@autonoma-ai/sdk'], - 'sharedSecretPresent': True, - 'signingSecretPresent': True, - 'devServer': {'startedByPlugin': True, 'pid': 1234}, - 'verification': { - 'discover': {'status': 'ok', 'validatedByPlugin': True}, - 'up': {'status': 'ok'}, - 'down': {'status': 'ok'}, - }, - 'branch': {'name': 'autonoma/feat-autonoma-sdk'}, - 'pr': {'url': 'https://github.com/example/repo/pull/1'}, - 'blockingIssues': [], - } - ), - }, - 'autonoma/.sdk-integration.json', - env, - ) - - assert code == 0 - assert out == '' - assert err == '' - - -def test_sdk_integration_hook_blocks_invalid_json(): - env = os.environ.copy() - - code, _, err = _run_hook( - { - 'autonoma/.sdk-integration.json': json.dumps({'status': 'ok'}), - }, - 'autonoma/.sdk-integration.json', - env, - ) - - assert code == 2 - assert 'validate-sdk-integration' in err - assert 'Missing required fields' in err - - -def test_scenario_validation_hook_accepts_valid_json(): - env = os.environ.copy() - - code, out, err = _run_hook( - { - 'autonoma/.scenario-validation.json': json.dumps( - { - 'status': 'ok', - 'preflightPassed': True, - 'smokeTestPassed': True, - 'validatedScenarios': ['standard', 'empty', 'large'], - 'failedScenarios': [], - 'blockingIssues': [], - 'recipePath': 'autonoma/scenario-recipes.json', - 'validationMode': 'sdk-check', - 'endpointUrl': 'http://127.0.0.1:3000/api/autonoma', - } - ), - }, - 'autonoma/.scenario-validation.json', - env, - ) - - assert code == 0 - assert out == '' - assert err == '' - - -def test_scenario_validation_hook_blocks_invalid_json(): - env = os.environ.copy() - - code, _, err = _run_hook( - { - 'autonoma/.scenario-validation.json': json.dumps( - { - 'status': 'failed', - 'preflightPassed': False, - } - ), - }, - 'autonoma/.scenario-validation.json', - env, - ) - - assert code == 2 - assert 'validate-scenario-validation' in err - assert 'Missing required fields' in err - - -def _run_hook(files: dict[str, str], target: str, env: dict[str, str]) -> tuple[int, str, str]: - with tempfile.TemporaryDirectory() as tmpdir: - for relpath, content in files.items(): - fullpath = Path(tmpdir) / relpath - fullpath.parent.mkdir(parents=True, exist_ok=True) - fullpath.write_text(content) - - target_path = str(Path(tmpdir) / target) - payload = json.dumps({'tool_input': {'file_path': target_path}}) - result = subprocess.run( - ['bash', str(SCRIPT)], - input=payload, - text=True, - capture_output=True, - env=env, - ) - return result.returncode, result.stdout.strip(), result.stderr.strip() - - -@contextmanager -def _sdk_server(up_status: int = 200, down_status: int = 200): - class Handler(BaseHTTPRequestHandler): - def do_POST(self): - length = int(self.headers.get('Content-Length', '0')) - body = json.loads(self.rfile.read(length) or '{}') - action = body.get('action') - - if action == 'up': - status = up_status - response = {'auth': {}, 'refs': {'organization': ['org_1']}, 'refsToken': 'token_1'} - if status >= 400: - response = {'error': 'up failed'} - elif action == 'down': - status = down_status - response = {'ok': True} - if status >= 400: - response = {'error': 'down failed'} - else: - status = 400 - response = {'error': 'unknown action'} - - encoded = json.dumps(response).encode() - self.send_response(status) - self.send_header('Content-Type', 'application/json') - self.send_header('Content-Length', str(len(encoded))) - self.end_headers() - self.wfile.write(encoded) - - def log_message(self, format, *args): - return - - server = ThreadingHTTPServer(('127.0.0.1', 0), Handler) - thread = threading.Thread(target=server.serve_forever, daemon=True) - thread.start() - try: - yield f'http://127.0.0.1:{server.server_address[1]}' - finally: - server.shutdown() - thread.join() - - -def test_scenario_recipes_hook_requires_preflight_env(): - env = os.environ.copy() - env.pop('AUTONOMA_SDK_ENDPOINT', None) - env.pop('AUTONOMA_SHARED_SECRET', None) - - code, _, err = _run_hook( - { - 'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES), - 'autonoma/discover.json': json.dumps(VALID_DISCOVER), - }, - 'autonoma/scenario-recipes.json', - env, - ) - - assert code == 2 - assert 'scenario-recipes-preflight' in err - assert 'AUTONOMA_SDK_ENDPOINT is not set' in err - - -def test_scenario_recipes_hook_runs_preflight_successfully(): - with _sdk_server() as endpoint: - env = os.environ.copy() - env['AUTONOMA_SDK_ENDPOINT'] = endpoint - env['AUTONOMA_SHARED_SECRET'] = 'test-secret' - - code, out, err = _run_hook( - { - 'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES), - 'autonoma/discover.json': json.dumps(VALID_DISCOVER), - }, - 'autonoma/scenario-recipes.json', - env, - ) - - assert code == 0 - assert out == '' - assert err == '' - - -def test_scenario_recipes_hook_blocks_failed_preflight(): - with _sdk_server(up_status=500) as endpoint: - env = os.environ.copy() - env['AUTONOMA_SDK_ENDPOINT'] = endpoint - env['AUTONOMA_SHARED_SECRET'] = 'test-secret' - - code, _, err = _run_hook( - { - 'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES), - 'autonoma/discover.json': json.dumps(VALID_DISCOVER), - }, - 'autonoma/scenario-recipes.json', - env, - ) - - assert code == 2 - assert 'scenario-recipes-preflight' in err - assert 'HTTP 500' in err diff --git a/tests/test_validate_scenarios.py b/tests/test_validate_scenarios.py index 40c55c0..100de96 100644 --- a/tests/test_validate_scenarios.py +++ b/tests/test_validate_scenarios.py @@ -9,70 +9,27 @@ scenarios: - name: standard description: Typical usage - entity_types: 2 + entity_types: [user, task] total_entities: 10 - name: empty description: No data - entity_types: 0 + entity_types: [user] total_entities: 0 - name: large description: Stress test - entity_types: 3 + entity_types: [user, task, project] total_entities: 1000 entity_types: - name: user - name: task -discover: - source: sdk - model_count: 4 - edge_count: 3 - relation_count: 2 - scope_field: organizationId -variable_fields: - - token: "{{project_title}}" - entity: Project.title - scenarios: - - standard - - large - reason: title must be unique per test run - test_reference: ({{project_title}} variable) +variable_fields: [] planning_sections: - - sdk_discover - schema_summary - relationship_map - variable_data_strategy --- # Scenarios - -## SDK Discover - -Models: 4 - -## Schema Summary - -- User -- Task - -## Relationship Map - -- User.organizationId -> Organization.id - -## Variable Data Strategy - -- `{{project_title}}` is generated. - -## Scenario: `standard` - -Standard details. - -## Scenario: `empty` - -Empty details. - -## Scenario: `large` - -Large details. """ @@ -95,23 +52,6 @@ def test_missing_required_fields(): assert 'Missing required frontmatter fields' in out -def test_missing_discover_field(): - content = VALID.replace( - "discover:\n source: sdk\n model_count: 4\n edge_count: 3\n relation_count: 2\n scope_field: organizationId\n", - "", - ) - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert "discover" in out - - -def test_discover_source_must_be_sdk(): - content = VALID.replace('source: sdk', 'source: codebase') - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'discover.source must be exactly "sdk"' in out - - def test_scenario_count_too_low(): content = VALID.replace('scenario_count: 3', 'scenario_count: 2') code, out = run_validator(SCRIPT, content) @@ -127,6 +67,7 @@ def test_scenario_count_mismatch(): def test_missing_required_scenario_name(): + # Replace 'large' with 'extra' — now 'large' is missing content = VALID.replace('name: large', 'name: extra') code, out = run_validator(SCRIPT, content) assert code == 1 @@ -135,6 +76,7 @@ def test_missing_required_scenario_name(): def test_scenario_missing_field(): + # Remove description from first scenario content = VALID.replace( ' - name: standard\n description: Typical usage', ' - name: standard', @@ -162,73 +104,3 @@ def test_entity_type_missing_name(): code, out = run_validator(SCRIPT, content) assert code == 1 assert 'must be a mapping with at least a "name" field' in out - - -def test_variable_token_must_use_double_curly_braces(): - content = VALID.replace('token: "{{project_title}}"', 'token: project_title') - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'must use double curly braces' in out - - -def test_variable_generator_is_optional(): - code, out = run_validator(SCRIPT, VALID) - assert code == 0 - assert out == 'OK' - - -def test_non_faker_generator_is_accepted(): - content = VALID.replace( - ' reason: title must be unique per test run\n', - ' generator: derived from testRunId\n reason: title must be unique per test run\n', - ) - code, out = run_validator(SCRIPT, content) - assert code == 0 - assert out == 'OK' - - -def test_empty_generator_fails_if_present(): - content = VALID.replace( - ' reason: title must be unique per test run\n', - ' generator: ""\n reason: title must be unique per test run\n', - ) - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'generator must be a non-empty string if present' in out - - -def test_variable_scenarios_must_be_known(): - content = VALID.replace(' - large', ' - invalid') - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'unknown scenario names' in out - - -def test_missing_required_planning_section(): - content = VALID.replace( - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n - variable_data_strategy\n', - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n', - ) - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'Missing required planning_sections' in out - - -def test_scoping_analysis_optional_section_accepted(): - content = VALID.replace( - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n - variable_data_strategy\n', - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n - variable_data_strategy\n - scoping_analysis\n', - ) - code, out = run_validator(SCRIPT, content) - assert code == 0 - assert out == 'OK' - - -def test_unknown_planning_section_rejected(): - content = VALID.replace( - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n - variable_data_strategy\n', - 'planning_sections:\n - sdk_discover\n - schema_summary\n - relationship_map\n - variable_data_strategy\n - made_up_section\n', - ) - code, out = run_validator(SCRIPT, content) - assert code == 1 - assert 'planning_sections contains unknown value: made_up_section' in out diff --git a/tests/test_validate_sdk_integration.py b/tests/test_validate_sdk_integration.py deleted file mode 100644 index 73fab81..0000000 --- a/tests/test_validate_sdk_integration.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Tests for validate_sdk_integration.py.""" -import json - -from conftest import run_validator - - -SCRIPT = "validate_sdk_integration.py" - - -def valid_payload(**overrides): - payload = { - "status": "ok", - "endpointUrl": "http://127.0.0.1:3000/api/autonoma", - "endpointPath": "/api/autonoma", - "stack": { - "language": "TypeScript", - "framework": "Express", - "orm": "Prisma", - "packageManager": "pnpm", - }, - "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"], - "sharedSecretPresent": True, - "signingSecretPresent": True, - "devServer": {"startedByPlugin": True, "pid": 1234}, - "verification": { - "discover": {"status": "ok", "validatedByPlugin": True}, - "up": {"status": "ok"}, - "down": {"status": "ok"}, - }, - "branch": {"name": "autonoma/feat-autonoma-sdk"}, - "pr": {"url": "https://github.com/example/repo/pull/1"}, - "blockingIssues": [], - } - payload.update(overrides) - return payload - - -def test_accepts_valid_payload(): - code, out = run_validator(SCRIPT, json.dumps(valid_payload()), filename=".sdk-integration.json") - assert code == 0 - assert out == "OK" - - -def test_rejects_missing_required_field(): - payload = valid_payload() - payload.pop("verification") - code, out = run_validator(SCRIPT, json.dumps(payload), filename=".sdk-integration.json") - assert code == 1 - assert "Missing required fields" in out - - -def test_rejects_invalid_endpoint_url(): - code, out = run_validator( - SCRIPT, - json.dumps(valid_payload(endpointUrl="/api/autonoma")), - filename=".sdk-integration.json", - ) - assert code == 1 - assert "absolute http/https URL" in out - - -def test_accepts_failed_status_with_blocking_issues(): - code, out = run_validator( - SCRIPT, - json.dumps( - valid_payload( - status="failed", - verification={ - "discover": {"status": "failed", "validatedByPlugin": False}, - "up": {"status": "failed"}, - "down": {"status": "failed"}, - }, - blockingIssues=["discover request failed"], - ) - ), - filename=".sdk-integration.json", - ) - assert code == 0 - assert out == "OK"