From bd04ab1467b08e8ae1aaaf13495813a2b825f1da Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Tue, 7 Apr 2026 19:24:26 -0300
Subject: [PATCH 01/33] feat: SDK discover schema and base infrastructure

Add discover.json validator, expand scenarios validator, update plugin
manifest for marketplace installability, and document scenario recipes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json        |   6 +-
 .claude-plugin/plugin.json             |   7 +-
 .github/workflows/tests.yml            |   2 +-
 CLAUDE.md                              |  10 ++-
 README.md                              | 101 ++++++++++++++++++++----
 hooks/validate-pipeline-output.sh      |   9 +++
 hooks/validators/validate_discover.py  | 102 +++++++++++++++++++++++++
 hooks/validators/validate_scenarios.py | 100 +++++++++++++++++++++++-
 8 files changed, 311 insertions(+), 26 deletions(-)
 create mode 100644 hooks/validators/validate_discover.py

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 8119710..0cec48e 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -9,7 +9,11 @@
   "plugins": [
     {
       "name": "autonoma-test-planner",
-      "source": "./",
+      "source": {
+        "source": "url",
+        "url": "https://github.com/IgnacioPardo/test-planner-plugin-sc-v2.git",
+        "ref": "IgnacioPardo/sdk-scenarios"
+      },
       "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
     }
   ]
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index bade427..2de57c6 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,11 +1,8 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.1.0",
+  "version": "1.2.1",
   "author": {
     "name": "Autonoma"
-  },
-  "commands": [
-    "./commands"
-  ]
+  }
 }
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 73754fe..f2c1c4d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -14,5 +14,5 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.11"
-      - run: pip install pytest pyyaml
+      - run: pip install pytest pyyaml Faker
       - run: pytest tests/ -v
diff --git a/CLAUDE.md b/CLAUDE.md
index 3822134..c7642a5 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -10,9 +10,9 @@ commands/generate-tests.md  # Entry point — dispatches the 4-step pipeline
 skills/generate-tests/SKILL.md  # Orchestrator skill
 agents/                   # Isolated subagents (one per step)
   kb-generator.md         # Step 1: Knowledge base → autonoma/AUTONOMA.md + features.json
-  scenario-generator.md   # Step 2: Scenarios → autonoma/scenarios.md
+  scenario-generator.md   # Step 2: Discover + scenarios → autonoma/discover.json + autonoma/scenarios.md
   test-case-generator.md  # Step 3: Tests → autonoma/qa-tests/INDEX.md + test files
-  env-factory-generator.md # Step 4: Environment factory endpoint
+  env-factory-generator.md # Step 4: Environment Factory implementation/integration + scenario validation
 hooks/
   hooks.json              # PostToolUse hook config (triggers on Write)
   validate-pipeline-output.sh  # Bash dispatcher → routes to Python validators
@@ -23,7 +23,7 @@ hooks/
 
 Each step spawns an isolated subagent. After each Write, the PostToolUse hook in `hooks/hooks.json` runs `validate-pipeline-output.sh`, which pattern-matches the file path and runs the appropriate Python validator. Validators exit 0 (OK) or 2 (block with error message).
 
-Steps 1-3 require user confirmation before advancing. Step 4 is the final step (no gate).
+Steps 1-3 require user confirmation before advancing. Step 4 is the final step.
 
 ## Validation
 
@@ -32,8 +32,10 @@ Validators are in `hooks/validators/`. They parse YAML frontmatter and check req
 | Validator | File matched | Key checks |
 |-----------|-------------|------------|
 | `validate_kb.py` | `*/autonoma/AUTONOMA.md` | app_name, app_description (≥20 chars), core_flows with at least one `core: true` |
+| `validate_discover.py` | `*/autonoma/discover.json` | schema object, models, edges, relations, scopeField |
 | `validate_features.py` | `*/autonoma/features.json` | features array length matches total_features, valid types, at least one core feature |
-| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario_count ≥ 3, standard/empty/large scenarios present, entity_types |
+| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario_count ≥ 3, standard/empty/large scenarios present, entity_types, discover metadata, variable field strategy |
+| `validate_scenario_recipes.py` | `*/autonoma/scenario-recipes.json` | approved recipe file, validation mode, standard/empty/large present, lifecycle status |
 | `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals match folder sums, criticality sums, cross-checks against features.json |
 | `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | title, description, criticality (critical/high/mid/low), scenario, flow |
 
diff --git a/README.md b/README.md
index d264d6d..176ea66 100644
--- a/README.md
+++ b/README.md
@@ -38,9 +38,9 @@ Analyzes your frontend codebase and produces `autonoma/AUTONOMA.md` — a user-p
 
 ### Step 2: Scenarios
 
-Reads the knowledge base and your backend data model to design three test data environments: `standard` (realistic variety), `empty` (empty states), and `large` (pagination/performance). Outputs `autonoma/scenarios.md` with frontmatter summarizing each scenario.
+Reads the knowledge base and the SDK `discover` response from your backend Environment Factory to design three test data environments: `standard` (realistic variety), `empty` (empty states), and `large` (pagination/performance). Outputs `autonoma/discover.json` plus `autonoma/scenarios.md`, preserving the legacy scenario summary while adding schema metadata and minimal variable-field planning.
 
-**You review**: entity names, counts, and relationships. These become hard assertions in your tests.
+**You review**: entity names, counts, relationships, and which values truly must stay generated. Fixed values are preferred because they become stable test assertions; if uniqueness is needed, the planner should first prefer concrete hardcoded values with a discriminator. Variable fields are exceptions used only for genuinely dynamic values. Generator hints are optional and are not tied to `faker`.
 
 ### Step 3: E2E Tests
 
@@ -48,13 +48,81 @@ Generates markdown test files organized by feature in `autonoma/qa-tests/`. Each
 
 An `INDEX.md` tracks total test count, folder breakdown, and coverage correlation with your codebase size.
 
+`scenarios.md` is fixture input for this step, not the subject under test. Step 3 should not spend test budget verifying seeded counts or Environment Factory correctness.
+
 **You review**: test distribution and coverage correlation. Test count should roughly match 3-5x your route/feature count.
 
 ### Step 4: Environment Factory
 
-Implements an endpoint in your backend that creates and tears down isolated test data for each scenario. Handles `discover`, `up`, and `down` actions with HMAC-SHA256 request signing and JWT-signed refs for safe teardown.
+Implements or completes the backend Environment Factory so the planned scenarios can actually be created and torn down through the current SDK contract. Step 4 includes backend wiring plus validation: `discover`, `up`, `down`, request signing, refs signing, a smoke-tested lifecycle, and validation of the planned scenarios with `autonoma/scenario-recipes.json`. After validation, the plugin uploads the parsed recipe document to the setup API through the dedicated `scenario-recipe-versions` route so Step 04 in `agent` can persist normalized scenario data directly.
+
+**You review**: where the Environment Factory lives, what changed, whether a smoke `discover` → `up` → `down` check passed, and whether `standard`, `empty`, and `large` all passed lifecycle validation.
+
+## Scenario Recipes
+
+`autonoma/scenario-recipes.json` is the validated handoff between planning and execution. It is produced in Step 4 after the Environment Factory has been implemented or verified and after each scenario has passed lifecycle validation.
+
+The file contains:
+
+- top-level metadata: `version`, `source`, and `validationMode`
+- one recipe per named scenario, usually `standard`, `empty`, and `large`
+- for each recipe:
+  - `name` and `description`
+  - `create`: the inline data graph Autonoma will send to the SDK `up` action
+  - `validation`: proof that the recipe passed `checkScenario`, `checkAllScenarios`, or endpoint lifecycle validation
+
+Conceptually, a scenario recipe is not a test case. It is a data fixture definition for the Environment Factory. The `create` payload describes which records should exist before a run starts, including nested records and references such as `_alias` and `_ref`.
+
+Example shape:
+
+```json
+{
+  "version": 1,
+  "source": {
+    "discoverPath": "autonoma/discover.json",
+    "scenariosPath": "autonoma/scenarios.md"
+  },
+  "validationMode": "sdk-check",
+  "recipes": [
+    {
+      "name": "standard",
+      "description": "Realistic baseline workspace",
+      "create": {
+        "User": [{ "email": "{{owner_email}}" }]
+      },
+      "variables": {
+        "owner_email": {
+          "strategy": "derived",
+          "source": "testRunId",
+          "format": "owner+{testRunId}@example.com"
+        }
+      },
+      "validation": {
+        "status": "validated",
+        "method": "checkScenario",
+        "phase": "ok"
+      }
+    }
+  ]
+}
+```
+
+Persisted recipes store tokenized `create` payloads plus `variables` metadata — never resolved concrete values. The `variables` field defines how each `{{token}}` is resolved at runtime using one of three strategies: `literal`, `derived` (from `testRunId`), or `faker`. This allows the `agent` to resolve the same tokens later for real runs.
+
+During Step 4, the plugin runs a preflight check that resolves tokens into transient concrete payloads and sends signed `up`/`down` requests to the live SDK endpoint. The write hook also enforces that same preflight before a final `autonoma/scenario-recipes.json` write is accepted. These transient values are never persisted.
+
+Storage semantics:
 
-**You review**: implementation plan before any code is written. The endpoint never modifies existing data.
+- in this plugin repo, `autonoma/scenario-recipes.json` is a local output artifact so the user and validators can inspect it
+- when uploaded to `agent`, the backend does not keep the raw JSON file as text
+- instead, `agent` parses the document and stores the approved scenario recipe data in its scenario JSONB storage through the `scenario-recipe-versions` setup endpoint
+
+Runtime semantics:
+
+- the planner still thinks in named scenarios like `standard`, `empty`, and `large`
+- the SDK protocol does not require those names on the wire
+- before a run, Autonoma resolves the active stored recipe version for the selected scenario and sends its `create` payload to the Environment Factory `up` action
+- after the run, Autonoma calls `down` using the returned teardown refs/token
 
 ## Validation
 
@@ -63,24 +131,26 @@ Every output file has YAML frontmatter validated by shell scripts (not prompts).
 | File | What's validated |
 |------|-----------------|
 | `AUTONOMA.md` | core_flows table, app description, feature/skill counts |
-| `scenarios.md` | scenario count, required scenarios (standard/empty/large), entity types |
+| `discover.json` | SDK discover schema shape: models, edges, relations, scopeField, and supported `type` formats |
+| `scenarios.md` | scenario count, required scenarios (standard/empty/large), entity types, discover metadata, minimal variable fields |
+| `scenario-recipes.json` | validated recipe file, discover-aware model/field/type parity, required scenarios, optional variables consistency, and mandatory live endpoint preflight |
 | `INDEX.md` | test totals match folder sums, criticality counts sum correctly, test count within expected range |
 | Each test file | title, description, criticality (critical/high/mid/low), scenario, flow |
 
-## Environment Variables (Step 4)
+## Environment Variables
 
-Step 4 requires two secrets for the Environment Factory endpoint:
+Step 2 and Step 4 use the live SDK endpoint when fetching `discover` or validating through HTTP:
 
 ```bash
-# Generate secrets
-openssl rand -hex 32  # AUTONOMA_SIGNING_SECRET
-openssl rand -hex 32  # AUTONOMA_JWT_SECRET
+AUTONOMA_SDK_ENDPOINT=<your sdk endpoint url>
+AUTONOMA_SHARED_SECRET=<shared HMAC secret>
 ```
 
-Add to your `.env`:
-```
-AUTONOMA_SIGNING_SECRET=<first-value>
-AUTONOMA_JWT_SECRET=<second-value>
+Step 4 backend implementation uses the current SDK secret names:
+
+```bash
+AUTONOMA_SHARED_SECRET=<shared HMAC secret>
+AUTONOMA_SIGNING_SECRET=<private refs signing secret>
 ```
 
 ## Requirements
@@ -115,8 +185,11 @@ autonoma-test-planner/
 ├── hooks/
 │   ├── hooks.json                      # PostToolUse hook config
 │   ├── validate-pipeline-output.sh     # Validation dispatcher
+│   ├── preflight_scenario_recipes.py   # Preflight resolver + endpoint lifecycle checker
 │   └── validators/
 │       ├── validate_kb.py
+│       ├── validate_discover.py
+│       ├── validate_scenario_recipes.py
 │       ├── validate_scenarios.py
 │       ├── validate_test_index.py
 │       └── validate_test_file.py
diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh
index 5fda0fe..c64d763 100755
--- a/hooks/validate-pipeline-output.sh
+++ b/hooks/validate-pipeline-output.sh
@@ -16,6 +16,11 @@ fi
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 VALIDATORS_DIR="$SCRIPT_DIR/validators"
 
+# Persist the plugin root so orchestrator/subagent bash snippets can find plugin-local scripts.
+# This hook is the earliest reliable place where we know the plugin directory.
+PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+echo "$PLUGIN_ROOT" > /tmp/autonoma-plugin-root
+
 # Ensure PyYAML is available (required for frontmatter parsing)
 python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null
 
@@ -25,6 +30,10 @@ case "$FILE_PATH" in
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_kb.py"
     VALIDATOR_NAME="validate-kb"
     ;;
+  */autonoma/discover.json)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_discover.py"
+    VALIDATOR_NAME="validate-discover"
+    ;;
   */autonoma/features.json)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_features.py"
     VALIDATOR_NAME="validate-features"
diff --git a/hooks/validators/validate_discover.py b/hooks/validators/validate_discover.py
new file mode 100644
index 0000000..102cc8c
--- /dev/null
+++ b/hooks/validators/validate_discover.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Validates autonoma/discover.json structure."""
+import json
+import re
+import sys
+
+
+TYPE_PATTERN = re.compile(r"^(?:[A-Za-z][A-Za-z0-9_]*|enum\([^()]+\))(?:\[\])?$")
+
+
+filepath = sys.argv[1]
+
+try:
+    with open(filepath) as fh:
+        payload = json.load(fh)
+except Exception as e:
+    print(f'Invalid JSON: {e}')
+    sys.exit(1)
+
+if not isinstance(payload, dict):
+    print('discover.json must contain a JSON object')
+    sys.exit(1)
+
+schema = payload.get('schema')
+if not isinstance(schema, dict):
+    print('discover.json must contain a "schema" object')
+    sys.exit(1)
+
+required_schema_fields = ['models', 'edges', 'relations', 'scopeField']
+missing = [f for f in required_schema_fields if f not in schema]
+if missing:
+    print(f'schema is missing required fields: {missing}')
+    sys.exit(1)
+
+models = schema.get('models')
+if not isinstance(models, list) or len(models) == 0:
+    print('schema.models must be a non-empty list')
+    sys.exit(1)
+
+for i, model in enumerate(models):
+    if not isinstance(model, dict):
+        print(f'schema.models[{i}] must be an object')
+        sys.exit(1)
+    if not isinstance(model.get('name'), str) or not model.get('name', '').strip():
+        print(f'schema.models[{i}].name must be a non-empty string')
+        sys.exit(1)
+    fields = model.get('fields')
+    if not isinstance(fields, list):
+        print(f'schema.models[{i}].fields must be a list')
+        sys.exit(1)
+    for j, field in enumerate(fields):
+        if not isinstance(field, dict):
+            print(f'schema.models[{i}].fields[{j}] must be an object')
+            sys.exit(1)
+        for key in ['name', 'type', 'isRequired', 'isId', 'hasDefault']:
+            if key not in field:
+                print(f'schema.models[{i}].fields[{j}] missing required field: {key}')
+                sys.exit(1)
+        field_type = field.get('type')
+        if not isinstance(field_type, str) or len(field_type.strip()) == 0:
+            print(f'schema.models[{i}].fields[{j}].type must be a non-empty string')
+            sys.exit(1)
+        if TYPE_PATTERN.match(field_type.strip()) is None:
+            print(
+                f'schema.models[{i}].fields[{j}].type must use a supported type format, got: {field_type}'
+            )
+            sys.exit(1)
+
+edges = schema.get('edges')
+if not isinstance(edges, list):
+    print('schema.edges must be a list')
+    sys.exit(1)
+
+for i, edge in enumerate(edges):
+    if not isinstance(edge, dict):
+        print(f'schema.edges[{i}] must be an object')
+        sys.exit(1)
+    for key in ['from', 'to', 'localField', 'foreignField', 'nullable']:
+        if key not in edge:
+            print(f'schema.edges[{i}] missing required field: {key}')
+            sys.exit(1)
+
+relations = schema.get('relations')
+if not isinstance(relations, list):
+    print('schema.relations must be a list')
+    sys.exit(1)
+
+for i, relation in enumerate(relations):
+    if not isinstance(relation, dict):
+        print(f'schema.relations[{i}] must be an object')
+        sys.exit(1)
+    for key in ['parentModel', 'childModel', 'parentField', 'childField']:
+        if key not in relation:
+            print(f'schema.relations[{i}] missing required field: {key}')
+            sys.exit(1)
+
+scope_field = schema.get('scopeField')
+if not isinstance(scope_field, str) or len(scope_field.strip()) == 0:
+    print('schema.scopeField must be a non-empty string')
+    sys.exit(1)
+
+print('OK')
diff --git a/hooks/validators/validate_scenarios.py b/hooks/validators/validate_scenarios.py
index eb77f5c..9bbbaec 100644
--- a/hooks/validators/validate_scenarios.py
+++ b/hooks/validators/validate_scenarios.py
@@ -26,7 +26,7 @@
     sys.exit(1)
 
 # Required fields
-required = ['scenario_count', 'scenarios', 'entity_types']
+required = ['scenario_count', 'scenarios', 'entity_types', 'discover', 'variable_fields', 'planning_sections']
 missing = [f for f in required if f not in fm]
 if missing:
     print(f'Missing required frontmatter fields: {missing}')
@@ -73,4 +73,102 @@
         print(f'entity_types[{i}] must be a mapping with at least a "name" field')
         sys.exit(1)
 
+# Validate discover metadata
+discover = fm.get('discover')
+if not isinstance(discover, dict):
+    print('discover must be a mapping')
+    sys.exit(1)
+
+for field in ['source', 'model_count', 'edge_count', 'relation_count', 'scope_field']:
+    if field not in discover:
+        print(f'discover missing required field: {field}')
+        sys.exit(1)
+
+if discover.get('source') != 'sdk':
+    print('discover.source must be exactly "sdk"')
+    sys.exit(1)
+
+for field in ['model_count', 'edge_count', 'relation_count']:
+    value = discover.get(field)
+    if not isinstance(value, int) or value < 0:
+        print(f'discover.{field} must be a non-negative integer')
+        sys.exit(1)
+
+scope_field = discover.get('scope_field')
+if not isinstance(scope_field, str) or len(scope_field.strip()) == 0:
+    print('discover.scope_field must be a non-empty string')
+    sys.exit(1)
+
+if discover.get('model_count') == 0:
+    print('discover.model_count must be greater than 0')
+    sys.exit(1)
+
+# Validate variable_fields
+variable_fields = fm.get('variable_fields')
+if not isinstance(variable_fields, list):
+    print('variable_fields must be a list')
+    sys.exit(1)
+
+for i, variable in enumerate(variable_fields):
+    if not isinstance(variable, dict):
+        print(f'variable_fields[{i}] must be a mapping')
+        sys.exit(1)
+    for field in ['token', 'entity', 'scenarios', 'reason', 'test_reference']:
+        if field not in variable:
+            print(f'variable_fields[{i}] missing required field: {field}')
+            sys.exit(1)
+
+    token = variable.get('token')
+    if not isinstance(token, str) or len(token) < 5 or not token.startswith('{{') or not token.endswith('}}'):
+        print(f'variable_fields[{i}].token must use double curly braces, e.g. {{title}}')
+        sys.exit(1)
+
+    for field in ['entity', 'reason', 'test_reference']:
+        value = variable.get(field)
+        if not isinstance(value, str) or len(value.strip()) == 0:
+            print(f'variable_fields[{i}].{field} must be a non-empty string')
+            sys.exit(1)
+
+    if 'generator' in variable:
+        generator = variable.get('generator')
+        if not isinstance(generator, str) or len(generator.strip()) == 0:
+            print(f'variable_fields[{i}].generator must be a non-empty string if present')
+            sys.exit(1)
+
+    scenario_names = variable.get('scenarios')
+    if not isinstance(scenario_names, list) or len(scenario_names) == 0:
+        print(f'variable_fields[{i}].scenarios must be a non-empty list')
+        sys.exit(1)
+    unknown_names = [name for name in scenario_names if name not in found_names]
+    if unknown_names:
+        print(f'variable_fields[{i}].scenarios has unknown scenario names: {unknown_names}')
+        sys.exit(1)
+
+# Validate planning_sections metadata
+planning_sections = fm.get('planning_sections')
+if not isinstance(planning_sections, list) or len(planning_sections) == 0:
+    print('planning_sections must be a non-empty list')
+    sys.exit(1)
+
+required_sections = {
+    'sdk_discover',
+    'schema_summary',
+    'relationship_map',
+    'variable_data_strategy',
+}
+unknown_sections = [section for section in planning_sections if not isinstance(section, str) or len(section.strip()) == 0]
+if unknown_sections:
+    print('planning_sections must contain only non-empty strings')
+    sys.exit(1)
+
+missing_sections = required_sections - set(planning_sections)
+if missing_sections:
+    print(f'Missing required planning_sections: {missing_sections}')
+    sys.exit(1)
+
+for section in planning_sections:
+    if section not in required_sections:
+        print(f'planning_sections contains unknown value: {section}')
+        sys.exit(1)
+
 print('OK')

From 804059b8ae0ae061743421dd69ae5d3a2bfc64f3 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Tue, 7 Apr 2026 19:24:40 -0300
Subject: [PATCH 02/33] feat: scenario recipe creation and preflight validation

Add token resolution engine with literal/derived/faker strategies,
recipe schema validator, live endpoint preflight, and updated agent
guidance for scenario generation flow.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 agents/env-factory-generator.md               | 352 +++++++++++-------
 agents/scenario-generator.md                  | 118 +++++-
 agents/test-case-generator.md                 |  25 +-
 commands/generate-tests.md                    | 156 ++++++--
 hooks/preflight_scenario_recipes.py           | 319 ++++++++++++++++
 hooks/validate-pipeline-output.sh             |  22 ++
 hooks/validators/validate_scenario_recipes.py | 332 +++++++++++++++++
 skills/generate-tests/SKILL.md                | 248 +++++++++---
 8 files changed, 1335 insertions(+), 237 deletions(-)
 create mode 100644 hooks/preflight_scenario_recipes.py
 create mode 100644 hooks/validators/validate_scenario_recipes.py

diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md
index 85d6ba7..5e18487 100644
--- a/agents/env-factory-generator.md
+++ b/agents/env-factory-generator.md
@@ -1,8 +1,8 @@
 ---
 description: >
-  Implements the Autonoma Environment Factory endpoint in the project's backend.
-  Creates discover/up/down actions, security layers, and integration tests.
-  Tests the implementation within the session before completing.
+  Implements or completes the Autonoma Environment Factory in the project's backend.
+  Extends an existing SDK integration when possible, wires discover/up/down behavior to the
+  planned scenarios, then validates the planned scenarios against the lifecycle before completing.
 tools:
   - Read
   - Glob
@@ -17,8 +17,23 @@ maxTurns: 60
 
 # Environment Factory Generator
 
-You implement the Autonoma Environment Factory endpoint in the project's backend.
-Your input is `autonoma/scenarios.md`. Your output is working endpoint code with tests.
+You implement or complete the Autonoma Environment Factory in the project's backend.
+Your inputs are `autonoma/discover.json`, `autonoma/scenarios.md`, and the backend codebase.
+Your output is working backend code plus validated scenario recipes.
+
+## Goal
+
+Step 2 already proved that the backend can answer `discover`, or at least that there is enough
+of an Environment Factory integration to expose schema metadata. Step 4's job is to finish the
+real backend implementation for scenario creation and teardown, then validate the planned scenarios
+against that implementation:
+
+1. make sure the backend exposes the current SDK protocol
+2. make sure `up` can create scenario data from inline `create` recipes
+3. make sure `down` can delete only the data created by `up`
+4. smoke-test the lifecycle in-session
+5. validate `standard`, `empty`, and `large`
+6. persist approved recipes to `autonoma/scenario-recipes.json`
 
 ## Instructions
 
@@ -28,156 +43,219 @@ Your input is `autonoma/scenarios.md`. Your output is working endpoint code with
    - `https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt`
    - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt`
 
-   Follow those instructions for how to implement the endpoint.
+   Follow the current SDK protocol from those docs. If the docs lag behind the repo, prefer the
+   real SDK contract already visible in the backend codebase.
 
-2. Read `autonoma/scenarios.md` — parse the frontmatter and full scenario data.
+2. Read `autonoma/discover.json` and `autonoma/scenarios.md`.
+   - `discover.json` is the schema source of truth
+   - `scenarios.md` is the planning layer that defines what `standard`, `empty`, and `large`
+     should look like
 
-3. Explore the backend codebase to understand:
-   - Framework (Next.js, Express, Elixir/Phoenix, etc.)
-   - Database layer (Prisma, Drizzle, raw SQL, Ecto, etc.)
-   - Authentication mechanism (session cookies, JWT, etc.)
-   - Existing route/endpoint patterns
+3. Explore the backend codebase to determine:
+   - whether the Autonoma SDK is already installed
+   - where the Environment Factory endpoint lives
+   - which parts already exist: `discover`, `up`, `down`, auth callback, teardown helpers
+   - what framework and ORM patterns the backend already uses
 
 ## CRITICAL: Before Writing Any Code
 
-**Ask the user for confirmation** before implementing. Present your plan:
+Ask the user for confirmation before implementing. Present a short plan:
 
-> "I'm about to implement the Autonoma Environment Factory endpoint. Here's what I'll do:
+> "I'm about to implement or complete the Autonoma Environment Factory. Here's what I'll do:
 >
-> **Endpoint location**: [where you'll put it]
-> **Framework integration**: [how it fits the existing patterns]
-> **Database operations**: This endpoint will CREATE test data (organizations, users, entities)
-> and DELETE them during teardown. It will NOT modify or delete any existing data.
-> **Security**: HMAC-SHA256 request signing + JWT-signed refs for safe teardown
+> **Endpoint location**: [route / handler path]
+> **Current state**: [what already exists vs what is missing]
+> **Step 4 scope**: make discover/up/down work with the current SDK contract and validate the planned scenarios against it
+> **Database operations**: `up` will create isolated test data and `down` will delete only those created refs
+> **Security**: HMAC-SHA256 request signing with `AUTONOMA_SHARED_SECRET` plus signed refs tokens with `AUTONOMA_SIGNING_SECRET`
 >
 > **Environment variables needed**:
-> - `AUTONOMA_SIGNING_SECRET` — shared secret for HMAC request verification
-> - `AUTONOMA_JWT_SECRET` — secret for signing/verifying refs tokens
->
-> To generate these secrets, run:
-> ```bash
-> openssl rand -hex 32
-> ```
-> Run this command TWICE — once for each secret. Use DIFFERENT values for each.
-> Set them in your `.env` file (or equivalent):
-> ```
-> AUTONOMA_SIGNING_SECRET=<first-value>
-> AUTONOMA_JWT_SECRET=<second-value>
-> ```
+> - `AUTONOMA_SHARED_SECRET`
+> - `AUTONOMA_SIGNING_SECRET`
 >
 > Shall I proceed?"
 
-**Do NOT proceed until the user confirms.**
+Do NOT proceed until the user confirms.
 
 ## Implementation Requirements
 
-### Always Implement on the Backend
-
-Find the project's backend and implement the endpoint there. Look for:
-- API route directories (e.g., `app/api/`, `pages/api/`, `src/routes/`, `lib/`)
-- Existing endpoint patterns to match
-- If it's a monorepo, find the backend package/app
-
-If you can't find the backend, ask the user where it is.
-
-### Environment Variables
-
-Always use these exact names:
-- `AUTONOMA_SIGNING_SECRET` — for HMAC-SHA256 request verification
-- `AUTONOMA_JWT_SECRET` — for JWT signing of refs tokens
-
-### Security Layers (All Required)
-
-1. **Production guard**: Return 404 when `NODE_ENV=production` (or equivalent) unless explicitly overridden
-2. **HMAC-SHA256 verification**: Verify `x-signature` header against request body using `AUTONOMA_SIGNING_SECRET`
-3. **Signed refs (JWT)**: Sign refs in `up` response, verify in `down` request using `AUTONOMA_JWT_SECRET`
-
-### Creation and Teardown Order
-
-- **Up**: Create parent entities before children (org → users → projects → tests → runs)
-- **Down**: Delete in REVERSE order (runs → tests → projects → users → org)
-- Do NOT rely on ORM cascade behavior — explicit deletion is safer
-- Use `testRunId` in all unique fields to prevent parallel test collisions
-
-### Endpoint Actions
-
-| Action     | Purpose                        |
-|------------|-------------------------------|
-| `discover` | Return available scenarios     |
-| `up`       | Create scenario data, return auth + refs |
-| `down`     | Verify refs token, delete data |
-
-## CRITICAL: Test Within the Session
-
-After implementing the endpoint, you MUST test it to verify it works:
-
-1. **Check if the dev server is running** or start it
-2. **Generate temporary secrets** for testing:
-   ```bash
-   export AUTONOMA_SIGNING_SECRET=$(openssl rand -hex 32)
-   export AUTONOMA_JWT_SECRET=$(openssl rand -hex 32)
-   ```
-
-3. **Test the discover action**:
-   ```bash
-   BODY='{"action":"discover"}'
-   SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //')
-   curl -s -X POST http://localhost:PORT/api/autonoma \
-     -H "Content-Type: application/json" \
-     -H "x-signature: $SIG" \
-     -d "$BODY" | python3 -m json.tool
-   ```
-
-4. **Test the up action** (for each scenario):
-   ```bash
-   BODY='{"action":"up","environment":"standard","testRunId":"test-001"}'
-   SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //')
-   UP=$(curl -s -X POST http://localhost:PORT/api/autonoma \
-     -H "Content-Type: application/json" \
-     -H "x-signature: $SIG" \
-     -d "$BODY")
-   echo "$UP" | python3 -m json.tool
-   ```
-
-5. **Test the down action** using refs from up:
-   ```bash
-   REFS=$(echo "$UP" | python3 -c "import sys,json; print(json.dumps(json.load(sys.stdin)['refs']))")
-   TOKEN=$(echo "$UP" | python3 -c "import sys,json; print(json.load(sys.stdin)['refsToken'])")
-   BODY=$(python3 -c "import json; print(json.dumps({'action':'down','testRunId':'test-001','refs':json.loads('$REFS'),'refsToken':'$TOKEN'}))")
-   SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SIGNING_SECRET" | sed 's/.*= //')
-   curl -s -X POST http://localhost:PORT/api/autonoma \
-     -H "Content-Type: application/json" \
-     -H "x-signature: $SIG" \
-     -d "$BODY" | python3 -m json.tool
-   ```
-
-6. **Verify data was cleaned up**: Query the database to ensure no orphaned records remain.
-
-If any test fails, fix the implementation and re-test.
+### Build on the existing backend
+
+- Prefer extending the existing Environment Factory endpoint over replacing it
+- Match the backend's framework, ORM, and route conventions
+- Do not create a separate throwaway server
+
+### Current SDK contract
+
+Implement or preserve these actions:
+
+| Action | Purpose |
+|--------|---------|
+| `discover` | Return schema metadata: version, sdk info, models, edges, relations, scopeField |
+| `up` | Accept inline `create` payloads plus optional `testRunId`, create data, return `auth`, `refs`, and `refsToken` |
+| `down` | Accept `refsToken`, verify it, and tear down the created data |
+
+### Security requirements
+
+Use these exact environment variable names:
+- `AUTONOMA_SHARED_SECRET` — HMAC request verification secret shared with Autonoma
+- `AUTONOMA_SIGNING_SECRET` — private secret for signing and verifying refs tokens
+
+Required protections:
+1. production guard unless explicitly allowed
+2. HMAC-SHA256 verification of the `x-signature` header
+3. signed refs tokens for teardown
+
+### Scenario implementation guidance
+
+- Use `autonoma/scenarios.md` to decide what data the backend needs to support
+- Preserve generated fields as generated values; do not force everything into static literals
+- Make unique fields depend on `testRunId` when needed
+- Prefer explicit create and teardown ordering based on the schema
+- If `discover` already works but `up` / `down` do not, keep the introspection path and finish the lifecycle
+
+## CRITICAL: Smoke-Test and Validate Within the Session
+
+After implementing, test the lifecycle in-session.
+
+At minimum:
+1. confirm `discover` still works
+2. send one signed `up` request with a small inline `create` payload compatible with the schema
+3. send the corresponding signed `down` request using the returned `refsToken`
+4. verify cleanup succeeds
+
+After the wiring works, validate `standard`, `empty`, and `large` against the backend.
+Prefer:
+1. backend-local `checkScenario` / `checkAllScenarios`
+2. signed endpoint `up` / `down` validation if local SDK checks are not practical
+
+Write the approved results to `autonoma/scenario-recipes.json`.
+
+## CRITICAL: scenario-recipes.json must match the current setup API schema
+
+The file must be a JSON object in this exact logical shape:
+
+```json
+{
+  "version": 1,
+  "source": {
+    "discoverPath": "autonoma/discover.json",
+    "scenariosPath": "autonoma/scenarios.md"
+  },
+  "validationMode": "sdk-check",
+  "recipes": [
+    {
+      "name": "standard",
+      "description": "Realistic dataset for core flows",
+      "create": {
+        "User": [
+          {
+            "email": "{{owner_email}}"
+          }
+        ]
+      },
+      "variables": {
+        "owner_email": {
+          "strategy": "derived",
+          "source": "testRunId",
+          "format": "owner+{testRunId}@example.com"
+        }
+      },
+      "validation": {
+        "status": "validated",
+        "method": "checkScenario",
+        "phase": "ok",
+        "up_ms": 12,
+        "down_ms": 8
+      }
+    }
+  ]
+}
+```
+
+Required rules:
+- top-level keys must be `version`, `source`, `validationMode`, and `recipes`
+- `version` must be the integer `1`
+- `source.discoverPath` must be `autonoma/discover.json`
+- `source.scenariosPath` must be `autonoma/scenarios.md`
+- `validationMode` must be `sdk-check` or `endpoint-lifecycle`
+- `recipes` must include `standard`, `empty`, and `large`
+- every recipe must contain `name`, `description`, `create`, and `validation`
+- every `validation` object must contain:
+  - `status: "validated"`
+  - `method`: one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`
+  - `phase: "ok"`
+  - optional `up_ms` / `down_ms` as non-negative integers
+
+### Per-recipe `variables` (required when `create` uses tokens)
+
+If `create` contains `{{token}}` placeholders, the recipe MUST include a `variables` object that
+defines how each token is resolved. The persisted `create` remains tokenized — concrete values are
+never stored. The `variables` field stores the planned generation logic so the `agent` can resolve
+tokens at runtime.
+
+Allowed strategies:
+- `literal` — `{ "strategy": "literal", "value": <scalar> }`
+- `derived` — `{ "strategy": "derived", "source": "testRunId", "format": "<template>" }`
+- `faker` — `{ "strategy": "faker", "generator": "<generator_id>" }`
+
+Allowed faker generators: `person.firstName`, `person.lastName`, `internet.email`, `company.name`, `lorem.words`.
+
+Rules:
+- every `{{token}}` in `create` must have a matching key in `variables`
+- every key in `variables` must be used as a `{{token}}` in `create`
+- fully concrete recipes (no tokens) do not need `variables`
+
+Do not write the old shape. In particular, do not use:
+- top-level `generatedAt`
+- top-level `scenarios`
+- per-recipe `validated`
+- per-recipe `timing`
+
+If you need timing data, map it into `validation.up_ms` and `validation.down_ms`.
+
+If any smoke test fails, fix the implementation and re-test.
+
+## CRITICAL: Preflight Endpoint Validation
+
+After generating tokenized recipes with `variables`, you MUST run a preflight check before
+writing the final `autonoma/scenario-recipes.json`. This is mandatory — backend-local
+`checkScenario` alone is NOT sufficient to complete Step 4.
+
+The preflight flow for each recipe:
+1. Generate a synthetic `testRunId`: `autonoma-preflight-<scenario>-<unix_ms>-<short_suffix>`
+2. Resolve all `{{token}}` placeholders using the `variables` definitions and the synthetic `testRunId`
+3. Send a signed `up` request to `AUTONOMA_SDK_ENDPOINT` with the resolved `create` payload
+4. Verify `up` returns `auth`, `refs`, and `refsToken`
+5. Send a signed `down` request with the returned `refs` and `refsToken`
+6. Verify `down` succeeds
+
+Run the preflight helper script:
+```bash
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json
+```
+
+This script requires `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` environment variables.
+
+If preflight fails, do NOT upload the recipe file. Fix the recipe or backend issue and re-run.
+The transient concrete values used during preflight are never persisted.
 
 ## What to Explain to the User
 
-After implementation, explain:
-
-1. **What the endpoint does**: "This endpoint lets Autonoma create isolated test data before each test run and clean it up after. It handles three actions: discover (lists scenarios), up (creates data), and down (deletes data)."
-
-2. **Why it's secure**: "Three security layers protect your data:
-   - Production guard: The endpoint returns 404 in production
-   - Request signing: Every request is verified with HMAC-SHA256 using your signing secret
-   - Signed refs: Teardown can only delete data that was actually created by the endpoint, verified by JWT"
-
-3. **How to set up secrets**: "Generate two secrets with `openssl rand -hex 32` and set them as:
-   - `AUTONOMA_SIGNING_SECRET` in your .env file
-   - `AUTONOMA_JWT_SECRET` in your .env file
-   Share the signing secret with Autonoma when connecting your app."
-
-4. **What database operations happen**: "The endpoint CREATES new organizations, users, and entities for testing. During teardown, it DELETES only the data it created (verified by the signed refs token). It never modifies or deletes existing data."
+When finished, explain:
+1. where the Environment Factory lives in the backend
+2. what was added or fixed
+3. what env vars are required:
+   - `AUTONOMA_SHARED_SECRET`
+   - `AUTONOMA_SIGNING_SECRET`
+4. what smoke tests were run and whether the lifecycle succeeded
+5. whether `standard`, `empty`, and `large` validated successfully
+6. where `autonoma/scenario-recipes.json` was written
 
 ## Important
 
-- Always prefer implementing in the project's existing backend — don't create a standalone server
-- Match existing code patterns and conventions in the project
-- Use the same ORM/database layer the project already uses
-- Handle circular foreign keys with transaction-wrapped deletion
-- Always use `testRunId` to make unique fields (emails, org names) to prevent parallel test collisions
-- Test the FULL lifecycle (discover → up → down) within the session
+- Do not remove or rewrite existing working discover logic just because Step 2 now consumes it
+- Treat `discover.json` as the schema contract and `scenarios.md` as the scenario intent
+- Step 4 is both Environment Factory implementation/integration and scenario validation
+- Keep backend changes minimal and consistent with the repo's style
+- Do not claim rollback semantics unless the backend actually implements rollback
diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md
index cfb7aa3..bc033e9 100644
--- a/agents/scenario-generator.md
+++ b/agents/scenario-generator.md
@@ -1,7 +1,7 @@
 ---
 description: >
   Generates test data scenarios from a knowledge base.
-  Reads AUTONOMA.md and produces scenarios.md with three named test data environments.
+  Reads AUTONOMA.md plus SDK discover output and produces scenarios.md with three named test data environments.
   Output has YAML frontmatter with scenario summaries for deterministic validation.
 tools:
   - Read
@@ -16,8 +16,9 @@ maxTurns: 40
 
 # Scenario Generator
 
-You generate test data scenarios from a knowledge base. Your input is `autonoma/AUTONOMA.md`
-and `autonoma/skills/`. Your output MUST be written to `autonoma/scenarios.md` with YAML frontmatter.
+You generate test data scenarios from a knowledge base. Your inputs are `autonoma/AUTONOMA.md`,
+`autonoma/skills/`, and `autonoma/discover.json`. Your output MUST be written to
+`autonoma/scenarios.md` with YAML frontmatter.
 
 ## Instructions
 
@@ -28,13 +29,58 @@ and `autonoma/skills/`. Your output MUST be written to `autonoma/scenarios.md` w
 
 2. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types.
 
-3. Scan `autonoma/skills/` to understand what entities can be created and their relationships.
+3. Read `autonoma/discover.json`. Treat the SDK `discover` response as the source of truth for:
+   - database models
+   - fields and requiredness
+   - foreign key edges
+   - parent/child relations
+   - scope field
 
-4. Explore the backend codebase to map the data model (database schema, API routes, types).
+   If `autonoma/discover.json` is missing or malformed, stop and tell the user that Step 2 now
+   requires a valid SDK discover artifact before scenario generation can continue.
 
-5. Design three scenarios: `standard`, `empty`, `large`.
+4. Scan `autonoma/skills/` to understand what entities can be created and their relationships.
 
-6. Write the output to `autonoma/scenarios.md`.
+5. Use the SDK discover schema plus the knowledge base to design three scenarios: `standard`, `empty`, `large`.
+
+6. Prefer hardcoded values when they make the resulting tests simpler, more reviewable, and more stable.
+   If a field needs run-level uniqueness but can still be expressed as a concrete literal, prefer a planner-chosen
+   hardcoded value with a discriminator suffix or prefix over introducing a variable placeholder.
+   Example: prefer `Acme Project testRunId suffix` encoded as a concrete scenario value over turning the whole field
+   into `{{project_name}}` unless later tests truly need the placeholder.
+
+   Use variable fields sparingly. Only mark a value as variable when at least one of these is true:
+   - the field must be globally unique or is highly collision-prone across runs
+   - the backend or SDK generates the value at runtime
+   - the value is inherently time-based, unstable, or nondeterministic
+   - hardcoding it would make later tests misleading or brittle
+
+   Do not mark a field as variable just because:
+   - it is user-facing text
+   - it could be unique in theory
+   - you want to avoid choosing a concrete literal
+
+   Every variable field must have:
+   - a double-curly token such as `{{project_title}}`
+   - the entity field it belongs to, such as `Project.title`
+   - the scenario names that use it
+   - a reason explaining why it truly must vary
+   - a plain-language test reference such as `({{project_title}} variable)`
+
+   `generator` is optional. If you include it, use a short free-form strategy note such as
+   `derived from testRunId`, `planner literal plus discriminator`, `backend-generated`, `UUID suffix`,
+   or `timestamp-based`.
+   Do not default to `faker`. Prefer deterministic derivation from stable inputs, and use `faker`
+   only as a last resort when deterministic strategies are not practical.
+
+   Good:
+   - use a concrete value such as `Acme Workspace qa-17` when the planner can safely choose it and append a discriminator
+   - only `{{owner_email}}` is variable because login requires uniqueness across runs
+
+   Bad:
+   - every user name, organization name, and label is variable with `faker.*` by default
+
+7. Write the output to `autonoma/scenarios.md`.
 
 ## CRITICAL: Output Format
 
@@ -62,6 +108,26 @@ entity_types:
   - name: "Test"
   - name: "Run"
   - name: "Folder"
+discover:
+  source: sdk
+  model_count: 12
+  edge_count: 18
+  relation_count: 16
+  scope_field: "organizationId"
+variable_fields:
+  - token: "{{project_title}}"
+    entity: "Project.title"
+    scenarios:
+      - standard
+      - large
+    generator: "planner literal plus discriminator"
+    reason: "title must be unique per test run"
+    test_reference: "({{project_title}} variable)"
+planning_sections:
+  - sdk_discover
+  - schema_summary
+  - relationship_map
+  - variable_data_strategy
 ---
 ```
 
@@ -75,10 +141,31 @@ entity_types:
   - `total_entities`: Total count of entities created in this scenario
 - **entity_types**: List of ALL entity types discovered in the data model. Each has:
   - `name`: Entity type name (e.g., "User", "Project", "Run")
+- **discover**: Summary of the SDK discover artifact. It must include:
+  - `source`: exactly `sdk`
+  - `model_count`, `edge_count`, `relation_count`: counts from `autonoma/discover.json`
+  - `scope_field`: scope field name from `autonoma/discover.json`
+- **variable_fields**: List of generated or per-run values that tests must not treat as hardcoded literals.
+  Each entry has:
+  - `token`: double-curly placeholder such as `{{project_title}}`
+  - `entity`: entity field path such as `Project.title`
+  - `scenarios`: list of scenario names that use this variable
+  - `reason`: why this field must be generated
+  - `test_reference`: how tests should refer to the value in natural language
+  - optional `generator`: free-form generation hint such as `derived from testRunId` or `backend-generated`
+- **planning_sections**: A list describing which planning artifacts are present. It must include:
+  - `sdk_discover`
+  - `schema_summary`
+  - `relationship_map`
+  - `variable_data_strategy`
 
 ### After the frontmatter
 
 The rest of the file follows the standard scenarios.md format from the fetched instructions:
+- Include a `## SDK Discover` section summarizing the schema counts and scope field.
+- Include a `## Schema Summary` section listing the key models and required fields that drive the scenarios.
+- Include a `## Relationship Map` section describing the important parent/child and FK relationships.
+- Include a `## Variable Data Strategy` section explaining which values are generated and how tests should reference them.
 - Scenario: `standard` (credentials, entity tables with concrete data, aggregate counts)
 - Scenario: `empty` (credentials, all entity types listed as None)
 - Scenario: `large` (credentials, high-volume data described in aggregate)
@@ -90,17 +177,26 @@ you'll receive an error message. Fix the issue and rewrite the file.
 
 The validation checks:
 - File starts with `---` (YAML frontmatter)
-- Frontmatter contains scenario_count, scenarios, entity_types
+- Frontmatter contains scenario_count, scenarios, entity_types, discover, variable_fields
+- Frontmatter contains planning_sections metadata
 - scenarios list length matches scenario_count
 - Required scenarios (standard, empty, large) are present
 - Each scenario has name, description, entity_types, total_entities
 - entity_types is a non-empty list with name fields
+- discover includes sdk source, schema counts, and scope field
+- variable_fields entries use double-curly tokens and known scenario names
+- planning_sections includes sdk_discover, schema_summary, relationship_map, and variable_data_strategy
 
 ## Important
 
-- **The scenario data is a contract.** Tests will assert against these exact values.
+- **The scenario data is a contract.** Fixed values are hard assertions; variable fields are explicit placeholders.
+- Prefer concrete literals for seed data unless the field truly must vary across runs.
+- Use variables sparingly. A smaller, justified variable list is better than marking every identity field dynamic.
+- Do not default to `faker`. Prefer deterministic strategies such as planner-chosen literals with stable discriminator conventions, deriving from `testRunId`, or backend-generated values.
+- If a field can safely be a concrete literal for review and testing, keep it concrete.
+- Only include `generator` when the generation mechanism is important to communicate.
 - Every value must be concrete — not "some applications" but "3 applications: Marketing Website, Android App, iOS App"
 - Every relationship must be explicit — which entities belong to which
 - Every enum value must be covered in `standard`
-- Use subagents to parallelize data model discovery
-- If you can't find the database schema, ask the user where the backend is
+- Use the SDK discover output instead of re-deriving the schema from local code
+- If the discover artifact is missing, ask the user to provide a working SDK discover response
diff --git a/agents/test-case-generator.md b/agents/test-case-generator.md
index f4b8ec5..ea05c12 100644
--- a/agents/test-case-generator.md
+++ b/agents/test-case-generator.md
@@ -37,17 +37,30 @@ Your output is a directory `autonoma/qa-tests/` containing:
    - All files in `autonoma/skills/`
    - `autonoma/scenarios.md` — parse the frontmatter to get scenarios and entity_types
 
-3. Count the routes/features/pages in the codebase to establish the coverage correlation.
+3. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
+   The scenarios exist only to provide preconditions and known data for app behavior tests.
+   Do NOT generate tests whose purpose is to verify:
+   - that the scenario contains the documented entity counts
+   - that every scenario row, seed, or example value exists
+   - that the Environment Factory created data correctly
+   - that `standard`, `empty`, or `large` themselves are "correct" as artifacts
+
+   Only reference scenario data when it is necessary to exercise a real user-facing flow.
+   Example:
+   - good: "open the project `{{project_title}}` and verify editing works"
+   - bad: "verify the scenario created 12 projects and 3 users"
+
+4. Count the routes/features/pages in the codebase to establish the coverage correlation.
    The total test count should roughly correlate:
    - Rule of thumb: 3-5 tests per route/feature for supporting flows
    - Rule of thumb: 8-15 tests per core flow
    - This is approximate — use judgment, but the INDEX must declare the correlation
 
-4. Generate test files organized in subdirectories by feature/flow.
+5. Generate test files organized in subdirectories by feature/flow.
 
-5. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
+6. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
 
-6. Write individual test files into subdirectories.
+7. Write individual test files into subdirectories.
 
 ## CRITICAL: INDEX.md Format
 
@@ -145,6 +158,9 @@ The body follows the standard Autonoma test format from the fetched instructions
 - Never write conditional steps — each test follows one deterministic path
 - Assertions must specify exact text, element, or visual state
 - Reference scenario data by exact values from scenarios.md
+- Do not spend test budget "auditing" scenario contents. Scenario data is setup, not the product behavior under test.
+- Do not write meta-tests such as "verify the seeded counts match scenarios.md" or "verify the Environment Factory created the right fixtures"
+- If a seeded value is not needed for a user-facing flow, do not assert it just because it exists in scenarios.md
 
 ## Validation
 
@@ -169,3 +185,4 @@ you'll receive an error message. Fix the issue and rewrite the file.
 - Use subagents to parallelize test generation across folders
 - Each test must be self-contained — no dependencies on other tests
 - Do not write code (no Playwright, no Cypress) — tests are markdown with natural language steps
+- Prefer testing visible user outcomes over seed correctness or fixture inventory
diff --git a/commands/generate-tests.md b/commands/generate-tests.md
index c750253..5782d5c 100644
--- a/commands/generate-tests.md
+++ b/commands/generate-tests.md
@@ -13,15 +13,19 @@ You are orchestrating a 4-step test generation pipeline. Each step runs as an is
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## CRITICAL: User Confirmation Between Steps
+## User Confirmation Between Steps
 
-After each step (1, 2, and 3), you MUST present the summary and then ask the user for
+By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
 confirmation using the `AskUserQuestion` tool. This creates an interactive
 UI prompt that makes it clear the user needs to respond before the pipeline continues.
 
 After calling `AskUserQuestion`, wait for the user's response.
 Only proceed to the next step after they confirm.
 
+**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
+skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
+the summary. The summaries are still displayed — only the confirmation prompt is skipped.
+
 ## Before Starting
 
 Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
@@ -31,10 +35,16 @@ echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
 mkdir -p autonoma/skills autonoma/qa-tests
 ```
 
+The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+```bash
+PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
+```
+
 Read the environment variables. These are required for reporting progress back to Autonoma:
 - `AUTONOMA_API_KEY` — your Autonoma API key
 - `AUTONOMA_PROJECT_ID` — your Autonoma project ID
 - `AUTONOMA_API_URL` — Autonoma API base URL
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
 
 Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
 
@@ -119,10 +129,11 @@ print(json.dumps({'skills': skills}))
   -d @- || true
 ```
 
-4. Call `AskUserQuestion` with:
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
    - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
 
 ## Step 2: Generate Scenarios
 
@@ -141,17 +152,56 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
   -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true
 ```
 
+Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
+Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
+State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+
+Fetch and validate the artifact:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+mkdir -p "$AUTONOMA_ROOT/autonoma"
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
+  -H "Content-Type: application/json" \
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+if [ "$HTTP_STATUS" != "200" ]; then
+  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
+  exit 1
+fi
+printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json"
+```
+
+If the fetch fails or validation fails, stop the pipeline at Step 2.
+Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
+
 Spawn the `scenario-generator` subagent with the following task:
 
-> Read the knowledge base from `autonoma/AUTONOMA.md` and `autonoma/skills/`.
+> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
+> artifact from `autonoma/discover.json`.
 > Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
-> The file MUST have YAML frontmatter with scenario_count, scenarios summary, and entity_types.
+> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
+> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
+> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
+> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
+> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
 
 **After the subagent completes:**
-1. Verify `autonoma/scenarios.md` exists and is non-empty
-2. The PostToolUse hook will have validated the frontmatter format automatically
-3. Read the file and present the frontmatter summary to the user — scenario names, entity counts, entity types
+1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
+2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
+3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically
+4. Read the file and present the summary to the user — scenario names, entity counts, entity types,
+   discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
 ```bash
@@ -161,17 +211,18 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Scenarios generated. 3 test data environments defined (standard, empty, large)."}}' || true
+  -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
   -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
 ```
 
-4. Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? The standard scenario data becomes hard assertions in your tests."
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
    - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
 
 ## Step 3: Generate E2E Test Cases
 
@@ -198,6 +249,9 @@ Spawn the `test-case-generator` subagent with the following task:
 > You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests,
 > total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
+> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
+> scenario data when it is needed to test a real user-facing app behavior.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
@@ -244,12 +298,13 @@ print(json.dumps({'testCases': test_cases}))
   -d @- || true
 ```
 
-4. Call `AskUserQuestion` with:
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app."
    - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
 
-## Step 4: Implement Environment Factory
+## Step 4: Environment Factory
 
 Report step start:
 ```bash
@@ -263,22 +318,53 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Implementing Environment Factory endpoint in your backend..."}}' || true
+  -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true
 ```
 
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
+preflight validation. State plainly that both environment variables are mandatory.
+
 Spawn the `env-factory-generator` subagent with the following task:
 
-> Read the scenarios from `autonoma/scenarios.md` and implement the Autonoma Environment Factory
-> endpoint in the project's backend. The endpoint handles discover/up/down actions.
+> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
+> Implement or complete the Autonoma Environment Factory in the project's backend so it can
+> support the planned scenarios with the current SDK contract, then validate the planned scenarios
+> against that implementation.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
 > and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
-> After implementing, run integration tests to verify the endpoint works.
-> Use AUTONOMA_SIGNING_SECRET and AUTONOMA_JWT_SECRET as environment variable names.
+> Preserve the existing discover integration if it already works, and finish `up` / `down`
+> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
+> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
+> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
+> The recipe file must match the current setup API schema:
+> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
+> `name`, `description`, `create`, and `validation` with `status: "validated"`,
+> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
+> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
+> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
+> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
+> Persisted `create` must remain tokenized — never store resolved concrete values.
+> After writing the recipe file, run the preflight helper to validate all recipes against the
+> live SDK endpoint before uploading:
+> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
+> The preflight must pass for all three scenarios before Step 4 is considered complete.
 
 **After the subagent completes:**
-1. Verify the endpoint was created and tests pass
-2. Present the results to the user — what was implemented, where, test results
-3. Report any issues that need manual attention
+1. Verify the backend implementation or integration changes were made
+2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+3. Run the preflight helper if the subagent did not already do so:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+```
+If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
+4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
+5. Report which environment variables the backend now requires
+6. Report any backend issues that still need manual attention
 
 Report step complete:
 ```bash
@@ -288,7 +374,25 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Environment Factory implemented and verified."}}' || true
+  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+if [ -n "$GENERATION_ID" ]; then
+  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+  if python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d @"$RECIPE_PATH")
+    UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+    UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+    echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  else
+    echo "WARNING: scenario-recipes.json is not valid JSON, skipping upload"
+  fi
+fi
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
@@ -301,4 +405,4 @@ After all steps complete, summarize:
 - **Step 1**: Knowledge base location and core flow count
 - **Step 2**: Scenario count and entity types covered
 - **Step 3**: Total test count, folder breakdown, coverage correlation
-- **Step 4**: Endpoint location, test results, env var setup instructions
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
diff --git a/hooks/preflight_scenario_recipes.py b/hooks/preflight_scenario_recipes.py
new file mode 100644
index 0000000..b2416c2
--- /dev/null
+++ b/hooks/preflight_scenario_recipes.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""Preflight resolver and endpoint lifecycle checker for scenario recipes.
+
+Reads autonoma/scenario-recipes.json, resolves tokenized recipes into transient
+concrete payloads, then sends signed up/down requests to AUTONOMA_SDK_ENDPOINT
+for each recipe. Exits non-zero on any failure. Never rewrites the recipe file.
+"""
+import hashlib
+import hmac
+import json
+import os
+import re
+import sys
+import time
+import urllib.request
+
+# ---------------------------------------------------------------------------
+# Variable resolution
+# ---------------------------------------------------------------------------
+
+ALLOWED_STRATEGIES = {'literal', 'derived', 'faker'}
+ALLOWED_FAKER_GENERATORS = {
+    'person.firstName',
+    'person.lastName',
+    'internet.email',
+    'company.name',
+    'lorem.words',
+}
+
+# Seeded Faker generators — deterministic: same (testRunId + ":" + tokenName) → same value.
+# Uses the `Faker` library (pip install Faker) for realistic data generation.
+
+def _seed_int(seed_str: str) -> int:
+    return int(hashlib.sha256(seed_str.encode()).hexdigest(), 16)
+
+
+def _get_faker(seed_str: str):
+    """Return a seeded Faker instance."""
+    from faker import Faker
+    fake = Faker()
+    fake.seed_instance(_seed_int(seed_str))
+    return fake
+
+
+# Map generator ids to Faker method calls.
+_FAKER_METHOD_MAP = {
+    'person.firstName': lambda f: f.first_name(),
+    'person.lastName':  lambda f: f.last_name(),
+    'internet.email':   lambda f: f.email(),
+    'company.name':     lambda f: f.company(),
+    'lorem.words':      lambda f: ' '.join(f.words(3)),
+}
+
+
+def _faker_generate(generator: str, seed_str: str) -> str:
+    method = _FAKER_METHOD_MAP.get(generator)
+    if method is None:
+        raise ValueError(f'Unsupported faker generator: {generator}')
+    fake = _get_faker(seed_str)
+    return method(fake)
+
+
+def resolve_variable(var_def: dict, test_run_id: str, token_name: str) -> object:
+    """Resolve a single variable definition to a concrete value."""
+    strategy = var_def.get('strategy')
+    if strategy not in ALLOWED_STRATEGIES:
+        raise ValueError(f'Unsupported variable strategy: {strategy}')
+
+    if strategy == 'literal':
+        return var_def['value']
+
+    if strategy == 'derived':
+        source = var_def.get('source')
+        if source != 'testRunId':
+            raise ValueError(f'derived.source must be "testRunId", got: {source}')
+        fmt = var_def.get('format')
+        if not fmt or not isinstance(fmt, str):
+            raise ValueError(f'derived.format must be a non-empty string')
+        return fmt.replace('{testRunId}', test_run_id)
+
+    if strategy == 'faker':
+        generator = var_def.get('generator')
+        if not generator or not isinstance(generator, str):
+            raise ValueError(f'faker.generator must be a non-empty string')
+        if generator not in ALLOWED_FAKER_GENERATORS:
+            raise ValueError(f'Unsupported faker generator: {generator}')
+        seed_str = f'{test_run_id}:{token_name}'
+        return _faker_generate(generator, seed_str)
+
+    raise ValueError(f'Unsupported variable strategy: {strategy}')
+
+
+def _find_tokens(obj) -> set:
+    """Find all {{token}} placeholders in a JSON-like structure."""
+    tokens = set()
+    if isinstance(obj, str):
+        tokens.update(re.findall(r'\{\{(\w+)\}\}', obj))
+    elif isinstance(obj, list):
+        for item in obj:
+            tokens.update(_find_tokens(item))
+    elif isinstance(obj, dict):
+        for v in obj.values():
+            tokens.update(_find_tokens(v))
+    return tokens
+
+
+def _resolve_value(val, resolved_vars: dict):
+    """Deep-resolve a single value, replacing {{token}} patterns."""
+    if isinstance(val, str):
+        # Check for full-string replacement (entire string is one token)
+        m = re.fullmatch(r'\{\{(\w+)\}\}', val)
+        if m:
+            token = m.group(1)
+            if token not in resolved_vars:
+                raise ValueError(f'Unresolved token: {{{{{token}}}}}')
+            return resolved_vars[token]
+        # Embedded replacement
+        def _replace(match):
+            token = match.group(1)
+            if token not in resolved_vars:
+                raise ValueError(f'Unresolved token: {{{{{token}}}}}')
+            return str(resolved_vars[token])
+        result = re.sub(r'\{\{(\w+)\}\}', _replace, val)
+        return result
+    if isinstance(val, list):
+        return [_resolve_value(item, resolved_vars) for item in val]
+    if isinstance(val, dict):
+        return {k: _resolve_value(v, resolved_vars) for k, v in val.items()}
+    return val
+
+
+def resolve_recipe(recipe: dict, test_run_id: str) -> dict:
+    """Resolve a tokenized recipe create payload into a concrete payload.
+
+    Returns the resolved create dict. Raises on any resolution failure.
+    """
+    create = recipe.get('create', {})
+    variables = recipe.get('variables', {})
+
+    # Validate: every token in create has a variable definition
+    tokens_in_create = _find_tokens(create)
+    var_keys = set(variables.keys())
+
+    missing = tokens_in_create - var_keys
+    if missing:
+        raise ValueError(f'Tokens without variable definitions: {missing}')
+
+    unused = var_keys - tokens_in_create
+    if unused:
+        raise ValueError(f'Unused variable definitions: {unused}')
+
+    # Resolve all variables
+    resolved = {}
+    for name, var_def in variables.items():
+        resolved[name] = resolve_variable(var_def, test_run_id, name)
+
+    # Deep-resolve the create payload
+    resolved_create = _resolve_value(create, resolved)
+
+    # Final check: no unresolved tokens remain
+    remaining = _find_tokens(resolved_create)
+    if remaining:
+        raise ValueError(f'Unresolved tokens after resolution: {remaining}')
+
+    return resolved_create
+
+
+# ---------------------------------------------------------------------------
+# Signed HTTP helpers
+# ---------------------------------------------------------------------------
+
+def _sign(body_bytes: bytes, secret: str) -> str:
+    return hmac.new(secret.encode(), body_bytes, hashlib.sha256).hexdigest()
+
+
+def _post(url: str, payload: dict, secret: str) -> tuple:
+    """POST JSON to url with HMAC signature. Returns (status, response_dict, elapsed_ms)."""
+    body = json.dumps(payload).encode()
+    sig = _sign(body, secret)
+    req = urllib.request.Request(
+        url,
+        data=body,
+        headers={
+            'Content-Type': 'application/json',
+            'x-signature': sig,
+        },
+        method='POST',
+    )
+    start = time.time()
+    try:
+        resp = urllib.request.urlopen(req)
+        elapsed = int((time.time() - start) * 1000)
+        data = json.loads(resp.read())
+        return resp.status, data, elapsed
+    except urllib.error.HTTPError as e:
+        elapsed = int((time.time() - start) * 1000)
+        try:
+            data = json.loads(e.read())
+        except Exception:
+            data = {'error': str(e)}
+        return e.code, data, elapsed
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def generate_test_run_id(scenario_name: str) -> str:
+    ms = int(time.time() * 1000)
+    suffix = hashlib.sha256(f'{scenario_name}{ms}'.encode()).hexdigest()[:6]
+    return f'autonoma-preflight-{scenario_name}-{ms}-{suffix}'
+
+
+def preflight(recipe_path: str, endpoint: str, secret: str) -> bool:
+    """Run preflight for all recipes. Returns True on success."""
+    with open(recipe_path) as f:
+        data = json.load(f)
+
+    recipes = data.get('recipes', [])
+    all_ok = True
+    results = []
+
+    for recipe in recipes:
+        name = recipe.get('name', '<unnamed>')
+        test_run_id = generate_test_run_id(name)
+
+        # Step 1: Resolve
+        print(f'\n--- Preflight: {name} ---')
+        print(f'  testRunId: {test_run_id}')
+        try:
+            resolved_create = resolve_recipe(recipe, test_run_id)
+        except ValueError as e:
+            print(f'  FAIL (recipe compilation): {e}')
+            all_ok = False
+            results.append({'name': name, 'status': 'fail', 'phase': 'compilation', 'error': str(e)})
+            continue
+
+        # Step 2: Signed up
+        up_payload = {
+            'action': 'up',
+            'create': resolved_create,
+            'testRunId': test_run_id,
+        }
+        status, resp, up_ms = _post(endpoint, up_payload, secret)
+        print(f'  up: HTTP {status} ({up_ms}ms)')
+        if status < 200 or status >= 300:
+            print(f'  FAIL (endpoint up): HTTP {status} — {json.dumps(resp)}')
+            all_ok = False
+            results.append({'name': name, 'status': 'fail', 'phase': 'up', 'http': status})
+            continue
+
+        # Validate up response
+        for field in ('auth', 'refs', 'refsToken'):
+            if field not in resp:
+                print(f'  FAIL (endpoint up): missing field "{field}" in response')
+                all_ok = False
+                results.append({'name': name, 'status': 'fail', 'phase': 'up', 'error': f'missing {field}'})
+                break
+        else:
+            # Step 3: Signed down
+            down_payload = {
+                'action': 'down',
+                'refs': resp['refs'],
+                'refsToken': resp['refsToken'],
+                'testRunId': test_run_id,
+            }
+            d_status, d_resp, down_ms = _post(endpoint, down_payload, secret)
+            print(f'  down: HTTP {d_status} ({down_ms}ms)')
+            if d_status < 200 or d_status >= 300:
+                print(f'  FAIL (endpoint down): HTTP {d_status} — {json.dumps(d_resp)}')
+                all_ok = False
+                results.append({'name': name, 'status': 'fail', 'phase': 'down', 'http': d_status})
+                continue
+
+            print(f'  OK (up: {up_ms}ms, down: {down_ms}ms)')
+            results.append({'name': name, 'status': 'ok', 'up_ms': up_ms, 'down_ms': down_ms})
+            continue
+        # If we broke out of the for-else, continue to next recipe
+        continue
+
+    print(f'\n--- Summary ---')
+    for r in results:
+        status_str = 'OK' if r['status'] == 'ok' else f"FAIL ({r.get('phase', '?')})"
+        print(f"  {r['name']}: {status_str}")
+
+    return all_ok
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f'Usage: {sys.argv[0]} <scenario-recipes.json>')
+        sys.exit(1)
+
+    recipe_path = sys.argv[1]
+
+    # Ensure Faker is available
+    try:
+        import faker  # noqa: F401
+    except ImportError:
+        import subprocess
+        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'Faker', '-q'],
+                              stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+    endpoint = os.environ.get('AUTONOMA_SDK_ENDPOINT')
+    secret = os.environ.get('AUTONOMA_SHARED_SECRET')
+
+    if not endpoint:
+        print('ERROR: AUTONOMA_SDK_ENDPOINT is not set')
+        sys.exit(1)
+    if not secret:
+        print('ERROR: AUTONOMA_SHARED_SECRET is not set')
+        sys.exit(1)
+
+    ok = preflight(recipe_path, endpoint, secret)
+    sys.exit(0 if ok else 1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh
index c64d763..dd7e3ec 100755
--- a/hooks/validate-pipeline-output.sh
+++ b/hooks/validate-pipeline-output.sh
@@ -42,6 +42,10 @@ case "$FILE_PATH" in
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenarios.py"
     VALIDATOR_NAME="validate-scenarios"
     ;;
+  */autonoma/scenario-recipes.json)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_recipes.py"
+    VALIDATOR_NAME="validate-scenario-recipes"
+    ;;
   */autonoma/qa-tests/INDEX.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
     VALIDATOR_NAME="validate-test-index"
@@ -82,6 +86,24 @@ if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then
   exit 2
 fi
 
+# scenario-recipes.json must also pass live endpoint preflight. This is the
+# only deterministic check that the generated create payload actually works
+# against the current SDK contract.
+if [ "$VALIDATOR_NAME" = "validate-scenario-recipes" ]; then
+  PREFLIGHT_SCRIPT="$SCRIPT_DIR/preflight_scenario_recipes.py"
+  if [ ! -f "$PREFLIGHT_SCRIPT" ]; then
+    echo "VALIDATION FAILED [scenario-recipes-preflight]: Script not found: $PREFLIGHT_SCRIPT" >&2
+    exit 2
+  fi
+
+  PREFLIGHT_RESULT=$(python3 "$PREFLIGHT_SCRIPT" "$FILE_PATH" 2>&1)
+  PREFLIGHT_EXIT=$?
+  if [ $PREFLIGHT_EXIT -ne 0 ]; then
+    echo "VALIDATION FAILED [scenario-recipes-preflight]: $PREFLIGHT_RESULT" >&2
+    exit 2
+  fi
+fi
+
 # For INDEX.md, also validate directory structure
 if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
   DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py"
diff --git a/hooks/validators/validate_scenario_recipes.py b/hooks/validators/validate_scenario_recipes.py
new file mode 100644
index 0000000..14f7f50
--- /dev/null
+++ b/hooks/validators/validate_scenario_recipes.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+"""Validates autonoma/scenario-recipes.json schema."""
+import json
+import re
+import sys
+from pathlib import Path
+
+
+TYPE_PATTERN = re.compile(r"^(?:[A-Za-z][A-Za-z0-9_]*|enum\([^()]+\))(?:\[\])?$")
+TOKEN_OR_REF_PATTERN = re.compile(r"^(?:\{\{\w+\}\}|_ref:.+)$")
+
+
+def _parse_type(type_name):
+    if not isinstance(type_name, str):
+        return None
+
+    is_list = type_name.endswith('[]')
+    base = type_name[:-2] if is_list else type_name
+    if not TYPE_PATTERN.match(type_name):
+        return None
+
+    if base.startswith('enum(') and base.endswith(')'):
+        values = [value.strip() for value in base[5:-1].split(',') if value.strip()]
+        return {'kind': 'enum', 'values': values, 'is_list': is_list}
+
+    return {'kind': 'scalar', 'name': base, 'is_list': is_list}
+
+
+def _resolve_source_path(filepath, source_path):
+    recipe_dir = Path(filepath).resolve().parent
+    raw_path = Path(source_path)
+
+    if raw_path.is_absolute():
+        return raw_path
+
+    for base_dir in (recipe_dir, *recipe_dir.parents):
+        candidate = (base_dir / source_path).resolve()
+        if candidate.is_file():
+            return candidate
+
+    return (recipe_dir / source_path).resolve()
+
+
+def _load_discover_schema(filepath, source):
+    if not isinstance(source, dict):
+        return None, None
+
+    discover_path = source.get('discoverPath')
+    if not isinstance(discover_path, str) or len(discover_path.strip()) == 0:
+        return None, None
+
+    resolved_path = _resolve_source_path(filepath, discover_path)
+    if not resolved_path.is_file():
+        return None, f'source.discoverPath does not exist: {discover_path}'
+
+    try:
+        with open(resolved_path) as fh:
+            payload = json.load(fh)
+    except Exception as exc:
+        return None, f'source.discoverPath is not valid JSON: {exc}'
+
+    schema = payload.get('schema')
+    if not isinstance(schema, dict):
+        return None, 'source.discoverPath must point to a discover file with a "schema" object'
+
+    models = schema.get('models')
+    if not isinstance(models, list):
+        return None, 'source.discoverPath schema.models must be a list'
+
+    model_map = {}
+    for model in models:
+        if not isinstance(model, dict):
+            continue
+        name = model.get('name')
+        fields = model.get('fields')
+        if not isinstance(name, str) or not isinstance(fields, list):
+            continue
+        field_map = {}
+        for field in fields:
+            if not isinstance(field, dict):
+                continue
+            field_name = field.get('name')
+            field_type = field.get('type')
+            if isinstance(field_name, str) and isinstance(field_type, str):
+                field_map[field_name] = field
+        model_map[name] = field_map
+
+    return model_map, None
+
+
+def _validate_value_against_field(value, field, path):
+    parsed_type = _parse_type(field.get('type'))
+    if parsed_type is None:
+        return f'{path} has unsupported discover type: {field.get("type")}'
+
+    if isinstance(value, str) and TOKEN_OR_REF_PATTERN.match(value):
+        return None
+
+    if parsed_type['is_list']:
+        if not isinstance(value, list):
+            return f'{path} must be a list because discover type is {field.get("type")}'
+        return None
+
+    if isinstance(value, list):
+        return f'{path} must not be a list because discover type is {field.get("type")}'
+
+    if parsed_type['kind'] == 'enum' and isinstance(value, str):
+        if value not in parsed_type['values']:
+            return (
+                f'{path} has invalid enum value "{value}". '
+                f'Expected one of {parsed_type["values"]}'
+            )
+
+    return None
+
+
+def _validate_create_against_discover(create, model_map, recipe_index):
+    if model_map is None:
+        return None
+
+    for model_name, entities in create.items():
+        if model_name not in model_map:
+            return f'recipes[{recipe_index}].create.{model_name} is not present in discover schema'
+        if not isinstance(entities, list):
+            return f'recipes[{recipe_index}].create.{model_name} must be an array'
+
+        field_map = model_map[model_name]
+        for entity_index, entity in enumerate(entities):
+            if not isinstance(entity, dict):
+                return f'recipes[{recipe_index}].create.{model_name}[{entity_index}] must be an object'
+            for field_name, value in entity.items():
+                if field_name.startswith('_'):
+                    continue
+                if field_name not in field_map:
+                    return (
+                        f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} '
+                        'is not present in discover schema'
+                    )
+                error = _validate_value_against_field(
+                    value,
+                    field_map[field_name],
+                    f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name}',
+                )
+                if error is not None:
+                    return error
+
+    return None
+
+filepath = sys.argv[1]
+
+try:
+    data = json.load(open(filepath))
+except Exception as e:
+    print(f'Invalid JSON: {e}')
+    sys.exit(1)
+
+if not isinstance(data, dict):
+    print('Root must be a JSON object')
+    sys.exit(1)
+
+required = ['version', 'source', 'validationMode', 'recipes']
+missing = [f for f in required if f not in data]
+if missing:
+    print(f'Missing required fields: {missing}')
+    sys.exit(1)
+
+version = data.get('version')
+if version != 1:
+    print('version must be exactly 1')
+    sys.exit(1)
+
+source = data.get('source')
+if not isinstance(source, dict):
+    print('source must be an object')
+    sys.exit(1)
+
+for field in ['discoverPath', 'scenariosPath']:
+    value = source.get(field)
+    if not isinstance(value, str) or len(value.strip()) == 0:
+        print(f'source.{field} must be a non-empty string')
+        sys.exit(1)
+
+discover_model_map, discover_error = _load_discover_schema(filepath, source)
+if discover_error is not None:
+    print(discover_error)
+    sys.exit(1)
+
+validation_mode = data.get('validationMode')
+valid_modes = {'sdk-check', 'endpoint-lifecycle'}
+if validation_mode not in valid_modes:
+    print(f'validationMode must be one of {valid_modes}, got: {validation_mode}')
+    sys.exit(1)
+
+recipes = data.get('recipes')
+if not isinstance(recipes, list) or len(recipes) < 3:
+    print('recipes must be an array with at least 3 entries')
+    sys.exit(1)
+
+required_names = {'standard', 'empty', 'large'}
+found_names = set()
+
+for i, recipe in enumerate(recipes):
+    if not isinstance(recipe, dict):
+        print(f'recipes[{i}] must be an object')
+        sys.exit(1)
+
+    for field in ['name', 'description', 'create', 'validation']:
+        if field not in recipe:
+            print(f'recipes[{i}] missing required field: {field}')
+            sys.exit(1)
+
+    name = recipe.get('name')
+    if not isinstance(name, str) or len(name.strip()) == 0:
+        print(f'recipes[{i}].name must be a non-empty string')
+        sys.exit(1)
+    found_names.add(name)
+
+    description = recipe.get('description')
+    if not isinstance(description, str) or len(description.strip()) == 0:
+        print(f'recipes[{i}].description must be a non-empty string')
+        sys.exit(1)
+
+    create = recipe.get('create')
+    if not isinstance(create, dict) or len(create) == 0:
+        print(f'recipes[{i}].create must be a non-empty object')
+        sys.exit(1)
+    create_error = _validate_create_against_discover(create, discover_model_map, i)
+    if create_error is not None:
+        print(create_error)
+        sys.exit(1)
+
+    validation = recipe.get('validation')
+    if not isinstance(validation, dict):
+        print(f'recipes[{i}].validation must be an object')
+        sys.exit(1)
+
+    for field in ['status', 'method', 'phase']:
+        if field not in validation:
+            print(f'recipes[{i}].validation missing required field: {field}')
+            sys.exit(1)
+
+    if validation.get('status') != 'validated':
+        print(f'recipes[{i}].validation.status must be exactly "validated"')
+        sys.exit(1)
+
+    if validation.get('phase') != 'ok':
+        print(f'recipes[{i}].validation.phase must be exactly "ok"')
+        sys.exit(1)
+
+    method = validation.get('method')
+    valid_methods = {'checkScenario', 'checkAllScenarios', 'endpoint-up-down'}
+    if method not in valid_methods:
+        print(f'recipes[{i}].validation.method must be one of {valid_methods}, got: {method}')
+        sys.exit(1)
+
+    for field in ['up_ms', 'down_ms']:
+        if field in validation:
+            value = validation.get(field)
+            if not isinstance(value, int) or value < 0:
+                print(f'recipes[{i}].validation.{field} must be a non-negative integer')
+                sys.exit(1)
+
+    # --- variables validation (optional) ---
+    variables = recipe.get('variables')
+    if variables is not None:
+        if not isinstance(variables, dict):
+            print(f'recipes[{i}].variables must be an object')
+            sys.exit(1)
+
+        # Find all tokens used in create
+        def _find_tokens(obj):
+            tokens = set()
+            if isinstance(obj, str):
+                tokens.update(re.findall(r'\{\{(\w+)\}\}', obj))
+            elif isinstance(obj, list):
+                for item in obj:
+                    tokens.update(_find_tokens(item))
+            elif isinstance(obj, dict):
+                for v in obj.values():
+                    tokens.update(_find_tokens(v))
+            return tokens
+
+        tokens_in_create = _find_tokens(create)
+        var_keys = set(variables.keys())
+
+        missing_vars = tokens_in_create - var_keys
+        if missing_vars:
+            print(f'recipes[{i}]: tokens without variable definitions: {sorted(missing_vars)}')
+            sys.exit(1)
+
+        unused_vars = var_keys - tokens_in_create
+        if unused_vars:
+            print(f'recipes[{i}]: unused variable definitions: {sorted(unused_vars)}')
+            sys.exit(1)
+
+        allowed_strategies = {'literal', 'derived', 'faker'}
+        for var_name, var_def in variables.items():
+            if not isinstance(var_def, dict):
+                print(f'recipes[{i}].variables.{var_name} must be an object')
+                sys.exit(1)
+            strategy = var_def.get('strategy')
+            if strategy not in allowed_strategies:
+                print(f'recipes[{i}].variables.{var_name}.strategy must be one of {allowed_strategies}, got: {strategy}')
+                sys.exit(1)
+            if strategy == 'literal':
+                if 'value' not in var_def:
+                    print(f'recipes[{i}].variables.{var_name}: literal must have "value"')
+                    sys.exit(1)
+                val = var_def['value']
+                if not isinstance(val, (str, int, float, bool)) and val is not None:
+                    print(f'recipes[{i}].variables.{var_name}: literal.value must be a scalar')
+                    sys.exit(1)
+            elif strategy == 'derived':
+                if var_def.get('source') != 'testRunId':
+                    print(f'recipes[{i}].variables.{var_name}: derived.source must be "testRunId"')
+                    sys.exit(1)
+                fmt = var_def.get('format')
+                if not isinstance(fmt, str) or len(fmt.strip()) == 0:
+                    print(f'recipes[{i}].variables.{var_name}: derived.format must be a non-empty string')
+                    sys.exit(1)
+            elif strategy == 'faker':
+                gen = var_def.get('generator')
+                if not isinstance(gen, str) or len(gen.strip()) == 0:
+                    print(f'recipes[{i}].variables.{var_name}: faker.generator must be a non-empty string')
+                    sys.exit(1)
+
+missing_names = required_names - found_names
+if missing_names:
+    print(f'Missing required recipes: {missing_names}')
+    sys.exit(1)
+
+print('OK')
diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md
index 509f3a5..15edf76 100644
--- a/skills/generate-tests/SKILL.md
+++ b/skills/generate-tests/SKILL.md
@@ -13,50 +13,69 @@ You are orchestrating a 4-step test generation pipeline. Each step runs as an is
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## CRITICAL: User Confirmation Between Steps
+## User Confirmation Between Steps
 
-After each step (1, 2, and 3), you MUST present the summary and then ask the user for
+By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
 confirmation using the `AskUserQuestion` tool. This creates an interactive
 UI prompt that makes it clear the user needs to respond before the pipeline continues.
 
 After calling `AskUserQuestion`, wait for the user's response.
 Only proceed to the next step after they confirm.
 
+**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
+skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
+the summary. The summaries are still displayed — only the confirmation prompt is skipped.
+
 ## Before Starting
 
-Create the output directory:
+Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
 ```bash
+AUTONOMA_ROOT="$(pwd)"
+echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
 mkdir -p autonoma/skills autonoma/qa-tests
 ```
 
+The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+```bash
+PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
+```
+
 Read the environment variables. These are required for reporting progress back to Autonoma:
 - `AUTONOMA_API_KEY` — your Autonoma API key
 - `AUTONOMA_PROJECT_ID` — your Autonoma project ID
 - `AUTONOMA_API_URL` — Autonoma API base URL
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
+
+Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
 
 Create the generation record so the dashboard can track progress in real time:
 ```bash
-RESPONSE=$(curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations" \
+RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\"}" 2>/dev/null || echo '{}')
-GENERATION_ID=$(echo "$RESPONSE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
+  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
+GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
 mkdir -p autonoma
 echo "$GENERATION_ID" > autonoma/.generation-id
 echo "Generation ID: $GENERATION_ID"
 ```
 
-If `GENERATION_ID` is empty, continue anyway — reporting is best-effort and must never block test generation.
+If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
 
 ## Step 1: Generate Knowledge Base
 
 Report step start:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' 2>/dev/null || true
+  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true
 ```
 
 Spawn the `kb-generator` subagent with the following task:
@@ -75,79 +94,128 @@ Spawn the `kb-generator` subagent with the following task:
 
 Report step complete and upload skills:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' 2>/dev/null || true
+  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true
 
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
+root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
 skills = []
-d = 'autonoma/skills'
+d = os.path.join(root, 'autonoma/skills')
 if os.path.isdir(d):
     for f in os.listdir(d):
         if f.endswith('.md'):
             with open(os.path.join(d, f)) as fh:
                 skills.append({'name': f, 'content': fh.read()})
 print(json.dumps({'skills': skills}))
-" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/artifacts" \
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d @- 2>/dev/null || true
+  -d @- || true
 ```
 
-4. Call `AskUserQuestion` with:
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
    - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
 
 ## Step 2: Generate Scenarios
 
 Report step start:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' 2>/dev/null || true
+  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true
 ```
 
+Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
+Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
+State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+
+Fetch and validate the artifact:
+```bash
+mkdir -p autonoma
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
+  -H "Content-Type: application/json" \
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+if [ "$HTTP_STATUS" != "200" ]; then
+  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
+  exit 1
+fi
+printf '%s\n' "$DISCOVER_BODY" > autonoma/discover.json
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" autonoma/discover.json
+```
+
+If the fetch fails or validation fails, stop the pipeline at Step 2.
+Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
+
 Spawn the `scenario-generator` subagent with the following task:
 
-> Read the knowledge base from `autonoma/AUTONOMA.md` and `autonoma/skills/`.
+> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
+> artifact from `autonoma/discover.json`.
 > Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
-> The file MUST have YAML frontmatter with scenario_count, scenarios summary, and entity_types.
+> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
+> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
+> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
+> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
+> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
 
 **After the subagent completes:**
-1. Verify `autonoma/scenarios.md` exists and is non-empty
-2. The PostToolUse hook will have validated the frontmatter format automatically
-3. Read the file and present the frontmatter summary to the user — scenario names, entity counts, entity types
+1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
+2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
+3. The PostToolUse hook will have validated the frontmatter format automatically
+4. Read the file and present the frontmatter summary to the user — scenario names, entity counts,
+   entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' 2>/dev/null || true
+  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
 ```
 
-4. Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? The standard scenario data becomes hard assertions in your tests."
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
    - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
 
 ## Step 3: Generate E2E Test Cases
 
 Report step start:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' 2>/dev/null || true
+  -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' || true
 ```
 
 Spawn the `test-case-generator` subagent with the following task:
@@ -158,6 +226,9 @@ Spawn the `test-case-generator` subagent with the following task:
 > You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests,
 > total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
+> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
+> scenario data when it is needed to test a real user-facing app behavior.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
@@ -167,20 +238,24 @@ Spawn the `test-case-generator` subagent with the following task:
 
 Report step complete and upload test cases:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' 2>/dev/null || true
+  -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' || true
 
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
+proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
+qa_dir = os.path.join(proj_root, 'autonoma/qa-tests')
 test_cases = []
-for root, dirs, files in os.walk('autonoma/qa-tests'):
+for root, dirs, files in os.walk(qa_dir):
     for f in files:
         if f.endswith('.md') and f != 'INDEX.md':
             path = os.path.join(root, f)
-            folder = os.path.relpath(root, 'autonoma/qa-tests')
+            folder = os.path.relpath(root, qa_dir)
             with open(path) as fh:
                 content = fh.read()
             entry = {'name': f, 'content': content}
@@ -188,49 +263,104 @@ for root, dirs, files in os.walk('autonoma/qa-tests'):
                 entry['folder'] = folder
             test_cases.append(entry)
 print(json.dumps({'testCases': test_cases}))
-" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/artifacts" \
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d @- 2>/dev/null || true
+  -d @- || true
 ```
 
-4. Call `AskUserQuestion` with:
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app."
    - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
-5. Wait for the user's response before proceeding.
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
 
-## Step 4: Implement Environment Factory
+## Step 4: Environment Factory
 
 Report step start:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' 2>/dev/null || true
+  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true
 ```
 
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
+preflight validation. State plainly that both environment variables are mandatory.
+
 Spawn the `env-factory-generator` subagent with the following task:
 
-> Read the scenarios from `autonoma/scenarios.md` and implement the Autonoma Environment Factory
-> endpoint in the project's backend. The endpoint handles discover/up/down actions.
+> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
+> Implement or complete the Autonoma Environment Factory in the project's backend so it can
+> support the planned scenarios with the current SDK contract, then validate the planned scenarios
+> against that implementation.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
 > and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
-> After implementing, run integration tests to verify the endpoint works.
-> Use AUTONOMA_SIGNING_SECRET and AUTONOMA_JWT_SECRET as environment variable names.
+> Preserve the existing discover integration if it already works, and finish `up` / `down`
+> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
+> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
+> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
+> The recipe file must match the current setup API schema:
+> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
+> `name`, `description`, `create`, and `validation` with `status: "validated"`,
+> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
+> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
+> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
+> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
+> Persisted `create` must remain tokenized — never store resolved concrete values.
+> After writing the recipe file, run the preflight helper to validate all recipes against the
+> live SDK endpoint before uploading:
+> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
+> The preflight must pass for all three scenarios before Step 4 is considered complete.
 
 **After the subagent completes:**
-1. Verify the endpoint was created and tests pass
-2. Present the results to the user — what was implemented, where, test results
-3. Report any issues that need manual attention
+1. Verify the backend implementation or integration changes were made
+2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+3. Run the preflight helper if the subagent did not already do so:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+```
+If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
+4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
+5. Report which environment variables the backend now requires
+6. Report any backend issues that still need manual attention
 
 Report step complete:
 ```bash
-GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
-[ -n "$GENERATION_ID" ] && curl -sf -X POST "${AUTONOMA_API_URL}/v1/generation/generations/${GENERATION_ID}/events" \
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+if [ -n "$GENERATION_ID" ]; then
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+  if python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d @"$RECIPE_PATH")
+    UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+    UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+    echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  else
+    echo "WARNING: scenario-recipes.json is not valid JSON, skipping upload"
+  fi
+fi
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' 2>/dev/null || true
+  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true
 ```
 
 ## Completion
@@ -239,4 +369,4 @@ After all steps complete, summarize:
 - **Step 1**: Knowledge base location and core flow count
 - **Step 2**: Scenario count and entity types covered
 - **Step 3**: Total test count, folder breakdown, coverage correlation
-- **Step 4**: Endpoint location, test results, env var setup instructions
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results

From ac9ebf56e5ccbd998fa0a4d9faa1c99220f55508 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Tue, 7 Apr 2026 19:24:45 -0300
Subject: [PATCH 03/33] test: scenario recipe and validation test suite

Add comprehensive tests for preflight variable resolution, discover
schema validation, pipeline output routing, scenario recipe validation,
and expanded scenarios validator coverage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_preflight_scenario_recipes.py | 152 ++++++++++
 tests/test_validate_discover.py          |  98 ++++++
 tests/test_validate_pipeline_output.py   | 184 ++++++++++++
 tests/test_validate_scenario_recipes.py  | 362 +++++++++++++++++++++++
 tests/test_validate_scenarios.py         | 123 +++++++-
 5 files changed, 914 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_preflight_scenario_recipes.py
 create mode 100644 tests/test_validate_discover.py
 create mode 100644 tests/test_validate_pipeline_output.py
 create mode 100644 tests/test_validate_scenario_recipes.py

diff --git a/tests/test_preflight_scenario_recipes.py b/tests/test_preflight_scenario_recipes.py
new file mode 100644
index 0000000..6bb1b44
--- /dev/null
+++ b/tests/test_preflight_scenario_recipes.py
@@ -0,0 +1,152 @@
+"""Tests for hooks/preflight_scenario_recipes.py resolver logic."""
+import sys
+import os
+
+# Add hooks dir to path so we can import the module
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'hooks'))
+
+from preflight_scenario_recipes import (
+    resolve_variable,
+    resolve_recipe,
+    _find_tokens,
+    _faker_generate,
+)
+import pytest
+
+
+# --- resolve_variable tests ---
+
+def test_literal_string():
+    v = resolve_variable({'strategy': 'literal', 'value': 'hello'}, 'run1', 'tok')
+    assert v == 'hello'
+
+
+def test_literal_number():
+    v = resolve_variable({'strategy': 'literal', 'value': 42}, 'run1', 'tok')
+    assert v == 42
+
+
+def test_literal_null():
+    v = resolve_variable({'strategy': 'literal', 'value': None}, 'run1', 'tok')
+    assert v is None
+
+
+def test_derived():
+    v = resolve_variable(
+        {'strategy': 'derived', 'source': 'testRunId', 'format': 'user+{testRunId}@example.com'},
+        'abc-123', 'tok',
+    )
+    assert v == 'user+abc-123@example.com'
+
+
+def test_faker_deterministic():
+    """Same testRunId + token name → same value."""
+    v1 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run1', 'first')
+    v2 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run1', 'first')
+    assert v1 == v2
+    assert isinstance(v1, str) and len(v1) > 0
+
+
+def test_faker_different_run_id():
+    """Different testRunId → different value (with high probability)."""
+    v1 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run-a', 'first')
+    v2 = resolve_variable({'strategy': 'faker', 'generator': 'person.firstName'}, 'run-b', 'first')
+    # Not guaranteed but extremely likely with different seeds
+    # We just check both produce valid strings
+    assert isinstance(v1, str)
+    assert isinstance(v2, str)
+
+
+def test_faker_email():
+    v = resolve_variable({'strategy': 'faker', 'generator': 'internet.email'}, 'run1', 'email')
+    assert '@' in v
+
+
+def test_faker_company():
+    v = resolve_variable({'strategy': 'faker', 'generator': 'company.name'}, 'run1', 'co')
+    assert isinstance(v, str) and len(v) > 0
+
+
+def test_faker_lorem():
+    v = resolve_variable({'strategy': 'faker', 'generator': 'lorem.words'}, 'run1', 'w')
+    assert ' ' in v  # multiple words
+
+
+def test_unsupported_faker_generator():
+    with pytest.raises(ValueError, match='Unsupported faker generator'):
+        resolve_variable({'strategy': 'faker', 'generator': 'address.city'}, 'run1', 'tok')
+
+
+def test_unsupported_strategy():
+    with pytest.raises(ValueError, match='Unsupported variable strategy'):
+        resolve_variable({'strategy': 'random'}, 'run1', 'tok')
+
+
+# --- resolve_recipe tests ---
+
+def test_resolve_full_recipe():
+    recipe = {
+        'create': {
+            'User': [{'email': '{{owner_email}}', 'name': '{{first_name}}'}],
+        },
+        'variables': {
+            'owner_email': {'strategy': 'derived', 'source': 'testRunId', 'format': 'owner+{testRunId}@example.com'},
+            'first_name': {'strategy': 'faker', 'generator': 'person.firstName'},
+        },
+    }
+    result = resolve_recipe(recipe, 'test-run-1')
+    assert result['User'][0]['email'] == 'owner+test-run-1@example.com'
+    assert isinstance(result['User'][0]['name'], str)
+
+
+def test_embedded_string_replacement():
+    recipe = {
+        'create': {
+            'Org': [{'name': 'Org-{{suffix}}'}],
+        },
+        'variables': {
+            'suffix': {'strategy': 'literal', 'value': 'acme'},
+        },
+    }
+    result = resolve_recipe(recipe, 'run1')
+    assert result['Org'][0]['name'] == 'Org-acme'
+
+
+def test_missing_variable_fails():
+    recipe = {
+        'create': {'User': [{'email': '{{missing}}'}]},
+        'variables': {},
+    }
+    with pytest.raises(ValueError, match='Tokens without variable definitions'):
+        resolve_recipe(recipe, 'run1')
+
+
+def test_unused_variable_fails():
+    recipe = {
+        'create': {'User': [{'email': 'static@example.com'}]},
+        'variables': {
+            'extra': {'strategy': 'literal', 'value': 'unused'},
+        },
+    }
+    with pytest.raises(ValueError, match='Unused variable definitions'):
+        resolve_recipe(recipe, 'run1')
+
+
+def test_concrete_recipe_no_variables():
+    """Recipe with no tokens and no variables should resolve fine."""
+    recipe = {
+        'create': {'Org': [{'name': 'Acme'}]},
+    }
+    result = resolve_recipe(recipe, 'run1')
+    assert result == {'Org': [{'name': 'Acme'}]}
+
+
+# --- _find_tokens tests ---
+
+def test_find_tokens_nested():
+    obj = {'a': [{'b': '{{x}} and {{y}}'}], 'c': '{{z}}'}
+    assert _find_tokens(obj) == {'x', 'y', 'z'}
+
+
+def test_find_tokens_no_tokens():
+    assert _find_tokens({'a': 'hello'}) == set()
diff --git a/tests/test_validate_discover.py b/tests/test_validate_discover.py
new file mode 100644
index 0000000..0a05909
--- /dev/null
+++ b/tests/test_validate_discover.py
@@ -0,0 +1,98 @@
+"""Tests for validate_discover.py."""
+from conftest import run_validator
+
+SCRIPT = 'validate_discover.py'
+
+VALID = """\
+{
+  "schema": {
+    "models": [
+      {
+        "name": "Organization",
+        "fields": [
+          {
+            "name": "id",
+            "type": "String",
+            "isRequired": true,
+            "isId": true,
+            "hasDefault": true
+          }
+        ]
+      }
+    ],
+    "edges": [
+      {
+        "from": "User",
+        "to": "Organization",
+        "localField": "organizationId",
+        "foreignField": "id",
+        "nullable": false
+      }
+    ],
+    "relations": [
+      {
+        "parentModel": "Organization",
+        "childModel": "User",
+        "parentField": "users",
+        "childField": "organizationId"
+      }
+    ],
+    "scopeField": "organizationId"
+  }
+}
+"""
+
+
+def test_valid_discover():
+    code, out = run_validator(SCRIPT, VALID, filename='discover.json')
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_invalid_json():
+    code, out = run_validator(SCRIPT, '{not-json', filename='discover.json')
+    assert code == 1
+    assert 'Invalid JSON' in out
+
+
+def test_missing_schema():
+    code, out = run_validator(SCRIPT, '{}', filename='discover.json')
+    assert code == 1
+    assert 'must contain a "schema" object' in out
+
+
+def test_missing_scope_field():
+    content = VALID.replace('    "scopeField": "organizationId"\n', '')
+    content = content.replace('    ],\n  }\n}\n', '    ]\n  }\n}\n')
+    code, out = run_validator(SCRIPT, content, filename='discover.json')
+    assert code == 1
+    assert 'schema is missing required fields' in out
+
+
+def test_model_requires_fields():
+    content = VALID.replace('"fields": [', '"oops": [')
+    code, out = run_validator(SCRIPT, content, filename='discover.json')
+    assert code == 1
+    assert 'fields must be a list' in out
+
+
+def test_accepts_enum_and_list_type_formats():
+    content = VALID.replace(
+        '"type": "String"',
+        '"type": "enum(slack)"',
+        1,
+    ).replace(
+        '"hasDefault": true',
+        '"hasDefault": true\n          },\n          {\n            "name": "teamSlugs",\n            "type": "String[]",\n            "isRequired": true,\n            "isId": false,\n            "hasDefault": true',
+        1,
+    )
+    code, out = run_validator(SCRIPT, content, filename='discover.json')
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_rejects_unsupported_type_format():
+    content = VALID.replace('"type": "String"', '"type": "enum(slack"', 1)
+    code, out = run_validator(SCRIPT, content, filename='discover.json')
+    assert code == 1
+    assert 'must use a supported type format' in out
diff --git a/tests/test_validate_pipeline_output.py b/tests/test_validate_pipeline_output.py
new file mode 100644
index 0000000..3077ba2
--- /dev/null
+++ b/tests/test_validate_pipeline_output.py
@@ -0,0 +1,184 @@
+"""Tests for hooks/validate-pipeline-output.sh."""
+import json
+import os
+import subprocess
+import tempfile
+import threading
+from contextlib import contextmanager
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SCRIPT = ROOT / 'hooks' / 'validate-pipeline-output.sh'
+
+VALID_DISCOVER = {
+    'schema': {
+        'models': [
+            {
+                'name': 'Organization',
+                'fields': [
+                    {
+                        'name': 'name',
+                        'type': 'String',
+                        'isRequired': True,
+                        'isId': False,
+                        'hasDefault': False,
+                    },
+                ],
+            },
+        ],
+        'edges': [],
+        'relations': [],
+        'scopeField': 'organizationId',
+    },
+}
+
+VALID_RECIPES = {
+    'version': 1,
+    'source': {
+        'discoverPath': 'autonoma/discover.json',
+        'scenariosPath': 'autonoma/scenarios.md',
+    },
+    'validationMode': 'sdk-check',
+    'recipes': [
+        {
+            'name': 'standard',
+            'description': 'Standard baseline',
+            'create': {'Organization': [{'name': 'Acme Standard'}]},
+            'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+        },
+        {
+            'name': 'empty',
+            'description': 'Empty workspace',
+            'create': {'Organization': [{'name': 'Acme Empty'}]},
+            'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+        },
+        {
+            'name': 'large',
+            'description': 'Large workspace',
+            'create': {'Organization': [{'name': 'Acme Large'}]},
+            'validation': {'status': 'validated', 'method': 'endpoint-up-down', 'phase': 'ok'},
+        },
+    ],
+}
+
+
+def _run_hook(files: dict[str, str], target: str, env: dict[str, str]) -> tuple[int, str, str]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for relpath, content in files.items():
+            fullpath = Path(tmpdir) / relpath
+            fullpath.parent.mkdir(parents=True, exist_ok=True)
+            fullpath.write_text(content)
+
+        target_path = str(Path(tmpdir) / target)
+        payload = json.dumps({'tool_input': {'file_path': target_path}})
+        result = subprocess.run(
+            ['bash', str(SCRIPT)],
+            input=payload,
+            text=True,
+            capture_output=True,
+            env=env,
+        )
+        return result.returncode, result.stdout.strip(), result.stderr.strip()
+
+
+@contextmanager
+def _sdk_server(up_status: int = 200, down_status: int = 200):
+    class Handler(BaseHTTPRequestHandler):
+        def do_POST(self):
+            length = int(self.headers.get('Content-Length', '0'))
+            body = json.loads(self.rfile.read(length) or '{}')
+            action = body.get('action')
+
+            if action == 'up':
+                status = up_status
+                response = {'auth': {}, 'refs': {'organization': ['org_1']}, 'refsToken': 'token_1'}
+                if status >= 400:
+                    response = {'error': 'up failed'}
+            elif action == 'down':
+                status = down_status
+                response = {'ok': True}
+                if status >= 400:
+                    response = {'error': 'down failed'}
+            else:
+                status = 400
+                response = {'error': 'unknown action'}
+
+            encoded = json.dumps(response).encode()
+            self.send_response(status)
+            self.send_header('Content-Type', 'application/json')
+            self.send_header('Content-Length', str(len(encoded)))
+            self.end_headers()
+            self.wfile.write(encoded)
+
+        def log_message(self, format, *args):
+            return
+
+    server = ThreadingHTTPServer(('127.0.0.1', 0), Handler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    try:
+        yield f'http://127.0.0.1:{server.server_address[1]}'
+    finally:
+        server.shutdown()
+        thread.join()
+
+
+def test_scenario_recipes_hook_requires_preflight_env():
+    env = os.environ.copy()
+    env.pop('AUTONOMA_SDK_ENDPOINT', None)
+    env.pop('AUTONOMA_SHARED_SECRET', None)
+
+    code, _, err = _run_hook(
+        {
+            'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
+            'autonoma/discover.json': json.dumps(VALID_DISCOVER),
+        },
+        'autonoma/scenario-recipes.json',
+        env,
+    )
+
+    assert code == 2
+    assert 'scenario-recipes-preflight' in err
+    assert 'AUTONOMA_SDK_ENDPOINT is not set' in err
+
+
+def test_scenario_recipes_hook_runs_preflight_successfully():
+    with _sdk_server() as endpoint:
+        env = os.environ.copy()
+        env['AUTONOMA_SDK_ENDPOINT'] = endpoint
+        env['AUTONOMA_SHARED_SECRET'] = 'test-secret'
+
+        code, out, err = _run_hook(
+            {
+                'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
+                'autonoma/discover.json': json.dumps(VALID_DISCOVER),
+            },
+            'autonoma/scenario-recipes.json',
+            env,
+        )
+
+    assert code == 0
+    assert out == ''
+    assert err == ''
+
+
+def test_scenario_recipes_hook_blocks_failed_preflight():
+    with _sdk_server(up_status=500) as endpoint:
+        env = os.environ.copy()
+        env['AUTONOMA_SDK_ENDPOINT'] = endpoint
+        env['AUTONOMA_SHARED_SECRET'] = 'test-secret'
+
+        code, _, err = _run_hook(
+            {
+                'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
+                'autonoma/discover.json': json.dumps(VALID_DISCOVER),
+            },
+            'autonoma/scenario-recipes.json',
+            env,
+        )
+
+    assert code == 2
+    assert 'scenario-recipes-preflight' in err
+    assert 'HTTP 500' in err
diff --git a/tests/test_validate_scenario_recipes.py b/tests/test_validate_scenario_recipes.py
new file mode 100644
index 0000000..06ca47d
--- /dev/null
+++ b/tests/test_validate_scenario_recipes.py
@@ -0,0 +1,362 @@
+"""Tests for validate_scenario_recipes.py."""
+import json
+from conftest import run_validator, run_validator_with_dir
+
+SCRIPT = 'validate_scenario_recipes.py'
+
+VALID_DISCOVER = {
+    'schema': {
+        'models': [
+            {
+                'name': 'Organization',
+                'fields': [
+                    {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    {'name': 'communicationChannel', 'type': 'enum(slack)', 'isRequired': False, 'isId': False, 'hasDefault': False},
+                    {'name': 'teamSlugs', 'type': 'String[]', 'isRequired': True, 'isId': False, 'hasDefault': True},
+                ],
+            },
+            {
+                'name': 'User',
+                'fields': [
+                    {'name': 'email', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                ],
+            },
+        ],
+        'edges': [],
+        'relations': [],
+        'scopeField': 'organizationId',
+    }
+}
+
+VALID_DATA = {
+    'version': 1,
+    'source': {
+        'discoverPath': 'autonoma/discover.json',
+        'scenariosPath': 'autonoma/scenarios.md',
+    },
+    'validationMode': 'sdk-check',
+    'recipes': [
+        {
+            'name': 'standard',
+            'description': 'Realistic variety for core flows',
+            'create': {
+                'Organization': [{'name': 'Standard Org {{testRunId}}'}],
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'checkScenario',
+                'phase': 'ok',
+                'up_ms': 12,
+                'down_ms': 8,
+            },
+        },
+        {
+            'name': 'empty',
+            'description': 'Empty-state scenario',
+            'create': {
+                'Organization': [{'name': 'Empty Org {{testRunId}}'}],
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'checkScenario',
+                'phase': 'ok',
+            },
+        },
+        {
+            'name': 'large',
+            'description': 'High-volume scenario',
+            'create': {
+                'Organization': [{'name': 'Large Org {{testRunId}}'}],
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'endpoint-up-down',
+                'phase': 'ok',
+                'up_ms': 120,
+                'down_ms': 65,
+            },
+        },
+    ],
+}
+
+VALID_DATA_WITH_VARIABLES = {
+    'version': 1,
+    'source': {
+        'discoverPath': 'autonoma/discover.json',
+        'scenariosPath': 'autonoma/scenarios.md',
+    },
+    'validationMode': 'sdk-check',
+    'recipes': [
+        {
+            'name': 'standard',
+            'description': 'Realistic variety for core flows',
+            'create': {
+                'User': [{'email': '{{owner_email}}'}],
+            },
+            'variables': {
+                'owner_email': {
+                    'strategy': 'derived',
+                    'source': 'testRunId',
+                    'format': 'owner+{testRunId}@example.com',
+                },
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'checkScenario',
+                'phase': 'ok',
+            },
+        },
+        {
+            'name': 'empty',
+            'description': 'Empty-state scenario',
+            'create': {
+                'Organization': [{'name': 'Empty Org'}],
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'checkScenario',
+                'phase': 'ok',
+            },
+        },
+        {
+            'name': 'large',
+            'description': 'High-volume scenario',
+            'create': {
+                'Organization': [{'name': '{{company}}'}],
+            },
+            'variables': {
+                'company': {
+                    'strategy': 'faker',
+                    'generator': 'company.name',
+                },
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'endpoint-up-down',
+                'phase': 'ok',
+            },
+        },
+    ],
+}
+
+
+def _json(data):
+    return json.dumps(data)
+
+
+def _run_recipe_validator(data, discover=None):
+    if discover is None:
+        discover = VALID_DISCOVER
+    files = {
+        'autonoma/scenario-recipes.json': _json(data),
+        'autonoma/discover.json': _json(discover),
+    }
+    return run_validator_with_dir(SCRIPT, files, 'autonoma/scenario-recipes.json')
+
+
+def test_valid_scenario_recipes():
+    code, out = _run_recipe_validator(VALID_DATA)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_valid_with_variables():
+    code, out = _run_recipe_validator(VALID_DATA_WITH_VARIABLES)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_valid_concrete_without_variables():
+    """Fully concrete recipes (no tokens) should pass without variables."""
+    data = {
+        'version': 1,
+        'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'},
+        'validationMode': 'sdk-check',
+        'recipes': [
+            {'name': 'standard', 'description': 'Std', 'create': {'Organization': [{'name': 'Acme'}]},
+             'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}},
+            {'name': 'empty', 'description': 'Empty', 'create': {'Organization': [{'name': 'None'}]},
+             'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}},
+            {'name': 'large', 'description': 'Large', 'create': {'Organization': [{'name': 'Big'}]},
+             'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'}},
+        ],
+    }
+    code, out = _run_recipe_validator(data)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_invalid_json():
+    code, out = run_validator(SCRIPT, '{not json', 'scenario-recipes.json')
+    assert code == 1
+    assert 'Invalid JSON' in out
+
+
+def test_missing_required_fields():
+    code, out = _run_recipe_validator({'recipes': []})
+    assert code == 1
+    assert 'Missing required fields' in out
+
+
+def test_invalid_validation_mode():
+    data = {**VALID_DATA, 'validationMode': 'rollback'}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'validationMode must be one of' in out
+
+
+def test_missing_required_recipe_name():
+    data = {**VALID_DATA}
+    data['recipes'] = [
+        VALID_DATA['recipes'][0],
+        VALID_DATA['recipes'][1],
+        {
+            'name': 'custom',
+            'description': 'Extra recipe',
+            'create': {
+                'Organization': [{'name': 'Custom Org {{testRunId}}'}],
+            },
+            'validation': {
+                'status': 'validated',
+                'method': 'checkScenario',
+                'phase': 'ok',
+            },
+        },
+    ]
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'Missing required recipes' in out
+
+
+def test_recipe_requires_create():
+    data = {**VALID_DATA}
+    data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']]
+    data['recipes'][0]['create'] = {}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'create must be a non-empty object' in out
+
+
+def test_validation_status_must_be_validated():
+    data = {**VALID_DATA}
+    data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']]
+    data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation'])
+    data['recipes'][0]['validation']['status'] = 'draft'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'validation.status must be exactly "validated"' in out
+
+
+def test_validation_phase_must_be_ok():
+    data = {**VALID_DATA}
+    data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']]
+    data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation'])
+    data['recipes'][0]['validation']['phase'] = 'up'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'validation.phase must be exactly "ok"' in out
+
+
+def test_validation_method_must_be_known():
+    data = {**VALID_DATA}
+    data['recipes'] = [dict(recipe) for recipe in VALID_DATA['recipes']]
+    data['recipes'][0]['validation'] = dict(data['recipes'][0]['validation'])
+    data['recipes'][0]['validation']['method'] = 'custom'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'validation.method must be one of' in out
+
+
+# --- Variables validation tests ---
+
+def test_token_without_variable_definition():
+    """Token in create with no matching variable should fail."""
+    import copy
+    data = copy.deepcopy(VALID_DATA_WITH_VARIABLES)
+    # Add a token but no variable
+    data['recipes'][0]['create']['User'][0]['name'] = '{{missing_var}}'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'tokens without variable definitions' in out
+
+
+def test_unused_variable_definition():
+    """Variable defined but not used in create should fail."""
+    import copy
+    data = copy.deepcopy(VALID_DATA_WITH_VARIABLES)
+    data['recipes'][0]['variables']['extra_unused'] = {
+        'strategy': 'literal',
+        'value': 'oops',
+    }
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'unused variable definitions' in out
+
+
+def test_invalid_variable_strategy():
+    """Unknown strategy should fail."""
+    import copy
+    data = copy.deepcopy(VALID_DATA_WITH_VARIABLES)
+    data['recipes'][0]['variables']['owner_email']['strategy'] = 'random'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'strategy must be one of' in out
+
+
+def test_invalid_derived_shape():
+    """Derived variable with wrong source should fail."""
+    import copy
+    data = copy.deepcopy(VALID_DATA_WITH_VARIABLES)
+    data['recipes'][0]['variables']['owner_email']['source'] = 'userId'
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'derived.source must be "testRunId"' in out
+
+
+def test_invalid_literal_scalar():
+    """Literal with non-scalar value should fail."""
+    import copy
+    data = copy.deepcopy(VALID_DATA_WITH_VARIABLES)
+    data['recipes'][0]['create'] = {'User': [{'email': '{{owner_email}}'}]}
+    data['recipes'][0]['variables'] = {
+        'owner_email': {
+            'strategy': 'literal',
+            'value': [1, 2, 3],  # not scalar
+        },
+    }
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'literal.value must be a scalar' in out
+
+
+def test_rejects_unknown_model_from_discover():
+    data = json.loads(_json(VALID_DATA))
+    data['recipes'][0]['create'] = {'UnknownModel': [{'name': 'Acme'}]}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'is not present in discover schema' in out
+
+
+def test_rejects_unknown_field_from_discover():
+    data = json.loads(_json(VALID_DATA))
+    data['recipes'][0]['create'] = {'Organization': [{'unknownField': 'Acme'}]}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'unknownField is not present in discover schema' in out
+
+
+def test_rejects_invalid_enum_literal_from_discover():
+    data = json.loads(_json(VALID_DATA))
+    data['recipes'][0]['create'] = {'Organization': [{'communicationChannel': 'EMAIL'}]}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'invalid enum value "EMAIL"' in out
+
+
+def test_rejects_non_list_value_for_list_field():
+    data = json.loads(_json(VALID_DATA))
+    data['recipes'][0]['create'] = {'Organization': [{'teamSlugs': 'qa-team'}]}
+    code, out = _run_recipe_validator(data)
+    assert code == 1
+    assert 'must be a list because discover type is String[]' in out
diff --git a/tests/test_validate_scenarios.py b/tests/test_validate_scenarios.py
index 1101459..b8bcf3d 100644
--- a/tests/test_validate_scenarios.py
+++ b/tests/test_validate_scenarios.py
@@ -9,22 +9,70 @@
 scenarios:
   - name: standard
     description: Typical usage
-    entity_types: [user, task]
+    entity_types: 2
     total_entities: 10
   - name: empty
     description: No data
-    entity_types: [user]
+    entity_types: 0
     total_entities: 0
   - name: large
     description: Stress test
-    entity_types: [user, task, project]
+    entity_types: 3
     total_entities: 1000
 entity_types:
   - name: user
   - name: task
+discover:
+  source: sdk
+  model_count: 4
+  edge_count: 3
+  relation_count: 2
+  scope_field: organizationId
+variable_fields:
+  - token: "{{project_title}}"
+    entity: Project.title
+    scenarios:
+      - standard
+      - large
+    reason: title must be unique per test run
+    test_reference: ({{project_title}} variable)
+planning_sections:
+  - sdk_discover
+  - schema_summary
+  - relationship_map
+  - variable_data_strategy
 ---
 
 # Scenarios
+
+## SDK Discover
+
+Models: 4
+
+## Schema Summary
+
+- User
+- Task
+
+## Relationship Map
+
+- User.organizationId -> Organization.id
+
+## Variable Data Strategy
+
+- `{{project_title}}` is generated.
+
+## Scenario: `standard`
+
+Standard details.
+
+## Scenario: `empty`
+
+Empty details.
+
+## Scenario: `large`
+
+Large details.
 """
 
 
@@ -47,6 +95,23 @@ def test_missing_required_fields():
     assert 'Missing required frontmatter fields' in out
 
 
+def test_missing_discover_field():
+    content = VALID.replace(
+        "discover:\n  source: sdk\n  model_count: 4\n  edge_count: 3\n  relation_count: 2\n  scope_field: organizationId\n",
+        "",
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert "discover" in out
+
+
+def test_discover_source_must_be_sdk():
+    content = VALID.replace('source: sdk', 'source: codebase')
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'discover.source must be exactly "sdk"' in out
+
+
 def test_scenario_count_too_low():
     content = VALID.replace('scenario_count: 3', 'scenario_count: 2')
     code, out = run_validator(SCRIPT, content)
@@ -62,7 +127,6 @@ def test_scenario_count_mismatch():
 
 
 def test_missing_required_scenario_name():
-    # Replace 'large' with 'extra' — now 'large' is missing
     content = VALID.replace('name: large', 'name: extra')
     code, out = run_validator(SCRIPT, content)
     assert code == 1
@@ -71,7 +135,6 @@ def test_missing_required_scenario_name():
 
 
 def test_scenario_missing_field():
-    # Remove description from first scenario
     content = VALID.replace(
         '  - name: standard\n    description: Typical usage',
         '  - name: standard',
@@ -99,3 +162,53 @@ def test_entity_type_missing_name():
     code, out = run_validator(SCRIPT, content)
     assert code == 1
     assert 'must be a mapping with at least a "name" field' in out
+
+
+def test_variable_token_must_use_double_curly_braces():
+    content = VALID.replace('token: "{{project_title}}"', 'token: project_title')
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'must use double curly braces' in out
+
+
+def test_variable_generator_is_optional():
+    code, out = run_validator(SCRIPT, VALID)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_non_faker_generator_is_accepted():
+    content = VALID.replace(
+        '    reason: title must be unique per test run\n',
+        '    generator: derived from testRunId\n    reason: title must be unique per test run\n',
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_empty_generator_fails_if_present():
+    content = VALID.replace(
+        '    reason: title must be unique per test run\n',
+        '    generator: ""\n    reason: title must be unique per test run\n',
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'generator must be a non-empty string if present' in out
+
+
+def test_variable_scenarios_must_be_known():
+    content = VALID.replace('      - large', '      - invalid')
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'unknown scenario names' in out
+
+
+def test_missing_required_planning_section():
+    content = VALID.replace(
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n',
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'Missing required planning_sections' in out

From 72689e9de710f9985b110e201843d0c3de8ab9eb Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 11:44:53 -0300
Subject: [PATCH 04/33] fix: require nested tree structure in recipe create
 payloads

The Autonoma dashboard may reorder JSON object keys when forwarding
create payloads to SDK endpoints. Flat top-level model keys connected
by _ref break when children are processed before parents, causing
NOT NULL constraint violations on FK columns.

Recipes must now use nested trees rooted at the scope entity. Also
restricts template tokens to SDK built-ins ({{testRunId}}, {{index}},
etc.) since custom tokens like {{user_email_alice}} fail at runtime.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agents/env-factory-generator.md | 66 +++++++++++++++++++++++++++++----
 agents/scenario-generator.md    |  2 +
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md
index 5e18487..c315bbd 100644
--- a/agents/env-factory-generator.md
+++ b/agents/env-factory-generator.md
@@ -114,6 +114,47 @@ Required protections:
 - Prefer explicit create and teardown ordering based on the schema
 - If `discover` already works but `up` / `down` do not, keep the introspection path and finish the lifecycle
 
+### CRITICAL: Use nested tree structure in `create` payloads
+
+Recipe `create` payloads MUST use a **nested tree** rooted at the scope entity (the model that
+owns `scopeField`). Do NOT use flat top-level model keys connected only by `_ref`.
+
+**Why:** The Autonoma dashboard may reorder JSON object keys when forwarding the `create` payload
+to the SDK endpoint. The SDK's `resolveTree` processes models in `Object.entries(create)` insertion
+order. If a child model (e.g. `Tasks`) appears before its parent (e.g. `Organizations`), `_ref`
+aliases are not yet registered, the INSERT runs without the FK value, and NOT NULL constraints fail.
+
+**How:** Nest children inside their parent using the SDK's relation field names from `discover.json`.
+Look at the `relations` array in the discover response — the `parentField` value is the nesting key.
+
+Instead of flat `_ref`:
+```json
+{
+  "Organizations": [{"_alias": "org1", "name": "Acme"}],
+  "Users": [{"name": "Alice", "organizationId": {"_ref": "org1"}}]
+}
+```
+
+Use nested tree:
+```json
+{
+  "Organizations": [{
+    "_alias": "org1",
+    "name": "Acme",
+    "userses": [{"_alias": "u1", "name": "Alice"}]
+  }]
+}
+```
+
+The SDK automatically sets the child FK (`organizationId`) when a child is nested under its parent.
+Use `_ref` only for **cross-branch** references that cannot be expressed by nesting (e.g. a Task
+nested under a Project that references a User nested under the same Organization via `assigneeId`).
+
+Only use `{{testRunId}}` as a template token in `create` values — do not invent custom tokens like
+`{{user_email_alice}}`. The SDK's template engine only resolves built-in expressions
+(`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`, etc.). Custom tokens cause a runtime error when the
+dashboard sends the payload directly to the endpoint.
+
 ## CRITICAL: Smoke-Test and Validate Within the Session
 
 After implementing, test the lifecycle in-session.
@@ -148,17 +189,24 @@ The file must be a JSON object in this exact logical shape:
       "name": "standard",
       "description": "Realistic dataset for core flows",
       "create": {
-        "User": [
-          {
-            "email": "{{owner_email}}"
-          }
-        ]
+        "Organization": [{
+          "_alias": "org1",
+          "name": "Acme Corp",
+          "userses": [
+            { "_alias": "owner", "email": "owner-{{testRunId}}@example.com" }
+          ],
+          "projectses": [
+            { "name": "Main Project", "taskses": [
+              { "title": "First task", "assigneeId": { "_ref": "owner" } }
+            ]}
+          ]
+        }]
       },
       "variables": {
-        "owner_email": {
+        "testRunId": {
           "strategy": "derived",
           "source": "testRunId",
-          "format": "owner+{testRunId}@example.com"
+          "format": "{testRunId}"
         }
       },
       "validation": {
@@ -173,6 +221,10 @@ The file must be a JSON object in this exact logical shape:
 }
 ```
 
+**Note:** The `create` payload uses a nested tree structure. Children are nested under parents using
+the relation field names from `discover.json` (e.g. `userses`, `projectses`, `taskses`). The SDK
+automatically fills in parent FK fields. Only cross-branch references use `_ref`.
+
 Required rules:
 - top-level keys must be `version`, `source`, `validationMode`, and `recipes`
 - `version` must be the integer `1`
diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md
index bc033e9..440cd44 100644
--- a/agents/scenario-generator.md
+++ b/agents/scenario-generator.md
@@ -200,3 +200,5 @@ The validation checks:
 - Every enum value must be covered in `standard`
 - Use the SDK discover output instead of re-deriving the schema from local code
 - If the discover artifact is missing, ask the user to provide a working SDK discover response
+- Only use `{{testRunId}}` as a template token — do not invent custom variable tokens like `{{user_email_alice}}`. The SDK template engine only resolves built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`, etc.). Custom tokens cause a runtime error when the dashboard sends the payload directly to the endpoint. If a field needs uniqueness, inline the testRunId directly: e.g. `alice-{{testRunId}}@test.local`
+- Design scenario entity tables so they can be expressed as a nested tree rooted at the scope entity. The Step 4 agent will convert scenarios into nested `create` payloads — flat cross-model `_ref` only structures break when JSON key order is not preserved

From 940a48bb21d6276ae548b17aec8d8893c699b4e2 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 15:12:40 -0300
Subject: [PATCH 05/33] fix: require variable fields for time-sensitive and
 constrained values

Hardcoded dates, deadlines, and uniquely constrained fields cause test
failures when values expire or collide across runs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agents/scenario-generator.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md
index 440cd44..00e9f2b 100644
--- a/agents/scenario-generator.md
+++ b/agents/scenario-generator.md
@@ -55,6 +55,10 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom
    - the value is inherently time-based, unstable, or nondeterministic
    - hardcoding it would make later tests misleading or brittle
 
+   Fields that are time-sensitive (dates, deadlines, timestamps) or have any uniqueness/format
+   constraint enforced by the database or application **must** be variable — hardcoding them
+   will cause test failures when the hardcoded value expires or collides.
+
    Do not mark a field as variable just because:
    - it is user-facing text
    - it could be unique in theory

From 0b840dfb5e8f4715917872a80ddab13da8bcc299 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 15:41:21 -0300
Subject: [PATCH 06/33] fix: block step 4 completion when recipe upload fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The upload response was logged but never checked — step completion
events fired regardless of HTTP status. Now exit 1 on non-2xx so the
pipeline stops and reports the failure.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 commands/generate-tests.md     | 24 ++++++++++++++----------
 skills/generate-tests/SKILL.md | 24 ++++++++++++++----------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/commands/generate-tests.md b/commands/generate-tests.md
index 5782d5c..325317e 100644
--- a/commands/generate-tests.md
+++ b/commands/generate-tests.md
@@ -377,16 +377,20 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
   -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
 if [ -n "$GENERATION_ID" ]; then
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
-  if python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d @"$RECIPE_PATH")
-    UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-    UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
-    echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
-  else
-    echo "WARNING: scenario-recipes.json is not valid JSON, skipping upload"
+  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
+    exit 1
+  fi
+  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @"$RECIPE_PATH")
+  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
+    exit 1
   fi
 fi
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md
index 15edf76..34e7da9 100644
--- a/skills/generate-tests/SKILL.md
+++ b/skills/generate-tests/SKILL.md
@@ -345,16 +345,20 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 if [ -n "$GENERATION_ID" ]; then
   AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
-  if python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d @"$RECIPE_PATH")
-    UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-    UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
-    echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
-  else
-    echo "WARNING: scenario-recipes.json is not valid JSON, skipping upload"
+  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
+    exit 1
+  fi
+  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @"$RECIPE_PATH")
+  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
+    exit 1
   fi
 fi
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \

From fb72ed181c532efca536b41b63639b6700d5abd9 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 15:46:07 -0300
Subject: [PATCH 07/33] fix: recipe validator now accepts nested tree creates
 with relation fields

The validator checked entity fields against discover schema columns, but
nested tree payloads use relation field names (e.g. userses, projectses)
as nesting keys. These are not columns and were rejected. Now the
validator extracts relation parentField names from discover and skips
them during field validation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 hooks/validators/validate_scenario_recipes.py | 24 ++++++--
 tests/test_validate_scenario_recipes.py       | 61 +++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/hooks/validators/validate_scenario_recipes.py b/hooks/validators/validate_scenario_recipes.py
index 14f7f50..7cefb87 100644
--- a/hooks/validators/validate_scenario_recipes.py
+++ b/hooks/validators/validate_scenario_recipes.py
@@ -85,7 +85,15 @@ def _load_discover_schema(filepath, source):
                 field_map[field_name] = field
         model_map[name] = field_map
 
-    return model_map, None
+    # Collect relation field names used as nesting keys in nested tree create payloads
+    relation_fields = set()
+    relations = schema.get('relations')
+    if isinstance(relations, list):
+        for rel in relations:
+            if isinstance(rel, dict) and isinstance(rel.get('parentField'), str):
+                relation_fields.add(rel['parentField'])
+
+    return {'models': model_map, 'relation_fields': relation_fields}, None
 
 
 def _validate_value_against_field(value, field, path):
@@ -114,10 +122,13 @@ def _validate_value_against_field(value, field, path):
     return None
 
 
-def _validate_create_against_discover(create, model_map, recipe_index):
-    if model_map is None:
+def _validate_create_against_discover(create, discover_info, recipe_index):
+    if discover_info is None:
         return None
 
+    model_map = discover_info['models']
+    relation_fields = discover_info['relation_fields']
+
     for model_name, entities in create.items():
         if model_name not in model_map:
             return f'recipes[{recipe_index}].create.{model_name} is not present in discover schema'
@@ -131,6 +142,9 @@ def _validate_create_against_discover(create, model_map, recipe_index):
             for field_name, value in entity.items():
                 if field_name.startswith('_'):
                     continue
+                # Skip relation nesting keys (e.g. userses, projectses)
+                if field_name in relation_fields:
+                    continue
                 if field_name not in field_map:
                     return (
                         f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} '
@@ -180,7 +194,7 @@ def _validate_create_against_discover(create, model_map, recipe_index):
         print(f'source.{field} must be a non-empty string')
         sys.exit(1)
 
-discover_model_map, discover_error = _load_discover_schema(filepath, source)
+discover_info, discover_error = _load_discover_schema(filepath, source)
 if discover_error is not None:
     print(discover_error)
     sys.exit(1)
@@ -224,7 +238,7 @@ def _validate_create_against_discover(create, model_map, recipe_index):
     if not isinstance(create, dict) or len(create) == 0:
         print(f'recipes[{i}].create must be a non-empty object')
         sys.exit(1)
-    create_error = _validate_create_against_discover(create, discover_model_map, i)
+    create_error = _validate_create_against_discover(create, discover_info, i)
     if create_error is not None:
         print(create_error)
         sys.exit(1)
diff --git a/tests/test_validate_scenario_recipes.py b/tests/test_validate_scenario_recipes.py
index 06ca47d..6778337 100644
--- a/tests/test_validate_scenario_recipes.py
+++ b/tests/test_validate_scenario_recipes.py
@@ -360,3 +360,64 @@ def test_rejects_non_list_value_for_list_field():
     code, out = _run_recipe_validator(data)
     assert code == 1
     assert 'must be a list because discover type is String[]' in out
+
+
+def test_nested_tree_with_relation_fields():
+    """Nested tree creates using relation field names from discover should pass."""
+    discover = {
+        'schema': {
+            'models': [
+                {
+                    'name': 'Organization',
+                    'fields': [
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+                {
+                    'name': 'User',
+                    'fields': [
+                        {'name': 'email', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+            ],
+            'edges': [
+                {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False},
+            ],
+            'relations': [
+                {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'},
+            ],
+            'scopeField': 'organizationId',
+        }
+    }
+    data = {
+        'version': 1,
+        'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'},
+        'validationMode': 'sdk-check',
+        'recipes': [
+            {
+                'name': 'standard', 'description': 'Nested tree',
+                'create': {
+                    'Organization': [{
+                        'name': 'Acme',
+                        'users': [{'name': 'Alice', 'email': 'alice@test.com'}],
+                    }],
+                },
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'empty', 'description': 'Empty',
+                'create': {'Organization': []},
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'large', 'description': 'Large',
+                'create': {'Organization': [{'name': 'Big'}]},
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+        ],
+    }
+    code, out = _run_recipe_validator(data, discover=discover)
+    assert code == 0
+    assert out == 'OK'

From 55afa4936457d9cd3368f07ead7fd8d514f636e0 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 15:55:09 -0300
Subject: [PATCH 08/33] fix: reject flat _ref format in recipe validator to
 prevent null FK on dashboard dry runs

The dashboard may reorder JSON keys when forwarding create payloads. With flat
format (multiple top-level model arrays connected by _ref), child models can
appear before parents, causing unresolved aliases and NULL FK constraint failures.

The validator now detects when an entity uses {"_ref": "..."} for a FK field
whose parent model is also a top-level key in create, and rejects it with a
clear error directing the user to use nested tree structure instead.
Cross-branch _ref (e.g. assigneeId) remains allowed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 hooks/validators/validate_scenario_recipes.py |  45 ++++-
 tests/test_validate_scenario_recipes.py       | 154 ++++++++++++++++++
 2 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/hooks/validators/validate_scenario_recipes.py b/hooks/validators/validate_scenario_recipes.py
index 7cefb87..70ad4b1 100644
--- a/hooks/validators/validate_scenario_recipes.py
+++ b/hooks/validators/validate_scenario_recipes.py
@@ -87,13 +87,33 @@ def _load_discover_schema(filepath, source):
 
     # Collect relation field names used as nesting keys in nested tree create payloads
     relation_fields = set()
+    # Map child FK fields to their parent model for flat-format detection.
+    # e.g. { ("Users", "organizationId"): "Organizations" }
+    nestable_fk_edges = {}
     relations = schema.get('relations')
     if isinstance(relations, list):
         for rel in relations:
             if isinstance(rel, dict) and isinstance(rel.get('parentField'), str):
                 relation_fields.add(rel['parentField'])
-
-    return {'models': model_map, 'relation_fields': relation_fields}, None
+            # A relation where childField is an FK column on the child model means
+            # the child SHOULD be nested under the parent via the parentField key.
+            if (isinstance(rel, dict)
+                    and isinstance(rel.get('parentModel'), str)
+                    and isinstance(rel.get('childModel'), str)
+                    and isinstance(rel.get('childField'), str)
+                    and isinstance(rel.get('parentField'), str)):
+                child_model = rel['childModel']
+                child_fk = rel['childField']
+                parent_model = rel['parentModel']
+                # Only record edges where child FK is a real column (not the reverse relation)
+                if child_model in model_map and child_fk in model_map[child_model]:
+                    nestable_fk_edges[(child_model, child_fk)] = parent_model
+
+    return {
+        'models': model_map,
+        'relation_fields': relation_fields,
+        'nestable_fk_edges': nestable_fk_edges,
+    }, None
 
 
 def _validate_value_against_field(value, field, path):
@@ -128,6 +148,9 @@ def _validate_create_against_discover(create, discover_info, recipe_index):
 
     model_map = discover_info['models']
     relation_fields = discover_info['relation_fields']
+    nestable_fk_edges = discover_info.get('nestable_fk_edges', {})
+
+    top_level_models = set(create.keys())
 
     for model_name, entities in create.items():
         if model_name not in model_map:
@@ -150,6 +173,24 @@ def _validate_create_against_discover(create, discover_info, recipe_index):
                         f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} '
                         'is not present in discover schema'
                     )
+
+                # Detect flat-format _ref on FK fields that should be nested.
+                # If an entity uses {"_ref": "..."} for a FK field whose parent
+                # model is also a top-level key in create, the recipe is using
+                # flat format instead of the required nested tree structure.
+                if (isinstance(value, dict)
+                        and '_ref' in value
+                        and len(value) == 1):
+                    parent_model = nestable_fk_edges.get((model_name, field_name))
+                    if parent_model and parent_model in top_level_models:
+                        return (
+                            f'recipes[{recipe_index}].create.{model_name}[{entity_index}].{field_name} '
+                            f'uses {{"_ref": "..."}} but {model_name} should be nested under '
+                            f'{parent_model} using the relation field instead of flat _ref. '
+                            f'The dashboard may reorder JSON keys, which breaks flat _ref resolution. '
+                            f'Use a nested tree structure rooted at the scope entity.'
+                        )
+
                 error = _validate_value_against_field(
                     value,
                     field_map[field_name],
diff --git a/tests/test_validate_scenario_recipes.py b/tests/test_validate_scenario_recipes.py
index 6778337..d34735d 100644
--- a/tests/test_validate_scenario_recipes.py
+++ b/tests/test_validate_scenario_recipes.py
@@ -421,3 +421,157 @@ def test_nested_tree_with_relation_fields():
     code, out = _run_recipe_validator(data, discover=discover)
     assert code == 0
     assert out == 'OK'
+
+
+def test_rejects_flat_ref_for_nestable_fk():
+    """Flat _ref for a FK that should be expressed via nesting must be rejected.
+
+    The dashboard may reorder JSON keys, breaking insertion-order-dependent _ref
+    resolution. Child models must be nested under their parent using relation
+    field names, not placed in separate top-level arrays with _ref.
+    """
+    discover = {
+        'schema': {
+            'models': [
+                {
+                    'name': 'Organization',
+                    'tableName': 'organizations',
+                    'fields': [
+                        {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True},
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+                {
+                    'name': 'User',
+                    'tableName': 'users',
+                    'fields': [
+                        {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True},
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+            ],
+            'edges': [
+                {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False},
+            ],
+            'relations': [
+                {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'},
+            ],
+            'scopeField': 'organizationId',
+        }
+    }
+    data = {
+        'version': 1,
+        'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'},
+        'validationMode': 'sdk-check',
+        'recipes': [
+            {
+                'name': 'standard', 'description': 'Flat format with _ref',
+                'create': {
+                    'Organization': [{'_alias': 'org1', 'name': 'Acme'}],
+                    'User': [{'name': 'Alice', 'organizationId': {'_ref': 'org1'}}],
+                },
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'empty', 'description': 'Empty',
+                'create': {'Organization': []},
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'large', 'description': 'Large flat',
+                'create': {
+                    'Organization': [{'_alias': 'org2', 'name': 'Big'}],
+                    'User': [{'name': 'Bob', 'organizationId': {'_ref': 'org2'}}],
+                },
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+        ],
+    }
+    code, out = _run_recipe_validator(data, discover=discover)
+    assert code == 1
+    assert 'should be nested under Organization' in out
+    assert 'flat _ref' in out
+
+
+def test_allows_cross_branch_ref_in_nested_tree():
+    """Cross-branch _ref (e.g. assigneeId pointing to a user) is allowed.
+
+    When a model is NOT a top-level key (it's nested under its parent), a _ref
+    to it from a sibling branch is the correct pattern and must not be rejected.
+    """
+    discover = {
+        'schema': {
+            'models': [
+                {
+                    'name': 'Organization',
+                    'tableName': 'organizations',
+                    'fields': [
+                        {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True},
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+                {
+                    'name': 'User',
+                    'tableName': 'users',
+                    'fields': [
+                        {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True},
+                        {'name': 'name', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+                {
+                    'name': 'Task',
+                    'tableName': 'tasks',
+                    'fields': [
+                        {'name': 'id', 'type': 'String', 'isRequired': True, 'isId': True, 'hasDefault': True},
+                        {'name': 'title', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'assigneeId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                        {'name': 'organizationId', 'type': 'String', 'isRequired': True, 'isId': False, 'hasDefault': False},
+                    ],
+                },
+            ],
+            'edges': [
+                {'from': 'User', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False},
+                {'from': 'Task', 'to': 'User', 'localField': 'assigneeId', 'foreignField': 'id', 'nullable': False},
+                {'from': 'Task', 'to': 'Organization', 'localField': 'organizationId', 'foreignField': 'id', 'nullable': False},
+            ],
+            'relations': [
+                {'parentModel': 'Organization', 'childModel': 'User', 'parentField': 'users', 'childField': 'organizationId'},
+                {'parentModel': 'User', 'childModel': 'Task', 'parentField': 'tasks', 'childField': 'assigneeId'},
+                {'parentModel': 'Organization', 'childModel': 'Task', 'parentField': 'orgTasks', 'childField': 'organizationId'},
+            ],
+            'scopeField': 'organizationId',
+        }
+    }
+    data = {
+        'version': 1,
+        'source': {'discoverPath': 'autonoma/discover.json', 'scenariosPath': 'autonoma/scenarios.md'},
+        'validationMode': 'sdk-check',
+        'recipes': [
+            {
+                'name': 'standard', 'description': 'Nested with cross-branch ref',
+                'create': {
+                    'Organization': [{
+                        'name': 'Acme',
+                        'users': [{'_alias': 'alice', 'name': 'Alice'}],
+                        'orgTasks': [{'title': 'Task 1', 'assigneeId': {'_ref': 'alice'}}],
+                    }],
+                },
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'empty', 'description': 'Empty',
+                'create': {'Organization': []},
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+            {
+                'name': 'large', 'description': 'Large nested',
+                'create': {'Organization': [{'name': 'Big'}]},
+                'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
+            },
+        ],
+    }
+    code, out = _run_recipe_validator(data, discover=discover)
+    assert code == 0
+    assert out == 'OK'

From 01101956b2b7d44b2c7993dd9f37e45e31c5c996 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 16:21:35 -0300
Subject: [PATCH 09/33] fix: test-case-generator must use {{token}}
 placeholders for variable fields

The test-case-generator had no awareness of variable_fields from
scenarios.md, so it would hardcode dynamic values (emails, dates) as
literals in test steps. At runtime those literals wouldn't match the
resolved values. Now the agent reads variable_fields and uses {{token}}
placeholders for any dynamic data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agents/test-case-generator.md | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/agents/test-case-generator.md b/agents/test-case-generator.md
index ea05c12..5eccfaf 100644
--- a/agents/test-case-generator.md
+++ b/agents/test-case-generator.md
@@ -35,9 +35,20 @@ Your output is a directory `autonoma/qa-tests/` containing:
 2. Read all input files:
    - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count
    - All files in `autonoma/skills/`
-   - `autonoma/scenarios.md` — parse the frontmatter to get scenarios and entity_types
+   - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields**
 
-3. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
+3. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter
+   declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has
+   a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`.
+   When writing test steps that involve a variable field value — typing it, asserting it, or
+   navigating to it — you MUST use the `{{token}}` placeholder, never the hardcoded literal from
+   the scenario body. At runtime the agent resolves these tokens to their actual values.
+
+   Example: if `variable_fields` includes `{{deadline_1}}` for `Tasks.deadline`:
+   - good: "assert the task deadline shows `{{deadline_1}}`"
+   - bad: "assert the task deadline shows 2025-06-15"
+
+4. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
    The scenarios exist only to provide preconditions and known data for app behavior tests.
    Do NOT generate tests whose purpose is to verify:
    - that the scenario contains the documented entity counts
@@ -50,17 +61,17 @@ Your output is a directory `autonoma/qa-tests/` containing:
    - good: "open the project `{{project_title}}` and verify editing works"
    - bad: "verify the scenario created 12 projects and 3 users"
 
-4. Count the routes/features/pages in the codebase to establish the coverage correlation.
+5. Count the routes/features/pages in the codebase to establish the coverage correlation.
    The total test count should roughly correlate:
    - Rule of thumb: 3-5 tests per route/feature for supporting flows
    - Rule of thumb: 8-15 tests per core flow
    - This is approximate — use judgment, but the INDEX must declare the correlation
 
-5. Generate test files organized in subdirectories by feature/flow.
+6. Generate test files organized in subdirectories by feature/flow.
 
-6. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
+7. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
 
-7. Write individual test files into subdirectories.
+8. Write individual test files into subdirectories.
 
 ## CRITICAL: INDEX.md Format
 
@@ -157,7 +168,7 @@ The body follows the standard Autonoma test format from the fetched instructions
 - **Administrative/settings**: 15-20% of tests, mostly `mid` and `low`
 - Never write conditional steps — each test follows one deterministic path
 - Assertions must specify exact text, element, or visual state
-- Reference scenario data by exact values from scenarios.md
+- Reference scenario data by exact values from scenarios.md, EXCEPT for variable fields — use `{{token}}` placeholders for those
 - Do not spend test budget "auditing" scenario contents. Scenario data is setup, not the product behavior under test.
 - Do not write meta-tests such as "verify the seeded counts match scenarios.md" or "verify the Environment Factory created the right fixtures"
 - If a seeded value is not needed for a user-facing flow, do not assert it just because it exists in scenarios.md

From d8e0e056230c423443635d3d68b8038e6c01ceda Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 19:26:39 -0300
Subject: [PATCH 10/33] feat: verify recipe persistence before sending
 step.completed

After uploading scenario recipes, call GET /setups/:id/scenarios
to verify every recipe name from scenario-recipes.json exists on
the dashboard with hasActiveRecipe: true. Only send step.completed(3)
after verification passes. If any recipe is missing, report the
mismatch and stop.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/generate-tests/SKILL.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md
index 34e7da9..7d09efb 100644
--- a/skills/generate-tests/SKILL.md
+++ b/skills/generate-tests/SKILL.md
@@ -360,6 +360,36 @@ if [ -n "$GENERATION_ID" ]; then
     echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
     exit 1
   fi
+
+  # Verify recipes were persisted by fetching them back from the dashboard
+  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
+  if [ "$VERIFY_STATUS" != "200" ]; then
+    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
+    exit 1
+  fi
+  # Extract scenario names from the uploaded recipes file and verify each one exists with an active recipe
+  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
+  MISSING=""
+  for NAME in $EXPECTED_NAMES; do
+    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
+print('yes' if match else 'no')
+" 2>/dev/null || echo "no")
+    if [ "$HAS_ACTIVE" != "yes" ]; then
+      MISSING="$MISSING $NAME"
+    fi
+  done
+  if [ -n "$MISSING" ]; then
+    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
+    echo "Step 4 cannot complete. Recipe upload may have partially failed."
+    exit 1
+  fi
+  echo "Verified: all scenario recipes persisted successfully on the dashboard."
 fi
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \

From 8b2bbe4a5b19c5f78cf4fda46570707de2361cae Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 19:27:44 -0300
Subject: [PATCH 11/33] chore: bump plugin version to 1.3.0

Includes recipe upload verification against dashboard before
sending step.completed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude-plugin/plugin.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 2de57c6..c6e0f1b 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.2.1",
+  "version": "1.3.0",
   "author": {
     "name": "Autonoma"
   }

From c62422b8dda9d3603546ff3e33a4cb5f100b87b9 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 9 Apr 2026 19:38:48 -0300
Subject: [PATCH 12/33] fix: pull plugin from main branch instead of
 sdk-scenarios

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 0cec48e..f3b9e9f 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -12,7 +12,7 @@
       "source": {
         "source": "url",
         "url": "https://github.com/IgnacioPardo/test-planner-plugin-sc-v2.git",
-        "ref": "IgnacioPardo/sdk-scenarios"
+        "ref": "main"
       },
       "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
     }

From bb21f1ce74915b4a65299d7eb00b6996259a08b8 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Fri, 10 Apr 2026 11:24:00 -0300
Subject: [PATCH 13/33] feat: testRunId scoping for non-multi-tenant apps

Agents now analyze the discover schema to detect whether the app has
natural per-run isolation. When it doesn't, the scenario generator
aggressively slugs all identifying fields (names, titles, descriptions)
with {{testRunId}} so parallel or sequential test runs never collide.

Bumps plugin version to 1.4.0.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude-plugin/plugin.json             |  2 +-
 agents/env-factory-generator.md        | 12 ++++++++++++
 agents/scenario-generator.md           | 26 ++++++++++++++++++++++++++
 hooks/validators/validate_scenarios.py |  7 ++++++-
 tests/test_validate_scenarios.py       | 20 ++++++++++++++++++++
 5 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index c6e0f1b..a51c30a 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.3.0",
+  "version": "1.4.0",
   "author": {
     "name": "Autonoma"
   }
diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md
index c315bbd..45998c1 100644
--- a/agents/env-factory-generator.md
+++ b/agents/env-factory-generator.md
@@ -114,6 +114,18 @@ Required protections:
 - Prefer explicit create and teardown ordering based on the schema
 - If `discover` already works but `up` / `down` do not, keep the introspection path and finish the lifecycle
 
+### Per-run data isolation via testRunId
+
+When `scenarios.md` contains many variable fields with `generator: derived from testRunId` — especially
+on identifying fields like names, titles, and descriptions, not just emails — the app lacks natural
+multi-tenant isolation. The scenario generator slugged these fields so that parallel or sequential
+test runs never collide.
+
+Preserve all of these `{{testRunId}}` tokens in `create` payloads and map them to `derived` strategy
+entries in the recipe `variables` block. Do not collapse slugged fields back into concrete literals.
+For these apps, `testRunId` is effectively required for correct operation — note this in the summary
+you present to the user at the end of Step 4.
+
 ### CRITICAL: Use nested tree structure in `create` payloads
 
 Recipe `create` payloads MUST use a **nested tree** rooted at the scope entity (the model that
diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md
index 00e9f2b..342899a 100644
--- a/agents/scenario-generator.md
+++ b/agents/scenario-generator.md
@@ -36,6 +36,19 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom
    - parent/child relations
    - scope field
 
+   While reading the schema, assess whether the scope entity provides real **per-run data isolation**.
+   Ask yourself: does the scope entity parent most other models via required foreign keys? Can a new
+   scope entity be created per test run (i.e. it has creatable fields beyond just auto-generated IDs)?
+   Do most models in the graph eventually chain back to the scope entity?
+
+   If the answer is yes to all of these, the app has natural multi-tenant isolation — each test run
+   can create its own scope entity and all child data is automatically partitioned.
+
+   If the scope entity is a singleton, shared across users, or doesn't meaningfully partition data
+   across concurrent runs, the app **lacks natural per-run isolation**. In this case you must slug
+   all identifying fields with `{{testRunId}}` (see step 6 below) so that parallel or sequential
+   test runs never collide on lookup, search, or assertion values.
+
    If `autonoma/discover.json` is missing or malformed, stop and tell the user that Step 2 now
    requires a valid SDK discover artifact before scenario generation can continue.
 
@@ -49,11 +62,22 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom
    Example: prefer `Acme Project testRunId suffix` encoded as a concrete scenario value over turning the whole field
    into `{{project_name}}` unless later tests truly need the placeholder.
 
+   **Exception — apps without natural per-run isolation:** If your scoping analysis in step 3
+   determined the app lacks natural multi-tenant isolation, **reverse the default above**. Slug ALL
+   identifying fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline
+   `{{testRunId}}` so that every value a test might search for, type into a form, or assert on screen
+   is unique to that test run. Use the pattern `Concrete Value {{testRunId}}` (e.g.
+   `Acme Corp {{testRunId}}`, `Main Project {{testRunId}}`). Each slugged field becomes a
+   `variable_field` entry with `generator: derived from testRunId`. This prevents parallel or
+   sequential test runs from interfering with each other when there is no scope entity to partition
+   the data.
+
    Use variable fields sparingly. Only mark a value as variable when at least one of these is true:
    - the field must be globally unique or is highly collision-prone across runs
    - the backend or SDK generates the value at runtime
    - the value is inherently time-based, unstable, or nondeterministic
    - hardcoding it would make later tests misleading or brittle
+   - **the app lacks natural per-run isolation** and the field is used in lookups, searches, or assertions
 
    Fields that are time-sensitive (dates, deadlines, timestamps) or have any uniqueness/format
    constraint enforced by the database or application **must** be variable — hardcoding them
@@ -162,6 +186,7 @@ planning_sections:
   - `schema_summary`
   - `relationship_map`
   - `variable_data_strategy`
+  - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation and you need to explain why fields were aggressively slugged with `{{testRunId}}`
 
 ### After the frontmatter
 
@@ -170,6 +195,7 @@ The rest of the file follows the standard scenarios.md format from the fetched i
 - Include a `## Schema Summary` section listing the key models and required fields that drive the scenarios.
 - Include a `## Relationship Map` section describing the important parent/child and FK relationships.
 - Include a `## Variable Data Strategy` section explaining which values are generated and how tests should reference them.
+- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation — explain why fields were aggressively slugged with `{{testRunId}}` and what isolation boundary the slugging replaces.
 - Scenario: `standard` (credentials, entity tables with concrete data, aggregate counts)
 - Scenario: `empty` (credentials, all entity types listed as None)
 - Scenario: `large` (credentials, high-volume data described in aggregate)
diff --git a/hooks/validators/validate_scenarios.py b/hooks/validators/validate_scenarios.py
index 9bbbaec..8580715 100644
--- a/hooks/validators/validate_scenarios.py
+++ b/hooks/validators/validate_scenarios.py
@@ -156,6 +156,11 @@
     'relationship_map',
     'variable_data_strategy',
 }
+optional_sections = {
+    'scoping_analysis',
+}
+allowed_sections = required_sections | optional_sections
+
 unknown_sections = [section for section in planning_sections if not isinstance(section, str) or len(section.strip()) == 0]
 if unknown_sections:
     print('planning_sections must contain only non-empty strings')
@@ -167,7 +172,7 @@
     sys.exit(1)
 
 for section in planning_sections:
-    if section not in required_sections:
+    if section not in allowed_sections:
         print(f'planning_sections contains unknown value: {section}')
         sys.exit(1)
 
diff --git a/tests/test_validate_scenarios.py b/tests/test_validate_scenarios.py
index b8bcf3d..40c55c0 100644
--- a/tests/test_validate_scenarios.py
+++ b/tests/test_validate_scenarios.py
@@ -212,3 +212,23 @@ def test_missing_required_planning_section():
     code, out = run_validator(SCRIPT, content)
     assert code == 1
     assert 'Missing required planning_sections' in out
+
+
+def test_scoping_analysis_optional_section_accepted():
+    content = VALID.replace(
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n  - scoping_analysis\n',
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_unknown_planning_section_rejected():
+    content = VALID.replace(
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
+        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n  - made_up_section\n',
+    )
+    code, out = run_validator(SCRIPT, content)
+    assert code == 1
+    assert 'planning_sections contains unknown value: made_up_section' in out

From 27d3d8d464ba8dea0af0b4da1a5137088b9674b8 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <65306107+IgnacioPardo@users.noreply.github.com>
Date: Thu, 16 Apr 2026 18:33:44 -0300
Subject: [PATCH 14/33] fix: change plugin source to local path

Updated plugin source to local directory.
---
 .claude-plugin/marketplace.json | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f3b9e9f..d7eaa0b 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -10,9 +10,7 @@
     {
       "name": "autonoma-test-planner",
       "source": {
-        "source": "url",
-        "url": "https://github.com/IgnacioPardo/test-planner-plugin-sc-v2.git",
-        "ref": "main"
+        "source": "./",
       },
       "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
     }

From ef0bda0282e73c4530aba927b204f9b1d6b9a261 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <65306107+IgnacioPardo@users.noreply.github.com>
Date: Thu, 16 Apr 2026 18:34:14 -0300
Subject: [PATCH 15/33] chore: plugin version

---
 .claude-plugin/plugin.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index a51c30a..3be3ef2 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.4.0",
+  "version": "1.2.0",
   "author": {
     "name": "Autonoma"
   }

From 5306caf10121f1001798ab4f357b675c15f3e4bf Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Thu, 16 Apr 2026 19:20:25 -0300
Subject: [PATCH 16/33] feat: adhoc planner plugin

---
 .claude-plugin/marketplace.json               |  10 +
 adhoc/.claude-plugin/plugin.json              |   8 +
 adhoc/agents/focused-test-case-generator.md   | 191 +++++++++++++++
 adhoc/commands/generate-adhoc-tests.md        | 222 ++++++++++++++++++
 adhoc/hooks/hooks.json                        |  15 ++
 adhoc/hooks/validate-pipeline-output.sh       |  69 ++++++
 .../validate_directory_structure.py           |  44 ++++
 adhoc/hooks/validators/validate_test_file.py  |  46 ++++
 adhoc/hooks/validators/validate_test_index.py | 130 ++++++++++
 adhoc/skills/generate-adhoc-tests/SKILL.md    | 221 +++++++++++++++++
 10 files changed, 956 insertions(+)
 create mode 100644 adhoc/.claude-plugin/plugin.json
 create mode 100644 adhoc/agents/focused-test-case-generator.md
 create mode 100644 adhoc/commands/generate-adhoc-tests.md
 create mode 100644 adhoc/hooks/hooks.json
 create mode 100755 adhoc/hooks/validate-pipeline-output.sh
 create mode 100644 adhoc/hooks/validators/validate_directory_structure.py
 create mode 100644 adhoc/hooks/validators/validate_test_file.py
 create mode 100644 adhoc/hooks/validators/validate_test_index.py
 create mode 100644 adhoc/skills/generate-adhoc-tests/SKILL.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index e18269f..afbca07 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -20,6 +20,16 @@
       "name": "autonoma-test-planner-development",
       "source": "./",
       "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
+    },
+    {
+      "name": "autonoma-adhoc-planner",
+      "source": {
+        "source": "git-subdir",
+        "url": "https://github.com/Autonoma-AI/test-planner-plugin.git",
+        "path": "adhoc",
+        "ref": "feat/adhoc-planner-plugin"
+      },
+      "description": "Generates focused E2E tests for a user-defined topic or feature area with a custom system prompt"
     }
   ]
 }
diff --git a/adhoc/.claude-plugin/plugin.json b/adhoc/.claude-plugin/plugin.json
new file mode 100644
index 0000000..1da7c50
--- /dev/null
+++ b/adhoc/.claude-plugin/plugin.json
@@ -0,0 +1,8 @@
+{
+  "name": "autonoma-adhoc-planner",
+  "description": "Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline with deterministic validation at each step",
+  "version": "1.0.0",
+  "author": {
+    "name": "Autonoma"
+  }
+}
diff --git a/adhoc/agents/focused-test-case-generator.md b/adhoc/agents/focused-test-case-generator.md
new file mode 100644
index 0000000..1dea436
--- /dev/null
+++ b/adhoc/agents/focused-test-case-generator.md
@@ -0,0 +1,191 @@
+---
+description: >
+  Generates E2E test cases focused on a specific user-defined domain or feature area.
+  Reads knowledge base, scenarios, and existing tests to produce targeted, non-duplicating
+  test files with YAML frontmatter for deterministic validation.
+tools:
+  - Read
+  - Glob
+  - Grep
+  - Write
+  - Bash
+  - Agent
+  - WebFetch
+maxTurns: 80
+---
+
+# Focused E2E Test Case Generator
+
+You generate E2E test cases scoped to a specific domain or feature area.
+
+**Your primary directive is defined by the orchestrator and passed in the task description as `FOCUS_PROMPT`.** Every test you write must be relevant to that focus. Do not generate tests outside the requested scope.
+
+Your inputs are:
+- `FOCUS_PROMPT` — the user-defined focus (injected by the orchestrator in the task description)
+- `FOCUS_SLUG` — the output folder name (injected by the orchestrator)
+- `autonoma/AUTONOMA.md` (knowledge base with core flows) — if it exists
+- `autonoma/skills/` (skill files for navigation) — if they exist
+- `autonoma/scenarios.md` (test data scenarios) — if it exists
+- `EXISTING_TESTS` — a list of existing test titles/folders passed by the orchestrator (to avoid duplication)
+
+Your output is a directory `autonoma/qa-tests/{FOCUS_SLUG}/` containing:
+1. `INDEX.md` — index with test distribution metadata
+2. Individual test files organized in subdirectories by sub-feature
+
+## Instructions
+
+1. First, fetch the latest test generation instructions:
+
+   Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt`
+   and follow those instructions for how to generate tests — except scope all tests to the `FOCUS_PROMPT`.
+
+2. Read all available input files:
+   - `autonoma/AUTONOMA.md` — parse frontmatter for core_flows and feature_count (if exists)
+   - All files in `autonoma/skills/` (if exists)
+   - `autonoma/scenarios.md` — parse frontmatter for scenarios, entity_types, variable_fields (if exists)
+   - If neither `AUTONOMA.md` nor `scenarios.md` exists, scan the codebase for routes and features relevant to the focus area
+
+3. Review the `EXISTING_TESTS` list provided by the orchestrator. Do not generate tests whose title or
+   purpose substantially duplicates an existing test.
+
+4. **Variable fields** work exactly as in the main planner: if `variable_fields` are declared in
+   `scenarios.md`, use `{{token}}` placeholders for those fields in test steps — never hardcode the
+   literal value. If `scenarios.md` does not exist, write tests without scenario references.
+
+5. Focus strictly on the `FOCUS_PROMPT`. If the focus is "signatures and documents", only generate
+   tests that exercise signing flows, document management, signature edge cases, etc. Do not generate
+   unrelated tests just to fill a quota.
+
+6. Count the routes/features/pages in the codebase relevant to the focus area to establish coverage.
+
+7. Write `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` FIRST (before individual test files).
+
+8. Write individual test files into subdirectories under `autonoma/qa-tests/{FOCUS_SLUG}/`.
+
+## CRITICAL: INDEX.md Format
+
+The file `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` MUST start with YAML frontmatter in this exact format:
+
+```yaml
+---
+total_tests: 18
+total_folders: 3
+folders:
+  - name: "sign-document"
+    description: "Signing a document from start to finish"
+    test_count: 8
+    critical: 3
+    high: 3
+    mid: 1
+    low: 1
+  - name: "signature-edge-cases"
+    description: "Edge cases in the signing flow"
+    test_count: 6
+    critical: 1
+    high: 2
+    mid: 2
+    low: 1
+  - name: "document-management"
+    description: "Document upload, deletion, and access control"
+    test_count: 4
+    critical: 0
+    high: 2
+    mid: 1
+    low: 1
+coverage_correlation:
+  routes_or_features: 6
+  expected_test_range_min: 18
+  expected_test_range_max: 30
+---
+```
+
+### INDEX Frontmatter Rules
+
+- **total_tests**: Sum of all tests across all folders. Must be a positive integer.
+- **total_folders**: Number of subdirectories. Must match the length of `folders` list.
+- **folders**: One entry per subdirectory. Each has:
+  - `name`: Folder name (kebab-case, matches the actual subdirectory name)
+  - `description`: What this folder covers within the focus area
+  - `test_count`: Number of test files in this folder
+  - `critical`, `high`, `mid`, `low`: Count of tests at each criticality level. **Must sum to test_count.**
+- **coverage_correlation**: Explains why the test count makes sense for the focus area.
+  - `routes_or_features`: Number of distinct routes/features relevant to the focus
+  - `expected_test_range_min`: Lower bound (routes_or_features * 3)
+  - `expected_test_range_max`: Upper bound (routes_or_features * 5, higher for core-heavy focus areas)
+  - **total_tests must fall within [expected_test_range_min, expected_test_range_max]**
+
+### After the INDEX frontmatter
+
+The body of INDEX.md should contain:
+- A human-readable summary of what the focused test suite covers
+- A table listing every folder with its test count and description
+- A table listing every test file with its title, criticality, scenario, and flow
+
+## CRITICAL: Individual Test File Format
+
+Each test file in `autonoma/qa-tests/{FOCUS_SLUG}/{folder-name}/` MUST start with YAML frontmatter:
+
+```yaml
+---
+title: "Sign a document with valid credentials"
+description: "Verify a user can complete the signing flow for a standard document"
+criticality: critical
+scenario: standard
+flow: "Document Signing"
+---
+```
+
+### Test File Frontmatter Rules
+
+- **title**: Short, descriptive test name (string, non-empty)
+- **description**: One sentence explaining what the test verifies (string, non-empty)
+- **criticality**: Exactly one of: `critical`, `high`, `mid`, `low`
+- **scenario**: Which scenario this test uses — `standard`, `empty`, or `large`. If `scenarios.md`
+  does not exist, use `standard` as the default.
+- **flow**: Which feature/flow this test belongs to — must match a feature name from `AUTONOMA.md`
+  frontmatter if that file exists, otherwise use a descriptive name for the focus sub-feature.
+
+### After the test frontmatter
+
+Follow the standard Autonoma test format from the fetched instructions:
+- **Setup**: Scenario reference and any preconditions
+- **Steps**: Numbered list using only: click, scroll, type, assert
+- **Expected Result**: What should be true when the test passes
+
+## Test Distribution Guidelines
+
+- Focus budget entirely on the `FOCUS_PROMPT` domain — do not pad with unrelated tests
+- Within the focus area, apply the same criticality distribution as the main planner:
+  - Core sub-flows of the focus: mostly `critical` and `high`
+  - Supporting sub-flows: mostly `high` and `mid`
+  - Settings/admin within the focus: mostly `mid` and `low`
+- Never write conditional steps — each test follows one deterministic path
+- Assertions must specify exact text, element, or visual state
+- Use `{{token}}` placeholders for variable fields; never hardcode dynamic values
+- Do not write meta-tests that verify scenario validity or Environment Factory correctness
+- Do not duplicate any test from `EXISTING_TESTS`
+
+## Validation
+
+Hook scripts will automatically validate your output when you write files. If validation fails,
+you'll receive an error message. Fix the issue and rewrite the file.
+
+**INDEX.md validation checks:**
+- Frontmatter contains total_tests, total_folders, folders, coverage_correlation
+- Folder criticality counts sum to test_count per folder
+- Sum of all folder test_counts equals total_tests
+- total_tests falls within expected_test_range
+
+**Individual test file validation checks:**
+- Frontmatter contains title, description, criticality, scenario, flow
+- criticality is one of: critical, high, mid, low
+- All string fields are non-empty
+
+## Important
+
+- Write INDEX.md FIRST, then individual test files
+- The folder names in INDEX.md must match actual subdirectory names
+- Use subagents to parallelize test generation across folders
+- Each test must be self-contained — no dependencies on other tests
+- Do not write code (no Playwright, no Cypress) — tests are markdown with natural language steps
+- Stay within the focus scope — quality and relevance over quantity
diff --git a/adhoc/commands/generate-adhoc-tests.md b/adhoc/commands/generate-adhoc-tests.md
new file mode 100644
index 0000000..45b8417
--- /dev/null
+++ b/adhoc/commands/generate-adhoc-tests.md
@@ -0,0 +1,222 @@
+---
+name: generate-adhoc-tests
+description: >
+  Generates focused E2E test cases for a user-defined topic or feature area through a validated
+  pipeline. Accepts a focus description (e.g. "signatures and documents", "invoice edge cases")
+  and produces tests scoped to that domain. Reads existing knowledge base and scenarios if
+  available; falls back to a targeted codebase scan if not.
+---
+
+# Autonoma Ad Hoc Test Generation Pipeline
+
+You are orchestrating a focused test generation pipeline. The user has requested tests for a
+specific topic or feature area. You will resolve the focus, load context, spawn an isolated
+subagent to generate tests, and validate the output.
+
+**Every step MUST complete successfully and pass validation before proceeding.**
+Do NOT skip steps. Do NOT proceed if validation fails.
+
+## User Confirmation
+
+By default, after the test generation step, you MUST present the summary and ask the user for
+confirmation using the `AskUserQuestion` tool before uploading.
+
+**Auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE` is set to `true`, skip the confirmation prompt
+and proceed directly to upload after presenting the summary.
+
+## Before Starting
+
+Save the project root (subagents change working directory, so we need an absolute path reference):
+```bash
+AUTONOMA_ROOT="$(pwd)"
+echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
+mkdir -p autonoma/qa-tests
+```
+
+Read the environment variables:
+- `AUTONOMA_API_KEY` — your Autonoma API key
+- `AUTONOMA_PROJECT_ID` — your Autonoma project ID
+- `AUTONOMA_API_URL` — Autonoma API base URL
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip confirmation prompt
+
+Derive a clean human-readable application name:
+```bash
+APP_NAME=$(git remote get-url origin 2>/dev/null | sed 's/.*\///' | sed 's/\.git//' || basename "$(pwd)")
+```
+
+Create the generation record so the dashboard can track progress:
+```bash
+RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
+GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
+mkdir -p autonoma
+echo "$GENERATION_ID" > autonoma/.generation-id
+echo "Generation ID: $GENERATION_ID"
+```
+
+If `GENERATION_ID` is empty, log it for debugging and continue — reporting is best-effort.
+
+## Step 1: Resolve Focus Prompt
+
+Read the user's input from the command invocation. The text after the command name is the focus
+description (e.g. `/autonoma-adhoc-planner:generate-adhoc-tests signatures and documents`).
+
+**If a focus description was provided on invocation**, use it directly. Derive `FOCUS_SLUG`:
+```bash
+FOCUS_PROMPT="<the user's description>"
+FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+echo "Focus: $FOCUS_PROMPT"
+echo "Slug: $FOCUS_SLUG"
+```
+
+**If no focus description was provided**, check available context and suggest focus areas:
+1. Read `autonoma/AUTONOMA.md` for `core_flows` if it exists
+2. Otherwise list top-level route/feature files in the codebase
+3. Call `AskUserQuestion` with 3–4 suggested focus areas drawn from what you found, plus an "Other"
+   option so the user can describe their own
+4. Wait for the user's response, then derive `FOCUS_SLUG` from their answer
+
+## Step 2: Load Context
+
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":0,"name":"Context"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Loading codebase context for focused test generation..."}}' || true
+```
+
+Prefer existing main-planner outputs; fall back to a targeted codebase scan if they are absent:
+
+```
+if autonoma/AUTONOMA.md exists → read it; extract core_flows, feature_count, app_name
+if autonoma/scenarios.md exists → read it; extract scenario names, entity_types, variable_fields
+if autonoma/skills/ exists → list all .md files in that directory
+if autonoma/qa-tests/ exists → list all existing test files (title + path) to avoid duplication
+else → scan the codebase for routes, pages, and features relevant to FOCUS_PROMPT
+```
+
+Compile an `EXISTING_TESTS` summary: a flat list of "folder/filename: title" for every test that
+already exists under `autonoma/qa-tests/`. This will be passed to the subagent.
+
+Report step complete:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":0,"name":"Context"}}' || true
+```
+
+## Step 3: Generate Focused Tests
+
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":1,"name":"Focused Tests"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generating focused E2E tests for: ${FOCUS_PROMPT}\"}}" || true
+```
+
+Spawn the `focused-test-case-generator` subagent with the following task (substitute actual values
+for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
+
+> **FOCUS_PROMPT**: <the user's focus description>
+> **FOCUS_SLUG**: <kebab-case slug>
+>
+> Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
+> Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
+>
+> Context available (use what exists, skip what doesn't):
+> - Knowledge base: `autonoma/AUTONOMA.md` (core_flows: <list>, feature_count: <n>)
+> - Scenarios: `autonoma/scenarios.md` (scenarios: <list>, variable_fields: <list>)
+> - Skills: `autonoma/skills/` (<n> files)
+>
+> EXISTING_TESTS (do not duplicate these):
+> <flat list of existing test paths and titles>
+>
+> You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
+> total_tests, total_folders, folder breakdown, and coverage_correlation.
+> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
+> Write INDEX.md FIRST, then individual test files.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` exists and is non-empty
+2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
+3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+
+Report step complete and upload test cases:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused tests for '${FOCUS_PROMPT}'. Preparing upload...\"}}" || true
+```
+
+**If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+- question: "Do these focused tests look correct for your requested topic?"
+- options: ["Yes, upload to dashboard", "I want to suggest changes"]
+Wait for the user's response before uploading.
+**If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to upload.
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && python3 -c "
+import os, json
+proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
+qa_dir = os.path.join(proj_root, 'autonoma/qa-tests/${FOCUS_SLUG}')
+test_cases = []
+for root, dirs, files in os.walk(qa_dir):
+    for f in files:
+        if f.endswith('.md') and f != 'INDEX.md':
+            path = os.path.join(root, f)
+            folder = os.path.relpath(root, qa_dir)
+            with open(path) as fh:
+                content = fh.read()
+            entry = {'name': f, 'content': content}
+            if folder != '.':
+                entry['folder'] = '${FOCUS_SLUG}/' + folder
+            test_cases.append(entry)
+print(json.dumps({'testCases': test_cases}))
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d @- || true
+
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":1,"name":"Focused Tests"}}' || true
+```
+
+## Completion
+
+After all steps complete, summarize:
+- **Focus**: The topic tested and the focus slug used as the output folder
+- **Tests generated**: Total count, folder breakdown, coverage correlation
+- **Context used**: Whether AUTONOMA.md and scenarios.md were available or a codebase scan was used
+- **Output location**: `autonoma/qa-tests/{FOCUS_SLUG}/`
+- **Avoided duplicates**: How many existing tests were found and respected
diff --git a/adhoc/hooks/hooks.json b/adhoc/hooks/hooks.json
new file mode 100644
index 0000000..d694b5d
--- /dev/null
+++ b/adhoc/hooks/hooks.json
@@ -0,0 +1,15 @@
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Write",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/validate-pipeline-output.sh"
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/adhoc/hooks/validate-pipeline-output.sh b/adhoc/hooks/validate-pipeline-output.sh
new file mode 100755
index 0000000..a33c95f
--- /dev/null
+++ b/adhoc/hooks/validate-pipeline-output.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Validates ad hoc planner output files after Write tool use.
+# Exit 0 = allow (file is valid or not a pipeline file)
+# Exit 2 = block and send error message to Claude
+
+INPUT=$(cat)
+
+FILE_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tool_input',{}).get('file_path',''))" 2>/dev/null)
+
+if [ -z "$FILE_PATH" ]; then
+  exit 0
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+VALIDATORS_DIR="$SCRIPT_DIR/validators"
+
+PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+echo "$PLUGIN_ROOT" > /tmp/autonoma-plugin-root
+
+python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null
+
+case "$FILE_PATH" in
+  */autonoma/qa-tests/*/INDEX.md)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
+    VALIDATOR_NAME="validate-test-index"
+    ;;
+  */autonoma/qa-tests/*/[!I]*.md)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py"
+    VALIDATOR_NAME="validate-test-file"
+    ;;
+  *)
+    exit 0
+    ;;
+esac
+
+if [ ! -f "$FILE_PATH" ]; then
+  echo "VALIDATION FAILED [$VALIDATOR_NAME]: File does not exist: $FILE_PATH" >&2
+  exit 2
+fi
+
+if [ ! -s "$FILE_PATH" ]; then
+  echo "VALIDATION FAILED [$VALIDATOR_NAME]: File is empty: $FILE_PATH" >&2
+  exit 2
+fi
+
+if [ ! -f "$VALIDATOR_SCRIPT" ]; then
+  echo "VALIDATION FAILED [$VALIDATOR_NAME]: Validator script not found: $VALIDATOR_SCRIPT" >&2
+  exit 2
+fi
+
+RESULT=$(python3 "$VALIDATOR_SCRIPT" "$FILE_PATH" 2>&1)
+EXIT_CODE=$?
+
+if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then
+  echo "VALIDATION FAILED [$VALIDATOR_NAME]: $RESULT" >&2
+  exit 2
+fi
+
+if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
+  DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py"
+  DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1)
+  DIR_EXIT=$?
+  if [ $DIR_EXIT -ne 0 ] || [ "$DIR_RESULT" != "OK" ]; then
+    echo "VALIDATION FAILED [validate-directory-structure]: $DIR_RESULT" >&2
+    exit 2
+  fi
+fi
+
+exit 0
diff --git a/adhoc/hooks/validators/validate_directory_structure.py b/adhoc/hooks/validators/validate_directory_structure.py
new file mode 100644
index 0000000..97d387f
--- /dev/null
+++ b/adhoc/hooks/validators/validate_directory_structure.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""Validates that the ad hoc focus folder is properly populated.
+
+For the ad hoc planner the index lives at autonoma/qa-tests/{focus-slug}/INDEX.md.
+We check that the focus folder contains at least one test file besides INDEX.md,
+and that every subfolder declared in the index also has at least one .md file.
+"""
+import os
+import sys
+import glob as globmod
+import yaml
+
+filepath = sys.argv[1]  # autonoma/qa-tests/{focus-slug}/INDEX.md
+focus_dir = os.path.dirname(filepath)  # autonoma/qa-tests/{focus-slug}/
+
+# Parse the INDEX frontmatter to get declared folder names
+content = open(filepath).read()
+parts = content.split('---', 2)
+try:
+    fm = yaml.safe_load(parts[1]) if len(parts) >= 3 else {}
+except Exception:
+    fm = {}
+
+declared_folders = [f.get('name') for f in fm.get('folders', []) if isinstance(f, dict) and f.get('name')]
+
+# Focus folder must contain at least one test file (not INDEX.md)
+test_files = [f for f in globmod.glob(os.path.join(focus_dir, '**', '*.md'), recursive=True)
+              if os.path.basename(f) != 'INDEX.md']
+if not test_files:
+    print(f'Focus folder has no test files: {focus_dir}')
+    sys.exit(1)
+
+# Every declared subfolder must exist and contain at least one .md file
+for name in declared_folders:
+    subdir = os.path.join(focus_dir, name)
+    if not os.path.isdir(subdir):
+        print(f'Declared folder "{name}" does not exist: {subdir}')
+        sys.exit(1)
+    md_files = globmod.glob(os.path.join(subdir, '*.md'))
+    if not md_files:
+        print(f'Declared folder "{name}" has no .md files: {subdir}')
+        sys.exit(1)
+
+print('OK')
diff --git a/adhoc/hooks/validators/validate_test_file.py b/adhoc/hooks/validators/validate_test_file.py
new file mode 100644
index 0000000..bea0726
--- /dev/null
+++ b/adhoc/hooks/validators/validate_test_file.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Validates individual test file frontmatter format."""
+import sys
+import yaml
+
+filepath = sys.argv[1]
+content = open(filepath).read()
+
+if not content.startswith('---'):
+    print('File must start with YAML frontmatter (---)')
+    sys.exit(1)
+
+parts = content.split('---', 2)
+if len(parts) < 3:
+    print('Missing closing --- for frontmatter')
+    sys.exit(1)
+
+try:
+    fm = yaml.safe_load(parts[1])
+except Exception as e:
+    print(f'Invalid YAML in frontmatter: {e}')
+    sys.exit(1)
+
+if not isinstance(fm, dict):
+    print('Frontmatter must be a YAML mapping')
+    sys.exit(1)
+
+required = ['title', 'description', 'criticality', 'scenario', 'flow']
+missing = [f for f in required if f not in fm]
+if missing:
+    print(f'Missing required frontmatter fields: {missing}')
+    sys.exit(1)
+
+valid_criticality = {'critical', 'high', 'mid', 'low'}
+crit = fm.get('criticality')
+if crit not in valid_criticality:
+    print(f'criticality must be one of {valid_criticality}, got: {crit}')
+    sys.exit(1)
+
+for field in ['title', 'description', 'scenario', 'flow']:
+    val = fm.get(field)
+    if not isinstance(val, str) or len(val.strip()) == 0:
+        print(f'{field} must be a non-empty string')
+        sys.exit(1)
+
+print('OK')
diff --git a/adhoc/hooks/validators/validate_test_index.py b/adhoc/hooks/validators/validate_test_index.py
new file mode 100644
index 0000000..2c9ddd3
--- /dev/null
+++ b/adhoc/hooks/validators/validate_test_index.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""Validates qa-tests/{focus-slug}/INDEX.md frontmatter format.
+
+For the ad hoc planner the index lives one level deeper than the main planner
+(autonoma/qa-tests/{focus-slug}/INDEX.md), so path calculations are adjusted
+accordingly. features.json cross-check is optional: skipped if the file does
+not exist (ad hoc runs do not require Step 1 to have completed first).
+"""
+import sys
+import os
+import json as jsonlib
+import yaml
+
+filepath = sys.argv[1]
+content = open(filepath).read()
+
+if not content.startswith('---'):
+    print('File must start with YAML frontmatter (---)')
+    sys.exit(1)
+
+parts = content.split('---', 2)
+if len(parts) < 3:
+    print('Missing closing --- for frontmatter')
+    sys.exit(1)
+
+try:
+    fm = yaml.safe_load(parts[1])
+except Exception as e:
+    print(f'Invalid YAML in frontmatter: {e}')
+    sys.exit(1)
+
+if not isinstance(fm, dict):
+    print('Frontmatter must be a YAML mapping')
+    sys.exit(1)
+
+required = ['total_tests', 'total_folders', 'folders', 'coverage_correlation']
+missing = [f for f in required if f not in fm]
+if missing:
+    print(f'Missing required frontmatter fields: {missing}')
+    sys.exit(1)
+
+tt = fm.get('total_tests')
+if not isinstance(tt, int) or tt < 1:
+    print('total_tests must be a positive integer')
+    sys.exit(1)
+
+tf = fm.get('total_folders')
+if not isinstance(tf, int) or tf < 1:
+    print('total_folders must be a positive integer')
+    sys.exit(1)
+
+folders = fm.get('folders')
+if not isinstance(folders, list) or len(folders) != tf:
+    print(f'folders list length ({len(folders) if isinstance(folders, list) else "N/A"}) must match total_folders ({tf})')
+    sys.exit(1)
+
+computed_total = 0
+for i, f in enumerate(folders):
+    if not isinstance(f, dict):
+        print(f'folders[{i}] must be a mapping')
+        sys.exit(1)
+    for field in ['name', 'description', 'test_count', 'critical', 'high', 'mid', 'low']:
+        if field not in f:
+            print(f'folders[{i}] missing required field: {field}')
+            sys.exit(1)
+    tc = f.get('test_count')
+    if not isinstance(tc, int) or tc < 1:
+        print(f'folders[{i}].test_count must be a positive integer')
+        sys.exit(1)
+    crit_sum = 0
+    for level in ['critical', 'high', 'mid', 'low']:
+        val = f.get(level)
+        if not isinstance(val, int) or val < 0:
+            print(f'folders[{i}].{level} must be a non-negative integer')
+            sys.exit(1)
+        crit_sum += val
+    if crit_sum != tc:
+        print(f'folders[{i}]: criticality counts ({crit_sum}) do not sum to test_count ({tc})')
+        sys.exit(1)
+    computed_total += tc
+
+if computed_total != tt:
+    print(f'Sum of folder test_counts ({computed_total}) does not match total_tests ({tt})')
+    sys.exit(1)
+
+cc = fm.get('coverage_correlation')
+if not isinstance(cc, dict):
+    print('coverage_correlation must be a mapping')
+    sys.exit(1)
+for field in ['routes_or_features', 'expected_test_range_min', 'expected_test_range_max']:
+    if field not in cc:
+        print(f'coverage_correlation missing required field: {field}')
+        sys.exit(1)
+
+rf = cc.get('routes_or_features')
+if not isinstance(rf, int) or rf < 1:
+    print('coverage_correlation.routes_or_features must be a positive integer')
+    sys.exit(1)
+
+tmin = cc.get('expected_test_range_min')
+tmax = cc.get('expected_test_range_max')
+if not isinstance(tmin, int) or not isinstance(tmax, int):
+    print('expected_test_range_min and expected_test_range_max must be integers')
+    sys.exit(1)
+if tmin > tmax:
+    print('expected_test_range_min must be <= expected_test_range_max')
+    sys.exit(1)
+if tt < tmin:
+    print(f'total_tests ({tt}) is below minimum ({tmin}) for {rf} routes/features. Too few tests — add more coverage.')
+    sys.exit(1)
+
+# Optional cross-check against features.json.
+# Path: autonoma/qa-tests/{focus-slug}/INDEX.md → up three levels → autonoma/features.json
+focus_dir = os.path.dirname(filepath)       # autonoma/qa-tests/{focus-slug}/
+qa_tests_dir = os.path.dirname(focus_dir)   # autonoma/qa-tests/
+autonoma_dir = os.path.dirname(qa_tests_dir)  # autonoma/
+features_path = os.path.join(autonoma_dir, 'features.json')
+
+if os.path.isfile(features_path):
+    try:
+        features_data = jsonlib.load(open(features_path))
+        feature_count = features_data.get('total_features', 0)
+        if feature_count > 0 and tt < feature_count * 2:
+            print(f'total_tests ({tt}) is too low for {feature_count} features in features.json. '
+                  f'Expected at least {feature_count * 2} tests (2 per feature).')
+            sys.exit(1)
+    except Exception:
+        pass  # malformed features.json — skip cross-check
+
+print('OK')
diff --git a/adhoc/skills/generate-adhoc-tests/SKILL.md b/adhoc/skills/generate-adhoc-tests/SKILL.md
new file mode 100644
index 0000000..321dc20
--- /dev/null
+++ b/adhoc/skills/generate-adhoc-tests/SKILL.md
@@ -0,0 +1,221 @@
+---
+name: generate-adhoc-tests
+description: >
+  Generates focused E2E test cases for a user-defined topic or feature area through a validated
+  pipeline. Accepts a focus description (e.g. "signatures and documents", "invoice edge cases")
+  and produces tests scoped to that domain. Reads existing knowledge base and scenarios if
+  available; falls back to a targeted codebase scan if not.
+---
+
+# Autonoma Ad Hoc Test Generation Pipeline
+
+You are orchestrating a focused test generation pipeline. The user has requested tests for a
+specific topic or feature area. You will resolve the focus, load context, spawn an isolated
+subagent to generate tests, and validate the output.
+
+**Every step MUST complete successfully and pass validation before proceeding.**
+Do NOT skip steps. Do NOT proceed if validation fails.
+
+## User Confirmation
+
+By default, after the test generation step, you MUST present the summary and ask the user for
+confirmation using the `AskUserQuestion` tool before uploading.
+
+**Auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE` is set to `true`, skip the confirmation prompt
+and proceed directly to upload after presenting the summary.
+
+## Before Starting
+
+Save the project root (subagents change working directory, so we need an absolute path reference):
+```bash
+AUTONOMA_ROOT="$(pwd)"
+echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
+mkdir -p autonoma/qa-tests
+```
+
+Read the environment variables:
+- `AUTONOMA_API_KEY` — your Autonoma API key
+- `AUTONOMA_PROJECT_ID` — your Autonoma project ID
+- `AUTONOMA_API_URL` — Autonoma API base URL
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip confirmation prompt
+
+Derive a clean human-readable application name:
+```bash
+APP_NAME=$(git remote get-url origin 2>/dev/null | sed 's/.*\///' | sed 's/\.git//' || basename "$(pwd)")
+```
+
+Create the generation record so the dashboard can track progress:
+```bash
+RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
+GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
+mkdir -p autonoma
+echo "$GENERATION_ID" > autonoma/.generation-id
+echo "Generation ID: $GENERATION_ID"
+```
+
+If `GENERATION_ID` is empty, log it for debugging and continue — reporting is best-effort.
+
+## Step 1: Resolve Focus Prompt
+
+Read the user's input. The text after the skill name is the focus description.
+
+**If a focus description was provided**, use it directly. Derive `FOCUS_SLUG`:
+```bash
+FOCUS_PROMPT="<the user's description>"
+FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+echo "Focus: $FOCUS_PROMPT"
+echo "Slug: $FOCUS_SLUG"
+```
+
+**If no focus description was provided**, check available context and suggest focus areas:
+1. Read `autonoma/AUTONOMA.md` for `core_flows` if it exists
+2. Otherwise list top-level route/feature files in the codebase
+3. Call `AskUserQuestion` with 3–4 suggested focus areas drawn from what you found, plus an "Other"
+   option so the user can describe their own
+4. Wait for the user's response, then derive `FOCUS_SLUG` from their answer
+
+## Step 2: Load Context
+
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":0,"name":"Context"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Loading codebase context for focused test generation..."}}' || true
+```
+
+Prefer existing main-planner outputs; fall back to a targeted codebase scan if they are absent:
+
+```
+if autonoma/AUTONOMA.md exists → read it; extract core_flows, feature_count, app_name
+if autonoma/scenarios.md exists → read it; extract scenario names, entity_types, variable_fields
+if autonoma/skills/ exists → list all .md files in that directory
+if autonoma/qa-tests/ exists → list all existing test files (title + path) to avoid duplication
+else → scan the codebase for routes, pages, and features relevant to FOCUS_PROMPT
+```
+
+Compile an `EXISTING_TESTS` summary: a flat list of "folder/filename: title" for every test that
+already exists under `autonoma/qa-tests/`. This will be passed to the subagent.
+
+Report step complete:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":0,"name":"Context"}}' || true
+```
+
+## Step 3: Generate Focused Tests
+
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":1,"name":"Focused Tests"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generating focused E2E tests for: ${FOCUS_PROMPT}\"}}" || true
+```
+
+Spawn the `focused-test-case-generator` subagent with the following task (substitute actual values
+for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
+
+> **FOCUS_PROMPT**: <the user's focus description>
+> **FOCUS_SLUG**: <kebab-case slug>
+>
+> Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
+> Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
+>
+> Context available (use what exists, skip what doesn't):
+> - Knowledge base: `autonoma/AUTONOMA.md` (core_flows: <list>, feature_count: <n>)
+> - Scenarios: `autonoma/scenarios.md` (scenarios: <list>, variable_fields: <list>)
+> - Skills: `autonoma/skills/` (<n> files)
+>
+> EXISTING_TESTS (do not duplicate these):
+> <flat list of existing test paths and titles>
+>
+> You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
+> total_tests, total_folders, folder breakdown, and coverage_correlation.
+> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
+> Write INDEX.md FIRST, then individual test files.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` exists and is non-empty
+2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
+3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+
+Report step complete and upload test cases:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused tests for '${FOCUS_PROMPT}'. Preparing upload...\"}}" || true
+```
+
+**If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+- question: "Do these focused tests look correct for your requested topic?"
+- options: ["Yes, upload to dashboard", "I want to suggest changes"]
+Wait for the user's response before uploading.
+**If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to upload.
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+[ -n "$GENERATION_ID" ] && python3 -c "
+import os, json
+proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
+qa_dir = os.path.join(proj_root, 'autonoma/qa-tests/${FOCUS_SLUG}')
+test_cases = []
+for root, dirs, files in os.walk(qa_dir):
+    for f in files:
+        if f.endswith('.md') and f != 'INDEX.md':
+            path = os.path.join(root, f)
+            folder = os.path.relpath(root, qa_dir)
+            with open(path) as fh:
+                content = fh.read()
+            entry = {'name': f, 'content': content}
+            if folder != '.':
+                entry['folder'] = '${FOCUS_SLUG}/' + folder
+            test_cases.append(entry)
+print(json.dumps({'testCases': test_cases}))
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d @- || true
+
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":1,"name":"Focused Tests"}}' || true
+```
+
+## Completion
+
+After all steps complete, summarize:
+- **Focus**: The topic tested and the focus slug used as the output folder
+- **Tests generated**: Total count, folder breakdown, coverage correlation
+- **Context used**: Whether AUTONOMA.md and scenarios.md were available or a codebase scan was used
+- **Output location**: `autonoma/qa-tests/{FOCUS_SLUG}/`
+- **Avoided duplicates**: How many existing tests were found and respected

From 13f5c5fd810c1ea73a03eed7b82c0002534c4bb0 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Thu, 16 Apr 2026 19:29:49 -0300
Subject: [PATCH 17/33] fix: update ref in
 test-planner-plugin/.claude-plugin/marketplace.json

---
 .claude-plugin/marketplace.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index afbca07..329bfc6 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -27,7 +27,7 @@
         "source": "git-subdir",
         "url": "https://github.com/Autonoma-AI/test-planner-plugin.git",
         "path": "adhoc",
-        "ref": "feat/adhoc-planner-plugin"
+        "ref": "chiara-ciriani/adhoc-planner-plugin"
       },
       "description": "Generates focused E2E tests for a user-defined topic or feature area with a custom system prompt"
     }

From dc3c03c4d622e29526bd984cca2ee23ab353e319 Mon Sep 17 00:00:00 2001
From: axel <axescalada@gmail.com>
Date: Thu, 16 Apr 2026 21:33:29 -0300
Subject: [PATCH 18/33] feat: pass adhoc source

---
 .claude-plugin/marketplace.json | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 329bfc6..28b78b2 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -23,12 +23,7 @@
     },
     {
       "name": "autonoma-adhoc-planner",
-      "source": {
-        "source": "git-subdir",
-        "url": "https://github.com/Autonoma-AI/test-planner-plugin.git",
-        "path": "adhoc",
-        "ref": "chiara-ciriani/adhoc-planner-plugin"
-      },
+      "source": "./adhoc",
       "description": "Generates focused E2E tests for a user-defined topic or feature area with a custom system prompt"
     }
   ]

From 3f0589cd411a564266c51e860e01a73e7a97a8b3 Mon Sep 17 00:00:00 2001
From: axel escalada <87334103+axlEscalada@users.noreply.github.com>
Date: Thu, 16 Apr 2026 21:44:26 -0300
Subject: [PATCH 19/33] fix: replace git-subdir with relative path for adhoc
 planner plugin


From 6f70961f24aef20f6a8d1e74c2ed1f3ba13c3ad2 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Fri, 17 Apr 2026 12:50:41 -0300
Subject: [PATCH 20/33] feat: update README.md

---
 README.md                                  |  87 ++++-
 adhoc/commands/generate-adhoc-tests.md     | 410 ++++++++++++++++-----
 adhoc/skills/generate-adhoc-tests/SKILL.md | 409 +++++++++++++++-----
 3 files changed, 729 insertions(+), 177 deletions(-)

diff --git a/README.md b/README.md
index 176ea66..1020424 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,72 @@ Implements or completes the backend Environment Factory so the planned scenarios
 
 **You review**: where the Environment Factory lives, what changed, whether a smoke `discover` → `up` → `down` check passed, and whether `standard`, `empty`, and `large` all passed lifecycle validation.
 
+---
+
+## Autonoma Ad Hoc Planner
+
+A second plugin in this repository that runs the same 4-step pipeline but scopes Step 3 to a user-defined focus area. Use it when you want targeted test coverage for a specific feature without regenerating your full test suite.
+
+### Install
+
+**Step 1:** The marketplace is the same as above. If you've already added it, skip this:
+
+```
+/plugin marketplace add Autonoma-AI/test-planner-plugin
+```
+
+**Step 2:** Install the ad hoc plugin:
+
+```
+/plugin install autonoma-adhoc-planner@autonoma
+```
+
+### Usage
+
+Inside any project with Claude Code:
+
+Pass your focus description directly after the command:
+
+```
+/autonoma-adhoc-planner:generate-adhoc-tests description
+```
+
+Or invoke without arguments and the plugin will suggest focus areas based on your codebase:
+
+```
+/autonoma-adhoc-planner:generate-adhoc-tests
+```
+
+The plugin walks you through 4 steps, asking for confirmation at each checkpoint before proceeding.
+
+## How it works
+
+### How it differs from the main planner
+
+Steps 1, 2, and 4 run identically to the main planner. Step 3 is scoped:
+
+| Step | Main planner | Ad hoc planner |
+|------|-------------|----------------|
+| 1 — Knowledge Base | Full codebase | Full codebase |
+| 2 — Scenarios | Full data model | Full data model |
+| 3 — E2E Tests | All features | **Focus area only** |
+| 4 — Environment Factory | All scenarios | All scenarios |
+
+Tests are written to `autonoma/qa-tests/{focus-slug}/` so they sit alongside your existing test suite without overwriting it. Running the ad hoc planner twice with different focus areas produces two separate subfolders.
+
+### Running multiple focus areas
+
+You can run the ad hoc planner multiple times for different topics, including simultaneously. Each run writes to its own subfolder and tracks its own generation ID file.
+
+```
+autonoma/qa-tests/
+├── canvas-interactions/      ← autonoma/.generation-id-canvas-interactions
+└── signatures-and-documents/ ← autonoma/.generation-id-signatures-and-documents
+```
+
+
+---
+
 ## Scenario Recipes
 
 `autonoma/scenario-recipes.json` is the validated handoff between planning and execution. It is produced in Step 4 after the Environment Factory has been implemented or verified and after each scenario has passed lifecycle validation.
@@ -174,9 +240,10 @@ claude plugin validate ./
 ```
 autonoma-test-planner/
 ├── .claude-plugin/
-│   ├── plugin.json                     # Plugin manifest
-│   └── marketplace.json                # Marketplace catalog
+│   ├── plugin.json                     # Plugin manifest (autonoma-test-planner)
+│   └── marketplace.json                # Marketplace catalog (lists both plugins)
 ├── skills/generate-tests/SKILL.md      # /generate-tests orchestrator
+├── commands/generate-tests.md          # /generate-tests command
 ├── agents/
 │   ├── kb-generator.md                 # Step 1 subagent
 │   ├── scenario-generator.md           # Step 2 subagent
@@ -193,6 +260,22 @@ autonoma-test-planner/
 │       ├── validate_scenarios.py
 │       ├── validate_test_index.py
 │       └── validate_test_file.py
+├── adhoc/                              # autonoma-adhoc-planner plugin root
+│   ├── .claude-plugin/
+│   │   └── plugin.json                 # Plugin manifest (autonoma-adhoc-planner)
+│   ├── skills/generate-adhoc-tests/
+│   │   └── SKILL.md                    # /generate-adhoc-tests orchestrator
+│   ├── commands/
+│   │   └── generate-adhoc-tests.md     # /generate-adhoc-tests command
+│   ├── agents/
+│   │   └── focused-test-case-generator.md  # Step 3 focused subagent
+│   └── hooks/
+│       ├── hooks.json                  # PostToolUse hook config
+│       ├── validate-pipeline-output.sh # Validation dispatcher
+│       └── validators/
+│           ├── validate_test_file.py
+│           ├── validate_test_index.py
+│           └── validate_directory_structure.py
 ├── LICENSE
 └── README.md
 ```
diff --git a/adhoc/commands/generate-adhoc-tests.md b/adhoc/commands/generate-adhoc-tests.md
index 45b8417..3a570d9 100644
--- a/adhoc/commands/generate-adhoc-tests.md
+++ b/adhoc/commands/generate-adhoc-tests.md
@@ -1,50 +1,67 @@
 ---
 name: generate-adhoc-tests
 description: >
-  Generates focused E2E test cases for a user-defined topic or feature area through a validated
-  pipeline. Accepts a focus description (e.g. "signatures and documents", "invoice edge cases")
-  and produces tests scoped to that domain. Reads existing knowledge base and scenarios if
-  available; falls back to a targeted codebase scan if not.
+  Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline.
+  Each step runs in an isolated subagent and must pass deterministic validation before the next
+  step begins. Steps 1, 2, and 4 run as normal; Step 3 scopes test generation to the requested
+  topic. Use when you want targeted test coverage for a specific feature area.
 ---
 
-# Autonoma Ad Hoc Test Generation Pipeline
+# Autonoma Ad Hoc E2E Test Generation Pipeline
 
-You are orchestrating a focused test generation pipeline. The user has requested tests for a
-specific topic or feature area. You will resolve the focus, load context, spawn an isolated
-subagent to generate tests, and validate the output.
-
-**Every step MUST complete successfully and pass validation before proceeding.**
+You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+**Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## User Confirmation
+## User Confirmation Between Steps
+
+By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
+confirmation using the `AskUserQuestion` tool. This creates an interactive
+UI prompt that makes it clear the user needs to respond before the pipeline continues.
 
-By default, after the test generation step, you MUST present the summary and ask the user for
-confirmation using the `AskUserQuestion` tool before uploading.
+After calling `AskUserQuestion`, wait for the user's response.
+Only proceed to the next step after they confirm.
 
-**Auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE` is set to `true`, skip the confirmation prompt
-and proceed directly to upload after presenting the summary.
+**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
+skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
+the summary. The summaries are still displayed — only the confirmation prompt is skipped.
 
 ## Before Starting
 
-Save the project root (subagents change working directory, so we need an absolute path reference):
+Resolve the focus prompt from the user's input (the text after the command name):
+
+```bash
+FOCUS_PROMPT="<the user's focus description>"
+FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+echo "Focus: $FOCUS_PROMPT"
+echo "Slug:  $FOCUS_SLUG"
+```
+
+If no focus description was provided, list top-level route/feature directories in the codebase,
+call `AskUserQuestion` with 3–4 suggested focus areas plus an "Other" option, wait for the user's
+response, then derive `FOCUS_SLUG` from their answer.
+
+Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
 ```bash
 AUTONOMA_ROOT="$(pwd)"
 echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma/qa-tests
+mkdir -p autonoma/skills autonoma/qa-tests
+```
+
+The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+```bash
+PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
 ```
 
-Read the environment variables:
+Read the environment variables. These are required for reporting progress back to Autonoma:
 - `AUTONOMA_API_KEY` — your Autonoma API key
 - `AUTONOMA_PROJECT_ID` — your Autonoma project ID
 - `AUTONOMA_API_URL` — Autonoma API base URL
-- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip confirmation prompt
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
 
-Derive a clean human-readable application name:
-```bash
-APP_NAME=$(git remote get-url origin 2>/dev/null | sed 's/.*\///' | sed 's/\.git//' || basename "$(pwd)")
-```
+Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
 
-Create the generation record so the dashboard can track progress:
+Create the generation record so the dashboard can track progress in real time:
 ```bash
 RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -55,108 +72,204 @@ BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
 mkdir -p autonoma
-echo "$GENERATION_ID" > autonoma/.generation-id
+echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}"
 echo "Generation ID: $GENERATION_ID"
 ```
 
-If `GENERATION_ID` is empty, log it for debugging and continue — reporting is best-effort.
+If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
 
-## Step 1: Resolve Focus Prompt
+## Step 1: Generate Knowledge Base
 
-Read the user's input from the command invocation. The text after the command name is the focus
-description (e.g. `/autonoma-adhoc-planner:generate-adhoc-tests signatures and documents`).
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true
+```
 
-**If a focus description was provided on invocation**, use it directly. Derive `FOCUS_SLUG`:
+Spawn the `kb-generator` subagent with the following task:
+
+> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md`
+> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with
+> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count.
+> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
+> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty
+2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically
+3. Read the file and present the frontmatter to the user — specifically the core_flows table
+
+Report step complete and upload skills:
 ```bash
-FOCUS_PROMPT="<the user's description>"
-FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
-echo "Focus: $FOCUS_PROMPT"
-echo "Slug: $FOCUS_SLUG"
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true
+
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true
+
+[ -n "$GENERATION_ID" ] && python3 -c "
+import os, json, sys
+root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
+skills = []
+d = os.path.join(root, 'autonoma/skills')
+if os.path.isdir(d):
+    for f in os.listdir(d):
+        if f.endswith('.md'):
+            with open(os.path.join(d, f)) as fh:
+                skills.append({'name': f, 'content': fh.read()})
+print(json.dumps({'skills': skills}))
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d @- || true
 ```
 
-**If no focus description was provided**, check available context and suggest focus areas:
-1. Read `autonoma/AUTONOMA.md` for `core_flows` if it exists
-2. Otherwise list top-level route/feature files in the codebase
-3. Call `AskUserQuestion` with 3–4 suggested focus areas drawn from what you found, plus an "Other"
-   option so the user can describe their own
-4. Wait for the user's response, then derive `FOCUS_SLUG` from their answer
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
+   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
 
-## Step 2: Load Context
+## Step 2: Generate Scenarios
 
 Report step start:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":0,"name":"Context"}}' || true
+  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Loading codebase context for focused test generation..."}}' || true
+  -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true
 ```
 
-Prefer existing main-planner outputs; fall back to a targeted codebase scan if they are absent:
+Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
 
+If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
+Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
+State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+
+Fetch and validate the artifact:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+mkdir -p "$AUTONOMA_ROOT/autonoma"
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
+  -H "Content-Type: application/json" \
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+if [ "$HTTP_STATUS" != "200" ]; then
+  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
+  exit 1
+fi
+printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json"
 ```
-if autonoma/AUTONOMA.md exists → read it; extract core_flows, feature_count, app_name
-if autonoma/scenarios.md exists → read it; extract scenario names, entity_types, variable_fields
-if autonoma/skills/ exists → list all .md files in that directory
-if autonoma/qa-tests/ exists → list all existing test files (title + path) to avoid duplication
-else → scan the codebase for routes, pages, and features relevant to FOCUS_PROMPT
-```
 
-Compile an `EXISTING_TESTS` summary: a flat list of "folder/filename: title" for every test that
-already exists under `autonoma/qa-tests/`. This will be passed to the subagent.
+If the fetch fails or validation fails, stop the pipeline at Step 2.
+Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
+
+Spawn the `scenario-generator` subagent with the following task:
+
+> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
+> artifact from `autonoma/discover.json`.
+> Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
+> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
+> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
+> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
+> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
+> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
+2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
+3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically
+4. Read the file and present the summary to the user — scenario names, entity counts, entity types,
+   discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":0,"name":"Context"}}' || true
+  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
 ```
 
-## Step 3: Generate Focused Tests
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
+   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
+
+## Step 3: Generate Focused E2E Test Cases
 
 Report step start:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":1,"name":"Focused Tests"}}' || true
+  -d '{"type":"step.started","data":{"step":2,"name":"Focused E2E Tests"}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generating focused E2E tests for: ${FOCUS_PROMPT}\"}}" || true
+  -d '{"type":"log","data":{"message":"Generating focused E2E test cases from knowledge base and scenarios..."}}' || true
 ```
 
-Spawn the `focused-test-case-generator` subagent with the following task (substitute actual values
-for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
+Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual
+values for FOCUS_PROMPT and FOCUS_SLUG before spawning):
 
 > **FOCUS_PROMPT**: <the user's focus description>
 > **FOCUS_SLUG**: <kebab-case slug>
 >
+> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
+> and scenarios from `autonoma/scenarios.md`.
 > Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
 > Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
->
-> Context available (use what exists, skip what doesn't):
-> - Knowledge base: `autonoma/AUTONOMA.md` (core_flows: <list>, feature_count: <n>)
-> - Scenarios: `autonoma/scenarios.md` (scenarios: <list>, variable_fields: <list>)
-> - Skills: `autonoma/skills/` (<n> files)
->
-> EXISTING_TESTS (do not duplicate these):
-> <flat list of existing test paths and titles>
->
 > You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
 > total_tests, total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Write INDEX.md FIRST, then individual test files.
+> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
+> scenario data when it is needed to test a real user-facing app behavior within the focus area.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
@@ -167,23 +280,19 @@ for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
 Report step complete and upload test cases:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused tests for '${FOCUS_PROMPT}'. Preparing upload...\"}}" || true
-```
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused test cases. Uploading to dashboard...\"}}" || true
 
-**If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-- question: "Do these focused tests look correct for your requested topic?"
-- options: ["Yes, upload to dashboard", "I want to suggest changes"]
-Wait for the user's response before uploading.
-**If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to upload.
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":2,"name":"Focused E2E Tests"}}' || true
 
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
 proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
@@ -205,18 +314,143 @@ print(json.dumps({'testCases': test_cases}))
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
   -d @- || true
+```
+
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly."
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+
+## Step 4: Environment Factory
 
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true
+```
+
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
+preflight validation. State plainly that both environment variables are mandatory.
+
+Spawn the `env-factory-generator` subagent with the following task:
+
+> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
+> Implement or complete the Autonoma Environment Factory in the project's backend so it can
+> support the planned scenarios with the current SDK contract, then validate the planned scenarios
+> against that implementation.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
+> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
+> Preserve the existing discover integration if it already works, and finish `up` / `down`
+> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
+> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
+> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
+> The recipe file must match the current setup API schema:
+> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
+> `name`, `description`, `create`, and `validation` with `status: "validated"`,
+> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
+> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
+> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
+> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
+> Persisted `create` must remain tokenized — never store resolved concrete values.
+> After writing the recipe file, run the preflight helper to validate all recipes against the
+> live SDK endpoint before uploading:
+> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
+> The preflight must pass for all three scenarios before Step 4 is considered complete.
+
+**After the subagent completes:**
+1. Verify the backend implementation or integration changes were made
+2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+3. Run the preflight helper if the subagent did not already do so:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+```
+If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
+4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
+5. Report which environment variables the backend now requires
+6. Report any backend issues that still need manual attention
+
+Report step complete:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+if [ -n "$GENERATION_ID" ]; then
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
+    exit 1
+  fi
+  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @"$RECIPE_PATH")
+  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
+    exit 1
+  fi
+
+  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
+  if [ "$VERIFY_STATUS" != "200" ]; then
+    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
+    exit 1
+  fi
+  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
+  MISSING=""
+  for NAME in $EXPECTED_NAMES; do
+    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
+print('yes' if match else 'no')
+" 2>/dev/null || echo "no")
+    if [ "$HAS_ACTIVE" != "yes" ]; then
+      MISSING="$MISSING $NAME"
+    fi
+  done
+  if [ -n "$MISSING" ]; then
+    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
+    echo "Step 4 cannot complete. Recipe upload may have partially failed."
+    exit 1
+  fi
+  echo "Verified: all scenario recipes persisted successfully on the dashboard."
+fi
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":1,"name":"Focused Tests"}}' || true
+  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true
 ```
 
 ## Completion
 
 After all steps complete, summarize:
-- **Focus**: The topic tested and the focus slug used as the output folder
-- **Tests generated**: Total count, folder breakdown, coverage correlation
-- **Context used**: Whether AUTONOMA.md and scenarios.md were available or a codebase scan was used
-- **Output location**: `autonoma/qa-tests/{FOCUS_SLUG}/`
-- **Avoided duplicates**: How many existing tests were found and respected
+- **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`)
+- **Step 1**: Knowledge base location and core flow count
+- **Step 2**: Scenario count and entity types covered
+- **Step 3**: Total focused test count, folder breakdown, coverage correlation
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
diff --git a/adhoc/skills/generate-adhoc-tests/SKILL.md b/adhoc/skills/generate-adhoc-tests/SKILL.md
index 321dc20..c0e55e7 100644
--- a/adhoc/skills/generate-adhoc-tests/SKILL.md
+++ b/adhoc/skills/generate-adhoc-tests/SKILL.md
@@ -1,50 +1,67 @@
 ---
 name: generate-adhoc-tests
 description: >
-  Generates focused E2E test cases for a user-defined topic or feature area through a validated
-  pipeline. Accepts a focus description (e.g. "signatures and documents", "invoice edge cases")
-  and produces tests scoped to that domain. Reads existing knowledge base and scenarios if
-  available; falls back to a targeted codebase scan if not.
+  Generates focused E2E test cases for a codebase with a user-defined topic through a validated multi-step pipeline.
+  Each step runs in an isolated subagent and must pass deterministic validation before the next
+  step begins. Steps 1, 2, and 4 run as normal; Step 3 scopes test generation to the requested
+  topic. Use when you want targeted test coverage for a specific feature area.
 ---
 
-# Autonoma Ad Hoc Test Generation Pipeline
+# Autonoma Ad Hoc E2E Test Generation Pipeline
 
-You are orchestrating a focused test generation pipeline. The user has requested tests for a
-specific topic or feature area. You will resolve the focus, load context, spawn an isolated
-subagent to generate tests, and validate the output.
-
-**Every step MUST complete successfully and pass validation before proceeding.**
+You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+**Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## User Confirmation
+## User Confirmation Between Steps
+
+By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
+confirmation using the `AskUserQuestion` tool. This creates an interactive
+UI prompt that makes it clear the user needs to respond before the pipeline continues.
 
-By default, after the test generation step, you MUST present the summary and ask the user for
-confirmation using the `AskUserQuestion` tool before uploading.
+After calling `AskUserQuestion`, wait for the user's response.
+Only proceed to the next step after they confirm.
 
-**Auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE` is set to `true`, skip the confirmation prompt
-and proceed directly to upload after presenting the summary.
+**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
+skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
+the summary. The summaries are still displayed — only the confirmation prompt is skipped.
 
 ## Before Starting
 
-Save the project root (subagents change working directory, so we need an absolute path reference):
+Resolve the focus prompt from the user's input (the text after the command name):
+
+```bash
+FOCUS_PROMPT="<the user's focus description>"
+FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+echo "Focus: $FOCUS_PROMPT"
+echo "Slug:  $FOCUS_SLUG"
+```
+
+If no focus description was provided, list top-level route/feature directories in the codebase,
+call `AskUserQuestion` with 3–4 suggested focus areas plus an "Other" option, wait for the user's
+response, then derive `FOCUS_SLUG` from their answer.
+
+Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
 ```bash
 AUTONOMA_ROOT="$(pwd)"
 echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma/qa-tests
+mkdir -p autonoma/skills autonoma/qa-tests
+```
+
+The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+```bash
+PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
 ```
 
-Read the environment variables:
+Read the environment variables. These are required for reporting progress back to Autonoma:
 - `AUTONOMA_API_KEY` — your Autonoma API key
 - `AUTONOMA_PROJECT_ID` — your Autonoma project ID
 - `AUTONOMA_API_URL` — Autonoma API base URL
-- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip confirmation prompt
+- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
 
-Derive a clean human-readable application name:
-```bash
-APP_NAME=$(git remote get-url origin 2>/dev/null | sed 's/.*\///' | sed 's/\.git//' || basename "$(pwd)")
-```
+Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
 
-Create the generation record so the dashboard can track progress:
+Create the generation record so the dashboard can track progress in real time:
 ```bash
 RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -55,107 +72,204 @@ BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
 mkdir -p autonoma
-echo "$GENERATION_ID" > autonoma/.generation-id
+echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}"
 echo "Generation ID: $GENERATION_ID"
 ```
 
-If `GENERATION_ID` is empty, log it for debugging and continue — reporting is best-effort.
+If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
 
-## Step 1: Resolve Focus Prompt
+## Step 1: Generate Knowledge Base
 
-Read the user's input. The text after the skill name is the focus description.
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true
+```
 
-**If a focus description was provided**, use it directly. Derive `FOCUS_SLUG`:
+Spawn the `kb-generator` subagent with the following task:
+
+> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md`
+> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with
+> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count.
+> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
+> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty
+2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically
+3. Read the file and present the frontmatter to the user — specifically the core_flows table
+
+Report step complete and upload skills:
 ```bash
-FOCUS_PROMPT="<the user's description>"
-FOCUS_SLUG=$(echo "$FOCUS_PROMPT" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
-echo "Focus: $FOCUS_PROMPT"
-echo "Slug: $FOCUS_SLUG"
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true
+
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true
+
+[ -n "$GENERATION_ID" ] && python3 -c "
+import os, json, sys
+root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
+skills = []
+d = os.path.join(root, 'autonoma/skills')
+if os.path.isdir(d):
+    for f in os.listdir(d):
+        if f.endswith('.md'):
+            with open(os.path.join(d, f)) as fh:
+                skills.append({'name': f, 'content': fh.read()})
+print(json.dumps({'skills': skills}))
+" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d @- || true
 ```
 
-**If no focus description was provided**, check available context and suggest focus areas:
-1. Read `autonoma/AUTONOMA.md` for `core_flows` if it exists
-2. Otherwise list top-level route/feature files in the codebase
-3. Call `AskUserQuestion` with 3–4 suggested focus areas drawn from what you found, plus an "Other"
-   option so the user can describe their own
-4. Wait for the user's response, then derive `FOCUS_SLUG` from their answer
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
+   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
 
-## Step 2: Load Context
+## Step 2: Generate Scenarios
 
 Report step start:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":0,"name":"Context"}}' || true
+  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Loading codebase context for focused test generation..."}}' || true
+  -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true
 ```
 
-Prefer existing main-planner outputs; fall back to a targeted codebase scan if they are absent:
+Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
 
+If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
+Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
+State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+
+Fetch and validate the artifact:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+mkdir -p "$AUTONOMA_ROOT/autonoma"
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
+  -H "Content-Type: application/json" \
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
+if [ "$HTTP_STATUS" != "200" ]; then
+  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
+  exit 1
+fi
+printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json"
 ```
-if autonoma/AUTONOMA.md exists → read it; extract core_flows, feature_count, app_name
-if autonoma/scenarios.md exists → read it; extract scenario names, entity_types, variable_fields
-if autonoma/skills/ exists → list all .md files in that directory
-if autonoma/qa-tests/ exists → list all existing test files (title + path) to avoid duplication
-else → scan the codebase for routes, pages, and features relevant to FOCUS_PROMPT
-```
 
-Compile an `EXISTING_TESTS` summary: a flat list of "folder/filename: title" for every test that
-already exists under `autonoma/qa-tests/`. This will be passed to the subagent.
+If the fetch fails or validation fails, stop the pipeline at Step 2.
+Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
+
+Spawn the `scenario-generator` subagent with the following task:
+
+> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
+> artifact from `autonoma/discover.json`.
+> Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
+> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
+> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
+> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
+> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
+> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
+
+**After the subagent completes:**
+1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
+2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
+3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically
+4. Read the file and present the summary to the user — scenario names, entity counts, entity types,
+   discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":0,"name":"Context"}}' || true
+  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
 ```
 
-## Step 3: Generate Focused Tests
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
+   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
+
+## Step 3: Generate Focused E2E Test Cases
 
 Report step start:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":1,"name":"Focused Tests"}}' || true
+  -d '{"type":"step.started","data":{"step":2,"name":"Focused E2E Tests"}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generating focused E2E tests for: ${FOCUS_PROMPT}\"}}" || true
+  -d '{"type":"log","data":{"message":"Generating focused E2E test cases from knowledge base and scenarios..."}}' || true
 ```
 
-Spawn the `focused-test-case-generator` subagent with the following task (substitute actual values
-for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
+Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual
+values for FOCUS_PROMPT and FOCUS_SLUG before spawning):
 
 > **FOCUS_PROMPT**: <the user's focus description>
 > **FOCUS_SLUG**: <kebab-case slug>
 >
+> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
+> and scenarios from `autonoma/scenarios.md`.
 > Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
 > Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
->
-> Context available (use what exists, skip what doesn't):
-> - Knowledge base: `autonoma/AUTONOMA.md` (core_flows: <list>, feature_count: <n>)
-> - Scenarios: `autonoma/scenarios.md` (scenarios: <list>, variable_fields: <list>)
-> - Skills: `autonoma/skills/` (<n> files)
->
-> EXISTING_TESTS (do not duplicate these):
-> <flat list of existing test paths and titles>
->
 > You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
 > total_tests, total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Write INDEX.md FIRST, then individual test files.
+> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
+> scenario data when it is needed to test a real user-facing app behavior within the focus area.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
@@ -166,23 +280,19 @@ for `FOCUS_PROMPT`, `FOCUS_SLUG`, and the loaded context before spawning):
 Report step complete and upload test cases:
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests/${FOCUS_SLUG}" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused tests for '${FOCUS_PROMPT}'. Preparing upload...\"}}" || true
-```
+  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} focused test cases. Uploading to dashboard...\"}}" || true
 
-**If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-- question: "Do these focused tests look correct for your requested topic?"
-- options: ["Yes, upload to dashboard", "I want to suggest changes"]
-Wait for the user's response before uploading.
-**If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to upload.
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.completed","data":{"step":2,"name":"Focused E2E Tests"}}' || true
 
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
 proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
@@ -204,18 +314,143 @@ print(json.dumps({'testCases': test_cases}))
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
   -d @- || true
+```
+
+4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+   - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly."
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+
+## Step 4: Environment Factory
 
+Report step start:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true
+```
+
+This step requires these environment variables:
+- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
+- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
+
+If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
+preflight validation. State plainly that both environment variables are mandatory.
+
+Spawn the `env-factory-generator` subagent with the following task:
+
+> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
+> Implement or complete the Autonoma Environment Factory in the project's backend so it can
+> support the planned scenarios with the current SDK contract, then validate the planned scenarios
+> against that implementation.
+> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
+> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
+> Preserve the existing discover integration if it already works, and finish `up` / `down`
+> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
+> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
+> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
+> The recipe file must match the current setup API schema:
+> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
+> `name`, `description`, `create`, and `validation` with `status: "validated"`,
+> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
+> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
+> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
+> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
+> Persisted `create` must remain tokenized — never store resolved concrete values.
+> After writing the recipe file, run the preflight helper to validate all recipes against the
+> live SDK endpoint before uploading:
+> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
+> The preflight must pass for all three scenarios before Step 4 is considered complete.
+
+**After the subagent completes:**
+1. Verify the backend implementation or integration changes were made
+2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+3. Run the preflight helper if the subagent did not already do so:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+```
+If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
+4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
+5. Report which environment variables the backend now requires
+6. Report any backend issues that still need manual attention
+
+Report step complete:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+if [ -n "$GENERATION_ID" ]; then
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
+    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
+    exit 1
+  fi
+  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @"$RECIPE_PATH")
+  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
+    exit 1
+  fi
+
+  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
+  if [ "$VERIFY_STATUS" != "200" ]; then
+    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
+    exit 1
+  fi
+  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
+  MISSING=""
+  for NAME in $EXPECTED_NAMES; do
+    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
+print('yes' if match else 'no')
+" 2>/dev/null || echo "no")
+    if [ "$HAS_ACTIVE" != "yes" ]; then
+      MISSING="$MISSING $NAME"
+    fi
+  done
+  if [ -n "$MISSING" ]; then
+    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
+    echo "Step 4 cannot complete. Recipe upload may have partially failed."
+    exit 1
+  fi
+  echo "Verified: all scenario recipes persisted successfully on the dashboard."
+fi
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":1,"name":"Focused Tests"}}' || true
+  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true
 ```
 
 ## Completion
 
 After all steps complete, summarize:
-- **Focus**: The topic tested and the focus slug used as the output folder
-- **Tests generated**: Total count, folder breakdown, coverage correlation
-- **Context used**: Whether AUTONOMA.md and scenarios.md were available or a codebase scan was used
-- **Output location**: `autonoma/qa-tests/{FOCUS_SLUG}/`
-- **Avoided duplicates**: How many existing tests were found and respected
+- **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`)
+- **Step 1**: Knowledge base location and core flow count
+- **Step 2**: Scenario count and entity types covered
+- **Step 3**: Total focused test count, folder breakdown, coverage correlation
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results

From 7ed7b3073db83ac19616b9c1c135a1bd691da420 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <65306107+IgnacioPardo@users.noreply.github.com>
Date: Fri, 17 Apr 2026 15:52:11 -0300
Subject: [PATCH 21/33] feat: SDK integration step & hardened pipeline (#28)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 .claude-plugin/marketplace.json               |   4 +-
 .claude-plugin/plugin.json                    |   2 +-
 CLAUDE.md                                     |  73 ++-
 DEVELOPMENT.md                                |  21 +-
 README.md                                     | 214 +++----
 agents/env-factory-generator.md               | 325 ----------
 agents/scenario-validator.md                  | 217 +++++++
 agents/sdk-integrator.md                      | 272 ++++++++
 commands/generate-tests.md                    | 582 +++++++++++++-----
 hooks/validate-pipeline-output.sh             |  12 +
 .../validate_scenario_validation.py           |  67 ++
 hooks/validators/validate_sdk_endpoint.py     |  29 +
 hooks/validators/validate_sdk_integration.py  | 113 ++++
 skills/generate-tests/SKILL.md                | 568 ++++++++++++-----
 tests/test_validate_pipeline_output.py        | 137 +++++
 tests/test_validate_scenario_validation.py    |  65 ++
 tests/test_validate_sdk_endpoint.py           |  35 ++
 tests/test_validate_sdk_integration.py        |  79 +++
 18 files changed, 1982 insertions(+), 833 deletions(-)
 delete mode 100644 agents/env-factory-generator.md
 create mode 100644 agents/scenario-validator.md
 create mode 100644 agents/sdk-integrator.md
 create mode 100644 hooks/validators/validate_scenario_validation.py
 create mode 100644 hooks/validators/validate_sdk_endpoint.py
 create mode 100644 hooks/validators/validate_sdk_integration.py
 create mode 100644 tests/test_validate_scenario_validation.py
 create mode 100644 tests/test_validate_sdk_endpoint.py
 create mode 100644 tests/test_validate_sdk_integration.py

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index e18269f..33de4f0 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -14,12 +14,12 @@
         "repo": "Autonoma-AI/test-planner-plugin",
         "ref": "production"
       },
-      "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
+      "description": "Generates comprehensive E2E test cases through a validated multi-step pipeline with deterministic validation"
     },
     {
       "name": "autonoma-test-planner-development",
       "source": "./",
-      "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
+      "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated multi-step pipeline with deterministic validation"
     }
   ]
 }
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 3be3ef2..2de57c6 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.2.0",
+  "version": "1.2.1",
   "author": {
     "name": "Autonoma"
   }
diff --git a/CLAUDE.md b/CLAUDE.md
index c7642a5..7685d42 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,58 +1,67 @@
 # Autonoma Test Planner Plugin
 
-Claude Code plugin that generates E2E test suites through a 4-step deterministic pipeline.
+Claude Code plugin that generates E2E test suites through a deterministic 5-step pipeline.
 
 ## Project Structure
 
-```
-.claude-plugin/           # Plugin manifest (plugin.json, marketplace.json)
-commands/generate-tests.md  # Entry point — dispatches the 4-step pipeline
-skills/generate-tests/SKILL.md  # Orchestrator skill
-agents/                   # Isolated subagents (one per step)
-  kb-generator.md         # Step 1: Knowledge base → autonoma/AUTONOMA.md + features.json
-  scenario-generator.md   # Step 2: Discover + scenarios → autonoma/discover.json + autonoma/scenarios.md
-  test-case-generator.md  # Step 3: Tests → autonoma/qa-tests/INDEX.md + test files
-  env-factory-generator.md # Step 4: Environment Factory implementation/integration + scenario validation
+```text
+.claude-plugin/              # Plugin manifest
+commands/generate-tests.md   # Command entry
+skills/generate-tests/SKILL.md
+agents/
+  sdk-integrator.md          # Step 1: SDK integration
+  kb-generator.md            # Step 2: Knowledge base
+  scenario-generator.md      # Step 3: Scenarios
+  test-case-generator.md     # Step 4: E2E tests
+  scenario-validator.md      # Step 5: Scenario validation
 hooks/
-  hooks.json              # PostToolUse hook config (triggers on Write)
-  validate-pipeline-output.sh  # Bash dispatcher → routes to Python validators
-  validators/             # Python scripts that validate YAML frontmatter
+  hooks.json
+  validate-pipeline-output.sh
+  preflight_scenario_recipes.py
+  validators/
+tests/
 ```
 
-## How the Pipeline Works
+## Pipeline
 
-Each step spawns an isolated subagent. After each Write, the PostToolUse hook in `hooks/hooks.json` runs `validate-pipeline-output.sh`, which pattern-matches the file path and runs the appropriate Python validator. Validators exit 0 (OK) or 2 (block with error message).
+1. SDK Integration
+2. Knowledge Base
+3. Scenarios
+4. E2E Tests
+5. Scenario Validation
 
-Steps 1-3 require user confirmation before advancing. Step 4 is the final step.
+The canonical launch mode is `AUTONOMA_AUTO_ADVANCE=true`. If you are still using the older flag,
+`AUTONOMA_REQUIRE_CONFIRMATION=false` is treated as the same auto-advance behavior. Step 5 is final.
 
 ## Validation
 
-Validators are in `hooks/validators/`. They parse YAML frontmatter and check required fields, types, and cross-file consistency. All validators print "OK" on success or an error message on failure.
+Validators are in `hooks/validators/`.
 
 | Validator | File matched | Key checks |
 |-----------|-------------|------------|
-| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | app_name, app_description (≥20 chars), core_flows with at least one `core: true` |
+| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | app_name, app_description, core_flows |
 | `validate_discover.py` | `*/autonoma/discover.json` | schema object, models, edges, relations, scopeField |
-| `validate_features.py` | `*/autonoma/features.json` | features array length matches total_features, valid types, at least one core feature |
-| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario_count ≥ 3, standard/empty/large scenarios present, entity_types, discover metadata, variable field strategy |
-| `validate_scenario_recipes.py` | `*/autonoma/scenario-recipes.json` | approved recipe file, validation mode, standard/empty/large present, lifecycle status |
-| `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals match folder sums, criticality sums, cross-checks against features.json |
-| `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | title, description, criticality (critical/high/mid/low), scenario, flow |
+| `validate_sdk_endpoint.py` | `*/autonoma/.sdk-endpoint` | absolute http/https URL |
+| `validate_sdk_integration.py` | `*/autonoma/.sdk-integration.json` | Step 1 handoff contract |
+| `validate_features.py` | `*/autonoma/features.json` | feature inventory schema |
+| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario count and metadata |
+| `validate_scenario_validation.py` | `*/autonoma/.scenario-validation.json` | Step 5 terminal-state contract |
+| `validate_scenario_recipes.py` | `*/autonoma/scenario-recipes.json` | recipe schema |
+| `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals and folder sums |
+| `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | test frontmatter |
+
+Scenario recipes also run live endpoint preflight through `hooks/preflight_scenario_recipes.py`.
 
 ## Development
 
 ```bash
-# Run plugin locally without installing
 claude --plugin-dir ./
-
-# Validate plugin structure
 claude plugin validate ./
+pytest
 ```
 
-## Dependencies
-
-- Python 3 + PyYAML (auto-installed by the hook if missing)
-
-## Known Issues
+## Notes
 
-- `commands/generate-tests.md` has unresolved merge conflicts between the AskUserQuestion approach and the end-turn approach for user confirmation between steps. Resolve before merging to main.
+- Step 1 installs the SDK from package managers only.
+- The SDK reference repo is read-only context.
+- Step 5 validates the live integration and does not edit backend code.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index b6726f6..84b49ff 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -4,8 +4,8 @@ This guide explains how to test changes from a branch without publishing to the
 
 ## Prerequisites
 
-- [Claude Code](https://claude.ai/code) installed
-- Your branch pushed to GitHub
+- [Claude Code](https://claude.ai/code)
+- your branch pushed to GitHub
 
 ## Install from a branch
 
@@ -31,15 +31,22 @@ Push new commits to your branch, then reinstall:
 
 ## Environment variables
 
-The plugin requires three environment variables to be set in the project where you run it:
+The plugin itself requires these values in the target project session:
 
 | Variable | Description |
 | --- | --- |
-| `AUTONOMA_API_KEY` | Your Autonoma API key (get it from the dashboard under Settings > API Keys) |
-| `AUTONOMA_PROJECT_ID` | The application ID from the Autonoma dashboard |
-| `AUTONOMA_API_URL` | API base URL - use `http://localhost:4000` for local dev |
+| `AUTONOMA_API_KEY` | Autonoma API key |
+| `AUTONOMA_PROJECT_ID` | Application ID from the Autonoma dashboard |
+| `AUTONOMA_API_URL` | API base URL, for example `http://localhost:4000` in local dev |
 
-Add them to the `.env` file or export them in your shell before running Claude Code in the target project.
+You do **not** need to pre-set `AUTONOMA_SDK_ENDPOINT`, `AUTONOMA_SHARED_SECRET`, or `AUTONOMA_SIGNING_SECRET`.
+Step 1 creates or discovers those values in the target repo by editing `.env` and `.env.example`.
+
+Use `AUTONOMA_AUTO_ADVANCE=true` as the canonical launch mode while testing. If you are still using
+the older confirmation flag, `AUTONOMA_REQUIRE_CONFIRMATION=false` is treated as the same
+auto-advance behavior.
+
+After the generated PR is merged, the user still needs to deploy those env changes.
 
 ## References
 
diff --git a/README.md b/README.md
index 176ea66..9da57e3 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,13 @@
 # Autonoma Test Planner
 
-A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 4-step pipeline.
+A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 5-step pipeline.
 
-Each step runs in an isolated subagent with deterministic validation — shell scripts check the output format before the pipeline advances. No hallucinated validations, no cascading errors.
+Each step runs in an isolated subagent with deterministic validation. The first step now integrates the Autonoma SDK directly into the target project, and the final step validates scenarios against that live endpoint without editing backend code.
 
 ## Install
 
-**Step 1:** Add the marketplace:
-
-```
+```text
 /plugin marketplace add Autonoma-AI/test-planner-plugin
-```
-
-**Step 2:** Install the plugin:
-
-```
 /plugin install autonoma-test-planner@autonoma
 ```
 
@@ -22,185 +15,138 @@ Each step runs in an isolated subagent with deterministic validation — shell s
 
 Inside any project with Claude Code:
 
-```
+```text
 /autonoma-test-planner:generate-tests
 ```
 
-The plugin walks you through 4 steps, asking for confirmation at each checkpoint before proceeding.
+The canonical launch mode is `AUTONOMA_AUTO_ADVANCE=true`, which keeps the plugin moving after
+Steps 1-4. If you are still using the older confirmation flag, `AUTONOMA_REQUIRE_CONFIRMATION=false`
+is treated as the same auto-advance behavior.
 
-## How it works
+## Pipeline
 
-### Step 1: Knowledge Base
+### Step 1: SDK Integration
 
-Analyzes your frontend codebase and produces `autonoma/AUTONOMA.md` — a user-perspective map of every page, flow, and feature. The file includes YAML frontmatter with a core flows table that determines how test coverage is distributed.
+Detects the project stack, installs the Autonoma SDK from package managers, wires the endpoint, ensures secrets exist, starts or reuses a local dev server, verifies signed `discover` / `up` / `down`, and writes `autonoma/.sdk-endpoint` plus `autonoma/.sdk-integration.json`.
 
-**You review**: the core flows table. If a flow is marked `core: true`, it gets 50-60% of test coverage.
+It may also create a branch, commit the integration, and open a PR when `gh` is available.
 
-### Step 2: Scenarios
+**You review**: detected stack, installed packages, endpoint URL, generated env vars, and PR status.
 
-Reads the knowledge base and the SDK `discover` response from your backend Environment Factory to design three test data environments: `standard` (realistic variety), `empty` (empty states), and `large` (pagination/performance). Outputs `autonoma/discover.json` plus `autonoma/scenarios.md`, preserving the legacy scenario summary while adding schema metadata and minimal variable-field planning.
+### Step 2: Knowledge Base
 
-**You review**: entity names, counts, relationships, and which values truly must stay generated. Fixed values are preferred because they become stable test assertions; if uniqueness is needed, the planner should first prefer concrete hardcoded values with a discriminator. Variable fields are exceptions used only for genuinely dynamic values. Generator hints are optional and are not tied to `faker`.
+Analyzes the app and produces `autonoma/AUTONOMA.md` and `autonoma/features.json`.
 
-### Step 3: E2E Tests
+**You review**: the core flows table.
 
-Generates markdown test files organized by feature in `autonoma/qa-tests/`. Each test has frontmatter (title, description, criticality, scenario, flow) and uses only natural-language steps: click, scroll, type, assert.
+### Step 3: Scenarios
 
-An `INDEX.md` tracks total test count, folder breakdown, and coverage correlation with your codebase size.
+Fetches `discover` from the Step 1 endpoint and produces `autonoma/discover.json` plus `autonoma/scenarios.md`.
 
-`scenarios.md` is fixture input for this step, not the subject under test. Step 3 should not spend test budget verifying seeded counts or Environment Factory correctness.
+**You review**: entity names, counts, relationships, and which values should stay concrete versus variable.
 
-**You review**: test distribution and coverage correlation. Test count should roughly match 3-5x your route/feature count.
+### Step 4: E2E Tests
 
-### Step 4: Environment Factory
+Generates markdown test files in `autonoma/qa-tests/` plus `INDEX.md`.
 
-Implements or completes the backend Environment Factory so the planned scenarios can actually be created and torn down through the current SDK contract. Step 4 includes backend wiring plus validation: `discover`, `up`, `down`, request signing, refs signing, a smoke-tested lifecycle, and validation of the planned scenarios with `autonoma/scenario-recipes.json`. After validation, the plugin uploads the parsed recipe document to the setup API through the dedicated `scenario-recipe-versions` route so Step 04 in `agent` can persist normalized scenario data directly.
+**You review**: test distribution and coverage correlation.
 
-**You review**: where the Environment Factory lives, what changed, whether a smoke `discover` → `up` → `down` check passed, and whether `standard`, `empty`, and `large` all passed lifecycle validation.
+### Step 5: Scenario Validation
 
-## Scenario Recipes
+Validates `standard`, `empty`, and `large` against the live SDK endpoint, writes `autonoma/scenario-recipes.json` plus `autonoma/.scenario-validation.json`, runs endpoint preflight, and uploads the approved recipes to the setup API only after all checks pass.
 
-`autonoma/scenario-recipes.json` is the validated handoff between planning and execution. It is produced in Step 4 after the Environment Factory has been implemented or verified and after each scenario has passed lifecycle validation.
+This step does **not** implement backend code. It only validates the existing integration.
 
-The file contains:
+## Key Outputs
 
-- top-level metadata: `version`, `source`, and `validationMode`
-- one recipe per named scenario, usually `standard`, `empty`, and `large`
-- for each recipe:
-  - `name` and `description`
-  - `create`: the inline data graph Autonoma will send to the SDK `up` action
-  - `validation`: proof that the recipe passed `checkScenario`, `checkAllScenarios`, or endpoint lifecycle validation
+- `autonoma/.sdk-endpoint`: validated SDK endpoint URL
+- `autonoma/.sdk-integration.json`: Step 1 machine-readable handoff
+- `autonoma/AUTONOMA.md`
+- `autonoma/features.json`
+- `autonoma/discover.json`
+- `autonoma/scenarios.md`
+- `autonoma/qa-tests/INDEX.md`
+- `autonoma/.scenario-validation.json`: Step 5 terminal-state artifact
+- `autonoma/scenario-recipes.json`
 
-Conceptually, a scenario recipe is not a test case. It is a data fixture definition for the Environment Factory. The `create` payload describes which records should exist before a run starts, including nested records and references such as `_alias` and `_ref`.
+## Environment Variables
 
-Example shape:
+Provide these before running the plugin:
 
-```json
-{
-  "version": 1,
-  "source": {
-    "discoverPath": "autonoma/discover.json",
-    "scenariosPath": "autonoma/scenarios.md"
-  },
-  "validationMode": "sdk-check",
-  "recipes": [
-    {
-      "name": "standard",
-      "description": "Realistic baseline workspace",
-      "create": {
-        "User": [{ "email": "{{owner_email}}" }]
-      },
-      "variables": {
-        "owner_email": {
-          "strategy": "derived",
-          "source": "testRunId",
-          "format": "owner+{testRunId}@example.com"
-        }
-      },
-      "validation": {
-        "status": "validated",
-        "method": "checkScenario",
-        "phase": "ok"
-      }
-    }
-  ]
-}
+```bash
+AUTONOMA_API_KEY=<api key>
+AUTONOMA_PROJECT_ID=<application id>
+AUTONOMA_API_URL=<setup api base url>
 ```
 
-Persisted recipes store tokenized `create` payloads plus `variables` metadata — never resolved concrete values. The `variables` field defines how each `{{token}}` is resolved at runtime using one of three strategies: `literal`, `derived` (from `testRunId`), or `faker`. This allows the `agent` to resolve the same tokens later for real runs.
-
-During Step 4, the plugin runs a preflight check that resolves tokens into transient concrete payloads and sends signed `up`/`down` requests to the live SDK endpoint. The write hook also enforces that same preflight before a final `autonoma/scenario-recipes.json` write is accepted. These transient values are never persisted.
-
-Storage semantics:
-
-- in this plugin repo, `autonoma/scenario-recipes.json` is a local output artifact so the user and validators can inspect it
-- when uploaded to `agent`, the backend does not keep the raw JSON file as text
-- instead, `agent` parses the document and stores the approved scenario recipe data in its scenario JSONB storage through the `scenario-recipe-versions` setup endpoint
-
-Runtime semantics:
-
-- the planner still thinks in named scenarios like `standard`, `empty`, and `large`
-- the SDK protocol does not require those names on the wire
-- before a run, Autonoma resolves the active stored recipe version for the selected scenario and sends its `create` payload to the Environment Factory `up` action
-- after the run, Autonoma calls `down` using the returned teardown refs/token
+Canonical:
 
-## Validation
-
-Every output file has YAML frontmatter validated by shell scripts (not prompts). If validation fails, Claude sees the error and must fix it before proceeding.
-
-| File | What's validated |
-|------|-----------------|
-| `AUTONOMA.md` | core_flows table, app description, feature/skill counts |
-| `discover.json` | SDK discover schema shape: models, edges, relations, scopeField, and supported `type` formats |
-| `scenarios.md` | scenario count, required scenarios (standard/empty/large), entity types, discover metadata, minimal variable fields |
-| `scenario-recipes.json` | validated recipe file, discover-aware model/field/type parity, required scenarios, optional variables consistency, and mandatory live endpoint preflight |
-| `INDEX.md` | test totals match folder sums, criticality counts sum correctly, test count within expected range |
-| Each test file | title, description, criticality (critical/high/mid/low), scenario, flow |
-
-## Environment Variables
+```bash
+AUTONOMA_AUTO_ADVANCE=true
+```
 
-Step 2 and Step 4 use the live SDK endpoint when fetching `discover` or validating through HTTP:
+Compatibility alias:
 
 ```bash
-AUTONOMA_SDK_ENDPOINT=<your sdk endpoint url>
-AUTONOMA_SHARED_SECRET=<shared HMAC secret>
+AUTONOMA_REQUIRE_CONFIRMATION=false
 ```
 
-Step 4 backend implementation uses the current SDK secret names:
+You no longer need to pre-provide `AUTONOMA_SDK_ENDPOINT` or `AUTONOMA_SHARED_SECRET`. Step 1 creates or discovers them in the target project.
+
+The integration step updates `.env` and `.env.example` in the target repo with:
 
 ```bash
-AUTONOMA_SHARED_SECRET=<shared HMAC secret>
-AUTONOMA_SIGNING_SECRET=<private refs signing secret>
+AUTONOMA_SHARED_SECRET=<shared hmac secret>
+AUTONOMA_SIGNING_SECRET=<private signing secret>
 ```
 
-## Requirements
+Those changes still need to be deployed after PR creation or merge.
+
+## Validation
 
-- Claude Code
-- Python 3 (ships with macOS/Linux)
-- PyYAML (auto-installed if missing)
+Every pipeline output is validated by shell-dispatched Python validators.
+
+| File | Validation |
+| --- | --- |
+| `AUTONOMA.md` | frontmatter and core-flow structure |
+| `features.json` | feature inventory schema |
+| `discover.json` | SDK discover schema |
+| `.sdk-endpoint` | absolute `http` or `https` URL |
+| `.sdk-integration.json` | Step 1 handoff contract |
+| `scenarios.md` | scenario schema and required sections |
+| `.scenario-validation.json` | Step 5 terminal-state contract |
+| `scenario-recipes.json` | recipe schema plus live endpoint preflight |
+| `INDEX.md` | test totals and folder breakdown |
+| test files | required frontmatter |
 
 ## Local Development
 
 ```bash
-# Test locally without installing
 claude --plugin-dir ./
-
-# Validate plugin structure
 claude plugin validate ./
+pytest
 ```
 
 ## Project Structure
 
-```
+```text
 autonoma-test-planner/
 ├── .claude-plugin/
-│   ├── plugin.json                     # Plugin manifest
-│   └── marketplace.json                # Marketplace catalog
-├── skills/generate-tests/SKILL.md      # /generate-tests orchestrator
+├── commands/generate-tests.md
+├── skills/generate-tests/SKILL.md
 ├── agents/
-│   ├── kb-generator.md                 # Step 1 subagent
-│   ├── scenario-generator.md           # Step 2 subagent
-│   ├── test-case-generator.md          # Step 3 subagent
-│   └── env-factory-generator.md        # Step 4 subagent
+│   ├── sdk-integrator.md
+│   ├── kb-generator.md
+│   ├── scenario-generator.md
+│   ├── test-case-generator.md
+│   └── scenario-validator.md
 ├── hooks/
-│   ├── hooks.json                      # PostToolUse hook config
-│   ├── validate-pipeline-output.sh     # Validation dispatcher
-│   ├── preflight_scenario_recipes.py   # Preflight resolver + endpoint lifecycle checker
+│   ├── validate-pipeline-output.sh
+│   ├── preflight_scenario_recipes.py
 │   └── validators/
-│       ├── validate_kb.py
-│       ├── validate_discover.py
-│       ├── validate_scenario_recipes.py
-│       ├── validate_scenarios.py
-│       ├── validate_test_index.py
-│       └── validate_test_file.py
-├── LICENSE
-└── README.md
+└── tests/
 ```
 
-## Documentation
-
-Full prompt documentation: [docs.agent.autonoma.app/llms.txt](https://docs.agent.autonoma.app/llms.txt)
-
 ## License
 
 MIT
diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md
deleted file mode 100644
index 45998c1..0000000
--- a/agents/env-factory-generator.md
+++ /dev/null
@@ -1,325 +0,0 @@
----
-description: >
-  Implements or completes the Autonoma Environment Factory in the project's backend.
-  Extends an existing SDK integration when possible, wires discover/up/down behavior to the
-  planned scenarios, then validates the planned scenarios against the lifecycle before completing.
-tools:
-  - Read
-  - Glob
-  - Grep
-  - Write
-  - Edit
-  - Bash
-  - Agent
-  - WebFetch
-maxTurns: 60
----
-
-# Environment Factory Generator
-
-You implement or complete the Autonoma Environment Factory in the project's backend.
-Your inputs are `autonoma/discover.json`, `autonoma/scenarios.md`, and the backend codebase.
-Your output is working backend code plus validated scenario recipes.
-
-## Goal
-
-Step 2 already proved that the backend can answer `discover`, or at least that there is enough
-of an Environment Factory integration to expose schema metadata. Step 4's job is to finish the
-real backend implementation for scenario creation and teardown, then validate the planned scenarios
-against that implementation:
-
-1. make sure the backend exposes the current SDK protocol
-2. make sure `up` can create scenario data from inline `create` recipes
-3. make sure `down` can delete only the data created by `up`
-4. smoke-test the lifecycle in-session
-5. validate `standard`, `empty`, and `large`
-6. persist approved recipes to `autonoma/scenario-recipes.json`
-
-## Instructions
-
-1. First, fetch the latest implementation instructions:
-
-   Use WebFetch to read BOTH of these:
-   - `https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt`
-   - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt`
-
-   Follow the current SDK protocol from those docs. If the docs lag behind the repo, prefer the
-   real SDK contract already visible in the backend codebase.
-
-2. Read `autonoma/discover.json` and `autonoma/scenarios.md`.
-   - `discover.json` is the schema source of truth
-   - `scenarios.md` is the planning layer that defines what `standard`, `empty`, and `large`
-     should look like
-
-3. Explore the backend codebase to determine:
-   - whether the Autonoma SDK is already installed
-   - where the Environment Factory endpoint lives
-   - which parts already exist: `discover`, `up`, `down`, auth callback, teardown helpers
-   - what framework and ORM patterns the backend already uses
-
-## CRITICAL: Before Writing Any Code
-
-Ask the user for confirmation before implementing. Present a short plan:
-
-> "I'm about to implement or complete the Autonoma Environment Factory. Here's what I'll do:
->
-> **Endpoint location**: [route / handler path]
-> **Current state**: [what already exists vs what is missing]
-> **Step 4 scope**: make discover/up/down work with the current SDK contract and validate the planned scenarios against it
-> **Database operations**: `up` will create isolated test data and `down` will delete only those created refs
-> **Security**: HMAC-SHA256 request signing with `AUTONOMA_SHARED_SECRET` plus signed refs tokens with `AUTONOMA_SIGNING_SECRET`
->
-> **Environment variables needed**:
-> - `AUTONOMA_SHARED_SECRET`
-> - `AUTONOMA_SIGNING_SECRET`
->
-> Shall I proceed?"
-
-Do NOT proceed until the user confirms.
-
-## Implementation Requirements
-
-### Build on the existing backend
-
-- Prefer extending the existing Environment Factory endpoint over replacing it
-- Match the backend's framework, ORM, and route conventions
-- Do not create a separate throwaway server
-
-### Current SDK contract
-
-Implement or preserve these actions:
-
-| Action | Purpose |
-|--------|---------|
-| `discover` | Return schema metadata: version, sdk info, models, edges, relations, scopeField |
-| `up` | Accept inline `create` payloads plus optional `testRunId`, create data, return `auth`, `refs`, and `refsToken` |
-| `down` | Accept `refsToken`, verify it, and tear down the created data |
-
-### Security requirements
-
-Use these exact environment variable names:
-- `AUTONOMA_SHARED_SECRET` — HMAC request verification secret shared with Autonoma
-- `AUTONOMA_SIGNING_SECRET` — private secret for signing and verifying refs tokens
-
-Required protections:
-1. production guard unless explicitly allowed
-2. HMAC-SHA256 verification of the `x-signature` header
-3. signed refs tokens for teardown
-
-### Scenario implementation guidance
-
-- Use `autonoma/scenarios.md` to decide what data the backend needs to support
-- Preserve generated fields as generated values; do not force everything into static literals
-- Make unique fields depend on `testRunId` when needed
-- Prefer explicit create and teardown ordering based on the schema
-- If `discover` already works but `up` / `down` do not, keep the introspection path and finish the lifecycle
-
-### Per-run data isolation via testRunId
-
-When `scenarios.md` contains many variable fields with `generator: derived from testRunId` — especially
-on identifying fields like names, titles, and descriptions, not just emails — the app lacks natural
-multi-tenant isolation. The scenario generator slugged these fields so that parallel or sequential
-test runs never collide.
-
-Preserve all of these `{{testRunId}}` tokens in `create` payloads and map them to `derived` strategy
-entries in the recipe `variables` block. Do not collapse slugged fields back into concrete literals.
-For these apps, `testRunId` is effectively required for correct operation — note this in the summary
-you present to the user at the end of Step 4.
-
-### CRITICAL: Use nested tree structure in `create` payloads
-
-Recipe `create` payloads MUST use a **nested tree** rooted at the scope entity (the model that
-owns `scopeField`). Do NOT use flat top-level model keys connected only by `_ref`.
-
-**Why:** The Autonoma dashboard may reorder JSON object keys when forwarding the `create` payload
-to the SDK endpoint. The SDK's `resolveTree` processes models in `Object.entries(create)` insertion
-order. If a child model (e.g. `Tasks`) appears before its parent (e.g. `Organizations`), `_ref`
-aliases are not yet registered, the INSERT runs without the FK value, and NOT NULL constraints fail.
-
-**How:** Nest children inside their parent using the SDK's relation field names from `discover.json`.
-Look at the `relations` array in the discover response — the `parentField` value is the nesting key.
-
-Instead of flat `_ref`:
-```json
-{
-  "Organizations": [{"_alias": "org1", "name": "Acme"}],
-  "Users": [{"name": "Alice", "organizationId": {"_ref": "org1"}}]
-}
-```
-
-Use nested tree:
-```json
-{
-  "Organizations": [{
-    "_alias": "org1",
-    "name": "Acme",
-    "userses": [{"_alias": "u1", "name": "Alice"}]
-  }]
-}
-```
-
-The SDK automatically sets the child FK (`organizationId`) when a child is nested under its parent.
-Use `_ref` only for **cross-branch** references that cannot be expressed by nesting (e.g. a Task
-nested under a Project that references a User nested under the same Organization via `assigneeId`).
-
-Only use `{{testRunId}}` as a template token in `create` values — do not invent custom tokens like
-`{{user_email_alice}}`. The SDK's template engine only resolves built-in expressions
-(`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`, etc.). Custom tokens cause a runtime error when the
-dashboard sends the payload directly to the endpoint.
-
-## CRITICAL: Smoke-Test and Validate Within the Session
-
-After implementing, test the lifecycle in-session.
-
-At minimum:
-1. confirm `discover` still works
-2. send one signed `up` request with a small inline `create` payload compatible with the schema
-3. send the corresponding signed `down` request using the returned `refsToken`
-4. verify cleanup succeeds
-
-After the wiring works, validate `standard`, `empty`, and `large` against the backend.
-Prefer:
-1. backend-local `checkScenario` / `checkAllScenarios`
-2. signed endpoint `up` / `down` validation if local SDK checks are not practical
-
-Write the approved results to `autonoma/scenario-recipes.json`.
-
-## CRITICAL: scenario-recipes.json must match the current setup API schema
-
-The file must be a JSON object in this exact logical shape:
-
-```json
-{
-  "version": 1,
-  "source": {
-    "discoverPath": "autonoma/discover.json",
-    "scenariosPath": "autonoma/scenarios.md"
-  },
-  "validationMode": "sdk-check",
-  "recipes": [
-    {
-      "name": "standard",
-      "description": "Realistic dataset for core flows",
-      "create": {
-        "Organization": [{
-          "_alias": "org1",
-          "name": "Acme Corp",
-          "userses": [
-            { "_alias": "owner", "email": "owner-{{testRunId}}@example.com" }
-          ],
-          "projectses": [
-            { "name": "Main Project", "taskses": [
-              { "title": "First task", "assigneeId": { "_ref": "owner" } }
-            ]}
-          ]
-        }]
-      },
-      "variables": {
-        "testRunId": {
-          "strategy": "derived",
-          "source": "testRunId",
-          "format": "{testRunId}"
-        }
-      },
-      "validation": {
-        "status": "validated",
-        "method": "checkScenario",
-        "phase": "ok",
-        "up_ms": 12,
-        "down_ms": 8
-      }
-    }
-  ]
-}
-```
-
-**Note:** The `create` payload uses a nested tree structure. Children are nested under parents using
-the relation field names from `discover.json` (e.g. `userses`, `projectses`, `taskses`). The SDK
-automatically fills in parent FK fields. Only cross-branch references use `_ref`.
-
-Required rules:
-- top-level keys must be `version`, `source`, `validationMode`, and `recipes`
-- `version` must be the integer `1`
-- `source.discoverPath` must be `autonoma/discover.json`
-- `source.scenariosPath` must be `autonoma/scenarios.md`
-- `validationMode` must be `sdk-check` or `endpoint-lifecycle`
-- `recipes` must include `standard`, `empty`, and `large`
-- every recipe must contain `name`, `description`, `create`, and `validation`
-- every `validation` object must contain:
-  - `status: "validated"`
-  - `method`: one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`
-  - `phase: "ok"`
-  - optional `up_ms` / `down_ms` as non-negative integers
-
-### Per-recipe `variables` (required when `create` uses tokens)
-
-If `create` contains `{{token}}` placeholders, the recipe MUST include a `variables` object that
-defines how each token is resolved. The persisted `create` remains tokenized — concrete values are
-never stored. The `variables` field stores the planned generation logic so the `agent` can resolve
-tokens at runtime.
-
-Allowed strategies:
-- `literal` — `{ "strategy": "literal", "value": <scalar> }`
-- `derived` — `{ "strategy": "derived", "source": "testRunId", "format": "<template>" }`
-- `faker` — `{ "strategy": "faker", "generator": "<generator_id>" }`
-
-Allowed faker generators: `person.firstName`, `person.lastName`, `internet.email`, `company.name`, `lorem.words`.
-
-Rules:
-- every `{{token}}` in `create` must have a matching key in `variables`
-- every key in `variables` must be used as a `{{token}}` in `create`
-- fully concrete recipes (no tokens) do not need `variables`
-
-Do not write the old shape. In particular, do not use:
-- top-level `generatedAt`
-- top-level `scenarios`
-- per-recipe `validated`
-- per-recipe `timing`
-
-If you need timing data, map it into `validation.up_ms` and `validation.down_ms`.
-
-If any smoke test fails, fix the implementation and re-test.
-
-## CRITICAL: Preflight Endpoint Validation
-
-After generating tokenized recipes with `variables`, you MUST run a preflight check before
-writing the final `autonoma/scenario-recipes.json`. This is mandatory — backend-local
-`checkScenario` alone is NOT sufficient to complete Step 4.
-
-The preflight flow for each recipe:
-1. Generate a synthetic `testRunId`: `autonoma-preflight-<scenario>-<unix_ms>-<short_suffix>`
-2. Resolve all `{{token}}` placeholders using the `variables` definitions and the synthetic `testRunId`
-3. Send a signed `up` request to `AUTONOMA_SDK_ENDPOINT` with the resolved `create` payload
-4. Verify `up` returns `auth`, `refs`, and `refsToken`
-5. Send a signed `down` request with the returned `refs` and `refsToken`
-6. Verify `down` succeeds
-
-Run the preflight helper script:
-```bash
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json
-```
-
-This script requires `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` environment variables.
-
-If preflight fails, do NOT upload the recipe file. Fix the recipe or backend issue and re-run.
-The transient concrete values used during preflight are never persisted.
-
-## What to Explain to the User
-
-When finished, explain:
-1. where the Environment Factory lives in the backend
-2. what was added or fixed
-3. what env vars are required:
-   - `AUTONOMA_SHARED_SECRET`
-   - `AUTONOMA_SIGNING_SECRET`
-4. what smoke tests were run and whether the lifecycle succeeded
-5. whether `standard`, `empty`, and `large` validated successfully
-6. where `autonoma/scenario-recipes.json` was written
-
-## Important
-
-- Do not remove or rewrite existing working discover logic just because Step 2 now consumes it
-- Treat `discover.json` as the schema contract and `scenarios.md` as the scenario intent
-- Step 4 is both Environment Factory implementation/integration and scenario validation
-- Keep backend changes minimal and consistent with the repo's style
-- Do not claim rollback semantics unless the backend actually implements rollback
diff --git a/agents/scenario-validator.md b/agents/scenario-validator.md
new file mode 100644
index 0000000..b91a8b5
--- /dev/null
+++ b/agents/scenario-validator.md
@@ -0,0 +1,217 @@
+---
+description: >
+  Validates planned scenarios against a live Autonoma SDK endpoint and writes
+  approved scenario recipes. Assumes SDK integration is already complete.
+tools:
+  - Read
+  - Glob
+  - Grep
+  - Write
+  - Edit
+  - Bash
+  - Agent
+  - WebFetch
+maxTurns: 60
+---
+
+# Scenario Validator
+
+You validate the planned scenarios against an already-working Autonoma SDK endpoint.
+Your inputs are `autonoma/discover.json`, `autonoma/scenarios.md`, and the existing backend behavior.
+Your output is `autonoma/scenario-recipes.json`.
+You MUST also leave a terminal artifact in `autonoma/.scenario-validation.json`.
+
+## Goal
+
+Step 1 already handled SDK installation, endpoint wiring, secrets, branch creation, and any PR work.
+This step is validation-only. Your job is to:
+
+1. read the schema contract from `autonoma/discover.json`
+2. read the scenario intent from `autonoma/scenarios.md`
+3. smoke-test `discover`, `up`, and `down` against the live endpoint
+4. validate `standard`, `empty`, and `large`
+5. persist approved recipes to `autonoma/scenario-recipes.json`
+
+## Strict Prohibitions
+
+- Do NOT install packages.
+- Do NOT edit backend code.
+- Do NOT modify SDK source code.
+- Do NOT modify database schemas or migrations.
+- Do NOT create branches, commits, or PRs.
+- Do NOT try to "fix" validation failures by changing the SDK contract.
+
+If validation fails, report the backend or recipe issue clearly and stop. Treat failures as integration or scenario issues, not coding tasks for this step.
+On failure, still write `autonoma/.scenario-validation.json` with `status: "failed"` and all blocking issues.
+
+## Instructions
+
+1. Fetch the current SDK protocol reference:
+   - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt`
+
+2. Read:
+   - `autonoma/discover.json`
+   - `autonoma/scenarios.md`
+
+3. Read `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` from the environment.
+   - If `AUTONOMA_SDK_ENDPOINT` is missing or the endpoint is unreachable, stop and tell the user to check Step 1 or the local dev server status.
+   - Do not try to implement or repair the endpoint in this step.
+
+## Validation Requirements
+
+### Smoke-test the live endpoint
+
+At minimum:
+1. confirm `discover` works
+2. send one signed `up` request with a small inline `create` payload compatible with the schema
+3. send the corresponding signed `down` request using the returned `refsToken`
+4. verify cleanup succeeds
+
+### Scenario validation
+
+After the smoke test works, validate `standard`, `empty`, and `large` against the current backend.
+
+Prefer:
+1. backend-local `checkScenario` / `checkAllScenarios` if already available without code changes
+2. signed endpoint `up` / `down` validation otherwise
+
+Do not change the backend if validation fails. Report the failure and stop.
+
+## Recipe Shape Requirements
+
+Write `autonoma/scenario-recipes.json` in this exact logical shape:
+
+```json
+{
+  "version": 1,
+  "source": {
+    "discoverPath": "autonoma/discover.json",
+    "scenariosPath": "autonoma/scenarios.md"
+  },
+  "validationMode": "sdk-check",
+  "recipes": [
+    {
+      "name": "standard",
+      "description": "Realistic dataset for core flows",
+      "create": {
+        "Organization": [{
+          "_alias": "org1",
+          "name": "Acme Corp"
+        }]
+      },
+      "variables": {
+        "testRunId": {
+          "strategy": "derived",
+          "source": "testRunId",
+          "format": "{testRunId}"
+        }
+      },
+      "validation": {
+        "status": "validated",
+        "method": "checkScenario",
+        "phase": "ok",
+        "up_ms": 12,
+        "down_ms": 8
+      }
+    }
+  ]
+}
+```
+
+Required rules:
+- top-level keys must be `version`, `source`, `validationMode`, and `recipes`
+- `version` must be integer `1`
+- `source.discoverPath` must be `autonoma/discover.json`
+- `source.scenariosPath` must be `autonoma/scenarios.md`
+- `validationMode` must be `sdk-check` or `endpoint-lifecycle`
+- `recipes` must include `standard`, `empty`, and `large`
+- every recipe must contain `name`, `description`, `create`, and `validation`
+- every `validation` object must contain:
+  - `status: "validated"`
+  - `method`: one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`
+  - `phase: "ok"`
+  - optional `up_ms` / `down_ms` as non-negative integers
+
+### Nested tree requirement
+
+Recipe `create` payloads MUST use a nested tree rooted at the scope entity.
+Do NOT use flat top-level model keys connected only by `_ref`.
+
+Children must be nested under their parent using the relation field names from `discover.json`.
+Use `_ref` only for cross-branch references that cannot be expressed through nesting.
+
+### Variables requirement
+
+If `create` contains `{{token}}` placeholders, include a `variables` object for that recipe.
+
+Allowed strategies:
+- `literal`
+- `derived`
+- `faker`
+
+Rules:
+- every `{{token}}` in `create` must have a matching key in `variables`
+- every key in `variables` must be used in `create`
+- fully concrete recipes do not need `variables`
+- if the backend requires explicit scalar foreign-key values in addition to nested trees, include those scalar assignments using `_ref`-resolved values
+- any collision-prone unique value must be derived from `testRunId`
+
+Do not write the old shape. In particular, do not use:
+- top-level `generatedAt`
+- top-level `scenarios`
+- per-recipe `validated`
+- per-recipe `timing`
+
+## Preflight Endpoint Validation
+
+After writing `autonoma/scenario-recipes.json`, you MUST run:
+
+```bash
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json
+```
+
+This requires:
+- `AUTONOMA_SDK_ENDPOINT`
+- `AUTONOMA_SHARED_SECRET`
+
+If preflight fails, do NOT rewrite backend code. Report the failure clearly and stop.
+
+Before returning, always write `autonoma/.scenario-validation.json` with this shape:
+
+```json
+{
+  "status": "ok",
+  "preflightPassed": true,
+  "smokeTestPassed": true,
+  "validatedScenarios": ["standard", "empty", "large"],
+  "failedScenarios": [],
+  "blockingIssues": [],
+  "recipePath": "autonoma/scenario-recipes.json",
+  "validationMode": "sdk-check",
+  "endpointUrl": "http://localhost:3000/api/autonoma"
+}
+```
+
+If the step fails, keep the same shape but set:
+- `status: "failed"`
+- `preflightPassed: false` when preflight did not pass
+- `failedScenarios` to the scenarios that failed
+- `blockingIssues` to the concrete validation/runtime blockers
+
+## What to Explain to the User
+
+When finished, explain:
+1. the endpoint that was validated
+2. whether the smoke `discover -> up -> down` lifecycle passed
+3. whether `standard`, `empty`, and `large` validated successfully
+4. what validation method was used
+5. where `autonoma/scenario-recipes.json` was written
+6. where `autonoma/.scenario-validation.json` was written
+7. any remaining manual deployment or backend issues that need attention
+
+## Important
+
+- Treat `discover.json` as the schema contract and `scenarios.md` as the scenario intent.
+- Assume SDK integration is already complete.
+- If the endpoint is down, tell the user to restart or redeploy the Step 1 integration instead of attempting code edits here.
+- The orchestrator must be able to trust `autonoma/.scenario-validation.json` as the only terminal-state signal for this step.
diff --git a/agents/sdk-integrator.md b/agents/sdk-integrator.md
new file mode 100644
index 0000000..a0d47b7
--- /dev/null
+++ b/agents/sdk-integrator.md
@@ -0,0 +1,272 @@
+---
+description: >
+  Detects the project stack, installs the Autonoma SDK from package managers,
+  wires the endpoint, starts a local dev server, verifies discover/up/down, and
+  opens a PR when possible.
+tools:
+  - Read
+  - Glob
+  - Grep
+  - Write
+  - Edit
+  - Bash
+  - Agent
+  - WebFetch
+maxTurns: 60
+---
+
+# SDK Integrator
+
+You implement the Autonoma SDK integration as the first step of the planner pipeline.
+
+## Goal
+
+Detect the stack, install the SDK from package managers, add a minimal endpoint following the matching example or SDK README, ensure secrets exist, start a dev server, verify `discover`, `up`, and `down`, and prepare the repo for user review.
+
+The SDK reference repo path is provided by the orchestrator in `/tmp/autonoma-sdk-ref-dir`. Treat that repo as read-only reference material only.
+
+## Strict Rules
+
+- Install the SDK from package managers only. Never vendor, copy, or link SDK source into the user's app.
+- Do NOT modify the SDK reference repo.
+- Do NOT modify database schemas, migrations, or models.
+- Keep integration changes minimal and aligned with the project's existing conventions.
+- Do NOT commit `.env`.
+- Do NOT commit anything under `autonoma/`.
+- You MUST leave a machine-readable terminal artifact in `autonoma/.sdk-integration.json` whether the step succeeds or fails.
+- Do NOT report success unless both `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` have been written.
+
+## Required Order
+
+### 1. Detect the stack
+
+Inspect the repo for:
+- `package.json`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`
+- `pyproject.toml`, `requirements.txt`, `Pipfile`
+- `mix.exs`
+- `composer.json`
+- `pom.xml`, `build.gradle`
+- `Gemfile`
+- `Cargo.toml`
+- `go.mod`
+
+Determine:
+- language
+- server framework
+- ORM or DB adapter
+- package manager
+
+### 2. Map the stack to the SDK docs matrix
+
+Use the matching runnable example from the SDK reference repo when available.
+Otherwise use the documented SDK package combinations from SDK READMEs.
+
+Supported docs matrix:
+- TypeScript: `@autonoma-ai/sdk` plus the matching ORM/server packages
+- Python: `autonoma-sdk[...]`
+- Elixir: `autonoma_sdk`
+- PHP: `autonoma-ai/sdk`
+- Java: `com.autonoma.ai:autonoma-sdk`
+- Ruby: `autonoma-ai`
+- Rust: `autonoma-sdk`
+- Go: `github.com/autonoma-ai/sdk-go`
+
+### 3. Stop immediately if unsupported
+
+If the detected stack is not supported, stop and output a `mailto:` link to `support@autonoma.app`.
+
+The mailto body must include:
+- detected language
+- detected framework
+- detected ORM or DB layer
+- detected package manager
+- repo name or directory name
+
+### 4. Create a branch
+
+Create a branch in the user repo:
+- preferred base name: `autonoma/feat-autonoma-sdk`
+- if it already exists, append `-2`, `-3`, and so on
+
+### 5. Install SDK packages
+
+Use the project's package manager.
+
+Examples:
+- TypeScript + Express + Prisma:
+  - `npm install @autonoma-ai/sdk @autonoma-ai/sdk-prisma @autonoma-ai/server-express`
+- TypeScript + Next.js + Drizzle:
+  - `pnpm add @autonoma-ai/sdk @autonoma-ai/sdk-drizzle @autonoma-ai/server-web`
+- Python + FastAPI + SQLAlchemy:
+  - `pip install "autonoma-sdk[sqlalchemy,fastapi]"`
+- Python + Django:
+  - `pip install "autonoma-sdk[django]"`
+- Elixir + Phoenix + Ecto:
+  - add `{:autonoma_sdk, "~> 0.1"}`
+
+### 6. Implement the endpoint
+
+Follow the matching example or README pattern with minimal project-specific glue.
+
+Requirements:
+- match the repo's routing conventions
+- preserve existing auth/session patterns if the SDK auth callback needs them
+- implement the current SDK contract for `discover`, `up`, and `down`
+- do not create a throwaway second app or server if the project already has one
+
+### 7. Ensure secrets exist
+
+Check `.env` first if present.
+
+Ensure:
+- `AUTONOMA_SHARED_SECRET`
+- `AUTONOMA_SIGNING_SECRET`
+
+If missing:
+- generate with `openssl rand -hex 32`
+- ensure the two secrets differ
+- append or update `.env`
+- append or update `.env.example` with placeholder values and short comments
+
+Suggested comments:
+
+```env
+# Autonoma SDK - shared secret for HMAC request signing
+AUTONOMA_SHARED_SECRET=your-shared-secret-here
+# Autonoma SDK - private secret for signing refs tokens
+AUTONOMA_SIGNING_SECRET=your-signing-secret-here
+```
+
+### 8. Ensure planner artifacts are not committed
+
+If `/autonoma/` is not already ignored, add it to `.git/info/exclude`.
+
+### 9. Detect and run the dev server
+
+Prefer the repo's existing dev/start script or command.
+
+Examples to inspect:
+- package scripts: `dev`, `start:dev`, `start`
+- `Makefile`
+- `Procfile`
+- Django `manage.py runserver`
+- Phoenix `mix phx.server`
+
+If a suitable server is already running and the expected endpoint responds, reuse it.
+Otherwise start one in the background and persist its PID to:
+
+```bash
+/tmp/autonoma-dev-server-pid
+```
+
+### 10. Verify endpoint behavior
+
+Run signed checks against the live endpoint:
+1. `discover`
+2. minimal `up`
+3. `down` using returned `refsToken`
+
+Do not continue if any of these fail.
+
+### 11. Write the verified endpoint URL
+
+Write the final endpoint URL to:
+
+```text
+autonoma/.sdk-endpoint
+```
+
+The file must contain only one absolute URL.
+
+### 12. Write the integration handoff artifact
+
+Write `autonoma/.sdk-integration.json` with this shape:
+
+```json
+{
+  "status": "ok",
+  "endpointUrl": "http://localhost:3000/api/autonoma",
+  "endpointPath": "/api/autonoma",
+  "stack": {
+    "language": "TypeScript",
+    "framework": "Express",
+    "orm": "Prisma",
+    "packageManager": "pnpm"
+  },
+  "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"],
+  "sharedSecretPresent": true,
+  "signingSecretPresent": true,
+  "devServer": {
+    "startedByPlugin": true,
+    "pid": 12345
+  },
+  "verification": {
+    "discover": { "status": "ok", "validatedByPlugin": true },
+    "up": { "status": "ok" },
+    "down": { "status": "ok" }
+  },
+  "branch": {
+    "name": "autonoma/feat-autonoma-sdk"
+  },
+  "pr": {
+    "url": "https://github.com/..."
+  },
+  "blockingIssues": []
+}
+```
+
+If the step fails after doing any work, still write `autonoma/.sdk-integration.json` with:
+- `status: "failed"`
+- the best known values for stack, endpoint, server pid, and branch
+- failed verification statuses
+- every blocking issue listed in `blockingIssues`
+
+### 13. Commit only integration changes
+
+Stage only the SDK integration changes, such as:
+- route or handler files
+- package-manager manifests and lockfiles
+- `.env.example`
+- any required config files
+
+Do NOT stage:
+- `.env`
+- `autonoma/`
+
+Commit message:
+
+```text
+feat: integrate autonoma sdk
+```
+
+### 14. Create a PR when possible
+
+If `gh` is available:
+- push the branch
+- create a PR
+
+Include a summary, required env vars, deployment reminder, and:
+
+```text
+Co-authored-by: Autonoma <noreply@autonoma.app>
+```
+
+If `gh` is unavailable, report the exact manual next steps instead.
+
+### 15. Final report
+
+Explain:
+1. detected stack
+2. installed packages
+3. endpoint path and URL
+4. where secrets were added
+5. dev server PID
+6. PR URL or manual push/PR steps
+7. where `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` were written
+
+## Verification Notes
+
+- Use the SDK reference repo in `/tmp/autonoma-sdk-ref-dir` only for examples and package-selection guidance.
+- Prefer existing project conventions over generic examples when file placement differs.
+- If the project already contains a partial SDK integration, extend it rather than replacing it.
+- If lifecycle verification passes but artifact writing fails, the step is still incomplete.
diff --git a/commands/generate-tests.md b/commands/generate-tests.md
index 325317e..7f0bbc2 100644
--- a/commands/generate-tests.md
+++ b/commands/generate-tests.md
@@ -9,46 +9,213 @@ description: >
 
 # Autonoma E2E Test Generation Pipeline
 
-You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
 ## User Confirmation Between Steps
 
-By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
-confirmation using the `AskUserQuestion` tool. This creates an interactive
-UI prompt that makes it clear the user needs to respond before the pipeline continues.
+By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the
+next step once validation passes.
+
+**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after
+Steps 1-4.
+
+**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`,
+that means auto-advance as well.
+
+If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation
+using the `AskUserQuestion` tool.
 
 After calling `AskUserQuestion`, wait for the user's response.
 Only proceed to the next step after they confirm.
 
-**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
-skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
-the summary. The summaries are still displayed — only the confirmation prompt is skipped.
-
 ## Before Starting
 
-Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
+Create the output directory and save the project root:
+
 ```bash
 AUTONOMA_ROOT="$(pwd)"
 echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma/skills autonoma/qa-tests
+mkdir -p autonoma autonoma/skills autonoma/qa-tests
+cleanup_dev_server() {
+  DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '')
+  if [ -n "$DEV_SERVER_PID" ]; then
+    kill "$DEV_SERVER_PID" 2>/dev/null || true
+    rm -f /tmp/autonoma-dev-server-pid
+    echo "Dev server (PID $DEV_SERVER_PID) stopped."
+  fi
+}
 ```
 
-The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write:
+
 ```bash
 PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
 ```
 
-Read the environment variables. These are required for reporting progress back to Autonoma:
-- `AUTONOMA_API_KEY` — your Autonoma API key
-- `AUTONOMA_PROJECT_ID` — your Autonoma project ID
-- `AUTONOMA_API_URL` — Autonoma API base URL
-- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
+Read the environment variables required for reporting progress back to Autonoma:
+- `AUTONOMA_API_KEY`
+- `AUTONOMA_PROJECT_ID`
+- `AUTONOMA_API_URL`
+- `AUTONOMA_AUTO_ADVANCE` — optional, canonical
+- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias
+
+Add shared helpers before running the pipeline:
+
+```bash
+auto_advance_enabled() {
+  if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then
+    return 0
+  fi
+  if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then
+    return 0
+  fi
+  return 1
+}
+
+refresh_generation_id() {
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+}
+
+build_event_payload() {
+  python3 - "$1" "$2" "$3" <<'PY'
+import json
+import sys
+
+event_type, key, value = sys.argv[1:4]
+print(json.dumps({"type": event_type, "data": {key: json.loads(value)}}))
+PY
+}
+
+build_step_payload() {
+  python3 - "$1" "$2" "$3" <<'PY'
+import json
+import sys
+
+event_type, step, name = sys.argv[1:4]
+print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}}))
+PY
+}
+
+post_setup_event_blocking() {
+  refresh_generation_id
+  payload="$1"
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  for attempt in 1 2 3; do
+    if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d "$payload" >/dev/null; then
+      return 0
+    fi
+    sleep "$attempt"
+  done
+  echo "ERROR: Failed to post blocking setup event after retries: $payload"
+  return 1
+}
+
+post_setup_log() {
+  refresh_generation_id
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")")
+  curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d "$payload" >/dev/null || true
+}
+
+patch_setup_status_blocking() {
+  refresh_generation_id
+  status="$1"
+  message="$2"
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  payload=$(python3 - "$status" "$message" <<'PY'
+import json
+import sys
+
+body = {"status": sys.argv[1]}
+if sys.argv[2]:
+    body["errorMessage"] = sys.argv[2]
+print(json.dumps(body))
+PY
+)
+  for attempt in 1 2 3; do
+    if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d "$payload" >/dev/null; then
+      return 0
+    fi
+    sleep "$attempt"
+  done
+  echo "ERROR: Failed to patch setup status after retries: $status"
+  return 1
+}
+
+report_error_and_exit() {
+  message="$1"
+  preserve_dev_server="${2:-false}"
+  payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")")
+  post_setup_event_blocking "$payload" || true
+  echo "ERROR: $message"
+  if [ "$preserve_dev_server" != "true" ]; then
+    cleanup_dev_server
+  fi
+  exit 1
+}
+
+report_partial_failure_and_exit() {
+  message="$1"
+  post_setup_log "$message"
+  patch_setup_status_blocking "partial_failure" "$message" || true
+  echo "ERROR: $message"
+  cleanup_dev_server
+  exit 1
+}
+
+rehydrate_sdk_env() {
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '')
+  AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
+  AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
+  export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET
+  if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then
+    return 1
+  fi
+  return 0
+}
+```
+
+Prepare the SDK reference repo for Step 1:
 
-Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
+```bash
+SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}"
+if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then
+  echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
+else
+  SDK_REF_DIR="$(mktemp -d)/autonoma-sdk"
+  if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then
+    echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
+  else
+    echo "ERROR: Unable to prepare the SDK reference repo."
+    cleanup_dev_server
+    exit 1
+  fi
+fi
+```
+
+Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug.
 
 Create the generation record so the dashboard can track progress in real time:
+
 ```bash
 RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -58,28 +225,95 @@ HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
-mkdir -p autonoma
 echo "$GENERATION_ID" > autonoma/.generation-id
 echo "Generation ID: $GENERATION_ID"
 ```
 
-If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
+If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway.
 
-## Step 1: Generate Knowledge Base
+## Step 1: SDK Integration
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start."
+post_setup_log "Detecting stack and integrating the Autonoma SDK..."
+```
+
+Spawn the `sdk-integrator` subagent with the following task:
+
+> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context.
+> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with
+> a `mailto:support@autonoma.app` link if unsupported.
+> Create a branch, install the SDK from package managers only, implement the SDK endpoint following
+> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`
+> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server,
+> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and
+> `autonoma/.sdk-integration.json`, commit with
+> `feat: integrate autonoma sdk`, and create a PR if `gh` is available.
+> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models.
+
+**After the subagent completes:**
+1. Verify `autonoma/.sdk-endpoint` exists and is non-empty
+2. Verify `autonoma/.sdk-integration.json` exists and is non-empty
+3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file
+4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env`
+5. Confirm the endpoint is reachable with a signed `discover` request
+6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes
+7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available
+
+Load the endpoint and secrets:
+
+```bash
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \
+  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \
+  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true
+
+rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true
+
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Analyzing codebase structure and identifying features..."}}' || true
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+if [ "$HTTP_STATUS" != "200" ]; then
+  report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true
+fi
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \
+  || report_error_and_exit "Step 1 discover response did not match the required schema." true
+```
+
+Report step complete:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true
+```
+
+7. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here."
+   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **Otherwise:** Skip the prompt and proceed directly to Step 2.
+
+## Step 2: Generate Knowledge Base
+
+Report step start:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start."
+post_setup_log "Analyzing codebase structure and identifying features..."
 ```
 
 Spawn the `kb-generator` subagent with the following task:
@@ -97,23 +331,16 @@ Spawn the `kb-generator` subagent with the following task:
 3. Read the file and present the frontmatter to the user — specifically the core_flows table
 
 Report step complete and upload skills:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard...\"}}" || true
-
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true
-
+post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..."
+post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion."
 [ -n "$GENERATION_ID" ] && python3 -c "
-import os, json, sys
+import os, json
 root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
 skills = []
 d = os.path.join(root, 'autonoma/skills')
@@ -129,42 +356,35 @@ print(json.dumps({'skills': skills}))
   -d @- || true
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
    - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
-   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
+   **Otherwise:** Skip the prompt and proceed directly to Step 3.
 
-## Step 2: Generate Scenarios
+## Step 3: Generate Scenarios
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Mapping data model and designing test data environments..."}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start."
+post_setup_log "Mapping data model and designing test data environments..."
 ```
 
-Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
-This step requires these environment variables:
-- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
-- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
-
-If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
-Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
-State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step assumes Step 1 already produced:
+- `AUTONOMA_SDK_ENDPOINT`
+- `AUTONOMA_SHARED_SECRET`
 
 Fetch and validate the artifact:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 mkdir -p "$AUTONOMA_ROOT/autonoma"
+rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1."
 BODY='{"action":"discover"}'
 SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
 RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
@@ -174,16 +394,13 @@ RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPO
 HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 if [ "$HTTP_STATUS" != "200" ]; then
-  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
-  exit 1
+  report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY"
 fi
 printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json"
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \
+  || report_error_and_exit "Step 3 discover artifact did not pass validation."
 ```
 
-If the fetch fails or validation fails, stop the pipeline at Step 2.
-Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
-
 Spawn the `scenario-generator` subagent with the following task:
 
 > Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
@@ -198,47 +415,36 @@ Spawn the `scenario-generator` subagent with the following task:
 
 **After the subagent completes:**
 1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
-2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
-3. The PostToolUse hook will have validated the `scenarios.md` frontmatter format automatically
-4. Read the file and present the summary to the user — scenario names, entity counts, entity types,
-   discover schema counts, and the minimal variable field tokens that remain dynamic
+2. Validate `autonoma/discover.json` using the plugin's validator
+3. The PostToolUse hook will have validated the frontmatter format automatically
+4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
+post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."
+post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion."
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
-   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests."
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
+   **Otherwise:** Skip the prompt and proceed directly to Step 4.
 
-## Step 3: Generate E2E Test Cases
+## Step 4: Generate E2E Test Cases
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Generating E2E test cases from knowledge base and scenarios..."}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start."
+post_setup_log "Generating E2E test cases from knowledge base and scenarios..."
 ```
 
 Spawn the `test-case-generator` subagent with the following task:
@@ -256,25 +462,62 @@ Spawn the `test-case-generator` subagent with the following task:
 
 **After the subagent completes:**
 1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty
-2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
-3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+2. Verify at least one non-`INDEX.md` test file exists
+3. Verify actual test count matches `INDEX.md`
+4. Verify folder breakdown matches `INDEX.md`
+5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
+6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+
+Enforce the file-count postconditions:
+
+```bash
+INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md"
+[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md."
+TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
+[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files."
+python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md."
+import sys
+from pathlib import Path
+import yaml
+
+index_path = Path(sys.argv[1])
+actual_count = int(sys.argv[2])
+qa_dir = Path(sys.argv[3])
+
+content = index_path.read_text()
+parts = content.split('---', 2)
+if len(parts) < 3:
+    raise SystemExit('INDEX.md is missing YAML frontmatter')
+frontmatter = yaml.safe_load(parts[1])
+
+if frontmatter.get('total_tests') != actual_count:
+    raise SystemExit(
+        f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})'
+    )
+
+actual_folders = {}
+for path in qa_dir.rglob('*.md'):
+    if path.name == 'INDEX.md':
+        continue
+    folder = path.parent.relative_to(qa_dir).as_posix()
+    actual_folders[folder] = actual_folders.get(folder, 0) + 1
+
+declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])}
+if actual_folders != declared_folders:
+    raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}')
+print('OK')
+PY
+```
 
 Report step complete and upload test cases:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d "{\"type\":\"log\",\"data\":{\"message\":\"Generated ${TEST_COUNT} test cases. Uploading to dashboard...\"}}" || true
-
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' || true
-
+post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..."
+post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion."
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
 proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
@@ -298,88 +541,79 @@ print(json.dumps({'testCases': test_cases}))
   -d @- || true
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app."
+   - options: ["Yes, proceed to Step 5", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+   **Otherwise:** Skip the prompt and proceed directly to Step 5.
 
-## Step 4: Environment Factory
+## Step 5: Scenario Validation
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Implementing or completing the Environment Factory and validating planned scenarios..."}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start."
+post_setup_log "Validating planned scenarios against the live SDK endpoint..."
 ```
 
-This step requires these environment variables:
-- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
-- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
-
-If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
-preflight validation. State plainly that both environment variables are mandatory.
-
-Spawn the `env-factory-generator` subagent with the following task:
+Spawn the `scenario-validator` subagent with the following task:
 
 > Read `autonoma/discover.json` and `autonoma/scenarios.md`.
-> Implement or complete the Autonoma Environment Factory in the project's backend so it can
-> support the planned scenarios with the current SDK contract, then validate the planned scenarios
-> against that implementation.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
-> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
-> Preserve the existing discover integration if it already works, and finish `up` / `down`
-> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
-> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
-> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
-> The recipe file must match the current setup API schema:
-> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
-> `name`, `description`, `create`, and `validation` with `status: "validated"`,
-> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
-> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
-> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
-> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
-> Persisted `create` must remain tokenized — never store resolved concrete values.
-> After writing the recipe file, run the preflight helper to validate all recipes against the
-> live SDK endpoint before uploading:
+> Validate the planned scenarios against the existing live SDK endpoint without editing backend code.
+> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`,
+> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact
+> `autonoma/.scenario-validation.json`, and run:
 > `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
-> The preflight must pass for all three scenarios before Step 4 is considered complete.
+> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs.
 
 **After the subagent completes:**
-1. Verify the backend implementation or integration changes were made
-2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
-3. Run the preflight helper if the subagent did not already do so:
+1. Rehydrate SDK env from Step 1 artifacts
+2. Verify `autonoma/.scenario-validation.json` exists and is non-empty
+3. Validate `autonoma/.scenario-validation.json`
+4. Require `status == "ok"` and `preflightPassed == true`
+5. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+6. Run the preflight helper if the subagent did not already do so
+7. If preflight fails, stop and report the failure without attempting code changes
+8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues
+
+Run and enforce preflight:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1."
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \
+  || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact."
+python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state."
+import json
+import sys
+
+payload = json.load(open(sys.argv[1]))
+if payload.get("status") != "ok":
+    raise SystemExit(f'status must be "ok", got {payload.get("status")!r}')
+if payload.get("preflightPassed") is not True:
+    raise SystemExit('preflightPassed must be true before Step 5 can upload recipes')
+print('OK')
+PY
+[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \
+  || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact."
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \
+  || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5."
 ```
-If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
-4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
-5. Report which environment variables the backend now requires
-6. Report any backend issues that still need manual attention
 
-Report step complete:
+Report step complete and upload scenario recipes:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+post_setup_log "Uploading validated scenario recipes to setup..."
 if [ -n "$GENERATION_ID" ]; then
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
   if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
-    exit 1
+    report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete."
   fi
   UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
     -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -389,24 +623,30 @@ if [ -n "$GENERATION_ID" ]; then
   UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
   echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
   if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
-    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
-    exit 1
+    report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete."
+  fi
+
+  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
+  if [ "$VERIFY_STATUS" != "200" ]; then
+    report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)."
   fi
 fi
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true
+post_setup_log "Scenario validation completed."
+post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion."
+cleanup_dev_server
 ```
 
 ## Completion
 
 After all steps complete, summarize:
-- **Step 1**: Knowledge base location and core flow count
-- **Step 2**: Scenario count and entity types covered
-- **Step 3**: Total test count, folder breakdown, coverage correlation
-- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
+- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available
+- **Step 2**: knowledge base location and core flow count
+- **Step 3**: scenario count and entity types covered
+- **Step 4**: total test count, folder breakdown, coverage correlation
+- **Step 5**: scenario validation results, smoke-test status, and recipe upload status
+
+If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID.
+For terminal failures after later steps begin, clean up the dev server before returning control to the user.
diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh
index dd7e3ec..2bf4bfc 100755
--- a/hooks/validate-pipeline-output.sh
+++ b/hooks/validate-pipeline-output.sh
@@ -34,6 +34,14 @@ case "$FILE_PATH" in
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_discover.py"
     VALIDATOR_NAME="validate-discover"
     ;;
+  */autonoma/.sdk-endpoint)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_endpoint.py"
+    VALIDATOR_NAME="validate-sdk-endpoint"
+    ;;
+  */autonoma/.sdk-integration.json)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_integration.py"
+    VALIDATOR_NAME="validate-sdk-integration"
+    ;;
   */autonoma/features.json)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_features.py"
     VALIDATOR_NAME="validate-features"
@@ -42,6 +50,10 @@ case "$FILE_PATH" in
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenarios.py"
     VALIDATOR_NAME="validate-scenarios"
     ;;
+  */autonoma/.scenario-validation.json)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py"
+    VALIDATOR_NAME="validate-scenario-validation"
+    ;;
   */autonoma/scenario-recipes.json)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_recipes.py"
     VALIDATOR_NAME="validate-scenario-recipes"
diff --git a/hooks/validators/validate_scenario_validation.py b/hooks/validators/validate_scenario_validation.py
new file mode 100644
index 0000000..1339352
--- /dev/null
+++ b/hooks/validators/validate_scenario_validation.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Validates autonoma/.scenario-validation.json."""
+import json
+import sys
+from urllib.parse import urlparse
+
+
+filepath = sys.argv[1]
+
+
+def fail(message: str) -> None:
+    print(message)
+    sys.exit(1)
+
+
+try:
+    with open(filepath) as fh:
+        payload = json.load(fh)
+except Exception as exc:
+    fail(f"Invalid JSON: {exc}")
+
+if not isinstance(payload, dict):
+    fail("Root must be a JSON object")
+
+required = [
+    "status",
+    "preflightPassed",
+    "smokeTestPassed",
+    "validatedScenarios",
+    "failedScenarios",
+    "blockingIssues",
+    "recipePath",
+    "validationMode",
+    "endpointUrl",
+]
+missing = [field for field in required if field not in payload]
+if missing:
+    fail(f"Missing required fields: {missing}")
+
+if payload.get("status") not in {"ok", "failed"}:
+    fail('status must be "ok" or "failed"')
+
+for field in ["preflightPassed", "smokeTestPassed"]:
+    if not isinstance(payload.get(field), bool):
+        fail(f"{field} must be a boolean")
+
+for field in ["validatedScenarios", "failedScenarios", "blockingIssues"]:
+    value = payload.get(field)
+    if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
+        fail(f"{field} must be a list of strings")
+
+recipe_path = payload.get("recipePath")
+if not isinstance(recipe_path, str) or not recipe_path.strip():
+    fail("recipePath must be a non-empty string")
+
+validation_mode = payload.get("validationMode")
+if validation_mode not in {"sdk-check", "endpoint-lifecycle"}:
+    fail('validationMode must be "sdk-check" or "endpoint-lifecycle"')
+
+endpoint_url = payload.get("endpointUrl")
+if not isinstance(endpoint_url, str) or not endpoint_url.strip():
+    fail("endpointUrl must be a non-empty string")
+parsed = urlparse(endpoint_url)
+if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+    fail("endpointUrl must be an absolute http/https URL")
+
+print("OK")
diff --git a/hooks/validators/validate_sdk_endpoint.py b/hooks/validators/validate_sdk_endpoint.py
new file mode 100644
index 0000000..fd7df1e
--- /dev/null
+++ b/hooks/validators/validate_sdk_endpoint.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""Validates autonoma/.sdk-endpoint."""
+import sys
+from urllib.parse import urlparse
+
+
+filepath = sys.argv[1]
+
+try:
+    with open(filepath) as fh:
+        value = fh.read().strip()
+except Exception as exc:
+    print(f'Unable to read file: {exc}')
+    sys.exit(1)
+
+if not value:
+    print('.sdk-endpoint must contain a non-empty URL')
+    sys.exit(1)
+
+parsed = urlparse(value)
+if parsed.scheme not in {'http', 'https'}:
+    print('.sdk-endpoint must use http or https')
+    sys.exit(1)
+
+if not parsed.netloc:
+    print('.sdk-endpoint must include a host')
+    sys.exit(1)
+
+print('OK')
diff --git a/hooks/validators/validate_sdk_integration.py b/hooks/validators/validate_sdk_integration.py
new file mode 100644
index 0000000..fde09df
--- /dev/null
+++ b/hooks/validators/validate_sdk_integration.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Validates autonoma/.sdk-integration.json."""
+import json
+import sys
+from urllib.parse import urlparse
+
+
+filepath = sys.argv[1]
+
+
+def fail(message: str) -> None:
+    print(message)
+    sys.exit(1)
+
+
+try:
+    with open(filepath) as fh:
+        payload = json.load(fh)
+except Exception as exc:
+    fail(f"Invalid JSON: {exc}")
+
+if not isinstance(payload, dict):
+    fail("Root must be a JSON object")
+
+required = [
+    "status",
+    "endpointUrl",
+    "endpointPath",
+    "stack",
+    "packagesInstalled",
+    "sharedSecretPresent",
+    "signingSecretPresent",
+    "devServer",
+    "verification",
+    "branch",
+    "blockingIssues",
+]
+missing = [field for field in required if field not in payload]
+if missing:
+    fail(f"Missing required fields: {missing}")
+
+status = payload.get("status")
+if status not in {"ok", "failed"}:
+    fail('status must be "ok" or "failed"')
+
+endpoint_url = payload.get("endpointUrl")
+if not isinstance(endpoint_url, str) or not endpoint_url.strip():
+    fail("endpointUrl must be a non-empty string")
+parsed = urlparse(endpoint_url)
+if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+    fail("endpointUrl must be an absolute http/https URL")
+
+endpoint_path = payload.get("endpointPath")
+if not isinstance(endpoint_path, str) or not endpoint_path.strip():
+    fail("endpointPath must be a non-empty string")
+
+stack = payload.get("stack")
+if not isinstance(stack, dict):
+    fail("stack must be an object")
+for field in ["language", "framework", "orm", "packageManager"]:
+    if field not in stack:
+        fail(f"stack.{field} is required")
+    if stack[field] is not None and not isinstance(stack[field], str):
+        fail(f"stack.{field} must be a string or null")
+
+packages = payload.get("packagesInstalled")
+if not isinstance(packages, list) or not all(isinstance(item, str) and item.strip() for item in packages):
+    fail("packagesInstalled must be a list of non-empty strings")
+
+for field in ["sharedSecretPresent", "signingSecretPresent"]:
+    if not isinstance(payload.get(field), bool):
+        fail(f"{field} must be a boolean")
+
+dev_server = payload.get("devServer")
+if not isinstance(dev_server, dict):
+    fail("devServer must be an object")
+if not isinstance(dev_server.get("startedByPlugin"), bool):
+    fail("devServer.startedByPlugin must be a boolean")
+pid = dev_server.get("pid")
+if pid is not None and not isinstance(pid, int):
+    fail("devServer.pid must be an integer or null")
+
+verification = payload.get("verification")
+if not isinstance(verification, dict):
+    fail("verification must be an object")
+for key in ["discover", "up", "down"]:
+    section = verification.get(key)
+    if not isinstance(section, dict):
+        fail(f"verification.{key} must be an object")
+    if section.get("status") not in {"ok", "failed"}:
+        fail(f'verification.{key}.status must be "ok" or "failed"')
+
+if not isinstance(verification.get("discover", {}).get("validatedByPlugin"), bool):
+    fail("verification.discover.validatedByPlugin must be a boolean")
+
+branch = payload.get("branch")
+if not isinstance(branch, dict) or not isinstance(branch.get("name"), str) or not branch.get("name", "").strip():
+    fail("branch.name must be a non-empty string")
+
+pr = payload.get("pr")
+if pr is not None:
+    if not isinstance(pr, dict):
+        fail("pr must be an object or null")
+    url = pr.get("url")
+    if url is not None:
+        if not isinstance(url, str) or not url.strip():
+            fail("pr.url must be a non-empty string or null")
+
+blocking = payload.get("blockingIssues")
+if not isinstance(blocking, list) or not all(isinstance(item, str) for item in blocking):
+    fail("blockingIssues must be a list of strings")
+
+print("OK")
diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md
index 7d09efb..7f0bbc2 100644
--- a/skills/generate-tests/SKILL.md
+++ b/skills/generate-tests/SKILL.md
@@ -9,46 +9,213 @@ description: >
 
 # Autonoma E2E Test Generation Pipeline
 
-You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
 ## User Confirmation Between Steps
 
-By default, after each step (1, 2, and 3), you MUST present the summary and then ask the user for
-confirmation using the `AskUserQuestion` tool. This creates an interactive
-UI prompt that makes it clear the user needs to respond before the pipeline continues.
+By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the
+next step once validation passes.
+
+**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after
+Steps 1-4.
+
+**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`,
+that means auto-advance as well.
+
+If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation
+using the `AskUserQuestion` tool.
 
 After calling `AskUserQuestion`, wait for the user's response.
 Only proceed to the next step after they confirm.
 
-**Auto-advance mode:** If the environment variable `AUTONOMA_AUTO_ADVANCE` is set to `true`,
-skip the `AskUserQuestion` calls and automatically proceed to the next step after presenting
-the summary. The summaries are still displayed — only the confirmation prompt is skipped.
-
 ## Before Starting
 
-Create the output directory and save the project root (subagents change working directory, so we need an absolute path reference):
+Create the output directory and save the project root:
+
 ```bash
 AUTONOMA_ROOT="$(pwd)"
 echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma/skills autonoma/qa-tests
+mkdir -p autonoma autonoma/skills autonoma/qa-tests
+cleanup_dev_server() {
+  DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '')
+  if [ -n "$DEV_SERVER_PID" ]; then
+    kill "$DEV_SERVER_PID" 2>/dev/null || true
+    rm -f /tmp/autonoma-dev-server-pid
+    echo "Dev server (PID $DEV_SERVER_PID) stopped."
+  fi
+}
 ```
 
-The plugin root path (where hooks, validators, and helper scripts live) is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse validation hook on the first Write. All bash snippets that need plugin-local files read it back:
+The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write:
+
 ```bash
 PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
 ```
 
-Read the environment variables. These are required for reporting progress back to Autonoma:
-- `AUTONOMA_API_KEY` — your Autonoma API key
-- `AUTONOMA_PROJECT_ID` — your Autonoma project ID
-- `AUTONOMA_API_URL` — Autonoma API base URL
-- `AUTONOMA_AUTO_ADVANCE` — (optional) set to `true` to skip user confirmation prompts between steps
+Read the environment variables required for reporting progress back to Autonoma:
+- `AUTONOMA_API_KEY`
+- `AUTONOMA_PROJECT_ID`
+- `AUTONOMA_API_URL`
+- `AUTONOMA_AUTO_ADVANCE` — optional, canonical
+- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias
+
+Add shared helpers before running the pipeline:
+
+```bash
+auto_advance_enabled() {
+  if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then
+    return 0
+  fi
+  if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then
+    return 0
+  fi
+  return 1
+}
+
+refresh_generation_id() {
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+}
+
+build_event_payload() {
+  python3 - "$1" "$2" "$3" <<'PY'
+import json
+import sys
+
+event_type, key, value = sys.argv[1:4]
+print(json.dumps({"type": event_type, "data": {key: json.loads(value)}}))
+PY
+}
+
+build_step_payload() {
+  python3 - "$1" "$2" "$3" <<'PY'
+import json
+import sys
+
+event_type, step, name = sys.argv[1:4]
+print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}}))
+PY
+}
+
+post_setup_event_blocking() {
+  refresh_generation_id
+  payload="$1"
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  for attempt in 1 2 3; do
+    if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d "$payload" >/dev/null; then
+      return 0
+    fi
+    sleep "$attempt"
+  done
+  echo "ERROR: Failed to post blocking setup event after retries: $payload"
+  return 1
+}
+
+post_setup_log() {
+  refresh_generation_id
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")")
+  curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d "$payload" >/dev/null || true
+}
+
+patch_setup_status_blocking() {
+  refresh_generation_id
+  status="$1"
+  message="$2"
+  if [ -z "$GENERATION_ID" ]; then
+    return 0
+  fi
+  payload=$(python3 - "$status" "$message" <<'PY'
+import json
+import sys
+
+body = {"status": sys.argv[1]}
+if sys.argv[2]:
+    body["errorMessage"] = sys.argv[2]
+print(json.dumps(body))
+PY
+)
+  for attempt in 1 2 3; do
+    if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \
+      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+      -H "Content-Type: application/json" \
+      -d "$payload" >/dev/null; then
+      return 0
+    fi
+    sleep "$attempt"
+  done
+  echo "ERROR: Failed to patch setup status after retries: $status"
+  return 1
+}
+
+report_error_and_exit() {
+  message="$1"
+  preserve_dev_server="${2:-false}"
+  payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")")
+  post_setup_event_blocking "$payload" || true
+  echo "ERROR: $message"
+  if [ "$preserve_dev_server" != "true" ]; then
+    cleanup_dev_server
+  fi
+  exit 1
+}
+
+report_partial_failure_and_exit() {
+  message="$1"
+  post_setup_log "$message"
+  patch_setup_status_blocking "partial_failure" "$message" || true
+  echo "ERROR: $message"
+  cleanup_dev_server
+  exit 1
+}
+
+rehydrate_sdk_env() {
+  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+  AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '')
+  AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
+  AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
+  export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET
+  if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then
+    return 1
+  fi
+  return 0
+}
+```
+
+Prepare the SDK reference repo for Step 1:
+
+```bash
+SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}"
+if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then
+  echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
+else
+  SDK_REF_DIR="$(mktemp -d)/autonoma-sdk"
+  if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then
+    echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
+  else
+    echo "ERROR: Unable to prepare the SDK reference repo."
+    cleanup_dev_server
+    exit 1
+  fi
+fi
+```
 
-Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug (e.g. "My App" not "my-app-v2-final"). Store it in `APP_NAME`.
+Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug.
 
 Create the generation record so the dashboard can track progress in real time:
+
 ```bash
 RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -58,24 +225,95 @@ HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
-mkdir -p autonoma
 echo "$GENERATION_ID" > autonoma/.generation-id
 echo "Generation ID: $GENERATION_ID"
 ```
 
-If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
+If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway.
 
-## Step 1: Generate Knowledge Base
+## Step 1: SDK Integration
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start."
+post_setup_log "Detecting stack and integrating the Autonoma SDK..."
+```
+
+Spawn the `sdk-integrator` subagent with the following task:
+
+> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context.
+> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with
+> a `mailto:support@autonoma.app` link if unsupported.
+> Create a branch, install the SDK from package managers only, implement the SDK endpoint following
+> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`
+> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server,
+> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and
+> `autonoma/.sdk-integration.json`, commit with
+> `feat: integrate autonoma sdk`, and create a PR if `gh` is available.
+> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models.
+
+**After the subagent completes:**
+1. Verify `autonoma/.sdk-endpoint` exists and is non-empty
+2. Verify `autonoma/.sdk-integration.json` exists and is non-empty
+3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file
+4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env`
+5. Confirm the endpoint is reachable with a signed `discover` request
+6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes
+7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available
+
+Load the endpoint and secrets:
+
+```bash
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \
+  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \
+  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true
+
+rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true
+
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
   -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' || true
+  -H "x-signature: $SIG" \
+  -d "$BODY")
+if [ "$HTTP_STATUS" != "200" ]; then
+  report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true
+fi
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \
+  || report_error_and_exit "Step 1 discover response did not match the required schema." true
+```
+
+Report step complete:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true
+```
+
+7. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here."
+   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   Wait for the user's response before proceeding.
+   **Otherwise:** Skip the prompt and proceed directly to Step 2.
+
+## Step 2: Generate Knowledge Base
+
+Report step start:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
+echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
+post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start."
+post_setup_log "Analyzing codebase structure and identifying features..."
 ```
 
 Spawn the `kb-generator` subagent with the following task:
@@ -93,15 +331,14 @@ Spawn the `kb-generator` subagent with the following task:
 3. Read the file and present the frontmatter to the user — specifically the core_flows table
 
 Report step complete and upload skills:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":0,"name":"Knowledge Base"}}' || true
-
+SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
+post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..."
+post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion."
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
 root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
@@ -119,37 +356,35 @@ print(json.dumps({'skills': skills}))
   -d @- || true
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
    - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
-   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
+   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 2.
+   **Otherwise:** Skip the prompt and proceed directly to Step 3.
 
-## Step 2: Generate Scenarios
+## Step 3: Generate Scenarios
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":1,"name":"Scenarios"}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start."
+post_setup_log "Mapping data model and designing test data environments..."
 ```
 
-Before spawning the Step 2 subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
-This step requires these environment variables:
-- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
-- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
-
-If either variable is missing, stop and tell the user that Step 2 now requires SDK discover access.
-Do not suggest skipping ahead, reordering the pipeline, or continuing without a working Environment Factory endpoint.
-State plainly that the endpoint and both environment variables are mandatory prerequisites for Step 2.
+Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
+This step assumes Step 1 already produced:
+- `AUTONOMA_SDK_ENDPOINT`
+- `AUTONOMA_SHARED_SECRET`
 
 Fetch and validate the artifact:
+
 ```bash
-mkdir -p autonoma
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+mkdir -p "$AUTONOMA_ROOT/autonoma"
+rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1."
 BODY='{"action":"discover"}'
 SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
 RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
@@ -159,16 +394,13 @@ RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPO
 HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 if [ "$HTTP_STATUS" != "200" ]; then
-  echo "SDK discover failed (HTTP $HTTP_STATUS): $DISCOVER_BODY"
-  exit 1
+  report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY"
 fi
-printf '%s\n' "$DISCOVER_BODY" > autonoma/discover.json
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" autonoma/discover.json
+printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \
+  || report_error_and_exit "Step 3 discover artifact did not pass validation."
 ```
 
-If the fetch fails or validation fails, stop the pipeline at Step 2.
-Do not suggest skipping ahead. Tell the user to provide a working SDK endpoint and correct shared secret, then rerun the command.
-
 Spawn the `scenario-generator` subagent with the following task:
 
 > Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
@@ -183,39 +415,36 @@ Spawn the `scenario-generator` subagent with the following task:
 
 **After the subagent completes:**
 1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
-2. Validate `autonoma/discover.json` using the plugin's validator (path saved in `/tmp/autonoma-plugin-root`)
+2. Validate `autonoma/discover.json` using the plugin's validator
 3. The PostToolUse hook will have validated the frontmatter format automatically
-4. Read the file and present the frontmatter summary to the user — scenario names, entity counts,
-   entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
+4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
 
 Report step complete:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":1,"name":"Scenarios"}}' || true
+post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."
+post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion."
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? Most seed values should stay concrete, ideally as planner-chosen literals with discriminators, and only truly dynamic values should remain variable for later tests."
-   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests."
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 3.
+   **Otherwise:** Skip the prompt and proceed directly to Step 4.
 
-## Step 3: Generate E2E Test Cases
+## Step 4: Generate E2E Test Cases
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":2,"name":"E2E Tests"}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start."
+post_setup_log "Generating E2E test cases from knowledge base and scenarios..."
 ```
 
 Spawn the `test-case-generator` subagent with the following task:
@@ -233,19 +462,62 @@ Spawn the `test-case-generator` subagent with the following task:
 
 **After the subagent completes:**
 1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty
-2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
-3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+2. Verify at least one non-`INDEX.md` test file exists
+3. Verify actual test count matches `INDEX.md`
+4. Verify folder breakdown matches `INDEX.md`
+5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
+6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
+
+Enforce the file-count postconditions:
+
+```bash
+INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md"
+[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md."
+TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
+[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files."
+python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md."
+import sys
+from pathlib import Path
+import yaml
+
+index_path = Path(sys.argv[1])
+actual_count = int(sys.argv[2])
+qa_dir = Path(sys.argv[3])
+
+content = index_path.read_text()
+parts = content.split('---', 2)
+if len(parts) < 3:
+    raise SystemExit('INDEX.md is missing YAML frontmatter')
+frontmatter = yaml.safe_load(parts[1])
+
+if frontmatter.get('total_tests') != actual_count:
+    raise SystemExit(
+        f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})'
+    )
+
+actual_folders = {}
+for path in qa_dir.rglob('*.md'):
+    if path.name == 'INDEX.md':
+        continue
+    folder = path.parent.relative_to(qa_dir).as_posix()
+    actual_folders[folder] = actual_folders.get(folder, 0) + 1
+
+declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])}
+if actual_folders != declared_folders:
+    raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}')
+print('OK')
+PY
+```
 
 Report step complete and upload test cases:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":2,"name":"E2E Tests"}}' || true
-
+TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
+post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..."
+post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion."
 [ -n "$GENERATION_ID" ] && python3 -c "
 import os, json
 proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
@@ -269,85 +541,79 @@ print(json.dumps({'testCases': test_cases}))
   -d @- || true
 ```
 
-4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
-   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes/features in your app."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
+   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app."
+   - options: ["Yes, proceed to Step 5", "I want to suggest changes"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+   **Otherwise:** Skip the prompt and proceed directly to Step 5.
 
-## Step 4: Environment Factory
+## Step 5: Scenario Validation
 
 Report step start:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.started","data":{"step":3,"name":"Environment Factory"}}' || true
+post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start."
+post_setup_log "Validating planned scenarios against the live SDK endpoint..."
 ```
 
-This step requires these environment variables:
-- `AUTONOMA_SDK_ENDPOINT` — full URL of the customer's SDK endpoint
-- `AUTONOMA_SHARED_SECRET` — the HMAC shared secret used by the SDK endpoint
-
-If either variable is missing, stop and tell the user that Step 4 requires SDK endpoint access for
-preflight validation. State plainly that both environment variables are mandatory.
-
-Spawn the `env-factory-generator` subagent with the following task:
+Spawn the `scenario-validator` subagent with the following task:
 
 > Read `autonoma/discover.json` and `autonoma/scenarios.md`.
-> Implement or complete the Autonoma Environment Factory in the project's backend so it can
-> support the planned scenarios with the current SDK contract, then validate the planned scenarios
-> against that implementation.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-4-implement-scenarios.txt
-> and https://docs.agent.autonoma.app/llms/guides/environment-factory.txt first.
-> Preserve the existing discover integration if it already works, and finish `up` / `down`
-> behavior using `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`.
-> Smoke-test the discover -> up -> down lifecycle in-session after implementing.
-> Then validate `standard`, `empty`, and `large`, and write approved recipes to `autonoma/scenario-recipes.json`.
-> The recipe file must match the current setup API schema:
-> top-level `version: 1`, `source`, `validationMode`, `recipes`; each recipe must use
-> `name`, `description`, `create`, and `validation` with `status: "validated"`,
-> a valid `method`, `phase: "ok"`, and optional `up_ms` / `down_ms`.
-> Do not use the old shape with top-level `scenarios`, `generatedAt`, or per-recipe `validated` / `timing`.
-> When `create` uses `{{token}}` placeholders, include a `variables` field per recipe that defines
-> how each token is resolved. Allowed strategies: `literal`, `derived`, `faker`.
-> Persisted `create` must remain tokenized — never store resolved concrete values.
-> After writing the recipe file, run the preflight helper to validate all recipes against the
-> live SDK endpoint before uploading:
+> Validate the planned scenarios against the existing live SDK endpoint without editing backend code.
+> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`,
+> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact
+> `autonoma/.scenario-validation.json`, and run:
 > `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
-> The preflight must pass for all three scenarios before Step 4 is considered complete.
+> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs.
 
 **After the subagent completes:**
-1. Verify the backend implementation or integration changes were made
-2. Verify `autonoma/scenario-recipes.json` exists and is non-empty
-3. Run the preflight helper if the subagent did not already do so:
+1. Rehydrate SDK env from Step 1 artifacts
+2. Verify `autonoma/.scenario-validation.json` exists and is non-empty
+3. Validate `autonoma/.scenario-validation.json`
+4. Require `status == "ok"` and `preflightPassed == true`
+5. Verify `autonoma/scenario-recipes.json` exists and is non-empty
+6. Run the preflight helper if the subagent did not already do so
+7. If preflight fails, stop and report the failure without attempting code changes
+8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues
+
+Run and enforce preflight:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1."
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \
+  || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact."
+python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state."
+import json
+import sys
+
+payload = json.load(open(sys.argv[1]))
+if payload.get("status") != "ok":
+    raise SystemExit(f'status must be "ok", got {payload.get("status")!r}')
+if payload.get("preflightPassed") is not True:
+    raise SystemExit('preflightPassed must be true before Step 5 can upload recipes')
+print('OK')
+PY
+[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \
+  || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact."
+python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \
+  || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5."
 ```
-If preflight fails, do NOT proceed to upload. Report the failure to the user and stop.
-4. Present the results to the user — endpoint location, what was implemented or fixed, smoke-test results, per-scenario preflight results
-5. Report which environment variables the backend now requires
-6. Report any backend issues that still need manual attention
 
-Report step complete:
+Report step complete and upload scenario recipes:
+
 ```bash
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
 echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
+post_setup_log "Uploading validated scenario recipes to setup..."
 if [ -n "$GENERATION_ID" ]; then
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
   if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
-    exit 1
+    report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete."
   fi
   UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
     -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
@@ -357,50 +623,30 @@ if [ -n "$GENERATION_ID" ]; then
   UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
   echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
   if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
-    echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
-    exit 1
+    report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete."
   fi
 
-  # Verify recipes were persisted by fetching them back from the dashboard
   VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
     -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
   VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
   VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
   if [ "$VERIFY_STATUS" != "200" ]; then
-    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
-    exit 1
-  fi
-  # Extract scenario names from the uploaded recipes file and verify each one exists with an active recipe
-  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
-  MISSING=""
-  for NAME in $EXPECTED_NAMES; do
-    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
-match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
-print('yes' if match else 'no')
-" 2>/dev/null || echo "no")
-    if [ "$HAS_ACTIVE" != "yes" ]; then
-      MISSING="$MISSING $NAME"
-    fi
-  done
-  if [ -n "$MISSING" ]; then
-    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
-    echo "Step 4 cannot complete. Recipe upload may have partially failed."
-    exit 1
+    report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)."
   fi
-  echo "Verified: all scenario recipes persisted successfully on the dashboard."
 fi
-[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d '{"type":"step.completed","data":{"step":3,"name":"Environment Factory"}}' || true
+post_setup_log "Scenario validation completed."
+post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion."
+cleanup_dev_server
 ```
 
 ## Completion
 
 After all steps complete, summarize:
-- **Step 1**: Knowledge base location and core flow count
-- **Step 2**: Scenario count and entity types covered
-- **Step 3**: Total test count, folder breakdown, coverage correlation
-- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
+- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available
+- **Step 2**: knowledge base location and core flow count
+- **Step 3**: scenario count and entity types covered
+- **Step 4**: total test count, folder breakdown, coverage correlation
+- **Step 5**: scenario validation results, smoke-test status, and recipe upload status
+
+If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID.
+For terminal failures after later steps begin, clean up the dev server before returning control to the user.
diff --git a/tests/test_validate_pipeline_output.py b/tests/test_validate_pipeline_output.py
index 3077ba2..b40bc26 100644
--- a/tests/test_validate_pipeline_output.py
+++ b/tests/test_validate_pipeline_output.py
@@ -64,6 +64,143 @@
 }
 
 
+def test_sdk_endpoint_hook_accepts_valid_url():
+    env = os.environ.copy()
+
+    code, out, err = _run_hook(
+        {
+            'autonoma/.sdk-endpoint': 'http://127.0.0.1:3000/api/autonoma\n',
+        },
+        'autonoma/.sdk-endpoint',
+        env,
+    )
+
+    assert code == 0
+    assert out == ''
+    assert err == ''
+
+
+def test_sdk_endpoint_hook_blocks_invalid_url():
+    env = os.environ.copy()
+
+    code, _, err = _run_hook(
+        {
+            'autonoma/.sdk-endpoint': '/api/autonoma\n',
+        },
+        'autonoma/.sdk-endpoint',
+        env,
+    )
+
+    assert code == 2
+    assert 'validate-sdk-endpoint' in err
+    assert 'http or https' in err
+
+
+def test_sdk_integration_hook_accepts_valid_json():
+    env = os.environ.copy()
+
+    code, out, err = _run_hook(
+        {
+            'autonoma/.sdk-integration.json': json.dumps(
+                {
+                    'status': 'ok',
+                    'endpointUrl': 'http://127.0.0.1:3000/api/autonoma',
+                    'endpointPath': '/api/autonoma',
+                    'stack': {
+                        'language': 'TypeScript',
+                        'framework': 'Express',
+                        'orm': 'Prisma',
+                        'packageManager': 'pnpm',
+                    },
+                    'packagesInstalled': ['@autonoma-ai/sdk'],
+                    'sharedSecretPresent': True,
+                    'signingSecretPresent': True,
+                    'devServer': {'startedByPlugin': True, 'pid': 1234},
+                    'verification': {
+                        'discover': {'status': 'ok', 'validatedByPlugin': True},
+                        'up': {'status': 'ok'},
+                        'down': {'status': 'ok'},
+                    },
+                    'branch': {'name': 'autonoma/feat-autonoma-sdk'},
+                    'pr': {'url': 'https://github.com/example/repo/pull/1'},
+                    'blockingIssues': [],
+                }
+            ),
+        },
+        'autonoma/.sdk-integration.json',
+        env,
+    )
+
+    assert code == 0
+    assert out == ''
+    assert err == ''
+
+
+def test_sdk_integration_hook_blocks_invalid_json():
+    env = os.environ.copy()
+
+    code, _, err = _run_hook(
+        {
+            'autonoma/.sdk-integration.json': json.dumps({'status': 'ok'}),
+        },
+        'autonoma/.sdk-integration.json',
+        env,
+    )
+
+    assert code == 2
+    assert 'validate-sdk-integration' in err
+    assert 'Missing required fields' in err
+
+
+def test_scenario_validation_hook_accepts_valid_json():
+    env = os.environ.copy()
+
+    code, out, err = _run_hook(
+        {
+            'autonoma/.scenario-validation.json': json.dumps(
+                {
+                    'status': 'ok',
+                    'preflightPassed': True,
+                    'smokeTestPassed': True,
+                    'validatedScenarios': ['standard', 'empty', 'large'],
+                    'failedScenarios': [],
+                    'blockingIssues': [],
+                    'recipePath': 'autonoma/scenario-recipes.json',
+                    'validationMode': 'sdk-check',
+                    'endpointUrl': 'http://127.0.0.1:3000/api/autonoma',
+                }
+            ),
+        },
+        'autonoma/.scenario-validation.json',
+        env,
+    )
+
+    assert code == 0
+    assert out == ''
+    assert err == ''
+
+
+def test_scenario_validation_hook_blocks_invalid_json():
+    env = os.environ.copy()
+
+    code, _, err = _run_hook(
+        {
+            'autonoma/.scenario-validation.json': json.dumps(
+                {
+                    'status': 'failed',
+                    'preflightPassed': False,
+                }
+            ),
+        },
+        'autonoma/.scenario-validation.json',
+        env,
+    )
+
+    assert code == 2
+    assert 'validate-scenario-validation' in err
+    assert 'Missing required fields' in err
+
+
 def _run_hook(files: dict[str, str], target: str, env: dict[str, str]) -> tuple[int, str, str]:
     with tempfile.TemporaryDirectory() as tmpdir:
         for relpath, content in files.items():
diff --git a/tests/test_validate_scenario_validation.py b/tests/test_validate_scenario_validation.py
new file mode 100644
index 0000000..a7f7b07
--- /dev/null
+++ b/tests/test_validate_scenario_validation.py
@@ -0,0 +1,65 @@
+"""Tests for validate_scenario_validation.py."""
+import json
+
+from conftest import run_validator
+
+
+SCRIPT = "validate_scenario_validation.py"
+
+
+def valid_payload(**overrides):
+    payload = {
+        "status": "ok",
+        "preflightPassed": True,
+        "smokeTestPassed": True,
+        "validatedScenarios": ["standard", "empty", "large"],
+        "failedScenarios": [],
+        "blockingIssues": [],
+        "recipePath": "autonoma/scenario-recipes.json",
+        "validationMode": "sdk-check",
+        "endpointUrl": "http://127.0.0.1:3000/api/autonoma",
+    }
+    payload.update(overrides)
+    return payload
+
+
+def test_accepts_valid_payload():
+    code, out = run_validator(SCRIPT, json.dumps(valid_payload()), filename=".scenario-validation.json")
+    assert code == 0
+    assert out == "OK"
+
+
+def test_accepts_failed_status_payload():
+    code, out = run_validator(
+        SCRIPT,
+        json.dumps(
+            valid_payload(
+                status="failed",
+                preflightPassed=False,
+                validatedScenarios=["standard"],
+                failedScenarios=["empty", "large"],
+                blockingIssues=["duplicate email"],
+            )
+        ),
+        filename=".scenario-validation.json",
+    )
+    assert code == 0
+    assert out == "OK"
+
+
+def test_rejects_missing_required_field():
+    payload = valid_payload()
+    payload.pop("recipePath")
+    code, out = run_validator(SCRIPT, json.dumps(payload), filename=".scenario-validation.json")
+    assert code == 1
+    assert "Missing required fields" in out
+
+
+def test_rejects_invalid_endpoint_url():
+    code, out = run_validator(
+        SCRIPT,
+        json.dumps(valid_payload(endpointUrl="relative/path")),
+        filename=".scenario-validation.json",
+    )
+    assert code == 1
+    assert "absolute http/https URL" in out
diff --git a/tests/test_validate_sdk_endpoint.py b/tests/test_validate_sdk_endpoint.py
new file mode 100644
index 0000000..319e0fb
--- /dev/null
+++ b/tests/test_validate_sdk_endpoint.py
@@ -0,0 +1,35 @@
+"""Tests for validate_sdk_endpoint.py."""
+from conftest import run_validator
+
+
+SCRIPT = 'validate_sdk_endpoint.py'
+
+
+def test_accepts_localhost_url():
+    code, out = run_validator(SCRIPT, 'http://localhost:3000/api/autonoma\n', filename='.sdk-endpoint')
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_accepts_https_url():
+    code, out = run_validator(SCRIPT, 'https://example.com/autonoma', filename='.sdk-endpoint')
+    assert code == 0
+    assert out == 'OK'
+
+
+def test_rejects_empty_content():
+    code, out = run_validator(SCRIPT, '', filename='.sdk-endpoint')
+    assert code == 1
+    assert 'non-empty URL' in out
+
+
+def test_rejects_relative_path():
+    code, out = run_validator(SCRIPT, '/api/autonoma', filename='.sdk-endpoint')
+    assert code == 1
+    assert 'http or https' in out
+
+
+def test_rejects_malformed_url():
+    code, out = run_validator(SCRIPT, 'https:///missing-host', filename='.sdk-endpoint')
+    assert code == 1
+    assert 'include a host' in out
diff --git a/tests/test_validate_sdk_integration.py b/tests/test_validate_sdk_integration.py
new file mode 100644
index 0000000..73fab81
--- /dev/null
+++ b/tests/test_validate_sdk_integration.py
@@ -0,0 +1,79 @@
+"""Tests for validate_sdk_integration.py."""
+import json
+
+from conftest import run_validator
+
+
+SCRIPT = "validate_sdk_integration.py"
+
+
+def valid_payload(**overrides):
+    payload = {
+        "status": "ok",
+        "endpointUrl": "http://127.0.0.1:3000/api/autonoma",
+        "endpointPath": "/api/autonoma",
+        "stack": {
+            "language": "TypeScript",
+            "framework": "Express",
+            "orm": "Prisma",
+            "packageManager": "pnpm",
+        },
+        "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"],
+        "sharedSecretPresent": True,
+        "signingSecretPresent": True,
+        "devServer": {"startedByPlugin": True, "pid": 1234},
+        "verification": {
+            "discover": {"status": "ok", "validatedByPlugin": True},
+            "up": {"status": "ok"},
+            "down": {"status": "ok"},
+        },
+        "branch": {"name": "autonoma/feat-autonoma-sdk"},
+        "pr": {"url": "https://github.com/example/repo/pull/1"},
+        "blockingIssues": [],
+    }
+    payload.update(overrides)
+    return payload
+
+
+def test_accepts_valid_payload():
+    code, out = run_validator(SCRIPT, json.dumps(valid_payload()), filename=".sdk-integration.json")
+    assert code == 0
+    assert out == "OK"
+
+
+def test_rejects_missing_required_field():
+    payload = valid_payload()
+    payload.pop("verification")
+    code, out = run_validator(SCRIPT, json.dumps(payload), filename=".sdk-integration.json")
+    assert code == 1
+    assert "Missing required fields" in out
+
+
+def test_rejects_invalid_endpoint_url():
+    code, out = run_validator(
+        SCRIPT,
+        json.dumps(valid_payload(endpointUrl="/api/autonoma")),
+        filename=".sdk-integration.json",
+    )
+    assert code == 1
+    assert "absolute http/https URL" in out
+
+
+def test_accepts_failed_status_with_blocking_issues():
+    code, out = run_validator(
+        SCRIPT,
+        json.dumps(
+            valid_payload(
+                status="failed",
+                verification={
+                    "discover": {"status": "failed", "validatedByPlugin": False},
+                    "up": {"status": "failed"},
+                    "down": {"status": "failed"},
+                },
+                blockingIssues=["discover request failed"],
+            )
+        ),
+        filename=".sdk-integration.json",
+    )
+    assert code == 0
+    assert out == "OK"

From 5ae0acc1a665849bbbf2dcdcf94b5e117d814b50 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Mon, 20 Apr 2026 11:03:23 -0300
Subject: [PATCH 22/33] feat: delete new autonoma-adhoc-planner plugin and
 replace it with new command generate-adhoc-tests

---
 .claude-plugin/marketplace.json               |   9 +-
 README.md                                     |  51 ++-----
 adhoc/.claude-plugin/plugin.json              |   8 -
 adhoc/hooks/hooks.json                        |  15 --
 adhoc/hooks/validate-pipeline-output.sh       |  69 ---------
 .../validate_directory_structure.py           |  44 ------
 adhoc/hooks/validators/validate_test_file.py  |  46 ------
 adhoc/hooks/validators/validate_test_index.py | 130 ----------------
 .../focused-test-case-generator.md            |  92 +++++++-----
 .../generate-adhoc-tests.md                   | 138 +++++++++++------
 hooks/validate-pipeline-output.sh             |   7 +-
 .../generate-adhoc-tests/SKILL.md             | 140 ++++++++++++------
 12 files changed, 269 insertions(+), 480 deletions(-)
 delete mode 100644 adhoc/.claude-plugin/plugin.json
 delete mode 100644 adhoc/hooks/hooks.json
 delete mode 100755 adhoc/hooks/validate-pipeline-output.sh
 delete mode 100644 adhoc/hooks/validators/validate_directory_structure.py
 delete mode 100644 adhoc/hooks/validators/validate_test_file.py
 delete mode 100644 adhoc/hooks/validators/validate_test_index.py
 rename {adhoc/agents => agents}/focused-test-case-generator.md (54%)
 rename {adhoc/commands => commands}/generate-adhoc-tests.md (84%)
 rename {adhoc/skills => skills}/generate-adhoc-tests/SKILL.md (84%)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 28b78b2..08cf736 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -14,17 +14,12 @@
         "repo": "Autonoma-AI/test-planner-plugin",
         "ref": "production"
       },
-      "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
+      "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands."
     },
     {
       "name": "autonoma-test-planner-development",
       "source": "./",
-      "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation"
-    },
-    {
-      "name": "autonoma-adhoc-planner",
-      "source": "./adhoc",
-      "description": "Generates focused E2E tests for a user-defined topic or feature area with a custom system prompt"
+      "description": "[DEVELOPMENT] Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands."
     }
   ]
 }
diff --git a/README.md b/README.md
index 1020424..ba4a030 100644
--- a/README.md
+++ b/README.md
@@ -60,60 +60,42 @@ Implements or completes the backend Environment Factory so the planned scenarios
 
 ---
 
-## Autonoma Ad Hoc Planner
+## Ad Hoc Test Generation
 
-A second plugin in this repository that runs the same 4-step pipeline but scopes Step 3 to a user-defined focus area. Use it when you want targeted test coverage for a specific feature without regenerating your full test suite.
-
-### Install
-
-**Step 1:** The marketplace is the same as above. If you've already added it, skip this:
-
-```
-/plugin marketplace add Autonoma-AI/test-planner-plugin
-```
-
-**Step 2:** Install the ad hoc plugin:
-
-```
-/plugin install autonoma-adhoc-planner@autonoma
-```
+The same plugin includes a `generate-adhoc-tests` command that generates tests focused on a specific topic without regenerating your full test suite.
 
 ### Usage
 
-Inside any project with Claude Code:
-
 Pass your focus description directly after the command:
 
 ```
-/autonoma-adhoc-planner:generate-adhoc-tests description
+/autonoma-test-planner:generate-adhoc-tests description
 ```
 
-Or invoke without arguments and the plugin will suggest focus areas based on your codebase:
+Or invoke without arguments and the command will suggest focus areas based on your codebase:
 
 ```
-/autonoma-adhoc-planner:generate-adhoc-tests
+/autonoma-test-planner:generate-adhoc-tests
 ```
 
-The plugin walks you through 4 steps, asking for confirmation at each checkpoint before proceeding.
-
-## How it works
+### How it works
 
-### How it differs from the main planner
+**Subsequent runs** (scenarios already configured in Autonoma): fetches scenarios and existing tests from the Autonoma, then runs only focused test generation (Step 3). Steps 1, 2, and 4 are skipped.
 
-Steps 1, 2, and 4 run identically to the main planner. Step 3 is scoped:
+**First run** (no scenarios configured yet): runs the full 4-step pipeline with Step 3 scoped to the requested focus area.
 
-| Step | Main planner | Ad hoc planner |
-|------|-------------|----------------|
-| 1 — Knowledge Base | Full codebase | Full codebase |
-| 2 — Scenarios | Full data model | Full data model |
-| 3 — E2E Tests | All features | **Focus area only** |
-| 4 — Environment Factory | All scenarios | All scenarios |
+| Step | Full suite (`generate-tests`) | Ad hoc — first run | Ad hoc — subsequent run |
+|------|-------------------------------|-------------------|------------------------|
+| 1 — Knowledge Base | Always | Yes | Skipped |
+| 2 — Scenarios | Always | Yes | Skipped (fetched from API) |
+| 3 — E2E Tests | All features | Focus area only | Focus area only |
+| 4 — Environment Factory | Always | Yes | Skipped |
 
-Tests are written to `autonoma/qa-tests/{focus-slug}/` so they sit alongside your existing test suite without overwriting it. Running the ad hoc planner twice with different focus areas produces two separate subfolders.
+Tests are written to `autonoma/qa-tests/{focus-slug}/` so they sit alongside your existing test suite without overwriting it.
 
 ### Running multiple focus areas
 
-You can run the ad hoc planner multiple times for different topics, including simultaneously. Each run writes to its own subfolder and tracks its own generation ID file.
+Each focus area run writes to its own subfolder and tracks its own generation ID file. Multiple topics can run in parallel:
 
 ```
 autonoma/qa-tests/
@@ -121,7 +103,6 @@ autonoma/qa-tests/
 └── signatures-and-documents/ ← autonoma/.generation-id-signatures-and-documents
 ```
 
-
 ---
 
 ## Scenario Recipes
diff --git a/adhoc/.claude-plugin/plugin.json b/adhoc/.claude-plugin/plugin.json
deleted file mode 100644
index 1da7c50..0000000
--- a/adhoc/.claude-plugin/plugin.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "name": "autonoma-adhoc-planner",
-  "description": "Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.0.0",
-  "author": {
-    "name": "Autonoma"
-  }
-}
diff --git a/adhoc/hooks/hooks.json b/adhoc/hooks/hooks.json
deleted file mode 100644
index d694b5d..0000000
--- a/adhoc/hooks/hooks.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "hooks": {
-    "PostToolUse": [
-      {
-        "matcher": "Write",
-        "hooks": [
-          {
-            "type": "command",
-            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/validate-pipeline-output.sh"
-          }
-        ]
-      }
-    ]
-  }
-}
diff --git a/adhoc/hooks/validate-pipeline-output.sh b/adhoc/hooks/validate-pipeline-output.sh
deleted file mode 100755
index a33c95f..0000000
--- a/adhoc/hooks/validate-pipeline-output.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-# Validates ad hoc planner output files after Write tool use.
-# Exit 0 = allow (file is valid or not a pipeline file)
-# Exit 2 = block and send error message to Claude
-
-INPUT=$(cat)
-
-FILE_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tool_input',{}).get('file_path',''))" 2>/dev/null)
-
-if [ -z "$FILE_PATH" ]; then
-  exit 0
-fi
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-VALIDATORS_DIR="$SCRIPT_DIR/validators"
-
-PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-echo "$PLUGIN_ROOT" > /tmp/autonoma-plugin-root
-
-python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null
-
-case "$FILE_PATH" in
-  */autonoma/qa-tests/*/INDEX.md)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
-    VALIDATOR_NAME="validate-test-index"
-    ;;
-  */autonoma/qa-tests/*/[!I]*.md)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py"
-    VALIDATOR_NAME="validate-test-file"
-    ;;
-  *)
-    exit 0
-    ;;
-esac
-
-if [ ! -f "$FILE_PATH" ]; then
-  echo "VALIDATION FAILED [$VALIDATOR_NAME]: File does not exist: $FILE_PATH" >&2
-  exit 2
-fi
-
-if [ ! -s "$FILE_PATH" ]; then
-  echo "VALIDATION FAILED [$VALIDATOR_NAME]: File is empty: $FILE_PATH" >&2
-  exit 2
-fi
-
-if [ ! -f "$VALIDATOR_SCRIPT" ]; then
-  echo "VALIDATION FAILED [$VALIDATOR_NAME]: Validator script not found: $VALIDATOR_SCRIPT" >&2
-  exit 2
-fi
-
-RESULT=$(python3 "$VALIDATOR_SCRIPT" "$FILE_PATH" 2>&1)
-EXIT_CODE=$?
-
-if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then
-  echo "VALIDATION FAILED [$VALIDATOR_NAME]: $RESULT" >&2
-  exit 2
-fi
-
-if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
-  DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py"
-  DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1)
-  DIR_EXIT=$?
-  if [ $DIR_EXIT -ne 0 ] || [ "$DIR_RESULT" != "OK" ]; then
-    echo "VALIDATION FAILED [validate-directory-structure]: $DIR_RESULT" >&2
-    exit 2
-  fi
-fi
-
-exit 0
diff --git a/adhoc/hooks/validators/validate_directory_structure.py b/adhoc/hooks/validators/validate_directory_structure.py
deleted file mode 100644
index 97d387f..0000000
--- a/adhoc/hooks/validators/validate_directory_structure.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-"""Validates that the ad hoc focus folder is properly populated.
-
-For the ad hoc planner the index lives at autonoma/qa-tests/{focus-slug}/INDEX.md.
-We check that the focus folder contains at least one test file besides INDEX.md,
-and that every subfolder declared in the index also has at least one .md file.
-"""
-import os
-import sys
-import glob as globmod
-import yaml
-
-filepath = sys.argv[1]  # autonoma/qa-tests/{focus-slug}/INDEX.md
-focus_dir = os.path.dirname(filepath)  # autonoma/qa-tests/{focus-slug}/
-
-# Parse the INDEX frontmatter to get declared folder names
-content = open(filepath).read()
-parts = content.split('---', 2)
-try:
-    fm = yaml.safe_load(parts[1]) if len(parts) >= 3 else {}
-except Exception:
-    fm = {}
-
-declared_folders = [f.get('name') for f in fm.get('folders', []) if isinstance(f, dict) and f.get('name')]
-
-# Focus folder must contain at least one test file (not INDEX.md)
-test_files = [f for f in globmod.glob(os.path.join(focus_dir, '**', '*.md'), recursive=True)
-              if os.path.basename(f) != 'INDEX.md']
-if not test_files:
-    print(f'Focus folder has no test files: {focus_dir}')
-    sys.exit(1)
-
-# Every declared subfolder must exist and contain at least one .md file
-for name in declared_folders:
-    subdir = os.path.join(focus_dir, name)
-    if not os.path.isdir(subdir):
-        print(f'Declared folder "{name}" does not exist: {subdir}')
-        sys.exit(1)
-    md_files = globmod.glob(os.path.join(subdir, '*.md'))
-    if not md_files:
-        print(f'Declared folder "{name}" has no .md files: {subdir}')
-        sys.exit(1)
-
-print('OK')
diff --git a/adhoc/hooks/validators/validate_test_file.py b/adhoc/hooks/validators/validate_test_file.py
deleted file mode 100644
index bea0726..0000000
--- a/adhoc/hooks/validators/validate_test_file.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python3
-"""Validates individual test file frontmatter format."""
-import sys
-import yaml
-
-filepath = sys.argv[1]
-content = open(filepath).read()
-
-if not content.startswith('---'):
-    print('File must start with YAML frontmatter (---)')
-    sys.exit(1)
-
-parts = content.split('---', 2)
-if len(parts) < 3:
-    print('Missing closing --- for frontmatter')
-    sys.exit(1)
-
-try:
-    fm = yaml.safe_load(parts[1])
-except Exception as e:
-    print(f'Invalid YAML in frontmatter: {e}')
-    sys.exit(1)
-
-if not isinstance(fm, dict):
-    print('Frontmatter must be a YAML mapping')
-    sys.exit(1)
-
-required = ['title', 'description', 'criticality', 'scenario', 'flow']
-missing = [f for f in required if f not in fm]
-if missing:
-    print(f'Missing required frontmatter fields: {missing}')
-    sys.exit(1)
-
-valid_criticality = {'critical', 'high', 'mid', 'low'}
-crit = fm.get('criticality')
-if crit not in valid_criticality:
-    print(f'criticality must be one of {valid_criticality}, got: {crit}')
-    sys.exit(1)
-
-for field in ['title', 'description', 'scenario', 'flow']:
-    val = fm.get(field)
-    if not isinstance(val, str) or len(val.strip()) == 0:
-        print(f'{field} must be a non-empty string')
-        sys.exit(1)
-
-print('OK')
diff --git a/adhoc/hooks/validators/validate_test_index.py b/adhoc/hooks/validators/validate_test_index.py
deleted file mode 100644
index 2c9ddd3..0000000
--- a/adhoc/hooks/validators/validate_test_index.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-"""Validates qa-tests/{focus-slug}/INDEX.md frontmatter format.
-
-For the ad hoc planner the index lives one level deeper than the main planner
-(autonoma/qa-tests/{focus-slug}/INDEX.md), so path calculations are adjusted
-accordingly. features.json cross-check is optional: skipped if the file does
-not exist (ad hoc runs do not require Step 1 to have completed first).
-"""
-import sys
-import os
-import json as jsonlib
-import yaml
-
-filepath = sys.argv[1]
-content = open(filepath).read()
-
-if not content.startswith('---'):
-    print('File must start with YAML frontmatter (---)')
-    sys.exit(1)
-
-parts = content.split('---', 2)
-if len(parts) < 3:
-    print('Missing closing --- for frontmatter')
-    sys.exit(1)
-
-try:
-    fm = yaml.safe_load(parts[1])
-except Exception as e:
-    print(f'Invalid YAML in frontmatter: {e}')
-    sys.exit(1)
-
-if not isinstance(fm, dict):
-    print('Frontmatter must be a YAML mapping')
-    sys.exit(1)
-
-required = ['total_tests', 'total_folders', 'folders', 'coverage_correlation']
-missing = [f for f in required if f not in fm]
-if missing:
-    print(f'Missing required frontmatter fields: {missing}')
-    sys.exit(1)
-
-tt = fm.get('total_tests')
-if not isinstance(tt, int) or tt < 1:
-    print('total_tests must be a positive integer')
-    sys.exit(1)
-
-tf = fm.get('total_folders')
-if not isinstance(tf, int) or tf < 1:
-    print('total_folders must be a positive integer')
-    sys.exit(1)
-
-folders = fm.get('folders')
-if not isinstance(folders, list) or len(folders) != tf:
-    print(f'folders list length ({len(folders) if isinstance(folders, list) else "N/A"}) must match total_folders ({tf})')
-    sys.exit(1)
-
-computed_total = 0
-for i, f in enumerate(folders):
-    if not isinstance(f, dict):
-        print(f'folders[{i}] must be a mapping')
-        sys.exit(1)
-    for field in ['name', 'description', 'test_count', 'critical', 'high', 'mid', 'low']:
-        if field not in f:
-            print(f'folders[{i}] missing required field: {field}')
-            sys.exit(1)
-    tc = f.get('test_count')
-    if not isinstance(tc, int) or tc < 1:
-        print(f'folders[{i}].test_count must be a positive integer')
-        sys.exit(1)
-    crit_sum = 0
-    for level in ['critical', 'high', 'mid', 'low']:
-        val = f.get(level)
-        if not isinstance(val, int) or val < 0:
-            print(f'folders[{i}].{level} must be a non-negative integer')
-            sys.exit(1)
-        crit_sum += val
-    if crit_sum != tc:
-        print(f'folders[{i}]: criticality counts ({crit_sum}) do not sum to test_count ({tc})')
-        sys.exit(1)
-    computed_total += tc
-
-if computed_total != tt:
-    print(f'Sum of folder test_counts ({computed_total}) does not match total_tests ({tt})')
-    sys.exit(1)
-
-cc = fm.get('coverage_correlation')
-if not isinstance(cc, dict):
-    print('coverage_correlation must be a mapping')
-    sys.exit(1)
-for field in ['routes_or_features', 'expected_test_range_min', 'expected_test_range_max']:
-    if field not in cc:
-        print(f'coverage_correlation missing required field: {field}')
-        sys.exit(1)
-
-rf = cc.get('routes_or_features')
-if not isinstance(rf, int) or rf < 1:
-    print('coverage_correlation.routes_or_features must be a positive integer')
-    sys.exit(1)
-
-tmin = cc.get('expected_test_range_min')
-tmax = cc.get('expected_test_range_max')
-if not isinstance(tmin, int) or not isinstance(tmax, int):
-    print('expected_test_range_min and expected_test_range_max must be integers')
-    sys.exit(1)
-if tmin > tmax:
-    print('expected_test_range_min must be <= expected_test_range_max')
-    sys.exit(1)
-if tt < tmin:
-    print(f'total_tests ({tt}) is below minimum ({tmin}) for {rf} routes/features. Too few tests — add more coverage.')
-    sys.exit(1)
-
-# Optional cross-check against features.json.
-# Path: autonoma/qa-tests/{focus-slug}/INDEX.md → up three levels → autonoma/features.json
-focus_dir = os.path.dirname(filepath)       # autonoma/qa-tests/{focus-slug}/
-qa_tests_dir = os.path.dirname(focus_dir)   # autonoma/qa-tests/
-autonoma_dir = os.path.dirname(qa_tests_dir)  # autonoma/
-features_path = os.path.join(autonoma_dir, 'features.json')
-
-if os.path.isfile(features_path):
-    try:
-        features_data = jsonlib.load(open(features_path))
-        feature_count = features_data.get('total_features', 0)
-        if feature_count > 0 and tt < feature_count * 2:
-            print(f'total_tests ({tt}) is too low for {feature_count} features in features.json. '
-                  f'Expected at least {feature_count * 2} tests (2 per feature).')
-            sys.exit(1)
-    except Exception:
-        pass  # malformed features.json — skip cross-check
-
-print('OK')
diff --git a/adhoc/agents/focused-test-case-generator.md b/agents/focused-test-case-generator.md
similarity index 54%
rename from adhoc/agents/focused-test-case-generator.md
rename to agents/focused-test-case-generator.md
index 1dea436..4cda3a5 100644
--- a/adhoc/agents/focused-test-case-generator.md
+++ b/agents/focused-test-case-generator.md
@@ -1,8 +1,8 @@
 ---
 description: >
-  Generates E2E test cases focused on a specific user-defined domain or feature area.
-  Reads knowledge base, scenarios, and existing tests to produce targeted, non-duplicating
-  test files with YAML frontmatter for deterministic validation.
+  Generates E2E test cases focused on a specific user-defined topic or feature area as markdown files from knowledge base and scenarios..
+  Creates an INDEX.md with test distribution metadata and individual test files
+  with YAML frontmatter for deterministic validation.
 tools:
   - Read
   - Glob
@@ -16,21 +16,17 @@ maxTurns: 80
 
 # Focused E2E Test Case Generator
 
-You generate E2E test cases scoped to a specific domain or feature area.
-
-**Your primary directive is defined by the orchestrator and passed in the task description as `FOCUS_PROMPT`.** Every test you write must be relevant to that focus. Do not generate tests outside the requested scope.
-
-Your inputs are:
-- `FOCUS_PROMPT` — the user-defined focus (injected by the orchestrator in the task description)
-- `FOCUS_SLUG` — the output folder name (injected by the orchestrator)
-- `autonoma/AUTONOMA.md` (knowledge base with core flows) — if it exists
+You generate E2E test cases scoped to a specific topic or feature area as markdown files.. Your inputs are:
+- `FOCUS_PROMPT` — the user-defined focus topic. **Every test you write must be relevant to this topic. Do not generate tests outside the requested scope.**
+- `FOCUS_SLUG` — the output folder name-
+- `autonoma/AUTONOMA.md` (knowledge base with core flows in frontmatter) — if it exists
 - `autonoma/skills/` (skill files for navigation) — if they exist
-- `autonoma/scenarios.md` (test data scenarios) — if it exists
-- `EXISTING_TESTS` — a list of existing test titles/folders passed by the orchestrator (to avoid duplication)
+- `autonoma/scenarios.md` (test data scenarios with frontmatter) — if it exists
+- `EXISTING_TESTS` — a list of existing test titles (to avoid duplication) — if provided
 
 Your output is a directory `autonoma/qa-tests/{FOCUS_SLUG}/` containing:
 1. `INDEX.md` — index with test distribution metadata
-2. Individual test files organized in subdirectories by sub-feature
+2. Subdirectories organized by sub-feature within the focus area, each containing test files
 
 ## Instructions
 
@@ -40,27 +36,45 @@ Your output is a directory `autonoma/qa-tests/{FOCUS_SLUG}/` containing:
    and follow those instructions for how to generate tests — except scope all tests to the `FOCUS_PROMPT`.
 
 2. Read all available input files:
-   - `autonoma/AUTONOMA.md` — parse frontmatter for core_flows and feature_count (if exists)
-   - All files in `autonoma/skills/` (if exists)
-   - `autonoma/scenarios.md` — parse frontmatter for scenarios, entity_types, variable_fields (if exists)
-   - If neither `AUTONOMA.md` nor `scenarios.md` exists, scan the codebase for routes and features relevant to the focus area
+   - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count (if it exists)
+   - All files in `autonoma/skills/` (if they exist)
+   - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields** (if it exists)
+   - If neither `autonoma/AUTONOMA.md` nor `autonoma/scenarios.md` exists, scan the codebase for routes and features relevant to the focus area
+
+3. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter
+   declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has
+   a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`.
+   When writing test steps that involve a variable field value — typing it, asserting it, or
+   navigating to it — you MUST use the `{{token}}` placeholder, never the hardcoded literal from
+   the scenario body. At runtime the agent resolves these tokens to their actual values.
+
+   Example: if `variable_fields` includes `{{deadline_1}}` for `Tasks.deadline`:
+   - good: "assert the task deadline shows `{{deadline_1}}`"
+   - bad: "assert the task deadline shows 2025-06-15"
+
+4. Review the `EXISTING_TESTS` list provided (if any). Do not generate tests
+   whose title or purpose substantially duplicates an existing test.
 
-3. Review the `EXISTING_TESTS` list provided by the orchestrator. Do not generate tests whose title or
-   purpose substantially duplicates an existing test.
+5. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
+   The scenarios exist only to provide preconditions and known data for app behavior tests.
+   Do NOT generate tests whose purpose is to verify:
+   - that the scenario contains the documented entity counts
+   - that every scenario row, seed, or example value exists
+   - that the Environment Factory created data correctly
+   - that `standard`, `empty`, or `large` themselves are "correct" as artifacts
 
-4. **Variable fields** work exactly as in the main planner: if `variable_fields` are declared in
-   `scenarios.md`, use `{{token}}` placeholders for those fields in test steps — never hardcode the
-   literal value. If `scenarios.md` does not exist, write tests without scenario references.
+   Only reference scenario data when it is necessary to exercise a real user-facing flow within
+   the focus area.
 
-5. Focus strictly on the `FOCUS_PROMPT`. If the focus is "signatures and documents", only generate
-   tests that exercise signing flows, document management, signature edge cases, etc. Do not generate
-   unrelated tests just to fill a quota.
+6. Count the routes/features/pages in the codebase relevant to the focus area to establish the
+   coverage correlation. Focus strictly on what belongs to `FOCUS_PROMPT` — do not pad with
+   unrelated tests.
 
-6. Count the routes/features/pages in the codebase relevant to the focus area to establish coverage.
+7. Generate test files organized in subdirectories by sub-feature within the focus area.
 
-7. Write `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` FIRST (before individual test files).
+8. Write `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` FIRST (before individual test files).
 
-8. Write individual test files into subdirectories under `autonoma/qa-tests/{FOCUS_SLUG}/`.
+9. Write individual test files into subdirectories.
 
 ## CRITICAL: INDEX.md Format
 
@@ -110,8 +124,8 @@ coverage_correlation:
   - `critical`, `high`, `mid`, `low`: Count of tests at each criticality level. **Must sum to test_count.**
 - **coverage_correlation**: Explains why the test count makes sense for the focus area.
   - `routes_or_features`: Number of distinct routes/features relevant to the focus
-  - `expected_test_range_min`: Lower bound (routes_or_features * 3)
-  - `expected_test_range_max`: Upper bound (routes_or_features * 5, higher for core-heavy focus areas)
+  - `expected_test_range_min`: Lower bound of expected tests (routes_or_features * 3)
+  - `expected_test_range_max`: Upper bound of expected tests (routes_or_features * 5, or higher for core-heavy focus areas)
   - **total_tests must fall within [expected_test_range_min, expected_test_range_max]**
 
 ### After the INDEX frontmatter
@@ -142,27 +156,29 @@ flow: "Document Signing"
 - **criticality**: Exactly one of: `critical`, `high`, `mid`, `low`
 - **scenario**: Which scenario this test uses — `standard`, `empty`, or `large`. If `scenarios.md`
   does not exist, use `standard` as the default.
-- **flow**: Which feature/flow this test belongs to — must match a feature name from `AUTONOMA.md`
+- **flow**: Which feature/flow this test belongs to — must match a feature name from AUTONOMA.md
   frontmatter if that file exists, otherwise use a descriptive name for the focus sub-feature.
 
 ### After the test frontmatter
 
-Follow the standard Autonoma test format from the fetched instructions:
+The body follows the standard Autonoma test format from the fetched instructions:
 - **Setup**: Scenario reference and any preconditions
 - **Steps**: Numbered list using only: click, scroll, type, assert
 - **Expected Result**: What should be true when the test passes
 
 ## Test Distribution Guidelines
 
-- Focus budget entirely on the `FOCUS_PROMPT` domain — do not pad with unrelated tests
-- Within the focus area, apply the same criticality distribution as the main planner:
-  - Core sub-flows of the focus: mostly `critical` and `high`
+- Focus budget entirely on the `FOCUS_PROMPT` domain — every test must belong to the focus topic
+- Within the focus area, apply the same criticality distribution:
+  - Core sub-flows of the focus (from AUTONOMA.md where `core: true`, scoped to the topic): mostly `critical` and `high`
   - Supporting sub-flows: mostly `high` and `mid`
   - Settings/admin within the focus: mostly `mid` and `low`
 - Never write conditional steps — each test follows one deterministic path
 - Assertions must specify exact text, element, or visual state
-- Use `{{token}}` placeholders for variable fields; never hardcode dynamic values
-- Do not write meta-tests that verify scenario validity or Environment Factory correctness
+- Reference scenario data by exact values from scenarios.md, EXCEPT for variable fields — use `{{token}}` placeholders for those
+- Do not spend test budget "auditing" scenario contents. Scenario data is setup, not the product behavior under test.
+- Do not write meta-tests such as "verify the seeded counts match scenarios.md" or "verify the Environment Factory created the right fixtures"
+- If a seeded value is not needed for a user-facing flow within the focus area, do not assert it just because it exists in scenarios.md
 - Do not duplicate any test from `EXISTING_TESTS`
 
 ## Validation
diff --git a/adhoc/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
similarity index 84%
rename from adhoc/commands/generate-adhoc-tests.md
rename to commands/generate-adhoc-tests.md
index 3a570d9..03c80a1 100644
--- a/adhoc/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -3,13 +3,14 @@ name: generate-adhoc-tests
 description: >
   Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline.
   Each step runs in an isolated subagent and must pass deterministic validation before the next
-  step begins. Steps 1, 2, and 4 run as normal; Step 3 scopes test generation to the requested
-  topic. Use when you want targeted test coverage for a specific feature area.
+  step begins. When scenarios already exist in Autonoma, fetches context from the API and runs only
+  Step 3 scoped to the topic. On a first run, executes the full 4-step pipeline with Step 3 focused.
+  Use when you want targeted test coverage for a specific feature or domain.
 ---
 
-# Autonoma Ad Hoc E2E Test Generation Pipeline
+# Autonoma Focused E2E Test Generation Pipeline
 
-You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a focused test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
@@ -78,6 +79,73 @@ echo "Generation ID: $GENERATION_ID"
 
 If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
 
+## Checking Existing Setup
+
+Check whether scenarios with active recipes already exist in Autonoma for this application:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+HAS_SCENARIOS="no"
+SCENARIOS_RESPONSE=""
+if [ -n "$GENERATION_ID" ]; then
+  SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+active = [s for s in data.get('scenarios', []) if s.get('hasActiveRecipe')]
+print('yes' if active else 'no')
+" 2>/dev/null || echo "no")
+fi
+echo "Has active scenarios: $HAS_SCENARIOS"
+```
+
+**If `HAS_SCENARIOS=yes`** — scenarios and tests already exist. Fetch context from the API and run only Step 3:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/existing-tests" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+
+SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+lines = ['## Available Scenarios', '']
+for s in data.get('scenarios', []):
+    status = 'active' if s.get('hasActiveRecipe') else 'no recipe'
+    lines.append(f\"- **{s['name']}** ({status})\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+
+TESTS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+tests = data.get('tests', [])
+lines = [f'## Existing Tests ({len(tests)} total)', '']
+for t in tests:
+    lines.append(f\"- {t['name']} (slug: {t['slug']})\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+
+SKILLS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+skills = data.get('skills', [])
+lines = [f'## Available Skills ({len(skills)} total)', '']
+for s in skills:
+    lines.append(f\"- {s['name']}: {s['description']}\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+```
+
+Skip to **Step 3: Generate Focused E2E Test Cases** and pass the fetched context inline in the subagent task — do not run Steps 1, 2, or 4.
+
+**If `HAS_SCENARIOS=no`** — this is a first run. Continue with the full pipeline below (Steps 1 through 4).
+
+---
+
 ## Step 1: Generate Knowledge Base
 
 Report step start:
@@ -251,29 +319,36 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Generating focused E2E test cases from knowledge base and scenarios..."}}' || true
+  -d '{"type":"log","data":{"message":"Generating focused E2E test cases..."}}' || true
 ```
 
 Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual
-values for FOCUS_PROMPT and FOCUS_SLUG before spawning):
+values for FOCUS_PROMPT, FOCUS_SLUG, and — when coming from the API-fetch path — the context
+variables SCENARIOS_CONTEXT, TESTS_CONTEXT, and SKILLS_CONTEXT before spawning):
 
 > **FOCUS_PROMPT**: <the user's focus description>
 > **FOCUS_SLUG**: <kebab-case slug>
 >
+> *(API-fetch path only — omit this block when running the full pipeline)*
+> Context fetched from the Autonoma API (use this instead of reading local files):
+> <SCENARIOS_CONTEXT>
+> <TESTS_CONTEXT>
+> <SKILLS_CONTEXT>
+>
 > Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
-> and scenarios from `autonoma/scenarios.md`.
+> and scenarios from `autonoma/scenarios.md` (if they exist and no inline context was provided above).
 > Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
 > Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
 > You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
 > total_tests, total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> Treat scenario data as fixture input only. Do not generate tests whose purpose is to verify
 > scenario counts, seeded inventories, or Environment Factory correctness. Only reference
 > scenario data when it is needed to test a real user-facing app behavior within the focus area.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
-1. Verify `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` exists and is non-empty
+1. Verify `autonoma/qa-tests/${FOCUS_SLUG}/INDEX.md` exists and is non-empty
 2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
 3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
 
@@ -318,9 +393,11 @@ print(json.dumps({'testCases': test_cases}))
 
 4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes", "Done — skip Step 4 (scenarios already exist)"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4 (or stop here if coming from the API-fetch path).
+
+If coming from the **API-fetch path** (scenarios already existed), stop here after uploading. Step 4 is not needed.
 
 ## Step 4: Environment Factory
 
@@ -394,7 +471,6 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
   -H "Content-Type: application/json" \
   -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
 if [ -n "$GENERATION_ID" ]; then
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
   if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
     echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
@@ -411,35 +487,11 @@ if [ -n "$GENERATION_ID" ]; then
     echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
     exit 1
   fi
-
-  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
-  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
-  if [ "$VERIFY_STATUS" != "200" ]; then
-    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
-    exit 1
-  fi
-  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
-  MISSING=""
-  for NAME in $EXPECTED_NAMES; do
-    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
-match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
-print('yes' if match else 'no')
-" 2>/dev/null || echo "no")
-    if [ "$HAS_ACTIVE" != "yes" ]; then
-      MISSING="$MISSING $NAME"
-    fi
-  done
-  if [ -n "$MISSING" ]; then
-    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
-    echo "Step 4 cannot complete. Recipe upload may have partially failed."
-    exit 1
-  fi
-  echo "Verified: all scenario recipes persisted successfully on the dashboard."
 fi
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
@@ -450,7 +502,7 @@ fi
 
 After all steps complete, summarize:
 - **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`)
-- **Step 1**: Knowledge base location and core flow count
-- **Step 2**: Scenario count and entity types covered
+- **Step 1**: Knowledge base location and core flow count *(full pipeline only)*
+- **Step 2**: Scenario count and entity types covered *(full pipeline only)*
 - **Step 3**: Total focused test count, folder breakdown, coverage correlation
-- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results *(full pipeline only)*
diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh
index dd7e3ec..6485484 100755
--- a/hooks/validate-pipeline-output.sh
+++ b/hooks/validate-pipeline-output.sh
@@ -50,6 +50,10 @@ case "$FILE_PATH" in
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
     VALIDATOR_NAME="validate-test-index"
     ;;
+  */autonoma/qa-tests/*/INDEX.md)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
+    VALIDATOR_NAME="validate-adhoc-test-index"
+    ;;
   */autonoma/qa-tests/*/[!I]*.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py"
     VALIDATOR_NAME="validate-test-file"
@@ -104,7 +108,8 @@ if [ "$VALIDATOR_NAME" = "validate-scenario-recipes" ]; then
   fi
 fi
 
-# For INDEX.md, also validate directory structure
+# For root INDEX.md only, also validate directory structure
+# (subfolder INDEX.md from adhoc runs uses validate-adhoc-test-index and skips this check)
 if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
   DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py"
   DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1)
diff --git a/adhoc/skills/generate-adhoc-tests/SKILL.md b/skills/generate-adhoc-tests/SKILL.md
similarity index 84%
rename from adhoc/skills/generate-adhoc-tests/SKILL.md
rename to skills/generate-adhoc-tests/SKILL.md
index c0e55e7..03c80a1 100644
--- a/adhoc/skills/generate-adhoc-tests/SKILL.md
+++ b/skills/generate-adhoc-tests/SKILL.md
@@ -1,15 +1,16 @@
 ---
 name: generate-adhoc-tests
 description: >
-  Generates focused E2E test cases for a codebase with a user-defined topic through a validated multi-step pipeline.
+  Generates focused E2E test cases for a user-defined topic through a validated multi-step pipeline.
   Each step runs in an isolated subagent and must pass deterministic validation before the next
-  step begins. Steps 1, 2, and 4 run as normal; Step 3 scopes test generation to the requested
-  topic. Use when you want targeted test coverage for a specific feature area.
+  step begins. When scenarios already exist in Autonoma, fetches context from the API and runs only
+  Step 3 scoped to the topic. On a first run, executes the full 4-step pipeline with Step 3 focused.
+  Use when you want targeted test coverage for a specific feature or domain.
 ---
 
-# Autonoma Ad Hoc E2E Test Generation Pipeline
+# Autonoma Focused E2E Test Generation Pipeline
 
-You are orchestrating a 4-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a focused test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
@@ -78,6 +79,73 @@ echo "Generation ID: $GENERATION_ID"
 
 If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
 
+## Checking Existing Setup
+
+Check whether scenarios with active recipes already exist in Autonoma for this application:
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+HAS_SCENARIOS="no"
+SCENARIOS_RESPONSE=""
+if [ -n "$GENERATION_ID" ]; then
+  SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+  HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+active = [s for s in data.get('scenarios', []) if s.get('hasActiveRecipe')]
+print('yes' if active else 'no')
+" 2>/dev/null || echo "no")
+fi
+echo "Has active scenarios: $HAS_SCENARIOS"
+```
+
+**If `HAS_SCENARIOS=yes`** — scenarios and tests already exist. Fetch context from the API and run only Step 3:
+
+```bash
+AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
+GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
+
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/existing-tests" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+
+SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+lines = ['## Available Scenarios', '']
+for s in data.get('scenarios', []):
+    status = 'active' if s.get('hasActiveRecipe') else 'no recipe'
+    lines.append(f\"- **{s['name']}** ({status})\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+
+TESTS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+tests = data.get('tests', [])
+lines = [f'## Existing Tests ({len(tests)} total)', '']
+for t in tests:
+    lines.append(f\"- {t['name']} (slug: {t['slug']})\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+
+SKILLS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
+import json, sys
+data = json.loads(sys.stdin.read())
+skills = data.get('skills', [])
+lines = [f'## Available Skills ({len(skills)} total)', '']
+for s in skills:
+    lines.append(f\"- {s['name']}: {s['description']}\")
+print('\n'.join(lines))
+" 2>/dev/null || echo "")
+```
+
+Skip to **Step 3: Generate Focused E2E Test Cases** and pass the fetched context inline in the subagent task — do not run Steps 1, 2, or 4.
+
+**If `HAS_SCENARIOS=no`** — this is a first run. Continue with the full pipeline below (Steps 1 through 4).
+
+---
+
 ## Step 1: Generate Knowledge Base
 
 Report step start:
@@ -251,29 +319,36 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
-  -d '{"type":"log","data":{"message":"Generating focused E2E test cases from knowledge base and scenarios..."}}' || true
+  -d '{"type":"log","data":{"message":"Generating focused E2E test cases..."}}' || true
 ```
 
 Spawn the `focused-test-case-generator` subagent with the following task (substitute the actual
-values for FOCUS_PROMPT and FOCUS_SLUG before spawning):
+values for FOCUS_PROMPT, FOCUS_SLUG, and — when coming from the API-fetch path — the context
+variables SCENARIOS_CONTEXT, TESTS_CONTEXT, and SKILLS_CONTEXT before spawning):
 
 > **FOCUS_PROMPT**: <the user's focus description>
 > **FOCUS_SLUG**: <kebab-case slug>
 >
+> *(API-fetch path only — omit this block when running the full pipeline)*
+> Context fetched from the Autonoma API (use this instead of reading local files):
+> <SCENARIOS_CONTEXT>
+> <TESTS_CONTEXT>
+> <SKILLS_CONTEXT>
+>
 > Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
-> and scenarios from `autonoma/scenarios.md`.
+> and scenarios from `autonoma/scenarios.md` (if they exist and no inline context was provided above).
 > Generate E2E test cases focused exclusively on the topic described in FOCUS_PROMPT.
 > Write tests to `autonoma/qa-tests/{FOCUS_SLUG}/`.
 > You MUST create `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` with frontmatter containing
 > total_tests, total_folders, folder breakdown, and coverage_correlation.
 > Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
+> Treat scenario data as fixture input only. Do not generate tests whose purpose is to verify
 > scenario counts, seeded inventories, or Environment Factory correctness. Only reference
 > scenario data when it is needed to test a real user-facing app behavior within the focus area.
 > Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
 
 **After the subagent completes:**
-1. Verify `autonoma/qa-tests/{FOCUS_SLUG}/INDEX.md` exists and is non-empty
+1. Verify `autonoma/qa-tests/${FOCUS_SLUG}/INDEX.md` exists and is non-empty
 2. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
 3. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
 
@@ -318,9 +393,11 @@ print(json.dumps({'testCases': test_cases}))
 
 4. **If `AUTONOMA_AUTO_ADVANCE` is not `true`:** Call `AskUserQuestion` with:
    - question: "Does this focused test distribution look correct? The tests should cover the requested topic thoroughly."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
+   - options: ["Yes, proceed to Step 4", "I want to suggest changes", "Done — skip Step 4 (scenarios already exist)"]
    Wait for the user's response before proceeding.
-   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4.
+   **If `AUTONOMA_AUTO_ADVANCE=true`:** Skip the prompt and proceed directly to Step 4 (or stop here if coming from the API-fetch path).
+
+If coming from the **API-fetch path** (scenarios already existed), stop here after uploading. Step 4 is not needed.
 
 ## Step 4: Environment Factory
 
@@ -394,7 +471,6 @@ echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
   -H "Content-Type: application/json" \
   -d '{"type":"log","data":{"message":"Uploading validated scenario recipes to setup..."}}' || true
 if [ -n "$GENERATION_ID" ]; then
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
   if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
     echo "ERROR: scenario-recipes.json is not valid JSON. Step 4 cannot complete."
@@ -411,35 +487,11 @@ if [ -n "$GENERATION_ID" ]; then
     echo "ERROR: Recipe upload failed (HTTP $UPLOAD_STATUS). Step 4 cannot complete."
     exit 1
   fi
-
-  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
-  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
-  if [ "$VERIFY_STATUS" != "200" ]; then
-    echo "ERROR: Failed to verify scenarios (HTTP $VERIFY_STATUS). Step 4 cannot complete."
-    exit 1
-  fi
-  EXPECTED_NAMES=$(python3 -c "import json; data=json.load(open('$RECIPE_PATH')); print('\n'.join(r['name'] for r in data['recipes']))")
-  MISSING=""
-  for NAME in $EXPECTED_NAMES; do
-    HAS_ACTIVE=$(echo "$VERIFY_BODY" | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
-match = [s for s in data.get('scenarios', []) if s['name'] == '$NAME' and s.get('hasActiveRecipe')]
-print('yes' if match else 'no')
-" 2>/dev/null || echo "no")
-    if [ "$HAS_ACTIVE" != "yes" ]; then
-      MISSING="$MISSING $NAME"
-    fi
-  done
-  if [ -n "$MISSING" ]; then
-    echo "ERROR: The following scenarios are missing or lack an active recipe on the dashboard:$MISSING"
-    echo "Step 4 cannot complete. Recipe upload may have partially failed."
-    exit 1
-  fi
-  echo "Verified: all scenario recipes persisted successfully on the dashboard."
 fi
+[ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"log","data":{"message":"Environment Factory implementation and scenario validation completed."}}' || true
 [ -n "$GENERATION_ID" ] && curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
   -H "Content-Type: application/json" \
@@ -450,7 +502,7 @@ fi
 
 After all steps complete, summarize:
 - **Focus**: The user-defined topic and output location (`autonoma/qa-tests/{FOCUS_SLUG}/`)
-- **Step 1**: Knowledge base location and core flow count
-- **Step 2**: Scenario count and entity types covered
+- **Step 1**: Knowledge base location and core flow count *(full pipeline only)*
+- **Step 2**: Scenario count and entity types covered *(full pipeline only)*
 - **Step 3**: Total focused test count, folder breakdown, coverage correlation
-- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results
+- **Step 4**: Environment Factory location, backend changes, smoke-test results, required secrets, and per-scenario lifecycle results *(full pipeline only)*

From aede7f62470157f793a64fc33671c0abedc358a2 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Mon, 20 Apr 2026 22:20:09 -0300
Subject: [PATCH 23/33] fix: point marketplace ref to
 chiara-ciriani/adhoc-planner for testing

---
 .claude-plugin/marketplace.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 08cf736..1372453 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -12,7 +12,7 @@
       "source": {
         "source": "github",
         "repo": "Autonoma-AI/test-planner-plugin",
-        "ref": "production"
+        "ref": "chiara-ciriani/adhoc-planner"
       },
       "description": "Generates comprehensive E2E test cases through a validated 4-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands."
     },

From 98baa7559629d921a97316717d46202c7e80cd25 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 16:21:47 -0300
Subject: [PATCH 24/33] fix: update EXISTING_CONTEXT URL

---
 commands/generate-adhoc-tests.md     | 2 +-
 skills/generate-adhoc-tests/SKILL.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
index 03c80a1..aaab97e 100644
--- a/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -106,7 +106,7 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/existing-tests" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
 SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
diff --git a/skills/generate-adhoc-tests/SKILL.md b/skills/generate-adhoc-tests/SKILL.md
index 03c80a1..aaab97e 100644
--- a/skills/generate-adhoc-tests/SKILL.md
+++ b/skills/generate-adhoc-tests/SKILL.md
@@ -106,7 +106,7 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/existing-tests" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
 SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "

From 542f758baf83e7fb12fcc87ce76ebfa4966321bd Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 16:31:41 -0300
Subject: [PATCH 25/33] fix: change getTestSuiteForSetup to
 getTestSuiteForApplication

---
 commands/generate-adhoc-tests.md     | 2 +-
 skills/generate-adhoc-tests/SKILL.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
index aaab97e..4f6dd14 100644
--- a/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -106,7 +106,7 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/test-suite" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
 SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
diff --git a/skills/generate-adhoc-tests/SKILL.md b/skills/generate-adhoc-tests/SKILL.md
index aaab97e..4f6dd14 100644
--- a/skills/generate-adhoc-tests/SKILL.md
+++ b/skills/generate-adhoc-tests/SKILL.md
@@ -106,7 +106,7 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/test-suite" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
 SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "

From 936bedada2cc0a6d3b3d5f8942c7915213a4eb21 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 17:29:57 -0300
Subject: [PATCH 26/33] feat: add APPLICATION_ID in generate-adhoc-tests

---
 commands/generate-adhoc-tests.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
index 4f6dd14..8adbfb1 100644
--- a/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -72,9 +72,11 @@ HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
+APPLICATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('applicationId',''))" 2>/dev/null || echo "$AUTONOMA_PROJECT_ID")
 mkdir -p autonoma
 echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}"
 echo "Generation ID: $GENERATION_ID"
+echo "Application ID: $APPLICATION_ID"
 ```
 
 If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
@@ -106,7 +108,7 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${APPLICATION_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
 SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "

From 01402e2b1f841019361f15181ce3ed13e29b5820 Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 17:47:54 -0300
Subject: [PATCH 27/33] feat: update generate-adhoc-tests.md

---
 commands/generate-adhoc-tests.md | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
index 8adbfb1..59e68d1 100644
--- a/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -72,11 +72,9 @@ HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
 BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
 echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
 GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
-APPLICATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('applicationId',''))" 2>/dev/null || echo "$AUTONOMA_PROJECT_ID")
 mkdir -p autonoma
 echo "$GENERATION_ID" > "autonoma/.generation-id-${FOCUS_SLUG}"
 echo "Generation ID: $GENERATION_ID"
-echo "Application ID: $APPLICATION_ID"
 ```
 
 If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway — reporting is best-effort and must never block test generation.
@@ -85,20 +83,15 @@ If `GENERATION_ID` is empty, log the HTTP status and response body above for deb
 
 Check whether scenarios with active recipes already exist in Autonoma for this application:
 ```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
-HAS_SCENARIOS="no"
-SCENARIOS_RESPONSE=""
-if [ -n "$GENERATION_ID" ]; then
-  SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
-  HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+SCENARIOS_RESPONSE=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/scenarios" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+HAS_SCENARIOS=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
 import json, sys
 data = json.loads(sys.stdin.read())
 active = [s for s in data.get('scenarios', []) if s.get('hasActiveRecipe')]
 print('yes' if active else 'no')
 " 2>/dev/null || echo "no")
-fi
+echo "$SCENARIOS_RESPONSE" > /tmp/autonoma-scenarios-response.json
 echo "Has active scenarios: $HAS_SCENARIOS"
 ```
 
@@ -108,10 +101,10 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${APPLICATION_ID}/test-suite" \
+EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
   -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
 
-SCENARIOS_CONTEXT=$(echo "$SCENARIOS_RESPONSE" | python3 -c "
+SCENARIOS_CONTEXT=$(cat /tmp/autonoma-scenarios-response.json 2>/dev/null | python3 -c "
 import json, sys
 data = json.loads(sys.stdin.read())
 lines = ['## Available Scenarios', '']

From e7b4dcd53a790ad4b1f4dc2f8009161cdcbcb75c Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 21:37:05 -0300
Subject: [PATCH 28/33] feat: generate-adhoc-test.md improvements

---
 commands/generate-adhoc-tests.md | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/commands/generate-adhoc-tests.md b/commands/generate-adhoc-tests.md
index 59e68d1..da61432 100644
--- a/commands/generate-adhoc-tests.md
+++ b/commands/generate-adhoc-tests.md
@@ -101,12 +101,13 @@ echo "Has active scenarios: $HAS_SCENARIOS"
 AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
 GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id-${FOCUS_SLUG}" 2>/dev/null || echo '')
 
-EXISTING_CONTEXT=$(curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
+curl -s "${AUTONOMA_API_URL}/v1/setup/applications/${AUTONOMA_PROJECT_ID}/test-suite" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" > /tmp/autonoma-test-suite.json
 
-SCENARIOS_CONTEXT=$(cat /tmp/autonoma-scenarios-response.json 2>/dev/null | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
+SCENARIOS_CONTEXT=$(python3 -c "
+import json
+with open('/tmp/autonoma-scenarios-response.json') as f:
+    data = json.load(f)
 lines = ['## Available Scenarios', '']
 for s in data.get('scenarios', []):
     status = 'active' if s.get('hasActiveRecipe') else 'no recipe'
@@ -114,9 +115,10 @@ for s in data.get('scenarios', []):
 print('\n'.join(lines))
 " 2>/dev/null || echo "")
 
-TESTS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
+TESTS_CONTEXT=$(python3 -c "
+import json
+with open('/tmp/autonoma-test-suite.json') as f:
+    data = json.load(f)
 tests = data.get('tests', [])
 lines = [f'## Existing Tests ({len(tests)} total)', '']
 for t in tests:
@@ -124,9 +126,10 @@ for t in tests:
 print('\n'.join(lines))
 " 2>/dev/null || echo "")
 
-SKILLS_CONTEXT=$(echo "$EXISTING_CONTEXT" | python3 -c "
-import json, sys
-data = json.loads(sys.stdin.read())
+SKILLS_CONTEXT=$(python3 -c "
+import json
+with open('/tmp/autonoma-test-suite.json') as f:
+    data = json.load(f)
 skills = data.get('skills', [])
 lines = [f'## Available Skills ({len(skills)} total)', '']
 for s in skills:

From e8fd9b5a7b46a1947f6f5b715242b7f8adbc0cac Mon Sep 17 00:00:00 2001
From: chiara-ciriani <chiara@autonoma.app>
Date: Tue, 21 Apr 2026 21:51:35 -0300
Subject: [PATCH 29/33] fix: update marketplace.json with correct ref

---
 .claude-plugin/marketplace.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 098fea4..37dd33b 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -12,7 +12,7 @@
       "source": {
         "source": "github",
         "repo": "Autonoma-AI/test-planner-plugin",
-        "ref": "chiara-ciriani/adhoc-planner"
+        "ref": "production"
       },
       "description": "Generates comprehensive E2E test cases through a validated multi-step pipeline with deterministic validation. Includes generate-tests (full suite) and generate-adhoc-tests (focused topic) commands."
     },

From 4470971fe1c53b05efa58bafc4871a24267994de Mon Sep 17 00:00:00 2001
From: tomaspiaggio <tomas.piaggio12@gmail.com>
Date: Tue, 21 Apr 2026 18:28:31 -0700
Subject: [PATCH 30/33] docs(agent): enforce backend discovery + no-sidecar
 rule in sdk-integrator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Strengthens the Strict Rules and rewrites Step 1 so the agent locates
the project's existing backend — and matches the SDK to its language —
before installing anything.

Strict Rules additions:
- Never create a standalone server / sidecar (no new FastAPI/express/
  Flask/Gin instance, no root-level `start-*.py`/`main.go` launcher,
  no separate port). Integrate as a new route inside the existing
  backend.
- SDK language MUST match backend language. Don't install the Python
  SDK into a TS/NestJS project or vice versa. If no matching SDK
  exists, stop per Step 3 — never fall back to a different-language
  sidecar.
- Never scaffold at repo root when a backend directory exists,
  including non-standard names like `core-app-backend/`, `apps/api/`,
  `services/core/`.

Step 1 rewrite:
- 1a: enumerate candidate backend directories via Glob/ls with a
  broad list (backend/, server/, api/, *-backend/, core-*/, apps/*,
  services/*, packages/*, root) — no hardcoded `backend/`.
- 1b: identify the backend by manifest file (package.json,
  pyproject.toml, go.mod, pom.xml/build.gradle, Cargo.toml, Gemfile,
  composer.json, mix.exs) and derive the language from it.
- 1c: state-and-confirm with the user before writing any code or
  installing packages.
- 1d: determine framework / ORM / package manager from inside the
  identified backend.

Motivation: recent runs installed the Python SDK and created a
standalone FastAPI sidecar next to a NestJS/TypeScript backend
because discovery assumed `backend/` as the directory name and the
SDK was picked before the backend language was detected.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 agents/sdk-integrator.md | 51 +++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/agents/sdk-integrator.md b/agents/sdk-integrator.md
index a0d47b7..addd455 100644
--- a/agents/sdk-integrator.md
+++ b/agents/sdk-integrator.md
@@ -28,6 +28,9 @@ The SDK reference repo path is provided by the orchestrator in `/tmp/autonoma-sd
 ## Strict Rules
 
 - Install the SDK from package managers only. Never vendor, copy, or link SDK source into the user's app.
+- **Never create a standalone server or sidecar.** The endpoint lives as a new route inside the project's existing backend. Do NOT create a new `FastAPI()` / `express()` / `Flask(__name__)` / `Gin.Default()` instance, a new `main.py` / `server.py` / `start-*.py` / `main.go` launcher, or open a separate port. If the project already has a backend, integrate into it.
+- **SDK language must match backend language.** Detect the backend's language from its manifest file BEFORE picking an SDK. Do not install the Python SDK into a TypeScript/NestJS project (or vice versa). If no matching SDK exists for the backend language, stop per Step 3 — do NOT fall back to a sidecar in a different language.
+- **Never scaffold at repo root when a backend directory exists**, including non-standard names like `core-app-backend/`, `apps/api/`, `services/core/`. Locate the backend first (Step 1).
 - Do NOT modify the SDK reference repo.
 - Do NOT modify database schemas, migrations, or models.
 - Keep integration changes minimal and aligned with the project's existing conventions.
@@ -38,19 +41,45 @@ The SDK reference repo path is provided by the orchestrator in `/tmp/autonoma-sd
 
 ## Required Order
 
-### 1. Detect the stack
+### 1. Locate the backend directory and detect the stack
 
-Inspect the repo for:
-- `package.json`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock`
-- `pyproject.toml`, `requirements.txt`, `Pipfile`
-- `mix.exs`
-- `composer.json`
-- `pom.xml`, `build.gradle`
-- `Gemfile`
-- `Cargo.toml`
-- `go.mod`
+**Do this BEFORE picking an SDK.** The SDK must match the backend's language, so the backend must be located first.
 
-Determine:
+#### 1a. Enumerate candidate backend directories
+
+Use Glob / `ls`. Do NOT hardcode the name `backend/`. Real projects use many conventions:
+
+- `backend/`, `server/`, `api/`, `service/`, `services/`
+- `*-backend/`, `*-api/`, `*-server/`, `core-*/`, `app-*/` (e.g. `core-app-backend/`)
+- Monorepo layouts: `apps/*`, `packages/*`, `services/*`
+- Single-repo backends at the workspace root
+
+#### 1b. Identify the backend by manifest file
+
+For each candidate, look for one of these manifest files — the file's language determines the SDK you install:
+
+- `package.json`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock` → TypeScript/JavaScript
+- `pyproject.toml`, `requirements.txt`, `Pipfile` → Python
+- `mix.exs` → Elixir
+- `composer.json` → PHP
+- `pom.xml`, `build.gradle` → Java
+- `Gemfile`, `*.gemspec` → Ruby
+- `Cargo.toml` → Rust
+- `go.mod` → Go
+
+Pick exactly one backend. If multiple plausible candidates exist, STOP and ask the user which one — do not guess, do not implement in more than one.
+
+#### 1c. Confirm with the user before writing any code
+
+State your finding:
+
+> "I found the backend at `<path>` (language: `<lang>`, framework: `<framework>`, ORM: `<orm>`, package manager: `<pm>`). I'll integrate the SDK there. Is that the right location?"
+
+Wait for confirmation before installing packages or writing files.
+
+#### 1d. Determine the rest of the stack
+
+From the identified backend directory, determine:
 - language
 - server framework
 - ORM or DB adapter

From 85719b54c9e23b7d6430cc3e14402405068e4141 Mon Sep 17 00:00:00 2001
From: Tom Piaggio <tomas.piaggio12@gmail.com>
Date: Wed, 22 Apr 2026 18:31:33 -0700
Subject: [PATCH 31/33] feat: scenarios v2.1 (#31)

---
 .claude-plugin/plugin.json                    |   2 +-
 agents/entity-audit-generator.md              | 241 ++++++
 agents/env-factory-generator.md               | 710 ++++++++++++++++
 agents/kb-generator.md                        |  30 +-
 agents/scenario-generator.md                  | 200 ++---
 agents/scenario-validator.md                  | 450 +++++-----
 agents/sdk-integrator.md                      | 301 -------
 agents/test-case-generator.md                 |  37 +-
 commands/generate-tests.md                    | 767 ++++--------------
 hooks/hooks.json                              |  22 +-
 hooks/pipeline-kickoff.sh                     | 111 +++
 hooks/pretool-heartbeat.sh                    |  80 ++
 hooks/transcript-streamer.py                  | 228 ++++++
 hooks/validate-pipeline-output.sh             | 291 +++++--
 hooks/validators/_audit_schema.py             |  67 ++
 hooks/validators/evals/README.md              |  53 ++
 .../fixtures/bad_audit_rewrite_only.json      |  12 +
 .../evals/fixtures/bad_missing_owner.json     |   7 +
 .../fixtures/bad_raw_orm_in_factory.json      |  12 +
 .../bad_stub_helper_in_handler_dir.json       |  12 +
 .../evals/fixtures/dependent_skipped.json     |   7 +
 .../fixtures/dual_judged_on_standalone.json   |  13 +
 .../framework_hook_extraction_pass.json       |  12 +
 .../framework_hook_raw_write_fail.json        |  12 +
 .../good_thin_wrapper_after_extraction.json   |  12 +
 .../evals/fixtures/good_uses_service.json     |  12 +
 .../fixtures/helper_unresolvable_errors.json  |  12 +
 hooks/validators/evals/run_evals.py           | 209 +++++
 .../validate_creation_file_immutable.py       | 112 +++
 .../validate_endpoint_implemented.py          | 451 ++++++++++
 hooks/validators/validate_entity_audit.py     | 172 ++++
 hooks/validators/validate_factory_fidelity.py | 585 +++++++++++++
 hooks/validators/validate_scenarios.py        |  87 +-
 hooks/validators/validate_sdk_integration.py  | 113 ---
 skills/generate-tests/SKILL.md                | 767 ++++--------------
 tests/test_validate_pipeline_output.py        | 321 --------
 tests/test_validate_scenarios.py              | 140 +---
 tests/test_validate_sdk_integration.py        |  79 --
 38 files changed, 4137 insertions(+), 2612 deletions(-)
 create mode 100644 agents/entity-audit-generator.md
 create mode 100644 agents/env-factory-generator.md
 delete mode 100644 agents/sdk-integrator.md
 create mode 100755 hooks/pipeline-kickoff.sh
 create mode 100755 hooks/pretool-heartbeat.sh
 create mode 100755 hooks/transcript-streamer.py
 create mode 100644 hooks/validators/_audit_schema.py
 create mode 100644 hooks/validators/evals/README.md
 create mode 100644 hooks/validators/evals/fixtures/bad_audit_rewrite_only.json
 create mode 100644 hooks/validators/evals/fixtures/bad_missing_owner.json
 create mode 100644 hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json
 create mode 100644 hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json
 create mode 100644 hooks/validators/evals/fixtures/dependent_skipped.json
 create mode 100644 hooks/validators/evals/fixtures/dual_judged_on_standalone.json
 create mode 100644 hooks/validators/evals/fixtures/framework_hook_extraction_pass.json
 create mode 100644 hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json
 create mode 100644 hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json
 create mode 100644 hooks/validators/evals/fixtures/good_uses_service.json
 create mode 100644 hooks/validators/evals/fixtures/helper_unresolvable_errors.json
 create mode 100755 hooks/validators/evals/run_evals.py
 create mode 100755 hooks/validators/validate_creation_file_immutable.py
 create mode 100755 hooks/validators/validate_endpoint_implemented.py
 create mode 100644 hooks/validators/validate_entity_audit.py
 create mode 100755 hooks/validators/validate_factory_fidelity.py
 delete mode 100644 hooks/validators/validate_sdk_integration.py
 delete mode 100644 tests/test_validate_pipeline_output.py
 delete mode 100644 tests/test_validate_sdk_integration.py

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 2de57c6..e43d1e1 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "autonoma-test-planner",
   "description": "Generates comprehensive E2E test cases for a codebase through a validated multi-step pipeline with deterministic validation at each step",
-  "version": "1.2.1",
+  "version": "1.13.1",
   "author": {
     "name": "Autonoma"
   }
diff --git a/agents/entity-audit-generator.md b/agents/entity-audit-generator.md
new file mode 100644
index 0000000..96b30e0
--- /dev/null
+++ b/agents/entity-audit-generator.md
@@ -0,0 +1,241 @@
+---
+description: >
+  Audits every database model to describe every way it comes into existence.
+  For each model the agent answers two orthogonal questions: (a) does a
+  standalone creation path exist? (b) which other models' creation flows
+  produce it as a side effect? Independently-created models get factories;
+  the rest fall back to raw SQL INSERT and are torn down via their owner(s).
+tools:
+  - Read
+  - Glob
+  - Grep
+  - Write
+  - Edit
+  - Bash
+  - Agent
+  - WebFetch
+maxTurns: 60
+---
+
+# Entity Creation Audit
+
+You audit the codebase to discover **every way each database model is created**. For every model
+you answer two orthogonal questions and record the answers so the Environment Factory can plan
+factories, scenario trees, and teardown correctly.
+
+Your input is the knowledge base (`autonoma/AUTONOMA.md` and `autonoma/skills/`). Your output
+is `autonoma/entity-audit.md`.
+
+## The two orthogonal questions
+
+For every model, answer **both** independently:
+
+1. **`independently_created`** — *Does the codebase have an exported function / method /
+   controller that creates this model on its own?* Boolean.
+2. **`created_by`** — *When I trace every other model's creation function, does any of them
+   produce this model as a side effect?* List of `{owner, via, why}` entries; empty if none.
+
+These are **not** mutually exclusive. A single model can be both. For example, a `<Child>` model
+may have its own `<Child>Service.create()` (answer 1 = true) *and* be minted inline inside a
+parent's `<Root>Service.createRoot()` transaction as a required default row (answer 2
+non-empty). Both facts are true simultaneously and both matter downstream — the scenario
+generator decides per-scenario whether a given `<Child>` is introduced via its standalone
+factory or comes along with its owner.
+
+**Do not collapse the two.** Do not omit `created_by` just because `independently_created` is
+true. Do not omit `independently_created` just because the model appears in someone else's
+`created_by`.
+
+**When in doubt, prefer `independently_created: true` and include `created_by` anyway.**
+Overclassifying a root as a dependent is worse than the inverse — a spurious factory is noisy,
+a missing factory leaves a real root untested.
+
+## The four states a model can be in
+
+| `independently_created` | `created_by` | Interpretation |
+|---|---|---|
+| `true` | `[]` | Pure root — only standalone creation exists. |
+| `true` | non-empty | Dual — has a standalone path AND is produced by at least one owner. |
+| `false` | non-empty | Pure dependent — only reachable via an owner's creation flow. |
+| `false` | `[]` | **Invalid.** Unreachable model — either you missed the owner, or the model is never created. Fix the audit before writing it. |
+
+## Instructions
+
+1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use
+   WebFetch. Do NOT write any URL yourself. The docs base URL lives only in
+   `autonoma/.docs-url`, written by the orchestrator before any subagent runs.
+
+   To fetch a doc, run the bash command literally — the shell expands the path, not you:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/<path>"
+   ```
+
+   If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code
+   and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback.
+
+2. Fetch the latest instructions:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"
+   ```
+
+   These are the source of truth. Follow them for audit methodology and output format.
+
+3. Read the knowledge base from `autonoma/AUTONOMA.md` and all skill files in `autonoma/skills/`.
+   Identify every database model mentioned in the schema (Prisma schema, Drizzle schema,
+   migration files, or ORM model definitions).
+
+4. **Pass A — find every standalone creation path.** For each model, search for a dedicated
+   create function:
+   - Service files: `*.service.ts`, `*.service.js`, `*Service.*`, `*_service.*`
+   - Repository files: `*.repository.ts`, `*.repository.js`, `*Repository.*`, `*_repository.*`
+   - Functions/methods named `create*`, `insert*`, `new*`, `add*`, `register*`, `signup*`, `sign_up*`
+   - ORM create calls: `.create(`, `.insert(`, `.save(`, `.build(`
+   - Controller or route handler files that contain inline creation logic
+   - Framework hooks (Better-Auth `databaseHooks.user.create`, NextAuth callbacks, Devise
+     callbacks, etc.) — these count as standalone creation paths.
+
+   If a standalone path exists → `independently_created: true` and record `creation_file`,
+   `creation_function`, and observed `side_effects`. If the only creation is inline in a route
+   handler or framework-hook closure, still mark `true` and add `needs_extraction: true` — the
+   env-factory agent will extract into a named export before wiring the factory.
+
+5. **Pass B — for every standalone creation path, find the sibling rows it mints.** Open each
+   creation function you found in Pass A and enumerate every write it performs:
+   - Every `db.<model>.create(...)` / `.insert(...)` / `.save(...)` / `<Model>.create` call
+   - Every `<Service>.create(...)` / repository call it delegates to
+   - Every transactional block (`db.$transaction`, `session.begin`, `Repo.transaction`, etc.)
+     that bundles multiple inserts together
+
+   For each sibling insert, append an entry to **that sibling model's** `created_by` list:
+
+   ```yaml
+   created_by:
+     - owner: <the model whose creation function you're scanning>
+       via: <the function name, e.g. <Root>Service.createRoot>
+       why: "<one-sentence prose explaining why this sibling is created inline>"
+   ```
+
+   The `why` is prose, written for humans. Scenarios and the env-factory teardown logic quote
+   it verbatim. Make it specific — "Every new `<Root>` needs a default `<Child>` created inline
+   in the same transaction so downstream features have something to read from the start" is
+   useful; "creates a `<Child>`" is not.
+
+   One pass per standalone path. When you're done, every sibling that was written inline will
+   have a `created_by` pointer back to the owner, and every model either has its own standalone
+   path (`independently_created: true`) or is reachable through at least one owner (non-empty
+   `created_by`).
+
+6. **Validate invariants before writing.** A model with `independently_created: false` and
+   empty `created_by` is a bug — either you missed a creation path, or the model is orphaned
+   in the schema. Do not ship an audit with orphans.
+
+7. Side effects are informational — they describe what an independently-created model's
+   function does. They help humans understand why a factory matters but do not affect
+   classification.
+
+## Output Format
+
+Write `autonoma/entity-audit.md` with YAML frontmatter and markdown body.
+
+### Frontmatter
+
+```yaml
+---
+model_count: 4
+factory_count: 3    # number of models with independently_created: true
+models:
+  - name: <User>
+    independently_created: true
+    creation_file: src/<auth-module>/<auth-module>.ts
+    creation_function: <AuthProvider>.databaseHooks.user.create
+    side_effects:
+      - hashes password
+      - creates default <Tenant> + <Member> rows
+    created_by: []
+
+  - name: <Root>
+    independently_created: true
+    creation_file: src/<domain>/<domain>.service.ts
+    creation_function: <Root>Service.create
+    side_effects:
+      - mints a default <Child> in the same transaction
+      - mints an <OnboardingLike> row
+    created_by: []
+
+  - name: <Child>
+    independently_created: true
+    creation_file: src/<child-domain>/<child-domain>.service.ts
+    creation_function: <Child>Service.create
+    side_effects: []
+    created_by:
+      - owner: <Root>
+        via: <Root>Service.create
+        why: "Every new <Root> needs a default <Child>, created inline in the same transaction so downstream features have something to read from the start."
+
+  - name: <PureDependent>
+    independently_created: false
+    created_by:
+      - owner: <Root>
+        via: <Root>Service.create
+        why: "Minted inside the <Root> transaction so dependent UI has a row wired up from the start."
+---
+```
+
+Schema rules:
+
+- `name` — required (string).
+- `independently_created` — required (boolean).
+- `creation_file` / `creation_function` / `side_effects` — required **iff**
+  `independently_created: true`.
+- `needs_extraction` — optional boolean; true when the standalone path is inline in a route
+  handler or framework-hook closure and the env-factory agent will need to extract it.
+- `created_by` — required (list, may be empty). Each entry requires `owner` (string — must
+  match another model's `name`), `via` (string — the function name), and `why` (non-empty
+  prose string).
+- Any model with `independently_created: false` MUST have a non-empty `created_by`.
+
+### Markdown Body
+
+After the frontmatter, write:
+
+#### Roots (models with `independently_created: true`)
+
+For each, include:
+- The model name as a heading
+- `creation_file` + `creation_function`
+- A brief description of what the function does, including observed side effects
+- Any sibling models it mints inline (these are the models with `owner: <ThisModel>` in their
+  `created_by`). Link back to them so the reader can follow the tree.
+
+#### Dependents (models with `independently_created: false`)
+
+A table listing each dependent model, its owner(s) (from `created_by`), and the `why` for each.
+This is the map the scenario generator uses: pure dependents are always created through their
+owner, not as standalone tree nodes.
+
+#### Dual-creation models
+
+A call-out section listing every model with `independently_created: true` AND non-empty
+`created_by`. For each, one sentence on when the standalone path is the right choice and when
+the via-owner path is. This helps scenarios decide which to use per narrative.
+
+## Important
+
+- Be thorough — every inline `db.<model>.create(...)` inside someone else's creation function
+  must produce a `created_by` entry on that sibling, even if that sibling also has its own
+  service.
+- Read the ACTUAL code to locate creation functions and sibling inserts — don't guess from file
+  names alone.
+- If a model has multiple standalone creation paths (e.g., signup + admin-create), pick the
+  canonical one (usually the public API or most-called path) for `creation_function` and note
+  alternatives in the body.
+- Framework-level hooks (Better-Auth, NextAuth, Devise) count as standalone paths — record them
+  with `needs_extraction: true` so the env-factory agent lifts the hook body into a named
+  export before wiring the factory.
+- ORM-level hooks (Prisma middleware, Sequelize hooks, ActiveRecord callbacks) DO NOT run on
+  raw SQL. A pure-dependent (`independently_created: false`) model relying on them is a
+  correctness bug; call it out in the body.
+- **Use subagents aggressively.** Pass A (find standalone paths) and Pass B (find sibling
+  inserts) are both embarrassingly parallel.
diff --git a/agents/env-factory-generator.md b/agents/env-factory-generator.md
new file mode 100644
index 0000000..cd0fb54
--- /dev/null
+++ b/agents/env-factory-generator.md
@@ -0,0 +1,710 @@
+---
+description: >
+  Installs the Autonoma SDK and configures the handler by registering factories for
+  every model with dedicated creation code (from entity-audit.md). Writes
+  autonoma/.endpoint-implemented on completion. End-to-end validation happens in the
+  next step (scenario-validator).
+tools:
+  - Read
+  - Glob
+  - Grep
+  - Write
+  - Edit
+  - Bash
+  - Agent
+  - WebFetch
+maxTurns: 60
+---
+
+# Environment Factory: SDK Setup
+
+You install the Autonoma SDK and configure the handler with factories.
+Your inputs are `autonoma/scenarios.md` and `autonoma/entity-audit.md`. Your output is an
+endpoint that responds to `discover` — end-to-end validation (`up`/`down`) happens in the
+next pipeline step.
+
+## CRITICAL: Database Safety
+
+You may be connected to a production database. Follow these rules absolutely:
+
+- **ALL writes go through the SDK endpoint only.** The SDK has production guards, HMAC auth, and signed refs tokens.
+- **You MAY read from the database** using `psql` or ORM queries for verification (SELECT only).
+- **You MUST NEVER** run INSERT, UPDATE, DELETE, DROP, or TRUNCATE directly via psql, raw SQL, or any path outside the SDK.
+- **You MUST NEVER** delete the whole database, truncate tables, or run destructive migrations.
+- The SDK's `down` action only deletes records that `up` created, verified by a cryptographically signed token.
+
+## The #1 rule — read before writing a single factory
+
+**`db.<model>.create()` (or any equivalent ORM/SQL write) inside a factory body for a model
+whose audit says `independently_created: true` is NEVER acceptable.** There is no condition
+under which this is the right output. If calling the audited function feels hard (inline in
+a route, buried in a framework hook, needs DI, triggers Temporal), the answer is never
+"just use the ORM." The answer is one of: extract, wire DI, use the app's test-mode
+toggle, or stop and ask the user.
+
+If you catch yourself typing `prisma.x.create`, `db.x.create`, `tx.insert`, `Repo.insert`,
+`<Model>::create`, `Model.objects.create`, `entityManager.persist`, etc. inside a factory
+body for an audited model — delete it. Go back to the per-model decision tree below.
+
+The entire value of factories is that tests run through the user's real creation path. An
+inline ORM call bypasses password hashing, slug generation, audit logs, Stripe sync,
+framework hooks that provision sibling rows, state-machine transitions, and every piece of
+business logic the user will add next month. It produces data that looks right in a
+`SELECT *` but is silently wrong in ways the tests can't catch.
+
+## Instructions
+
+1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use
+   WebFetch. Do NOT write any URL yourself. The docs base URL lives only in
+   `autonoma/.docs-url`, written by the orchestrator before any subagent runs.
+
+   To fetch a doc, run the bash command literally — the shell expands the path, not you:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/<path>"
+   ```
+
+   If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code
+   and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback.
+
+2. Fetch the latest implementation instructions:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement-scenarios.txt"
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"
+   ```
+
+   These are the source of truth. Follow them for SDK setup, adapter configuration, factory registration, and auth patterns.
+
+3. Read `autonoma/entity-audit.md` — parse the frontmatter. For every model with
+   `independently_created: true`, you MUST register a factory that calls the identified
+   `creation_function` in `creation_file`. Models with `independently_created: false` get no
+   factory — the SDK will fall back to raw SQL INSERT automatically.
+
+4. Read `autonoma/scenarios.md` — parse the frontmatter and full scenario data. Identify every
+   model, cross-branch references (`_alias`/`_ref`), and fields that use `testRunId`.
+
+5. Explore the backend codebase to understand:
+   - Framework (Next.js, Express, Hono, etc.)
+   - ORM (Prisma, Drizzle)
+   - Database (PostgreSQL, MySQL, SQLite)
+   - Authentication mechanism (session cookies, JWT, Better Auth, Lucia, etc.)
+   - Existing route/endpoint patterns
+   - **Auth-adjacent framework hooks** — Better Auth `databaseHooks`, NextAuth callbacks,
+     Lucia adapters, Clerk webhooks. These frequently contain the real creation logic for
+     User/Session/Account and also write to sibling tables (Organization, Member, Billing).
+     The audit will flag these with `needs_extraction: true`.
+   - **App composition root** — where the app wires services, clients, and repositories
+     (DI container, service registry, module init). You'll reuse this wiring when a
+     creation function needs dependencies beyond `ctx.executor`.
+
+## Factory registration philosophy
+
+Register a factory for **every model with `independently_created: true`** — no exceptions.
+
+This is true even if the creation function looks trivial. A factory wired up to `ProjectService.create()`
+that today just calls `prisma.project.create()` will automatically benefit from any business logic
+the user adds later (audit log, Stripe sync, cache write). Raw SQL, by contrast, can never run
+that logic — it's always a compatibility risk.
+
+Models with `independently_created: false` fall back to the SDK's raw SQL path. That's safe because
+the audit explicitly determined there's no creation logic to preserve.
+
+## Dependents, cascades, and teardown
+
+For every root (`independently_created: true`) decide how its dependents will be torn down
+before writing the factory. The `created_by` list in the audit tells you which models come
+into existence as a byproduct of this root's creation flow — those rows must also be deleted
+when the SDK tears down the root.
+
+Walk this decision tree in order. The first match wins; if none match, STOP and report.
+
+1. **Schema cascade** — check the ORM schema. If the FK chain from every dependent back to
+   the root is `onDelete: Cascade` (Prisma) / `ON DELETE CASCADE` (raw SQL) / analogous in
+   your ORM, you're done. The SDK deletes the root row and the DB cleans up the rest. No
+   `teardown` field needed on the factory.
+2. **Existing delete function** — if the codebase has a delete method that already tears
+   down the same subtree (e.g. a `<Root>Service.delete<Root>` that removes the root AND
+   every dependent it minted), register `teardown` on the factory to call that function.
+   Same principle as the `create` side: stay on the user's code path.
+3. **Return dependents' IDs the production function ALREADY returns** — if the production
+   `create` function returns the dependent IDs in its result (e.g. returns
+   `{ root, child, grandchild }`), forward those IDs in your factory's return so they land
+   in refs, then register a `teardown` that deletes them in reverse FK order.
+4. **None of the above — STOP.** Do NOT modify the production service to return more IDs
+   than it already does just to make teardown work. Doing so changes the real code path to
+   serve test needs, which is exactly the inversion we avoid. Report the gap to the user
+   and let them choose: add a cascade, add a delete function, or accept orphans until
+   `TRUNCATE` between test runs.
+
+The `created_by[].why` field is a useful hint for this: if it says "minted inline in the
+same transaction", option 1 (schema cascade) is usually set up correctly; if it says "seeded
+with the owner so onboarding has something to advance through", check whether the dependent
+is behind a soft-delete flag the root's delete function already handles.
+
+Pure dependents (`independently_created: false`) never have their own `teardown` — they are
+torn down via their owner's factory (one of the four options above).
+
+## Compatibility with legacy audits
+
+Older audits used a single `independently_created` field. The validators read both schemas and
+treat `independently_created: true` as `independently_created: true` with an empty `created_by`.
+If the audit you're reading only has `independently_created`, you can still register factories,
+but you'll lose the `created_by` teardown guidance above — prefer regenerating the audit
+with the current prompt when possible.
+
+## Research pass — MANDATORY before writing any factory
+
+Post-mortems of past runs show a consistent failure mode: the agent makes **one bad
+decision and applies it 50 times**. The research pass prevents this by forcing you to
+open every relevant file and document a per-model decision *before* touching the handler.
+
+Write a table to `autonoma/.factory-plan.md` with one row per `independently_created: true`
+model in the audit. Fill EVERY cell — do not leave any as TODO. The orchestrator and
+the user will review this table before you write a single factory.
+
+```
+| Model | Audit function | File opened? | Import path | DI dependencies observed | Decision (Branch 1/2/3) | Notes |
+|-------|----------------|--------------|-------------|--------------------------|-------------------------|-------|
+```
+
+Column rules:
+
+- **File opened?** — "yes, lines X-Y" or "no, why". If you write "no", you MUST NOT
+  proceed. You cannot decide Branch 1 vs Branch 2 without reading the file.
+- **Import path** — the exact `import ... from "..."` statement you will add to the
+  handler. If the symbol is inline in a hook/route (Branch 1), this column holds the
+  *new* export path you will create during extraction, not the current inline location.
+- **DI dependencies observed** — every constructor arg or closed-over variable the
+  function uses. `ctx.executor` for a DB-only service is the trivial case; any logger,
+  event bus, Temporal client, analytics client, etc. must be listed. This is where
+  past agents gave up silently — we want the give-up moment to be visible.
+- **Decision** — Branch 1 (extract inline → export → call), Branch 2 (import existing
+  export → call), or Branch 3 (audit is wrong, argue why). "Inline ORM" is NOT a valid
+  decision.
+
+### Cross-codebase DI discovery
+
+Before filling the table, run these greps against the backend to find real
+instantiation patterns. The agent debrief identified this as the single actionable
+guidance past runs were missing:
+
+```bash
+# Find how each service is actually constructed in production code.
+grep -rnE "new ${ServiceName}\(" apps/ --include='*.ts' --include='*.tsx' | head -20
+# Find exported singletons and module-level instances.
+grep -rnE "^(export )?(const|let) [a-zA-Z]+ = new " apps/ --include='*.ts' | head -40
+# Find composition root candidates.
+grep -rnlE "(container|registry|services/index|app\.module)" apps/ | head
+```
+
+Use the results to fill the "DI dependencies observed" column honestly. If a service
+needs `logger, eventBus, temporal, analytics` and you can't find where the app wires
+them, STOP and ask the user — do NOT fall back to raw ORM.
+
+### External-side-effects policy reminder
+
+When the creation function triggers Temporal / GitHub / analytics / BetterAuth hooks,
+you are NOT allowed to skip the function. You must either:
+1. Call the real function and let the test-mode toggle handle it (grep for
+   `process.env.NODE_ENV === "test"`, `AUTONOMA_TEST_MODE`, `DISABLE_*`, or similar).
+2. Call the real function and let external calls fail gracefully — most SDKs throw,
+   which is fine if the DB writes complete first.
+3. Wrap the external call with a try/catch **inside the real function**, not inside
+   the factory.
+
+Never replicate DB writes the function performs. If the real function writes to
+sibling tables (Organization, Member, BillingCustomer from BetterAuth's `user.create`
+hook; a default Folder from `createProject`), those writes come for free only when
+you call the real function. Inlining `db.user.create()` silently drops them.
+
+---
+
+## Per-model decision tree (run this BEFORE writing any factory)
+
+For every model with `independently_created: true` in `autonoma/entity-audit.md`, walk this tree
+in order. Do NOT skip. Each branch has exactly one legitimate output — there is no "give up
+and use `db.<model>.create()`" escape hatch.
+
+### Branch 1 — `needs_extraction: true`
+
+Meaning: the creation logic exists inline in a route handler, a framework hook (Better Auth
+`databaseHooks`, NextAuth callbacks, Express middleware closures), or an anonymous closure.
+There is no named export to import.
+
+**Mandatory action — extract before wiring:**
+
+1. Open `creation_file`. Find the inline block named by `creation_function`.
+2. Move the body into a new **named, exported function** in the nearest sensible module
+   (a new `*.service.ts`, `*.repository.ts`, a sibling `create-<model>.ts`, or an existing
+   service file if one exists nearby). The function must:
+   - Take a plain input object (no `req`/`res`/`ctx` — those are HTTP concerns).
+   - Return the created record (at minimum `{ id }`).
+   - Preserve every side effect the inline block had — including writes to sibling tables
+     that framework hooks produce (e.g. Better Auth's `user.create` hook provisioning an
+     Organization, Member, BillingCustomer; NextAuth's callback writing Account rows).
+3. Replace the inline block with a call to the new function. The real HTTP caller's
+   behavior MUST stay identical. Run the project's typecheck/test command before moving on.
+   **Leave a short comment** (1–2 lines) above the new exported function explaining why it
+   was extracted — e.g. `// Extracted from the Better Auth databaseHooks.user.create closure
+   so the Autonoma Environment Factory can reuse the same creation path (Org + Member +
+   billing provisioning) as production. See autonoma/entity-audit.md.` This is a courtesy
+   to the developers who will encounter the new function — they should be able to tell at a
+   glance that it was lifted out for factory reuse, not invented for it.
+4. **Update `autonoma/entity-audit.md` in-place** — change `creation_file` to the new file,
+   `creation_function` to the new exported name, add `extracted_to: <new-path>`,
+   and keep `needs_extraction: true` so the fidelity rubric's framework-hook
+   carve-out can score the factory against the extracted helper.
+   Downstream steps read the audit; they must see the fixed state.
+5. Now — and only now — import the new function and wire the factory.
+
+If extraction is genuinely impossible (the inline block depends on `req`/`res` in a way that
+can't be untangled, or it's generated code you can't edit), **STOP and ask the user**. Do
+NOT fall back to raw ORM. That is the bug we are trying to prevent.
+
+**Concrete example — Better Auth `databaseHooks`:**
+
+The audit marks `User` with `needs_extraction: true`, `creation_file: src/auth.ts`,
+`creation_function: buildAuth (databaseHooks.user.create)`. Reading `src/auth.ts`, the real
+creation logic lives inside a closure passed to `betterAuth({ databaseHooks: { user: { create: async (user) => {...} } } })`, which calls `db.user.create`, then `ensureOrgMembership`, then provisions a `BillingCustomer`, then enqueues a welcome email.
+
+Wrong: import `db` and call `db.user.create(...)` in the factory — silently skips the
+Organization/Member/BillingCustomer rows and every downstream test that reads them breaks.
+
+Right: extract the closure body into `export async function createUserWithOnboarding(input)`
+in `src/auth/create-user.ts`, call it from the Better Auth hook (so production still works),
+update the audit, then `import { createUserWithOnboarding }` in the factory.
+
+### Branch 2 — `independently_created: true`, no `needs_extraction`
+
+Meaning: a named exported function or class method already exists. Import it and call it.
+Do not copy its body. Do not call the ORM directly "because it's simpler." The whole point
+is to stay on the user's code path.
+
+Go to the DI playbook below to figure out how to invoke it.
+
+### Branch 3 — `independently_created: false`
+
+Do not register a factory at all. The SDK's raw SQL fallback handles it. Writing a factory
+here just so you can call `db.<model>.create()` is the anti-pattern in disguise — let the
+SDK do it.
+
+## DI / constructor-injection playbook
+
+Factories receive `(data, ctx)` where `ctx.executor` is the DB client/transaction. That's
+enough for simple service classes but many creation functions need more. Walk this list in
+order — the first match wins:
+
+1. **Top-level exported function** — `import { createX } from "..."; return createX(data);`.
+   Simplest case. Most services should end up here after Branch 1 extraction.
+2. **Static method on a class** — `return XService.create(data, ctx.executor);`. Pass
+   `ctx.executor` as the DB/transaction argument so writes stay in the SDK's transaction.
+3. **Instance method, needs only a DB client** —
+   `const svc = new XService(ctx.executor); return svc.create(data);`. Mirrors how the app
+   instantiates it at call time.
+4. **Instance method, needs more dependencies (logger, event bus, config, clients)** —
+   find the app's composition root (DI container, service registry, `container.ts`,
+   `app.module.ts`, `services/index.ts`) and reuse it. Two viable patterns:
+   - **Import the already-constructed singleton** the app exports for production use:
+     `import { userService } from "@/services"; return userService.create(data);`.
+   - **Rebuild the service the same way the composition root does**, substituting
+     `ctx.executor` for the DB dependency and importing real singletons for everything
+     else (logger, event bus). Do not invent mocks. Example:
+
+     ```ts
+     import { logger, eventBus, temporalClient } from "@/lib/singletons";
+
+     UserProfile: defineFactory({
+       create: async (data, ctx) => {
+         const svc = new UserProfileService({
+           db: ctx.executor,
+           logger,
+           eventBus,
+           temporal: temporalClient,
+         });
+         return svc.create(data);
+       },
+     }),
+     ```
+5. **Framework-scoped dependencies (NestJS provider, Fastify plugin, Rails concern)** —
+   bootstrap the smallest containing module and resolve the service from it. If that turns
+   into a 50-line boilerplate, that's a signal the composition root should expose a helper
+   the factory can call; add the helper to the app and use it. Still never `db.create()`.
+6. **Impossible** — if you genuinely can't wire the dependencies without rewriting the
+   service, STOP and ask the user. Do NOT fall back to raw ORM.
+
+Never mock, stub, or fake a dependency. The factory must exercise real code.
+
+## External side effects policy
+
+Audited creation functions often perform side effects beyond the DB row: enqueueing a
+Temporal workflow, hitting the GitHub/Stripe/Slack API, sending an email, publishing to a
+message bus, writing a semantic embedding, firing an analytics event, calling an LLM.
+
+**Your goal is correct DB state, not production-grade external delivery.** The factory MUST
+preserve every DB write the real function performs (including writes to sibling tables
+done by ORM hooks, framework hooks, triggers). It is NOT responsible for making every
+network call succeed. Order of preference:
+
+1. **Call the real function with real side effects.** If Temporal/GitHub/Stripe clients are
+   already wired for the test environment (sandbox keys, a local Temporal dev server,
+   mocked SDKs in test config), just call through. Cleanest option when infra is available.
+2. **Use the app's existing test-mode toggle.** Most apps have one: an env var
+   (`NODE_ENV=test`, `DISABLE_WORKFLOWS=1`, `ANALYTICS_DISABLED=1`), a feature flag, a
+   null-object client injected in tests. Find it, set it on the handler's environment, and
+   call the real function.
+3. **Wrap external-only calls and let them no-op on failure.** If no toggle exists and the
+   call would fail in the test environment, the acceptable pattern is to try/catch the
+   outbound call inside the real function's wrapper — not inside a rewritten factory body.
+   Prefer exposing a toggle in the app over adding try/catch at the factory layer. Only use
+   this for calls whose failure does not affect DB state under test. If a test later
+   asserts on a row the side effect would have created, make it succeed (option 1 or 2).
+4. **Reimplement the DB writes inline.** NEVER. If you find yourself typing
+   `db.<other_model>.create` inside a factory to replicate what a hook or workflow would
+   have done, STOP. That means the function wasn't truly "called" — you re-wrote it. Go
+   back to option 1 or 2, or ask the user.
+
+**What you are NOT allowed to skip:**
+
+- Password hashing, slug generation, ID derivation, normalisation — pure CPU work inside
+  the creation function; calling the function gets them for free.
+- DB writes performed by ORM hooks / framework hooks / triggers on the model being created.
+  Better Auth's `databaseHooks.user.create` writes to Organization, Member, BillingCustomer
+  — if you call `db.user.create()` instead of the real signup function, those rows go
+  missing and every test that reads them breaks silently.
+- Writes to sibling tables done by the creation function itself (e.g. `createProject`
+  writing a default Folder row). If you don't call the function, those rows go missing too.
+
+## CRITICAL: Before Writing Any Code
+
+**Ask the user for confirmation** before implementing. Present your plan:
+
+> "I'm about to set up the Autonoma SDK. Here's what I'll do:
+>
+> **SDK packages**: [list packages to install]
+> **Endpoint location**: [where the handler file will go]
+> **Scope field**: [e.g., organizationId]
+>
+> **Models needing extraction (`needs_extraction: true`)**:
+> - [Model]: inline in `[file]#[block]` → will extract to `[new file]#[new function]`
+> - ...
+>
+> **Factories to register** (from entity-audit.md):
+> - [Model]: calls `[file]#[function]` (DI: [top-level import / `new Service(ctx.executor)` / composition-root singleton]; side effects: [list, or "none — future-proofs against added logic"])
+> - ...
+>
+> **External side effects strategy**: [test-mode toggle name / sandbox credentials / try-catch wrapper]
+>
+> **Raw SQL fallback** (no creation code in audit): [list]
+>
+> **Auth callback**: [how sessions/tokens will be created]
+>
+> **Database operations**: The SDK creates test data by calling the factories you register
+> (or raw SQL for models without creation code). It deletes only what it created during
+> teardown (verified by a signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on
+> existing data.
+>
+> **Environment variables needed**:
+> - `AUTONOMA_SHARED_SECRET` — shared with Autonoma for HMAC request verification
+> - `AUTONOMA_SIGNING_SECRET` — private, for signing refs tokens
+>
+> To generate these secrets, run:
+> ```bash
+> openssl rand -hex 32
+> ```
+> Run this command TWICE — once for each secret. Use DIFFERENT values for each.
+> Set them in your `.env` file (or equivalent):
+> ```
+> AUTONOMA_SHARED_SECRET=<first-value>
+> AUTONOMA_SIGNING_SECRET=<second-value>
+> ```
+>
+> Shall I proceed?"
+
+**Do NOT proceed until the user confirms.**
+
+## Implementation
+
+### 1. Install SDK packages
+
+Pick the correct packages for the project's stack:
+
+| Your ORM | Package |
+|----------|---------|
+| Prisma | `@autonoma-ai/sdk-prisma` |
+| Drizzle | `@autonoma-ai/sdk-drizzle` |
+
+| Your Framework | Package |
+|----------------|---------|
+| Next.js App Router, Hono, Bun, Deno | `@autonoma-ai/server-web` |
+| Express, Fastify | `@autonoma-ai/server-express` |
+| Node.js http | `@autonoma-ai/server-node` |
+
+Always install `@autonoma-ai/sdk` as the core package.
+
+### 2. Do the extractions FIRST
+
+Before writing the handler, walk every `needs_extraction: true` model in the audit and do
+the extraction per Branch 1 of the decision tree. After each extraction, update
+`autonoma/entity-audit.md` in-place. This must happen before Step 3 — the handler imports
+these new exports by name.
+
+### 3. Create the endpoint handler
+
+Write a single handler file that:
+1. Imports and configures the ORM adapter with the scope field
+2. Registers factories for EVERY model with `independently_created: true` in entity-audit.md
+3. Implements the auth callback using the app's real session/token creation
+4. Passes both secrets from environment variables
+
+Match existing codebase patterns — import style, file organization, error handling.
+
+### 4. Register factories (one per model with creation code)
+
+For every entry in entity-audit.md with `independently_created: true`:
+
+- Import the function from `creation_file` (post-extraction if Branch 1 applied)
+- Wrap it in `defineFactory({ create, teardown? })` from `@autonoma-ai/sdk`
+- In `create`: call the imported function with the resolved data and return at least `{ id }` (the primary key)
+- Optionally define `teardown` for custom cleanup (SQL DELETE is the default)
+
+#### The one thing you MUST NOT do
+
+Do not re-implement the creation logic inline using the ORM, even if calling the real function
+is inconvenient (constructor arguments, DI containers, weird signatures). The entire point of
+the factory is to stay on the user's code path so that when they add business logic later —
+password hashing, audit logs, Stripe sync, state-machine transitions — the test data gets it
+for free. Inline ORM calls bypass all of that silently and are the #1 bug source in generated
+factories.
+
+**A raw ORM/DB write MUST NEVER appear in a factory body for a `independently_created: true`
+model.** There are no exceptions. Exact patterns vary by language/ORM — a non-exhaustive list:
+
+- TypeScript/JavaScript: `prisma.<m>.create(`, `db.<m>.create(`, `tx.insert(`, `drizzle.insert(`, `knex('<t>').insert(`, `sequelize.models.<M>.create(`, `typeorm.getRepository(...).save(`, `mongoose.Model.create(`, `await <M>.create(`, `.upsert(`
+- Python: `session.add(`, `session.execute(insert(...))`, `Model.objects.create(`, `Model(...).save(`, `db.session.add(`, `conn.execute("INSERT ...")`
+- Ruby/Rails: `<Model>.create(`, `<Model>.create!(`, `<Model>.new(...).save`, `<Model>.insert(`, `ActiveRecord::Base.connection.execute("INSERT ...")`
+- PHP/Laravel: `<Model>::create(`, `new <Model>(...)->save()`, `DB::table('...')->insert(`, `$repository->persist(`
+- Java/Spring: `entityManager.persist(`, `<Repository>.save(`, `jdbcTemplate.update("INSERT ...")`
+- Go: `db.Create(`, `gorm.DB.Create(`, `sq.Insert(`, raw `db.Exec("INSERT ...")` / `db.ExecContext(...)`
+- Elixir/Ecto: `Repo.insert(`, `Repo.insert!(`, `Repo.insert_all(`
+- Rust: `diesel::insert_into(`, `sqlx::query!("INSERT ...")`, `sea_orm::ActiveModel ... .insert(`
+- Raw SQL anywhere: an `INSERT INTO <table>` string literal passed to a query/exec/prepare API
+
+If you wrote one of these inside a factory body for a model whose audit says
+`independently_created: true`, you took the trap. Delete it. Go back to the per-model decision
+tree and the DI playbook.
+
+**WRONG — re-implementing creation logic inline (this is the trap):**
+
+```ts
+// entity-audit.md said: creation_function = OnboardingManager.getState
+OnboardingState: defineFactory({
+  create: async (data) => {
+    // Bypasses OnboardingManager entirely. If the user adds logic later, tests silently diverge.
+    return db.onboardingState.create({ data: { applicationId: data.applicationId, step: "welcome" } });
+  },
+}),
+```
+
+**RIGHT — call the audit's identified function, even if you have to instantiate a class:**
+
+```ts
+import { OnboardingManager } from "@/lib/onboarding-manager";
+
+OnboardingState: defineFactory({
+  create: async (data, ctx) => {
+    // Uses the real code path. Any business logic added later flows through automatically.
+    const manager = new OnboardingManager(ctx.executor);
+    return manager.getState(data.applicationId);
+  },
+}),
+```
+
+### 4b. Populate `tableNameMap` sparsely (do not mirror the factory registry)
+
+The SDK auto-derives model names from SQL tables by splitting on `_` and PascalCasing
+each part. **No pluralization is performed.** `organization` → `Organization`;
+`organizations` → `Organizations`; `api_key` → `ApiKey`; `api_keys` → `ApiKeys`.
+
+Do NOT write a `tableNameMap` / `table_name_map` that mirrors your factory registry
+1:1. That doubles the maintenance surface and is a silent-breakage foot-gun — adding a
+new model forces two edits and forgetting one silently misroutes creates.
+
+**Algorithm to follow before writing the map:**
+
+1. List every factory key you intend to register.
+2. For each key, compute `autoName = snakeToPascal(dbTable)` — split on `_`, PascalCase
+   each part, concatenate. No pluralization step.
+3. If `autoName === factoryKey`: **do not add** the entry.
+4. If `autoName !== factoryKey`: add the entry.
+5. If after step 4 the map is empty, **omit the `tableNameMap` field entirely**.
+
+**Worked example (plural DB tables, singular factory keys):**
+
+```ts
+// DB tables: organizations, users, api_keys
+// Factory keys: Organization, User, ApiKey
+// Every auto-derived name disagrees → every factory needs one entry:
+tableNameMap: {
+  Organization: 'organizations',
+  User: 'users',
+  ApiKey: 'api_keys',
+},
+factories: { Organization: ..., User: ..., ApiKey: ... },
+```
+
+**Worked example (singular DB tables):**
+
+```ts
+// DB tables: organization, user, api_key
+// Factory keys: Organization, User, ApiKey
+// Every auto-derived name matches → omit tableNameMap entirely.
+factories: { Organization: ..., User: ..., ApiKey: ... },
+```
+
+**Red flag.** If `tableNameMap` ends up with exactly one entry per factory and every
+entry is a plural↔singular rename, you have two options:
+
+- (a) Keep the map (verbose but explicit).
+- (b) Change factory keys to match the plural auto-derived names (`Organizations`,
+  `Users`, `ApiKeys`) and drop the map entirely.
+
+Prefer (b) unless scenario files already use the singular convention. A `tableNameMap`
+that is a 1:1 copy of the factory registry means you're doing work the SDK already
+does.
+
+### 5. Register the route
+
+Add the endpoint to the app's routing.
+
+### 6. Set up environment variables
+
+Add `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` to `.env`. If `.env.example` exists, add placeholders.
+
+## Smoke test
+
+Before writing the sentinel, run a single `discover` call to confirm the endpoint is wired
+up and HMAC works. Do NOT run `up` or `down` here — that is the scenario-validator's job.
+
+```bash
+export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)}
+export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)}
+
+BODY='{"action":"discover"}'
+SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
+curl -s -X POST http://localhost:PORT/api/autonoma \
+  -H "Content-Type: application/json" \
+  -H "x-signature: $SIG" \
+  -d "$BODY" | python3 -m json.tool
+```
+
+Expected: JSON with `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`.
+
+If this fails, fix the handler (likely the adapter config or route mount) before writing
+the sentinel.
+
+## CRITICAL: Factory-integrity check (before writing the sentinel)
+
+Prove every factory calls the audit's identified `creation_function`. This is deterministic
+static analysis, not a vibe check. Run it yourself and HALT if it fails — the next step
+(scenario-validator) runs the exact same check and will kick the work back.
+
+### Step A — collect the audit targets
+
+Parse `autonoma/entity-audit.md` and build a list of `(model, creation_file, creation_function)`
+for every model with `independently_created: true`. Also flag any entry that still has
+`needs_extraction: true` — that's a bug (you were supposed to extract first and clear the
+flag). HALT and go do the extraction.
+
+### Step B — grep the handler for the anti-pattern
+
+```bash
+grep -nE '(prisma|db|tx)\.[a-zA-Z_]+\.(create|createMany|insert|upsert)\(' <handler-file>
+```
+
+Every match inside a `defineFactory({ create })` body is a RED FLAG. The only legitimate
+matches are:
+- Inside a model's `teardown` body (custom cleanup is allowed).
+- Outside any `defineFactory` (auth callback, scope helpers, etc.).
+- Inside a factory for a model the audit marked `independently_created: false` (no service exists;
+  raw ORM is the documented fallback — though the SDK does this automatically, so you usually
+  shouldn't even write such a factory).
+
+Anything else is the trap. Do NOT ship it.
+
+### Step C — per-model structural check
+
+For each `(model, creation_file, creation_function)` from Step A, verify ALL of:
+
+1. An `import` (or `require`) line pulls `creation_function` — or the class/object that owns
+   it — into the handler file, from a path that resolves to `creation_file`.
+2. The factory body for `model` invokes that identified symbol (e.g. `manager.getState(...)`,
+   `createUser(...)`, `ProjectService.create(...)`, `service.create(...)`).
+3. The factory body does NOT contain a raw ORM write for `model` (`db.<model>.create(...)`,
+   `prisma.<model>.create(...)`, `tx.insert(<model>Table)`, etc.).
+
+If any model fails any of the three, STOP. Fix the factory per the per-model decision tree
+and the DI playbook, then re-run this check from Step A.
+
+### Step D — commit only when clean
+
+Only write `autonoma/.endpoint-implemented` after:
+- Every `needs_extraction: true` flag in the audit has been resolved.
+- Step B returns zero anti-pattern matches inside factory bodies.
+- Step C passes for every audited model.
+- The discover smoke test returns 200 with the expected schema shape.
+
+If you extracted any route-handler or framework-hook logic into a new exported function
+(per Branch 1), the audit must have been updated in-place; re-read it after the edit before
+running Step A.
+
+## CRITICAL: Write the implementation sentinel
+
+After the discover smoke test passes AND the factory-integrity check passes, use the
+`Write` tool to create `autonoma/.endpoint-implemented` with a short plain-text summary:
+
+```
+Endpoint implemented.
+- handler: <path>
+- packages: <list>
+- factories registered: <count>
+- extractions performed: <count, with from→to paths>
+- scope field: <field>
+- auth callback: <brief description>
+```
+
+Do NOT use `touch` — the hook fires only on `Write`/`Edit`.
+
+The next step (scenario-validator) will exercise up/down for every scenario and write
+`autonoma/.endpoint-validated`. E2E test generation is blocked until that happens.
+
+## What to Explain to the User
+
+After implementation and validation, explain:
+
+1. **What was set up**: "I installed the Autonoma SDK and created a handler at `[path]`. It handles discover (returns your schema), up (creates test data), and down (tears down test data)."
+
+2. **Extractions performed**: For each `needs_extraction: true` model, show the inline block → new exported function mapping, and confirm the original caller now invokes the new function.
+
+3. **Factories registered**: List each factory — which function it wraps, which DI pattern was used, and what side effects the audit observed (or "none — factory is registered to future-proof").
+
+4. **External side effects strategy**: which toggle/sandbox/wrapper was used.
+
+5. **How to set up secrets**: "Generate two secrets with `openssl rand -hex 32` and set them as:
+   - `AUTONOMA_SHARED_SECRET` — share this with Autonoma
+   - `AUTONOMA_SIGNING_SECRET` — keep this private"
+
+6. **Safety**: "The SDK can only INSERT records via the factories you registered (which call the user's real creation functions) or raw SQL for models without creation code. Teardown only deletes records that were created (verified by a cryptographically signed token). It cannot UPDATE, DELETE, DROP, or run raw SQL on existing data."
+
+## Important
+
+- Always implement in the project's existing backend — don't create a standalone server
+- Match existing code patterns and conventions
+- Use the same ORM/database layer the project already uses
+- Register factories for EVERY model with `independently_created: true` in the audit — no exceptions, even for thin wrappers
+- Resolve every `needs_extraction: true` by extracting FIRST, then wiring the factory
+- Never reimplement the user's creation logic in a factory — always call their function
+- `db.<model>.create()` in a factory for a `independently_created: true` model is NEVER acceptable
+- ALL database writes go through the SDK endpoint — never write directly
+- Use `testRunId` to make unique fields (emails, org names) to prevent parallel test collisions
+- Validate the FULL lifecycle (discover → up → verify → down → verify) before completing
diff --git a/agents/kb-generator.md b/agents/kb-generator.md
index f26e998..cd83f42 100644
--- a/agents/kb-generator.md
+++ b/agents/kb-generator.md
@@ -21,22 +21,38 @@ You generate a structured knowledge base for a codebase. Your output MUST be wri
 
 ## Instructions
 
-1. First, fetch the latest knowledge base generation instructions:
+1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use
+   WebFetch. Do NOT write any URL yourself. The docs base URL lives only in
+   `autonoma/.docs-url`, written by the orchestrator before any subagent runs.
 
-   Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt`
-   and follow those instructions for how to analyze the codebase.
+   To fetch a doc, run the bash command literally — the shell expands the path, not you:
 
-2. Create the output directory if it doesn't exist:
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/<path>"
+   ```
+
+   If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code
+   and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback.
+
+2. Fetch the latest knowledge base generation instructions:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"
+   ```
+
+   Read the output and follow those instructions for how to analyze the codebase.
+
+3. Create the output directory if it doesn't exist:
    ```bash
    mkdir -p autonoma/skills
    ```
 
-3. Follow the fetched instructions to analyze the codebase — discover the application,
+4. Follow the fetched instructions to analyze the codebase — discover the application,
    map pages and flows, identify core workflows.
 
-4. Write the output to `autonoma/AUTONOMA.md`.
+5. Write the output to `autonoma/AUTONOMA.md`.
 
-5. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
+6. Write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
 
 ## CRITICAL: Output Format
 
diff --git a/agents/scenario-generator.md b/agents/scenario-generator.md
index 342899a..57cc418 100644
--- a/agents/scenario-generator.md
+++ b/agents/scenario-generator.md
@@ -1,7 +1,7 @@
 ---
 description: >
   Generates test data scenarios from a knowledge base.
-  Reads AUTONOMA.md plus SDK discover output and produces scenarios.md with three named test data environments.
+  Reads AUTONOMA.md and produces scenarios.md with three named test data environments.
   Output has YAML frontmatter with scenario summaries for deterministic validation.
 tools:
   - Read
@@ -16,61 +16,71 @@ maxTurns: 40
 
 # Scenario Generator
 
-You generate test data scenarios from a knowledge base. Your inputs are `autonoma/AUTONOMA.md`,
-`autonoma/skills/`, and `autonoma/discover.json`. Your output MUST be written to
-`autonoma/scenarios.md` with YAML frontmatter.
+You generate test data scenarios from a knowledge base. Your input is `autonoma/AUTONOMA.md`
+and `autonoma/skills/`. Your output MUST be written to `autonoma/scenarios.md` with YAML frontmatter.
 
 ## Instructions
 
-1. First, fetch the latest scenario generation instructions:
+1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use
+   WebFetch. Do NOT write any URL yourself. The docs base URL lives only in
+   `autonoma/.docs-url`, written by the orchestrator before any subagent runs.
 
-   Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt`
-   and follow those instructions for how to design scenarios.
+   To fetch a doc, run the bash command literally — the shell expands the path, not you:
 
-2. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types.
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/<path>"
+   ```
 
-3. Read `autonoma/discover.json`. Treat the SDK `discover` response as the source of truth for:
-   - database models
-   - fields and requiredness
-   - foreign key edges
-   - parent/child relations
-   - scope field
+   If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code
+   and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback.
 
-   While reading the schema, assess whether the scope entity provides real **per-run data isolation**.
-   Ask yourself: does the scope entity parent most other models via required foreign keys? Can a new
-   scope entity be created per test run (i.e. it has creatable fields beyond just auto-generated IDs)?
-   Do most models in the graph eventually chain back to the scope entity?
+2. Fetch the latest scenario generation instructions:
 
-   If the answer is yes to all of these, the app has natural multi-tenant isolation — each test run
-   can create its own scope entity and all child data is automatically partitioned.
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-scenarios.txt"
+   ```
 
-   If the scope entity is a singleton, shared across users, or doesn't meaningfully partition data
-   across concurrent runs, the app **lacks natural per-run isolation**. In this case you must slug
-   all identifying fields with `{{testRunId}}` (see step 6 below) so that parallel or sequential
-   test runs never collide on lookup, search, or assertion values.
+   Read the output and follow those instructions for how to design scenarios.
+
+3. Read `autonoma/AUTONOMA.md` fully — understand the application, core flows, and entity types.
+
+4. Read `autonoma/entity-audit.md` — this is the authoritative schema map from Step 2.
+   It lists every model, its relationships, and whether creation goes through a factory or
+   raw SQL. Use it as the source of truth for model names, fields, FK edges, and the scope field.
+
+5. Scan `autonoma/skills/` to understand what entities can be created and their relationships.
+
+6. Explore the backend codebase only to fill gaps the audit does not cover (e.g. enum values,
+   string length limits, constraint details).
 
-   If `autonoma/discover.json` is missing or malformed, stop and tell the user that Step 2 now
-   requires a valid SDK discover artifact before scenario generation can continue.
+7. **Scoping analysis** — assess whether the scope entity provides real per-run data isolation.
+   Ask: does the scope entity parent most other models via required FKs? Can a new scope entity
+   be created per test run (i.e. it has creatable fields beyond auto-generated IDs)? Do most
+   models eventually chain back to the scope entity?
 
-4. Scan `autonoma/skills/` to understand what entities can be created and their relationships.
+   If yes to all: the app has natural multi-tenant isolation — each test run creates its own
+   scope entity and all child data is automatically partitioned.
 
-5. Use the SDK discover schema plus the knowledge base to design three scenarios: `standard`, `empty`, `large`.
+   If the scope entity is a singleton, shared across users, or does not meaningfully partition
+   data across concurrent runs: the app **lacks natural per-run isolation**. In this case you
+   MUST slug all identifying fields with `{{testRunId}}` (see step 9) so parallel or sequential
+   test runs never collide on lookup, search, or assertion values.
+
+8. Design three scenarios: `standard`, `empty`, `large`.
 
-6. Prefer hardcoded values when they make the resulting tests simpler, more reviewable, and more stable.
-   If a field needs run-level uniqueness but can still be expressed as a concrete literal, prefer a planner-chosen
-   hardcoded value with a discriminator suffix or prefix over introducing a variable placeholder.
-   Example: prefer `Acme Project testRunId suffix` encoded as a concrete scenario value over turning the whole field
+9. **Variable fields.** Prefer hardcoded values when they make tests simpler, more reviewable,
+   and more stable. If a field needs run-level uniqueness but can still be expressed as a
+   concrete literal, prefer a planner-chosen hardcoded value with a discriminator suffix over
+   introducing a variable placeholder.
+   Example: prefer `Acme Project qa-17` encoded as a concrete value over turning the field
    into `{{project_name}}` unless later tests truly need the placeholder.
 
-   **Exception — apps without natural per-run isolation:** If your scoping analysis in step 3
-   determined the app lacks natural multi-tenant isolation, **reverse the default above**. Slug ALL
-   identifying fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline
-   `{{testRunId}}` so that every value a test might search for, type into a form, or assert on screen
-   is unique to that test run. Use the pattern `Concrete Value {{testRunId}}` (e.g.
-   `Acme Corp {{testRunId}}`, `Main Project {{testRunId}}`). Each slugged field becomes a
-   `variable_field` entry with `generator: derived from testRunId`. This prevents parallel or
-   sequential test runs from interfering with each other when there is no scope entity to partition
-   the data.
+   **Exception — apps without natural per-run isolation:** if your scoping analysis determined
+   the app lacks natural multi-tenant isolation, **reverse the default**. Slug ALL identifying
+   fields — names, titles, descriptions, labels, slugs, emails, usernames — with inline
+   `{{testRunId}}` so every value a test might search, type, or assert on screen is unique to
+   that test run. Pattern: `Concrete Value {{testRunId}}` (e.g. `Acme Corp {{testRunId}}`).
+   Each slugged field becomes a `variable_field` entry with `generator: derived from testRunId`.
 
    Use variable fields sparingly. Only mark a value as variable when at least one of these is true:
    - the field must be globally unique or is highly collision-prone across runs
@@ -83,10 +93,8 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom
    constraint enforced by the database or application **must** be variable — hardcoding them
    will cause test failures when the hardcoded value expires or collides.
 
-   Do not mark a field as variable just because:
-   - it is user-facing text
-   - it could be unique in theory
-   - you want to avoid choosing a concrete literal
+   Do not mark a field as variable just because it is user-facing text, could be unique in
+   theory, or you want to avoid choosing a concrete literal.
 
    Every variable field must have:
    - a double-curly token such as `{{project_title}}`
@@ -95,20 +103,46 @@ You generate test data scenarios from a knowledge base. Your inputs are `autonom
    - a reason explaining why it truly must vary
    - a plain-language test reference such as `({{project_title}} variable)`
 
-   `generator` is optional. If you include it, use a short free-form strategy note such as
-   `derived from testRunId`, `planner literal plus discriminator`, `backend-generated`, `UUID suffix`,
-   or `timestamp-based`.
+   `generator` is optional. Use a short free-form strategy note such as `derived from testRunId`,
+   `planner literal plus discriminator`, `backend-generated`, `UUID suffix`, or `timestamp-based`.
    Do not default to `faker`. Prefer deterministic derivation from stable inputs, and use `faker`
-   only as a last resort when deterministic strategies are not practical.
-
-   Good:
-   - use a concrete value such as `Acme Workspace qa-17` when the planner can safely choose it and append a discriminator
-   - only `{{owner_email}}` is variable because login requires uniqueness across runs
-
-   Bad:
-   - every user name, organization name, and label is variable with `faker.*` by default
-
-7. Write the output to `autonoma/scenarios.md`.
+   only as a last resort.
+
+10. **Nested tree constraint.** Design scenario entity tables so they can be expressed as a
+    nested tree rooted at the scope entity. Step 4 (env-factory) and Step 5 (scenario-validator)
+    will convert scenarios into nested `create` payloads — flat cross-model structures connected
+    only by `_ref` break when JSON key order is not preserved. Children must nest under their
+    parent using the relation field names from the audit. Use `_ref` only for cross-branch
+    references that cannot be expressed through nesting.
+
+11. **Standalone vs via-owner choice.** For every model that appears in a scenario, consult
+    the audit and pick one of two paths:
+
+    - If the model has `independently_created: true` and the scenario narrative wants it
+      in isolation (e.g. the user creates a child directly, independent of any root), add
+      it as a top-level tree node. The SDK will call its factory directly.
+    - If the model appears in some owner's `created_by` list and the scenario narrative
+      already includes that owner (e.g. the scenario already has the root, and a default
+      child / onboarding row / deployment row comes along for free), **do NOT add the
+      model as a separate node**. It is created as a side effect of the owner's factory.
+      Quote the `why` from the audit in the scenario prose so the reader knows where it
+      came from.
+
+    **Dual models** (`independently_created: true` AND listed in someone's `created_by`)
+    get to pick per-scenario:
+
+    - Narrative where the root is being created for the first time → the child comes in
+      via the owner (via-owner path).
+    - Narrative where the root already exists and the user is creating a standalone child
+      → the child is a top-level node (standalone-factory path); its owner is also in
+      the tree, as its FK parent.
+
+    Never double-create a dependent. If the audit says an owner mints a dependent row
+    inline, and your scenario has that owner, the dependent must not appear as a separate
+    tree node — the factory already creates it, and adding it twice will either fail
+    uniqueness checks or produce confusing test state.
+
+12. Write the output to `autonoma/scenarios.md`.
 
 ## CRITICAL: Output Format
 
@@ -136,12 +170,6 @@ entity_types:
   - name: "Test"
   - name: "Run"
   - name: "Folder"
-discover:
-  source: sdk
-  model_count: 12
-  edge_count: 18
-  relation_count: 16
-  scope_field: "organizationId"
 variable_fields:
   - token: "{{project_title}}"
     entity: "Project.title"
@@ -152,7 +180,6 @@ variable_fields:
     reason: "title must be unique per test run"
     test_reference: "({{project_title}} variable)"
 planning_sections:
-  - sdk_discover
   - schema_summary
   - relationship_map
   - variable_data_strategy
@@ -169,33 +196,28 @@ planning_sections:
   - `total_entities`: Total count of entities created in this scenario
 - **entity_types**: List of ALL entity types discovered in the data model. Each has:
   - `name`: Entity type name (e.g., "User", "Project", "Run")
-- **discover**: Summary of the SDK discover artifact. It must include:
-  - `source`: exactly `sdk`
-  - `model_count`, `edge_count`, `relation_count`: counts from `autonoma/discover.json`
-  - `scope_field`: scope field name from `autonoma/discover.json`
-- **variable_fields**: List of generated or per-run values that tests must not treat as hardcoded literals.
-  Each entry has:
+- **variable_fields**: List of generated or per-run values that tests must not treat as
+  hardcoded literals. May be `[]` if no variable fields are needed. Each entry has:
   - `token`: double-curly placeholder such as `{{project_title}}`
   - `entity`: entity field path such as `Project.title`
   - `scenarios`: list of scenario names that use this variable
   - `reason`: why this field must be generated
   - `test_reference`: how tests should refer to the value in natural language
-  - optional `generator`: free-form generation hint such as `derived from testRunId` or `backend-generated`
+  - optional `generator`: free-form generation hint such as `derived from testRunId`
 - **planning_sections**: A list describing which planning artifacts are present. It must include:
-  - `sdk_discover`
   - `schema_summary`
   - `relationship_map`
   - `variable_data_strategy`
-  - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation and you need to explain why fields were aggressively slugged with `{{testRunId}}`
+  - (optional) `scoping_analysis` — include this when the app lacks natural per-run isolation
+    and you need to explain why fields were aggressively slugged with `{{testRunId}}`
 
 ### After the frontmatter
 
 The rest of the file follows the standard scenarios.md format from the fetched instructions:
-- Include a `## SDK Discover` section summarizing the schema counts and scope field.
-- Include a `## Schema Summary` section listing the key models and required fields that drive the scenarios.
-- Include a `## Relationship Map` section describing the important parent/child and FK relationships.
-- Include a `## Variable Data Strategy` section explaining which values are generated and how tests should reference them.
-- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation — explain why fields were aggressively slugged with `{{testRunId}}` and what isolation boundary the slugging replaces.
+- Include a `## Schema Summary` section listing the key models and required fields driving the scenarios.
+- Include a `## Relationship Map` section describing parent/child and FK relationships.
+- Include a `## Variable Data Strategy` section explaining which values are generated and how tests reference them.
+- (Optional) Include a `## Scoping Analysis` section if the app lacks natural per-run isolation.
 - Scenario: `standard` (credentials, entity tables with concrete data, aggregate counts)
 - Scenario: `empty` (credentials, all entity types listed as None)
 - Scenario: `large` (credentials, high-volume data described in aggregate)
@@ -207,28 +229,24 @@ you'll receive an error message. Fix the issue and rewrite the file.
 
 The validation checks:
 - File starts with `---` (YAML frontmatter)
-- Frontmatter contains scenario_count, scenarios, entity_types, discover, variable_fields
-- Frontmatter contains planning_sections metadata
+- Frontmatter contains scenario_count, scenarios, entity_types, variable_fields, planning_sections
 - scenarios list length matches scenario_count
 - Required scenarios (standard, empty, large) are present
 - Each scenario has name, description, entity_types, total_entities
 - entity_types is a non-empty list with name fields
-- discover includes sdk source, schema counts, and scope field
 - variable_fields entries use double-curly tokens and known scenario names
-- planning_sections includes sdk_discover, schema_summary, relationship_map, and variable_data_strategy
+- planning_sections includes schema_summary, relationship_map, and variable_data_strategy
 
 ## Important
 
 - **The scenario data is a contract.** Fixed values are hard assertions; variable fields are explicit placeholders.
-- Prefer concrete literals for seed data unless the field truly must vary across runs.
+- Prefer concrete literals unless the field truly must vary across runs.
 - Use variables sparingly. A smaller, justified variable list is better than marking every identity field dynamic.
-- Do not default to `faker`. Prefer deterministic strategies such as planner-chosen literals with stable discriminator conventions, deriving from `testRunId`, or backend-generated values.
-- If a field can safely be a concrete literal for review and testing, keep it concrete.
-- Only include `generator` when the generation mechanism is important to communicate.
+- Do not default to `faker`. Prefer deterministic strategies — planner-chosen literals with stable discriminators, derivation from `testRunId`, or backend-generated values.
 - Every value must be concrete — not "some applications" but "3 applications: Marketing Website, Android App, iOS App"
 - Every relationship must be explicit — which entities belong to which
 - Every enum value must be covered in `standard`
-- Use the SDK discover output instead of re-deriving the schema from local code
-- If the discover artifact is missing, ask the user to provide a working SDK discover response
-- Only use `{{testRunId}}` as a template token — do not invent custom variable tokens like `{{user_email_alice}}`. The SDK template engine only resolves built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`, etc.). Custom tokens cause a runtime error when the dashboard sends the payload directly to the endpoint. If a field needs uniqueness, inline the testRunId directly: e.g. `alice-{{testRunId}}@test.local`
-- Design scenario entity tables so they can be expressed as a nested tree rooted at the scope entity. The Step 4 agent will convert scenarios into nested `create` payloads — flat cross-model `_ref` only structures break when JSON key order is not preserved
+- Use subagents to parallelize data model discovery
+- Only use `{{testRunId}}` as a template token in scenario BODIES (field values). Custom tokens like `{{user_email_alice}}` are only valid in `variable_fields` declarations — when the SDK resolves payloads at runtime it only knows built-in expressions (`{{testRunId}}`, `{{index}}`, `{{cycle(...)}}`). If a field needs uniqueness inside the scenario body, inline testRunId: e.g. `alice-{{testRunId}}@test.local`.
+- Design scenarios so each entity table can be serialised as a nested tree rooted at the scope entity. Flat cross-model `_ref`-only structures break when JSON key order is not preserved.
+- If the audit does not describe a model you need, ask the user rather than guessing.
diff --git a/agents/scenario-validator.md b/agents/scenario-validator.md
index b91a8b5..f5ec61c 100644
--- a/agents/scenario-validator.md
+++ b/agents/scenario-validator.md
@@ -1,7 +1,9 @@
 ---
 description: >
-  Validates planned scenarios against a live Autonoma SDK endpoint and writes
-  approved scenario recipes. Assumes SDK integration is already complete.
+  Validates the Environment Factory endpoint end-to-end by running discover/up/down
+  against every scenario, iteratively fixing handler bugs and reconciling scenarios.md
+  with the real behavior. Writes autonoma/.endpoint-validated on success. Hard gate
+  before E2E test generation.
 tools:
   - Read
   - Glob
@@ -11,207 +13,247 @@ tools:
   - Bash
   - Agent
   - WebFetch
-maxTurns: 60
+maxTurns: 120
 ---
 
-# Scenario Validator
-
-You validate the planned scenarios against an already-working Autonoma SDK endpoint.
-Your inputs are `autonoma/discover.json`, `autonoma/scenarios.md`, and the existing backend behavior.
-Your output is `autonoma/scenario-recipes.json`.
-You MUST also leave a terminal artifact in `autonoma/.scenario-validation.json`.
-
-## Goal
-
-Step 1 already handled SDK installation, endpoint wiring, secrets, branch creation, and any PR work.
-This step is validation-only. Your job is to:
-
-1. read the schema contract from `autonoma/discover.json`
-2. read the scenario intent from `autonoma/scenarios.md`
-3. smoke-test `discover`, `up`, and `down` against the live endpoint
-4. validate `standard`, `empty`, and `large`
-5. persist approved recipes to `autonoma/scenario-recipes.json`
-
-## Strict Prohibitions
-
-- Do NOT install packages.
-- Do NOT edit backend code.
-- Do NOT modify SDK source code.
-- Do NOT modify database schemas or migrations.
-- Do NOT create branches, commits, or PRs.
-- Do NOT try to "fix" validation failures by changing the SDK contract.
-
-If validation fails, report the backend or recipe issue clearly and stop. Treat failures as integration or scenario issues, not coding tasks for this step.
-On failure, still write `autonoma/.scenario-validation.json` with `status: "failed"` and all blocking issues.
-
-## Instructions
-
-1. Fetch the current SDK protocol reference:
-   - `https://docs.agent.autonoma.app/llms/guides/environment-factory.txt`
-
-2. Read:
-   - `autonoma/discover.json`
-   - `autonoma/scenarios.md`
-
-3. Read `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` from the environment.
-   - If `AUTONOMA_SDK_ENDPOINT` is missing or the endpoint is unreachable, stop and tell the user to check Step 1 or the local dev server status.
-   - Do not try to implement or repair the endpoint in this step.
-
-## Validation Requirements
-
-### Smoke-test the live endpoint
-
-At minimum:
-1. confirm `discover` works
-2. send one signed `up` request with a small inline `create` payload compatible with the schema
-3. send the corresponding signed `down` request using the returned `refsToken`
-4. verify cleanup succeeds
-
-### Scenario validation
-
-After the smoke test works, validate `standard`, `empty`, and `large` against the current backend.
-
-Prefer:
-1. backend-local `checkScenario` / `checkAllScenarios` if already available without code changes
-2. signed endpoint `up` / `down` validation otherwise
-
-Do not change the backend if validation fails. Report the failure and stop.
-
-## Recipe Shape Requirements
-
-Write `autonoma/scenario-recipes.json` in this exact logical shape:
-
-```json
-{
-  "version": 1,
-  "source": {
-    "discoverPath": "autonoma/discover.json",
-    "scenariosPath": "autonoma/scenarios.md"
-  },
-  "validationMode": "sdk-check",
-  "recipes": [
-    {
-      "name": "standard",
-      "description": "Realistic dataset for core flows",
-      "create": {
-        "Organization": [{
-          "_alias": "org1",
-          "name": "Acme Corp"
-        }]
-      },
-      "variables": {
-        "testRunId": {
-          "strategy": "derived",
-          "source": "testRunId",
-          "format": "{testRunId}"
-        }
-      },
-      "validation": {
-        "status": "validated",
-        "method": "checkScenario",
-        "phase": "ok",
-        "up_ms": 12,
-        "down_ms": 8
-      }
-    }
-  ]
-}
-```
-
-Required rules:
-- top-level keys must be `version`, `source`, `validationMode`, and `recipes`
-- `version` must be integer `1`
-- `source.discoverPath` must be `autonoma/discover.json`
-- `source.scenariosPath` must be `autonoma/scenarios.md`
-- `validationMode` must be `sdk-check` or `endpoint-lifecycle`
-- `recipes` must include `standard`, `empty`, and `large`
-- every recipe must contain `name`, `description`, `create`, and `validation`
-- every `validation` object must contain:
-  - `status: "validated"`
-  - `method`: one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`
-  - `phase: "ok"`
-  - optional `up_ms` / `down_ms` as non-negative integers
-
-### Nested tree requirement
-
-Recipe `create` payloads MUST use a nested tree rooted at the scope entity.
-Do NOT use flat top-level model keys connected only by `_ref`.
-
-Children must be nested under their parent using the relation field names from `discover.json`.
-Use `_ref` only for cross-branch references that cannot be expressed through nesting.
-
-### Variables requirement
-
-If `create` contains `{{token}}` placeholders, include a `variables` object for that recipe.
-
-Allowed strategies:
-- `literal`
-- `derived`
-- `faker`
-
-Rules:
-- every `{{token}}` in `create` must have a matching key in `variables`
-- every key in `variables` must be used in `create`
-- fully concrete recipes do not need `variables`
-- if the backend requires explicit scalar foreign-key values in addition to nested trees, include those scalar assignments using `_ref`-resolved values
-- any collision-prone unique value must be derived from `testRunId`
-
-Do not write the old shape. In particular, do not use:
-- top-level `generatedAt`
-- top-level `scenarios`
-- per-recipe `validated`
-- per-recipe `timing`
-
-## Preflight Endpoint Validation
-
-After writing `autonoma/scenario-recipes.json`, you MUST run:
-
-```bash
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json
-```
-
-This requires:
-- `AUTONOMA_SDK_ENDPOINT`
-- `AUTONOMA_SHARED_SECRET`
-
-If preflight fails, do NOT rewrite backend code. Report the failure clearly and stop.
-
-Before returning, always write `autonoma/.scenario-validation.json` with this shape:
-
-```json
-{
-  "status": "ok",
-  "preflightPassed": true,
-  "smokeTestPassed": true,
-  "validatedScenarios": ["standard", "empty", "large"],
-  "failedScenarios": [],
-  "blockingIssues": [],
-  "recipePath": "autonoma/scenario-recipes.json",
-  "validationMode": "sdk-check",
-  "endpointUrl": "http://localhost:3000/api/autonoma"
-}
-```
-
-If the step fails, keep the same shape but set:
-- `status: "failed"`
-- `preflightPassed: false` when preflight did not pass
-- `failedScenarios` to the scenarios that failed
-- `blockingIssues` to the concrete validation/runtime blockers
-
-## What to Explain to the User
-
-When finished, explain:
-1. the endpoint that was validated
-2. whether the smoke `discover -> up -> down` lifecycle passed
-3. whether `standard`, `empty`, and `large` validated successfully
-4. what validation method was used
-5. where `autonoma/scenario-recipes.json` was written
-6. where `autonoma/.scenario-validation.json` was written
-7. any remaining manual deployment or backend issues that need attention
-
-## Important
-
-- Treat `discover.json` as the schema contract and `scenarios.md` as the scenario intent.
-- Assume SDK integration is already complete.
-- If the endpoint is down, tell the user to restart or redeploy the Step 1 integration instead of attempting code edits here.
-- The orchestrator must be able to trust `autonoma/.scenario-validation.json` as the only terminal-state signal for this step.
+# Scenario Validator: iterative fix loop + reality reconciliation
+
+The Environment Factory endpoint exists (step 4 wrote `autonoma/.endpoint-implemented`).
+Your job is to prove it actually works and keep iterating until it does. The E2E test
+generator (step 6) is gated on your sentinel — if you do not write
+`autonoma/.endpoint-validated`, no tests get generated.
+
+## Database Safety (absolute)
+
+- ALL writes go through the SDK endpoint only. Never INSERT/UPDATE/DELETE/DROP/TRUNCATE via psql or raw SQL.
+- You MAY run SELECT via psql / ORM read queries to verify data.
+- The SDK's `down` action deletes only what `up` created (signed refs token).
+
+## Inputs
+
+- `autonoma/entity-audit.md` — every model and whether it needs a factory
+- `autonoma/scenarios.md` — scenario definitions (may contain mistakes you will correct)
+- The handler file created in step 4
+- A running dev server (start one if it is not up — ask the user for the port)
+- `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` (for HMAC signing + preflight)
+
+## Outputs
+
+- `autonoma/scenario-recipes.json` — validated nested `create` trees per scenario
+- `autonoma/.scenario-validation.json` — terminal artifact the orchestrator reads
+- `autonoma/.endpoint-validated` — sentinel that gates Step 6 (test generation)
+
+## The loop
+
+Repeat until all three actions succeed for every scenario OR you exhaust 5 iterations
+(if you hit 5, STOP and report — do not fake success):
+
+1. Fetch the protocol docs (first iteration only):
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/protocol.txt"
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/scenarios.txt"
+   ```
+
+   If curl fails, STOP and report — do not fabricate a URL.
+
+2. Export working secrets (same values the handler reads):
+
+   ```bash
+   export AUTONOMA_SHARED_SECRET=${AUTONOMA_SHARED_SECRET:-$(openssl rand -hex 32)}
+   export AUTONOMA_SIGNING_SECRET=${AUTONOMA_SIGNING_SECRET:-$(openssl rand -hex 32)}
+   ```
+
+3. Run `discover` via curl with proper HMAC.
+   - The response MUST contain `schema.models`, `schema.edges`, `schema.relations`, `schema.scopeField`.
+   - **Coverage check**: every model in `entity-audit.md` MUST appear in `schema.models`. If one is missing, fix the handler's model filter / adapter config and restart the loop.
+   - **Factory coverage check**: open the handler file(s), extract the registered factory names. Every model with `independently_created: true` in the audit MUST be registered.
+   - **Factory-body integrity check (deterministic, MANDATORY)**: this is the check the env-factory agent is supposed to run before writing its sentinel. Re-run it here; do not trust the upstream. Steps:
+     1. Grep the handler file(s) for raw DB/ORM writes. The pattern set must cover every
+        language and ORM the SDK supports — any of these appearing inside a factory body for a
+        model with `independently_created: true` is a FAIL:
+        ```bash
+        # TypeScript/JavaScript — Prisma, Drizzle, Knex, Sequelize, TypeORM, Mongoose
+        grep -nE '(prisma|db|tx|trx)\.[a-zA-Z_]+\.(create|createMany|upsert)\(|\b(drizzle|db|tx)\.insert\(|\bknex\([^)]*\)\.insert\(|\.models\.[A-Za-z_]+\.create\(|getRepository\([^)]*\)\.save\(|\bMongoose.*\.create\(' <handler-file>
+
+        # Python — SQLAlchemy, Django ORM
+        grep -nE '\bsession\.(add|execute|bulk_insert_mappings)\(|\.objects\.create\(|\.save\(\)' <handler-file>
+
+        # Ruby/Rails — ActiveRecord
+        grep -nE '\b[A-Z][A-Za-z0-9]*\.(create|create!|insert|insert_all)\(|\.new\([^)]*\)\.save' <handler-file>
+
+        # PHP/Laravel — Eloquent, raw DB
+        grep -nE '\b[A-Z][A-Za-z0-9]*::create\(|->save\(\)|\bDB::table\([^)]*\)->insert\(' <handler-file>
+
+        # Java/Spring — JPA, JDBC
+        grep -nE '\bentityManager\.persist\(|\b[a-zA-Z]+Repository\.save\(|\bjdbcTemplate\.update\(' <handler-file>
+
+        # Go — GORM, database/sql, squirrel
+        grep -nE '\.Create\(|\bdb\.Exec(Context)?\(|\bsq\.Insert\(' <handler-file>
+
+        # Elixir/Ecto
+        grep -nE '\bRepo\.(insert|insert!|insert_all)\(' <handler-file>
+
+        # Rust — Diesel, SQLx, SeaORM
+        grep -nE '\bdiesel::insert_into\(|\bsqlx::query!?\("INSERT|ActiveModel[^{]*\.insert\(' <handler-file>
+
+        # Raw SQL INSERT in any language
+        grep -niE '"[^"]*INSERT\s+INTO\b|'"'"'[^'"'"']*INSERT\s+INTO\b' <handler-file>
+        ```
+        Use the pattern set appropriate for the project's stack (determined from the handler file
+        and `entity-audit.md`); include the raw-SQL pattern unconditionally. Any match that
+        falls inside a factory body for a `independently_created: true` model is a FAIL.
+     2. For each `(model, creation_file, creation_function)` from `entity-audit.md`, verify the handler contains both an `import` resolving to `creation_file` AND an invocation of `creation_function` inside that model's factory body.
+     3. If any model fails either check, this is a **handler bug** (path 3a). Fix by importing and calling the audited function. If the audit pointed at an inline route handler (no exported function), extract it into a named exported function in a nearby module, replace the route body with a call to the new function, update `entity-audit.md` in-place with the new `creation_file`/`creation_function`, then restart this step.
+     4. The validator MUST NOT write `.endpoint-validated` while any factory body contains a raw ORM create for its own model.
+
+4. For each scenario in `scenarios.md`:
+   1. Build the `{action:"up", create:..., testRunId:"<scenario>-<iteration>"}` body from the scenario.
+   2. HMAC-sign and POST.
+   3. If non-200 or error body, pick one of three paths:
+      a. **Handler bug** (missing factory, bad FK handling, wrong adapter config) → fix the handler and restart.
+      b. **Scenario bug** (field does not exist on the model, FK target wrong, scope field missing) → edit `scenarios.md` to match reality and restart. Log the change.
+      c. **Unfeasible scenario** (requires data the app cannot produce) → REMOVE the scenario from `scenarios.md` with justification. Restart.
+   4. If 200: parse `auth`, `refs`, `refsToken`.
+      - **Auth check**: `auth` MUST be non-null and contain at least one of `{ cookies, headers, token, user }`. If empty, the auth callback is not wired — fix it and restart.
+      - **Refs check**: every top-level model in the `create` tree MUST appear in `refs`.
+   5. Verify DB state with a read-only `SELECT` for at least one refs id.
+   6. POST `{action:"down", refsToken}`. Expect `{ok:true}`.
+   7. Verify the refs rows are gone.
+
+5. After every scenario passes cleanly, emit the scenario recipes.
+
+   Write `autonoma/scenario-recipes.json` with this shape (recipes mirror the `create`
+   trees you just validated — one entry per scenario):
+
+   ```json
+   {
+     "version": 1,
+     "source": {
+       "scenariosPath": "autonoma/scenarios.md"
+     },
+     "validationMode": "endpoint-lifecycle",
+     "recipes": [
+       {
+         "name": "standard",
+         "description": "Realistic dataset for core flows",
+         "create": {
+           "Organization": [{
+             "_alias": "org1",
+             "name": "Acme Corp"
+           }]
+         },
+         "variables": {
+           "testRunId": {
+             "strategy": "derived",
+             "source": "testRunId",
+             "format": "{testRunId}"
+           }
+         },
+         "validation": {
+           "status": "validated",
+           "method": "endpoint-up-down",
+           "phase": "ok",
+           "up_ms": 12,
+           "down_ms": 8
+         }
+       }
+     ]
+   }
+   ```
+
+   Rules:
+   - top-level keys MUST be exactly `version`, `source`, `validationMode`, `recipes`
+   - `version` must be integer `1`
+   - `validationMode` must be `sdk-check` or `endpoint-lifecycle` (use `endpoint-lifecycle`
+     when you drove up/down via HTTP in the loop above)
+   - `recipes` MUST include `standard`, `empty`, and `large`
+   - every recipe MUST contain `name`, `description`, `create`, and `validation`
+   - every `validation` object MUST contain `status: "validated"`, `phase: "ok"`, and a
+     valid `method` (one of `checkScenario`, `checkAllScenarios`, `endpoint-up-down`)
+   - **Nested tree**: `create` MUST use a nested tree rooted at the scope entity. Do NOT
+     use flat top-level model keys connected only by `_ref`. Nest children under their
+     parent using relation field names. Use `_ref` only for cross-branch references that
+     cannot be expressed through nesting.
+   - **Variables**: if `create` contains `{{token}}` placeholders, include a `variables`
+     object. Every `{{token}}` in `create` must match a key in `variables`; every key
+     in `variables` must be used in `create`. Fully concrete recipes do not need `variables`.
+     Allowed strategies: `literal`, `derived`, `faker`. Any collision-prone unique value
+     must be derived from `testRunId`.
+   - Do NOT write the legacy shape — no top-level `generatedAt`, no top-level `scenarios`,
+     no per-recipe `validated`, no per-recipe `timing`.
+
+6. Run preflight on the emitted recipes:
+
+   ```bash
+   python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" \
+     autonoma/scenario-recipes.json
+   ```
+
+   This resolves tokenized payloads and re-runs signed up/down against the live endpoint.
+   Requires `AUTONOMA_SDK_ENDPOINT` and `AUTONOMA_SHARED_SECRET` in the environment.
+
+   If preflight exits non-zero, fix the failing recipe (or the corresponding scenario) and
+   re-run. Do NOT proceed to step 7 until preflight passes.
+
+7. Write the terminal artifact `autonoma/.scenario-validation.json` with this shape:
+
+   ```json
+   {
+     "status": "ok",
+     "preflightPassed": true,
+     "smokeTestPassed": true,
+     "validatedScenarios": ["standard", "empty", "large"],
+     "failedScenarios": [],
+     "blockingIssues": [],
+     "recipePath": "autonoma/scenario-recipes.json",
+     "validationMode": "endpoint-lifecycle",
+     "endpointUrl": "http://localhost:3000/api/autonoma"
+   }
+   ```
+
+   On failure keep the same shape with `status: "failed"`, `preflightPassed: false` when
+   preflight did not pass, populated `failedScenarios`, and concrete `blockingIssues`.
+
+8. Write the sentinel `autonoma/.endpoint-validated`.
+
+   Use the `Write` tool (NOT `touch` — the hook fires only on `Write`/`Edit`) with a short
+   plain-text report:
+
+   ```
+   Validated N scenarios across M models.
+   - discover: all audited models present, all independently_created factories registered
+   - up: all N scenarios created successfully, auth returned {cookies|headers|token}
+   - down: all N scenarios cleaned up, no orphans
+   - recipes: autonoma/scenario-recipes.json emitted, preflight passed
+   - scenarios.md edits: <list of changes you made, or "none">
+   ```
+
+## Iteration discipline
+
+- One handler fix per iteration, then re-run everything. Do not chain fixes blind.
+- If the same scenario fails twice in a row with the same error, the scenario itself is probably wrong — prefer editing `scenarios.md` over contorting the handler.
+- If you have edited `scenarios.md`, re-read it from disk after every edit.
+
+## When you hit the 5-iteration cap
+
+STOP and write a clear failure report. Do NOT write `.endpoint-validated`. Include:
+
+- the last failing curl body + response
+- which scenario(s) failed
+- which handler file + line range is most likely at fault
+
+The orchestrator will surface this to the user, who can intervene manually.
+
+## scenarios.md reconciliation rules
+
+When you edit `scenarios.md`, preserve the frontmatter shape (the validator hook checks
+it). Allowed:
+
+- Drop a scenario entirely (decrement `scenario_count`, update the `scenarios` summary).
+- Remove/rename fields on a model to match what `discover` reports.
+- Adjust FK aliases so they reference models that actually exist.
+- Flatten cross-branch references that the handler cannot resolve.
+
+Disallowed: silently changing a scenario's intent (e.g. renaming "admin with one project"
+to "user with one project" without reflecting that in the description).
diff --git a/agents/sdk-integrator.md b/agents/sdk-integrator.md
deleted file mode 100644
index addd455..0000000
--- a/agents/sdk-integrator.md
+++ /dev/null
@@ -1,301 +0,0 @@
----
-description: >
-  Detects the project stack, installs the Autonoma SDK from package managers,
-  wires the endpoint, starts a local dev server, verifies discover/up/down, and
-  opens a PR when possible.
-tools:
-  - Read
-  - Glob
-  - Grep
-  - Write
-  - Edit
-  - Bash
-  - Agent
-  - WebFetch
-maxTurns: 60
----
-
-# SDK Integrator
-
-You implement the Autonoma SDK integration as the first step of the planner pipeline.
-
-## Goal
-
-Detect the stack, install the SDK from package managers, add a minimal endpoint following the matching example or SDK README, ensure secrets exist, start a dev server, verify `discover`, `up`, and `down`, and prepare the repo for user review.
-
-The SDK reference repo path is provided by the orchestrator in `/tmp/autonoma-sdk-ref-dir`. Treat that repo as read-only reference material only.
-
-## Strict Rules
-
-- Install the SDK from package managers only. Never vendor, copy, or link SDK source into the user's app.
-- **Never create a standalone server or sidecar.** The endpoint lives as a new route inside the project's existing backend. Do NOT create a new `FastAPI()` / `express()` / `Flask(__name__)` / `Gin.Default()` instance, a new `main.py` / `server.py` / `start-*.py` / `main.go` launcher, or open a separate port. If the project already has a backend, integrate into it.
-- **SDK language must match backend language.** Detect the backend's language from its manifest file BEFORE picking an SDK. Do not install the Python SDK into a TypeScript/NestJS project (or vice versa). If no matching SDK exists for the backend language, stop per Step 3 — do NOT fall back to a sidecar in a different language.
-- **Never scaffold at repo root when a backend directory exists**, including non-standard names like `core-app-backend/`, `apps/api/`, `services/core/`. Locate the backend first (Step 1).
-- Do NOT modify the SDK reference repo.
-- Do NOT modify database schemas, migrations, or models.
-- Keep integration changes minimal and aligned with the project's existing conventions.
-- Do NOT commit `.env`.
-- Do NOT commit anything under `autonoma/`.
-- You MUST leave a machine-readable terminal artifact in `autonoma/.sdk-integration.json` whether the step succeeds or fails.
-- Do NOT report success unless both `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` have been written.
-
-## Required Order
-
-### 1. Locate the backend directory and detect the stack
-
-**Do this BEFORE picking an SDK.** The SDK must match the backend's language, so the backend must be located first.
-
-#### 1a. Enumerate candidate backend directories
-
-Use Glob / `ls`. Do NOT hardcode the name `backend/`. Real projects use many conventions:
-
-- `backend/`, `server/`, `api/`, `service/`, `services/`
-- `*-backend/`, `*-api/`, `*-server/`, `core-*/`, `app-*/` (e.g. `core-app-backend/`)
-- Monorepo layouts: `apps/*`, `packages/*`, `services/*`
-- Single-repo backends at the workspace root
-
-#### 1b. Identify the backend by manifest file
-
-For each candidate, look for one of these manifest files — the file's language determines the SDK you install:
-
-- `package.json`, `package-lock.json`, `pnpm-lock.yaml`, `yarn.lock` → TypeScript/JavaScript
-- `pyproject.toml`, `requirements.txt`, `Pipfile` → Python
-- `mix.exs` → Elixir
-- `composer.json` → PHP
-- `pom.xml`, `build.gradle` → Java
-- `Gemfile`, `*.gemspec` → Ruby
-- `Cargo.toml` → Rust
-- `go.mod` → Go
-
-Pick exactly one backend. If multiple plausible candidates exist, STOP and ask the user which one — do not guess, do not implement in more than one.
-
-#### 1c. Confirm with the user before writing any code
-
-State your finding:
-
-> "I found the backend at `<path>` (language: `<lang>`, framework: `<framework>`, ORM: `<orm>`, package manager: `<pm>`). I'll integrate the SDK there. Is that the right location?"
-
-Wait for confirmation before installing packages or writing files.
-
-#### 1d. Determine the rest of the stack
-
-From the identified backend directory, determine:
-- language
-- server framework
-- ORM or DB adapter
-- package manager
-
-### 2. Map the stack to the SDK docs matrix
-
-Use the matching runnable example from the SDK reference repo when available.
-Otherwise use the documented SDK package combinations from SDK READMEs.
-
-Supported docs matrix:
-- TypeScript: `@autonoma-ai/sdk` plus the matching ORM/server packages
-- Python: `autonoma-sdk[...]`
-- Elixir: `autonoma_sdk`
-- PHP: `autonoma-ai/sdk`
-- Java: `com.autonoma.ai:autonoma-sdk`
-- Ruby: `autonoma-ai`
-- Rust: `autonoma-sdk`
-- Go: `github.com/autonoma-ai/sdk-go`
-
-### 3. Stop immediately if unsupported
-
-If the detected stack is not supported, stop and output a `mailto:` link to `support@autonoma.app`.
-
-The mailto body must include:
-- detected language
-- detected framework
-- detected ORM or DB layer
-- detected package manager
-- repo name or directory name
-
-### 4. Create a branch
-
-Create a branch in the user repo:
-- preferred base name: `autonoma/feat-autonoma-sdk`
-- if it already exists, append `-2`, `-3`, and so on
-
-### 5. Install SDK packages
-
-Use the project's package manager.
-
-Examples:
-- TypeScript + Express + Prisma:
-  - `npm install @autonoma-ai/sdk @autonoma-ai/sdk-prisma @autonoma-ai/server-express`
-- TypeScript + Next.js + Drizzle:
-  - `pnpm add @autonoma-ai/sdk @autonoma-ai/sdk-drizzle @autonoma-ai/server-web`
-- Python + FastAPI + SQLAlchemy:
-  - `pip install "autonoma-sdk[sqlalchemy,fastapi]"`
-- Python + Django:
-  - `pip install "autonoma-sdk[django]"`
-- Elixir + Phoenix + Ecto:
-  - add `{:autonoma_sdk, "~> 0.1"}`
-
-### 6. Implement the endpoint
-
-Follow the matching example or README pattern with minimal project-specific glue.
-
-Requirements:
-- match the repo's routing conventions
-- preserve existing auth/session patterns if the SDK auth callback needs them
-- implement the current SDK contract for `discover`, `up`, and `down`
-- do not create a throwaway second app or server if the project already has one
-
-### 7. Ensure secrets exist
-
-Check `.env` first if present.
-
-Ensure:
-- `AUTONOMA_SHARED_SECRET`
-- `AUTONOMA_SIGNING_SECRET`
-
-If missing:
-- generate with `openssl rand -hex 32`
-- ensure the two secrets differ
-- append or update `.env`
-- append or update `.env.example` with placeholder values and short comments
-
-Suggested comments:
-
-```env
-# Autonoma SDK - shared secret for HMAC request signing
-AUTONOMA_SHARED_SECRET=your-shared-secret-here
-# Autonoma SDK - private secret for signing refs tokens
-AUTONOMA_SIGNING_SECRET=your-signing-secret-here
-```
-
-### 8. Ensure planner artifacts are not committed
-
-If `/autonoma/` is not already ignored, add it to `.git/info/exclude`.
-
-### 9. Detect and run the dev server
-
-Prefer the repo's existing dev/start script or command.
-
-Examples to inspect:
-- package scripts: `dev`, `start:dev`, `start`
-- `Makefile`
-- `Procfile`
-- Django `manage.py runserver`
-- Phoenix `mix phx.server`
-
-If a suitable server is already running and the expected endpoint responds, reuse it.
-Otherwise start one in the background and persist its PID to:
-
-```bash
-/tmp/autonoma-dev-server-pid
-```
-
-### 10. Verify endpoint behavior
-
-Run signed checks against the live endpoint:
-1. `discover`
-2. minimal `up`
-3. `down` using returned `refsToken`
-
-Do not continue if any of these fail.
-
-### 11. Write the verified endpoint URL
-
-Write the final endpoint URL to:
-
-```text
-autonoma/.sdk-endpoint
-```
-
-The file must contain only one absolute URL.
-
-### 12. Write the integration handoff artifact
-
-Write `autonoma/.sdk-integration.json` with this shape:
-
-```json
-{
-  "status": "ok",
-  "endpointUrl": "http://localhost:3000/api/autonoma",
-  "endpointPath": "/api/autonoma",
-  "stack": {
-    "language": "TypeScript",
-    "framework": "Express",
-    "orm": "Prisma",
-    "packageManager": "pnpm"
-  },
-  "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"],
-  "sharedSecretPresent": true,
-  "signingSecretPresent": true,
-  "devServer": {
-    "startedByPlugin": true,
-    "pid": 12345
-  },
-  "verification": {
-    "discover": { "status": "ok", "validatedByPlugin": true },
-    "up": { "status": "ok" },
-    "down": { "status": "ok" }
-  },
-  "branch": {
-    "name": "autonoma/feat-autonoma-sdk"
-  },
-  "pr": {
-    "url": "https://github.com/..."
-  },
-  "blockingIssues": []
-}
-```
-
-If the step fails after doing any work, still write `autonoma/.sdk-integration.json` with:
-- `status: "failed"`
-- the best known values for stack, endpoint, server pid, and branch
-- failed verification statuses
-- every blocking issue listed in `blockingIssues`
-
-### 13. Commit only integration changes
-
-Stage only the SDK integration changes, such as:
-- route or handler files
-- package-manager manifests and lockfiles
-- `.env.example`
-- any required config files
-
-Do NOT stage:
-- `.env`
-- `autonoma/`
-
-Commit message:
-
-```text
-feat: integrate autonoma sdk
-```
-
-### 14. Create a PR when possible
-
-If `gh` is available:
-- push the branch
-- create a PR
-
-Include a summary, required env vars, deployment reminder, and:
-
-```text
-Co-authored-by: Autonoma <noreply@autonoma.app>
-```
-
-If `gh` is unavailable, report the exact manual next steps instead.
-
-### 15. Final report
-
-Explain:
-1. detected stack
-2. installed packages
-3. endpoint path and URL
-4. where secrets were added
-5. dev server PID
-6. PR URL or manual push/PR steps
-7. where `autonoma/.sdk-endpoint` and `autonoma/.sdk-integration.json` were written
-
-## Verification Notes
-
-- Use the SDK reference repo in `/tmp/autonoma-sdk-ref-dir` only for examples and package-selection guidance.
-- Prefer existing project conventions over generic examples when file placement differs.
-- If the project already contains a partial SDK integration, extend it rather than replacing it.
-- If lifecycle verification passes but artifact writing fails, the step is still incomplete.
diff --git a/agents/test-case-generator.md b/agents/test-case-generator.md
index 5eccfaf..ee951f0 100644
--- a/agents/test-case-generator.md
+++ b/agents/test-case-generator.md
@@ -27,17 +27,33 @@ Your output is a directory `autonoma/qa-tests/` containing:
 
 ## Instructions
 
-1. First, fetch the latest test generation instructions:
+1. All Autonoma documentation MUST be fetched via `curl` in the Bash tool. Do NOT use
+   WebFetch. Do NOT write any URL yourself. The docs base URL lives only in
+   `autonoma/.docs-url`, written by the orchestrator before any subagent runs.
 
-   Use WebFetch to read `https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt`
-   and follow those instructions for how to generate tests.
+   To fetch a doc, run the bash command literally — the shell expands the path, not you:
 
-2. Read all input files:
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/<path>"
+   ```
+
+   If `curl` exits non-zero for any reason, **STOP the pipeline** and report the exit code
+   and stderr. Do not invent a URL. Do not retry with a different host. There is no fallback.
+
+2. Fetch the latest test generation instructions:
+
+   ```bash
+   curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-e2e-tests.txt"
+   ```
+
+   Read the output and follow those instructions for how to generate tests.
+
+3. Read all input files:
    - `autonoma/AUTONOMA.md` — parse the frontmatter to get core_flows and feature_count
    - All files in `autonoma/skills/`
    - `autonoma/scenarios.md` — parse the frontmatter to get scenarios, entity_types, and **variable_fields**
 
-3. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter
+4. **Variable fields are dynamic data.** The `variable_fields` list in scenarios.md frontmatter
    declares which values change between test runs (e.g. emails, dates, deadlines). Each entry has
    a `token` (like `{{user_email_1}}`), the `entity` field it belongs to, and a `test_reference`.
    When writing test steps that involve a variable field value — typing it, asserting it, or
@@ -48,7 +64,7 @@ Your output is a directory `autonoma/qa-tests/` containing:
    - good: "assert the task deadline shows `{{deadline_1}}`"
    - bad: "assert the task deadline shows 2025-06-15"
 
-4. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
+5. Treat `autonoma/scenarios.md` as fixture input, not as the subject under test.
    The scenarios exist only to provide preconditions and known data for app behavior tests.
    Do NOT generate tests whose purpose is to verify:
    - that the scenario contains the documented entity counts
@@ -61,17 +77,17 @@ Your output is a directory `autonoma/qa-tests/` containing:
    - good: "open the project `{{project_title}}` and verify editing works"
    - bad: "verify the scenario created 12 projects and 3 users"
 
-5. Count the routes/features/pages in the codebase to establish the coverage correlation.
+6. Count the routes/features/pages in the codebase to establish the coverage correlation.
    The total test count should roughly correlate:
    - Rule of thumb: 3-5 tests per route/feature for supporting flows
    - Rule of thumb: 8-15 tests per core flow
    - This is approximate — use judgment, but the INDEX must declare the correlation
 
-6. Generate test files organized in subdirectories by feature/flow.
+7. Generate test files organized in subdirectories by feature/flow.
 
-7. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
+8. Write `autonoma/qa-tests/INDEX.md` FIRST (before individual test files).
 
-8. Write individual test files into subdirectories.
+9. Write individual test files into subdirectories.
 
 ## CRITICAL: INDEX.md Format
 
@@ -196,4 +212,3 @@ you'll receive an error message. Fix the issue and rewrite the file.
 - Use subagents to parallelize test generation across folders
 - Each test must be self-contained — no dependencies on other tests
 - Do not write code (no Playwright, no Cypress) — tests are markdown with natural language steps
-- Prefer testing visible user outcomes over seed correctness or fixture inventory
diff --git a/commands/generate-tests.md b/commands/generate-tests.md
index 7f0bbc2..4ccc236 100644
--- a/commands/generate-tests.md
+++ b/commands/generate-tests.md
@@ -9,644 +9,183 @@ description: >
 
 # Autonoma E2E Test Generation Pipeline
 
-You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## User Confirmation Between Steps
+## CRITICAL: User Confirmation Between Steps
 
-By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the
-next step once validation passes.
+After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation
+using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm.
 
-**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after
-Steps 1-4.
+## How lifecycle reporting works
 
-**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`,
-that means auto-advance as well.
+You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that:
 
-If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation
-using the `AskUserQuestion` tool.
-
-After calling `AskUserQuestion`, wait for the user's response.
-Only proceed to the next step after they confirm.
+- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`.
+- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output,
+  emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate
+  (test files cannot be written until `autonoma/.endpoint-validated` exists).
 
 ## Before Starting
 
-Create the output directory and save the project root:
-
-```bash
-AUTONOMA_ROOT="$(pwd)"
-echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma autonoma/skills autonoma/qa-tests
-cleanup_dev_server() {
-  DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '')
-  if [ -n "$DEV_SERVER_PID" ]; then
-    kill "$DEV_SERVER_PID" 2>/dev/null || true
-    rm -f /tmp/autonoma-dev-server-pid
-    echo "Dev server (PID $DEV_SERVER_PID) stopped."
-  fi
-}
-```
-
-The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write:
-
-```bash
-PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
-```
-
-Read the environment variables required for reporting progress back to Autonoma:
-- `AUTONOMA_API_KEY`
-- `AUTONOMA_PROJECT_ID`
-- `AUTONOMA_API_URL`
-- `AUTONOMA_AUTO_ADVANCE` — optional, canonical
-- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias
-
-Add shared helpers before running the pipeline:
-
 ```bash
-auto_advance_enabled() {
-  if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then
-    return 0
-  fi
-  if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then
-    return 0
-  fi
-  return 1
-}
-
-refresh_generation_id() {
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-  GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-}
-
-build_event_payload() {
-  python3 - "$1" "$2" "$3" <<'PY'
-import json
-import sys
-
-event_type, key, value = sys.argv[1:4]
-print(json.dumps({"type": event_type, "data": {key: json.loads(value)}}))
-PY
-}
-
-build_step_payload() {
-  python3 - "$1" "$2" "$3" <<'PY'
-import json
-import sys
-
-event_type, step, name = sys.argv[1:4]
-print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}}))
-PY
-}
-
-post_setup_event_blocking() {
-  refresh_generation_id
-  payload="$1"
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  for attempt in 1 2 3; do
-    if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d "$payload" >/dev/null; then
-      return 0
-    fi
-    sleep "$attempt"
-  done
-  echo "ERROR: Failed to post blocking setup event after retries: $payload"
-  return 1
-}
-
-post_setup_log() {
-  refresh_generation_id
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")")
-  curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-    -H "Content-Type: application/json" \
-    -d "$payload" >/dev/null || true
-}
-
-patch_setup_status_blocking() {
-  refresh_generation_id
-  status="$1"
-  message="$2"
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  payload=$(python3 - "$status" "$message" <<'PY'
-import json
-import sys
-
-body = {"status": sys.argv[1]}
-if sys.argv[2]:
-    body["errorMessage"] = sys.argv[2]
-print(json.dumps(body))
-PY
-)
-  for attempt in 1 2 3; do
-    if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d "$payload" >/dev/null; then
-      return 0
-    fi
-    sleep "$attempt"
-  done
-  echo "ERROR: Failed to patch setup status after retries: $status"
-  return 1
-}
-
-report_error_and_exit() {
-  message="$1"
-  preserve_dev_server="${2:-false}"
-  payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")")
-  post_setup_event_blocking "$payload" || true
-  echo "ERROR: $message"
-  if [ "$preserve_dev_server" != "true" ]; then
-    cleanup_dev_server
-  fi
-  exit 1
-}
-
-report_partial_failure_and_exit() {
-  message="$1"
-  post_setup_log "$message"
-  patch_setup_status_blocking "partial_failure" "$message" || true
-  echo "ERROR: $message"
-  cleanup_dev_server
-  exit 1
-}
-
-rehydrate_sdk_env() {
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-  AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '')
-  AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
-  AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
-  export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET
-  if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then
-    return 1
-  fi
-  return 0
-}
+mkdir -p autonoma/skills autonoma/qa-tests
 ```
 
-Prepare the SDK reference repo for Step 1:
+The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`.
 
-```bash
-SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}"
-if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then
-  echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
-else
-  SDK_REF_DIR="$(mktemp -d)/autonoma-sdk"
-  if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then
-    echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
-  else
-    echo "ERROR: Unable to prepare the SDK reference repo."
-    cleanup_dev_server
-    exit 1
-  fi
-fi
-```
-
-Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug.
+## Step 1: Generate Knowledge Base
 
-Create the generation record so the dashboard can track progress in real time:
+Spawn `kb-generator`:
 
-```bash
-RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}")
-HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
-echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
-GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
-echo "$GENERATION_ID" > autonoma/.generation-id
-echo "Generation ID: $GENERATION_ID"
-```
+> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML
+> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill
+> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals).
+> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`.
 
-If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway.
+After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body).
 
-## Step 1: SDK Integration
+## Step 2: Entity Creation Audit
 
-Report step start:
+Spawn `entity-audit-generator`:
 
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start."
-post_setup_log "Detecting stack and integrating the Autonoma SDK..."
-```
-
-Spawn the `sdk-integrator` subagent with the following task:
-
-> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context.
-> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with
-> a `mailto:support@autonoma.app` link if unsupported.
-> Create a branch, install the SDK from package managers only, implement the SDK endpoint following
-> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`
-> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server,
-> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and
-> `autonoma/.sdk-integration.json`, commit with
-> `feat: integrate autonoma sdk`, and create a PR if `gh` is available.
-> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models.
-
-**After the subagent completes:**
-1. Verify `autonoma/.sdk-endpoint` exists and is non-empty
-2. Verify `autonoma/.sdk-integration.json` exists and is non-empty
-3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file
-4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env`
-5. Confirm the endpoint is reachable with a signed `discover` request
-6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes
-7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available
-
-Load the endpoint and secrets:
-
-```bash
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \
-  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \
-  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true
-
-rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true
-
-BODY='{"action":"discover"}'
-SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
-HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
-  -H "Content-Type: application/json" \
-  -H "x-signature: $SIG" \
-  -d "$BODY")
-if [ "$HTTP_STATUS" != "200" ]; then
-  report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true
-fi
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \
-  || report_error_and_exit "Step 1 discover response did not match the required schema." true
-```
+> Read the knowledge base. Audit how each database model is created. For every model, find the
+> dedicated creation function in a service/repository/helper. Classify as `independently_created: true`
+> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output
+> `autonoma/entity-audit.md` with frontmatter listing each model.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`.
 
-Report step complete:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true
-```
-
-7. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here."
-   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 2.
-
-## Step 2: Generate Knowledge Base
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start."
-post_setup_log "Analyzing codebase structure and identifying features..."
-```
-
-Spawn the `kb-generator` subagent with the following task:
-
-> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md`
-> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with
-> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count.
-> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
-> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty
-2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically
-3. Read the file and present the frontmatter to the user — specifically the core_flows table
-
-Report step complete and upload skills:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
-post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..."
-post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion."
-[ -n "$GENERATION_ID" ] && python3 -c "
-import os, json
-root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
-skills = []
-d = os.path.join(root, 'autonoma/skills')
-if os.path.isdir(d):
-    for f in os.listdir(d):
-        if f.endswith('.md'):
-            with open(os.path.join(d, f)) as fh:
-                skills.append({'name': f, 'content': fh.read()})
-print(json.dumps({'skills': skills}))
-" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d @- || true
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
-   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 3.
+After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`.
 
 ## Step 3: Generate Scenarios
 
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start."
-post_setup_log "Mapping data model and designing test data environments..."
-```
-
-Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
-This step assumes Step 1 already produced:
-- `AUTONOMA_SDK_ENDPOINT`
-- `AUTONOMA_SHARED_SECRET`
-
-Fetch and validate the artifact:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-mkdir -p "$AUTONOMA_ROOT/autonoma"
-rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1."
-BODY='{"action":"discover"}'
-SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
-RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
-  -H "Content-Type: application/json" \
-  -H "x-signature: $SIG" \
-  -d "$BODY")
-HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
-if [ "$HTTP_STATUS" != "200" ]; then
-  report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY"
-fi
-printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \
-  || report_error_and_exit "Step 3 discover artifact did not pass validation."
-```
-
-Spawn the `scenario-generator` subagent with the following task:
-
-> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
-> artifact from `autonoma/discover.json`.
-> Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
-> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
-> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
-> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
-> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
-> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
-2. Validate `autonoma/discover.json` using the plugin's validator
-3. The PostToolUse hook will have validated the frontmatter format automatically
-4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
-
-Report step complete:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."
-post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion."
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 4.
-
-## Step 4: Generate E2E Test Cases
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start."
-post_setup_log "Generating E2E test cases from knowledge base and scenarios..."
-```
-
-Spawn the `test-case-generator` subagent with the following task:
-
-> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
-> and scenarios from `autonoma/scenarios.md`.
-> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`.
-> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests,
-> total_folders, folder breakdown, and coverage_correlation.
-> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
-> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
-> scenario data when it is needed to test a real user-facing app behavior.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty
-2. Verify at least one non-`INDEX.md` test file exists
-3. Verify actual test count matches `INDEX.md`
-4. Verify folder breakdown matches `INDEX.md`
-5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
-6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
-
-Enforce the file-count postconditions:
-
-```bash
-INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md"
-[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md."
-TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
-[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files."
-python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md."
-import sys
-from pathlib import Path
-import yaml
-
-index_path = Path(sys.argv[1])
-actual_count = int(sys.argv[2])
-qa_dir = Path(sys.argv[3])
-
-content = index_path.read_text()
-parts = content.split('---', 2)
-if len(parts) < 3:
-    raise SystemExit('INDEX.md is missing YAML frontmatter')
-frontmatter = yaml.safe_load(parts[1])
-
-if frontmatter.get('total_tests') != actual_count:
-    raise SystemExit(
-        f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})'
-    )
-
-actual_folders = {}
-for path in qa_dir.rglob('*.md'):
-    if path.name == 'INDEX.md':
-        continue
-    folder = path.parent.relative_to(qa_dir).as_posix()
-    actual_folders[folder] = actual_folders.get(folder, 0) + 1
-
-declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])}
-if actual_folders != declared_folders:
-    raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}')
-print('OK')
-PY
-```
-
-Report step complete and upload test cases:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
-post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..."
-post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion."
-[ -n "$GENERATION_ID" ] && python3 -c "
-import os, json
-proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
-qa_dir = os.path.join(proj_root, 'autonoma/qa-tests')
-test_cases = []
-for root, dirs, files in os.walk(qa_dir):
-    for f in files:
-        if f.endswith('.md') and f != 'INDEX.md':
-            path = os.path.join(root, f)
-            folder = os.path.relpath(root, qa_dir)
-            with open(path) as fh:
-                content = fh.read()
-            entry = {'name': f, 'content': content}
-            if folder != '.':
-                entry['folder'] = folder
-            test_cases.append(entry)
-print(json.dumps({'testCases': test_cases}))
-" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d @- || true
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app."
-   - options: ["Yes, proceed to Step 5", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 5.
-
-## Step 5: Scenario Validation
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start."
-post_setup_log "Validating planned scenarios against the live SDK endpoint..."
-```
-
-Spawn the `scenario-validator` subagent with the following task:
-
-> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
-> Validate the planned scenarios against the existing live SDK endpoint without editing backend code.
-> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`,
-> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact
-> `autonoma/.scenario-validation.json`, and run:
-> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
-> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs.
-
-**After the subagent completes:**
-1. Rehydrate SDK env from Step 1 artifacts
-2. Verify `autonoma/.scenario-validation.json` exists and is non-empty
-3. Validate `autonoma/.scenario-validation.json`
-4. Require `status == "ok"` and `preflightPassed == true`
-5. Verify `autonoma/scenario-recipes.json` exists and is non-empty
-6. Run the preflight helper if the subagent did not already do so
-7. If preflight fails, stop and report the failure without attempting code changes
-8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues
-
-Run and enforce preflight:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1."
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \
-  || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact."
-python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state."
-import json
-import sys
-
-payload = json.load(open(sys.argv[1]))
-if payload.get("status") != "ok":
-    raise SystemExit(f'status must be "ok", got {payload.get("status")!r}')
-if payload.get("preflightPassed") is not True:
-    raise SystemExit('preflightPassed must be true before Step 5 can upload recipes')
-print('OK')
-PY
-[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \
-  || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact."
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \
-  || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5."
-```
-
-Report step complete and upload scenario recipes:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_log "Uploading validated scenario recipes to setup..."
-if [ -n "$GENERATION_ID" ]; then
-  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
-  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete."
-  fi
-  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-    -H "Content-Type: application/json" \
-    -d @"$RECIPE_PATH")
-  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
-  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
-  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
-    report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete."
-  fi
-
-  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
-  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
-  if [ "$VERIFY_STATUS" != "200" ]; then
-    report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)."
-  fi
-fi
-post_setup_log "Scenario validation completed."
-post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion."
-cleanup_dev_server
-```
+Spawn `scenario-generator`:
+
+> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write
+> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types,
+> variable_fields, planning_sections). Mark values as variable only when they must vary across
+> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural
+> per-run isolation). Design entity tables so they serialise as nested trees rooted at the
+> scope entity.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`.
+
+After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`.
+
+## Step 4: Implement Environment Factory
+
+Spawn `env-factory-generator`:
+
+> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure
+> the handler. Register a factory for every model with `independently_created: true` (call the audit's
+> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback
+> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity
+> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down`
+> — that is step 5.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"`
+> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`.
+> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names.
+
+After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary,
+`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`.
+
+## Step 5: Validate Scenario Lifecycle
+
+Spawn `scenario-validator`:
+
+> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4.
+> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5
+> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails
+> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On
+> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at
+> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe
+> per scenario), run `preflight_scenario_recipes.py` against it, and write
+> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write`
+> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and
+> report — do NOT write the sentinel.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`.
+> Verify: every audited model appears in `discover.schema.models`, every `independently_created`
+> model has a registered factory, `auth` is non-empty, DB state is correct before and after
+> `down`, and preflight exits 0.
+
+After completion:
+1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON
+   AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`:
+   enforce and upload the recipes to the dashboard, then ack.
+
+   ```bash
+   AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}"
+   VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json"
+   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+
+   # Enforce terminal artifact contract
+   python3 - "$VALIDATION_ARTIFACT" <<'PY'
+   import json, sys
+   payload = json.load(open(sys.argv[1]))
+   if payload.get("status") != "ok":
+       raise SystemExit("status must be ok before Step 5 can upload recipes")
+   if payload.get("preflightPassed") is not True:
+       raise SystemExit("preflightPassed must be true before Step 5 can upload recipes")
+   PY
+
+   [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; }
+   python3 -c "import json; json.load(open('$RECIPE_PATH'))" \
+     || { echo "scenario-recipes.json is not valid JSON"; exit 1; }
+
+   # Re-run preflight at the orchestrator level for belt-and-suspenders safety.
+   python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \
+     || { echo "Preflight failed at orchestrator gate"; exit 1; }
+
+   # Upload to dashboard
+   GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id")
+   UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \
+     "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \
+     -d @"$RECIPE_PATH")
+   UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+   UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+   echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+   if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+     echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2
+     exit 1
+   fi
+   ```
+
+   Then present validation summary (scenarios passed, any edits made to `scenarios.md`,
+   recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`.
+
+2. If any of those artifacts are missing/invalid: the agent failed — surface the failure
+   report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook
+   will also block test file writes.
+
+## Step 6: Generate E2E Test Cases
+
+Spawn `test-case-generator`:
+
+> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has
+> been reconciled with reality in step 5 — use it as the source of truth). Parse the
+> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any
+> variable value (typed, asserted, or navigated to), never the hardcoded literal.
+> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests
+> that "audit" seeded counts or fixture existence.
+> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with
+> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test
+> file needs frontmatter (title, description, criticality, scenario, flow).
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`.
+
+After completion:
+1. Verify `autonoma/qa-tests/INDEX.md` exists
+2. Present INDEX summary
+3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed`
+   for the final step, marking the setup complete.
 
 ## Completion
 
-After all steps complete, summarize:
-- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available
-- **Step 2**: knowledge base location and core flow count
-- **Step 3**: scenario count and entity types covered
-- **Step 4**: total test count, folder breakdown, coverage correlation
-- **Step 5**: scenario validation results, smoke-test status, and recipe upload status
-
-If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID.
-For terminal failures after later steps begin, clean up the dev server before returning control to the user.
+Summarize each step:
+- **Step 1**: KB location, core flows
+- **Step 2**: entity audit — factories vs raw SQL
+- **Step 3**: scenarios generated
+- **Step 4**: endpoint implemented (handler path, packages, factories registered)
+- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any)
+- **Step 6**: test count, folder breakdown
diff --git a/hooks/hooks.json b/hooks/hooks.json
index d694b5d..310a20c 100644
--- a/hooks/hooks.json
+++ b/hooks/hooks.json
@@ -1,8 +1,28 @@
 {
   "hooks": {
+    "UserPromptSubmit": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pipeline-kickoff.sh"
+          }
+        ]
+      }
+    ],
+    "PreToolUse": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/pretool-heartbeat.sh"
+          }
+        ]
+      }
+    ],
     "PostToolUse": [
       {
-        "matcher": "Write",
+        "matcher": "Write|Edit",
         "hooks": [
           {
             "type": "command",
diff --git a/hooks/pipeline-kickoff.sh b/hooks/pipeline-kickoff.sh
new file mode 100755
index 0000000..29425b7
--- /dev/null
+++ b/hooks/pipeline-kickoff.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# UserPromptSubmit hook. Fires on every user prompt, early-exits unless:
+#   1. The prompt invokes the generate-tests skill/command, AND
+#   2. The pipeline has not already been kicked off (no autonoma/.generation-id).
+#
+# When both conditions hold, this script owns pipeline startup so the agent
+# never has to remember to do it:
+#   - verifies required env vars (hard-fails if AUTONOMA_DOCS_URL is unset)
+#   - creates autonoma/ output dirs
+#   - writes autonoma/.docs-url
+#   - POSTs /v1/setup/setups to create the generation record
+#   - writes autonoma/.generation-id
+#   - emits step.started for step 0
+#
+# Exit 0 always (best-effort reporting must never block test generation).
+
+set -u
+
+INPUT=$(cat)
+
+PROMPT=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('prompt',''))" 2>/dev/null || echo '')
+
+# Match either the slash command or a direct mention of the skill name
+case "$PROMPT" in
+  */generate-tests*|*generate-tests*) ;;
+  *) exit 0 ;;
+esac
+
+# Idempotency: if we've already kicked off this project's pipeline, nothing to do.
+if [ -s autonoma/.generation-id ]; then
+  exit 0
+fi
+
+# Hard-require AUTONOMA_DOCS_URL — the plugin refuses to guess a docs URL.
+if [ -z "${AUTONOMA_DOCS_URL:-}" ]; then
+  echo "[autonoma pipeline-kickoff] ERROR: AUTONOMA_DOCS_URL is not set." >&2
+  echo "[autonoma pipeline-kickoff] Re-launch Claude using the onboarding command from the Autonoma dashboard (it exports AUTONOMA_DOCS_URL), or export it manually before running /generate-tests." >&2
+  exit 0
+fi
+
+mkdir -p autonoma/skills autonoma/qa-tests
+echo "$AUTONOMA_DOCS_URL" > autonoma/.docs-url
+
+# Nothing below this line should ever fail hard — we must not block the agent.
+if [ -z "${AUTONOMA_API_URL:-}" ] || [ -z "${AUTONOMA_API_KEY:-}" ] || [ -z "${AUTONOMA_PROJECT_ID:-}" ]; then
+  echo "[autonoma pipeline-kickoff] WARN: AUTONOMA_API_URL/AUTONOMA_API_KEY/AUTONOMA_PROJECT_ID not all set. Skipping dashboard reporting." >&2
+  exit 0
+fi
+
+# Derive a human-readable app name from the project dir (best-effort).
+APP_NAME=$(basename "$(pwd)")
+
+RESPONSE=$(curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}" 2>/dev/null || echo '{}')
+
+GENERATION_ID=$(echo "$RESPONSE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
+
+if [ -z "$GENERATION_ID" ]; then
+  echo "[autonoma pipeline-kickoff] WARN: setup creation returned no id. Dashboard will not reflect this run." >&2
+  exit 0
+fi
+
+echo "$GENERATION_ID" > autonoma/.generation-id
+echo "[autonoma pipeline-kickoff] Pipeline kickoff complete. generation_id=${GENERATION_ID}" >&2
+
+curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d '{"type":"step.started","data":{"step":0,"name":"Knowledge Base"}}' >/dev/null 2>&1 || true
+
+touch autonoma/.step-0-started
+
+# ---------------------------------------------------------------------------
+# Launch the transcript streamer as a detached background daemon. It tails
+# the session JSONL and forwards assistant text/thinking/tool-use/tool-result
+# events to /v1/setup/setups/{id}/events so the dashboard can render a live
+# activity log. Best-effort, never blocks.
+# ---------------------------------------------------------------------------
+TRANSCRIPT_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '')
+
+if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then
+  STREAMER_PID_FILE="autonoma/.streamer.pid"
+  STREAMER_LOG="autonoma/.streamer.log"
+  STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py"
+
+  # If a prior streamer is still alive (e.g. from a previous session in this
+  # project dir), replace it — the transcript path has changed.
+  if [ -s "$STREAMER_PID_FILE" ]; then
+    existing_pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null || echo '')
+    if [ -n "$existing_pid" ] && kill -0 "$existing_pid" 2>/dev/null; then
+      kill "$existing_pid" 2>/dev/null || true
+    fi
+  fi
+
+  if [ -f "$STREAMER_SCRIPT" ]; then
+    nohup python3 "$STREAMER_SCRIPT" \
+      "$TRANSCRIPT_PATH" \
+      "$GENERATION_ID" \
+      "$AUTONOMA_API_URL" \
+      "$AUTONOMA_API_KEY" \
+      >> "$STREAMER_LOG" 2>&1 </dev/null &
+    STREAMER_PID=$!
+    echo "$STREAMER_PID" > "$STREAMER_PID_FILE"
+    disown "$STREAMER_PID" 2>/dev/null || true
+    echo "[autonoma pipeline-kickoff] Transcript streamer started. pid=${STREAMER_PID} transcript=${TRANSCRIPT_PATH}" >&2
+  fi
+fi
+
+exit 0
diff --git a/hooks/pretool-heartbeat.sh b/hooks/pretool-heartbeat.sh
new file mode 100755
index 0000000..7dd4bf2
--- /dev/null
+++ b/hooks/pretool-heartbeat.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Emits a lightweight "activity" event for every tool call so the dashboard
+# can show Claude is still alive. Best-effort — failures never block the
+# pipeline. Only fires when a generation is active (autonoma/.generation-id
+# exists) and the Autonoma API is reachable.
+
+set -u
+
+INPUT=$(cat)
+
+# Guard: only fire during an active generation.
+GENERATION_ID=$(cat autonoma/.generation-id 2>/dev/null || echo '')
+[ -z "$GENERATION_ID" ] && exit 0
+[ -z "${AUTONOMA_API_URL:-}" ] && exit 0
+[ -z "${AUTONOMA_API_KEY:-}" ] && exit 0
+
+# ---------------------------------------------------------------------------
+# Streamer liveness check + auto-revive. If the transcript streamer daemon
+# has died (crash, OS restart, etc.) re-launch it so the dashboard keeps
+# receiving events. kill -0 is nearly free when the process is alive.
+# Skipped when the plugin's streamer.py is missing (e.g. older plugin cache).
+# ---------------------------------------------------------------------------
+STREAMER_PID_FILE="autonoma/.streamer.pid"
+STREAMER_LOG="autonoma/.streamer.log"
+STREAMER_SCRIPT="${CLAUDE_PLUGIN_ROOT:-$(dirname "$0")/..}/hooks/transcript-streamer.py"
+
+streamer_alive() {
+  [ -s "$STREAMER_PID_FILE" ] || return 1
+  local pid
+  pid=$(cat "$STREAMER_PID_FILE" 2>/dev/null)
+  [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
+}
+
+if ! streamer_alive && [ -f "$STREAMER_SCRIPT" ]; then
+  TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('transcript_path',''))" 2>/dev/null || echo '')
+  if [ -n "$TRANSCRIPT_PATH" ] && [ -f "$TRANSCRIPT_PATH" ]; then
+    nohup python3 "$STREAMER_SCRIPT" \
+      "$TRANSCRIPT_PATH" \
+      "$GENERATION_ID" \
+      "$AUTONOMA_API_URL" \
+      "$AUTONOMA_API_KEY" \
+      >> "$STREAMER_LOG" 2>&1 </dev/null &
+    NEW_PID=$!
+    echo "$NEW_PID" > "$STREAMER_PID_FILE"
+    disown "$NEW_PID" 2>/dev/null || true
+    echo "[$(date +%H:%M:%S)] streamer revived by pretool-heartbeat pid=$NEW_PID transcript=$TRANSCRIPT_PATH" >> "$STREAMER_LOG"
+  fi
+fi
+
+# Build the payload: tool name + a short preview of the most informative arg.
+# Heavy args (full file contents from Write/Edit) are never forwarded.
+PAYLOAD=$(printf '%s' "$INPUT" | python3 -c "
+import json, sys
+try:
+    data = json.load(sys.stdin)
+except Exception:
+    sys.exit(0)
+tool = data.get('tool_name') or ''
+if not tool:
+    sys.exit(0)
+inp = data.get('tool_input') or {}
+# Pick the first informative string field; never forward large blobs.
+preview = ''
+for key in ('command', 'description', 'file_path', 'pattern', 'path', 'query', 'prompt', 'url'):
+    v = inp.get(key)
+    if isinstance(v, str) and v.strip():
+        preview = v.replace('\n', ' ').strip()[:200]
+        break
+print(json.dumps({'type': 'activity', 'data': {'tool': tool, 'preview': preview}}))
+" 2>/dev/null)
+
+[ -z "$PAYLOAD" ] && exit 0
+
+# Short timeout — the hook runs before every tool call, never block the session.
+curl --max-time 2 -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
+  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+  -H "Content-Type: application/json" \
+  -d "$PAYLOAD" >/dev/null 2>&1 || true
+
+exit 0
diff --git a/hooks/transcript-streamer.py b/hooks/transcript-streamer.py
new file mode 100755
index 0000000..be496ca
--- /dev/null
+++ b/hooks/transcript-streamer.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""Streams Claude Code session transcript events to the Autonoma dashboard.
+
+Spawned as a detached background process by pipeline-kickoff.sh when a
+/generate-tests run starts. Tails the session JSONL as Claude appends to it,
+extracts assistant text + thinking + tool calls + tool results, and POSTs
+each as a `transcript` event to /v1/setup/setups/{id}/events so the dashboard
+can render a live activity log.
+
+Self-terminates after IDLE_SECONDS of no new transcript data. Safe to kill
+at any time — the daemon is stateless and holds no locks.
+
+Usage:
+  python3 transcript-streamer.py <transcript_path> <generation_id> <api_url> <api_key>
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+POLL_INTERVAL = 0.75
+IDLE_SECONDS = 1800  # 30 min with no new lines → daemon exits
+MAX_TEXT_CHARS = 4000
+MAX_PREVIEW_CHARS = 500
+HTTP_TIMEOUT = 2.0
+
+
+def main() -> None:
+    if len(sys.argv) != 5:
+        sys.exit(2)
+    transcript_path, generation_id, api_url, api_key = sys.argv[1:5]
+    if not all([transcript_path, generation_id, api_url, api_key]):
+        sys.exit(0)
+
+    path = Path(transcript_path)
+    # Start at end of file. Anything written before this daemon launched was
+    # already visible in the terminal before the dashboard existed — don't
+    # replay it.
+    last_size = path.stat().st_size if path.exists() else 0
+    idle = 0.0
+    log(f"streamer up transcript={transcript_path} generation_id={generation_id} api_url={api_url} start_offset={last_size}")
+
+    while idle < IDLE_SECONDS:
+        if not path.exists():
+            time.sleep(POLL_INTERVAL)
+            idle += POLL_INTERVAL
+            continue
+
+        size = path.stat().st_size
+        if size < last_size:
+            # File was rotated/truncated — reset.
+            last_size = 0
+        if size == last_size:
+            time.sleep(POLL_INTERVAL)
+            idle += POLL_INTERVAL
+            continue
+
+        idle = 0.0
+        with path.open("r", encoding="utf-8", errors="replace") as fh:
+            fh.seek(last_size)
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                payload = extract_event(entry)
+                if payload is not None:
+                    forward(payload, generation_id, api_url, api_key)
+            last_size = fh.tell()
+
+
+def extract_event(entry: dict) -> dict | None:
+    """Turn a transcript line into a dashboard event, or None to skip."""
+    etype = entry.get("type")
+    is_sidechain = bool(entry.get("isSidechain", False))
+    uuid = entry.get("uuid")
+
+    if etype == "assistant":
+        msg = entry.get("message") or {}
+        content = msg.get("content") or []
+        texts: list[str] = []
+        tool_uses: list[dict] = []
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            btype = block.get("type")
+            if btype == "text":
+                t = (block.get("text") or "").strip()
+                if t:
+                    texts.append(t)
+            elif btype == "thinking":
+                t = (block.get("thinking") or "").strip()
+                if t:
+                    texts.append(f"[thinking] {t}")
+            elif btype == "tool_use":
+                tool_uses.append({
+                    "name": block.get("name") or "unknown",
+                    "input_preview": _preview(block.get("input") or {}),
+                })
+        if not texts and not tool_uses:
+            return None
+        data: dict = {"role": "assistant", "is_sidechain": is_sidechain}
+        if uuid:
+            data["uuid"] = uuid
+        if texts:
+            data["text"] = "\n".join(texts)[:MAX_TEXT_CHARS]
+        if tool_uses:
+            data["tool_uses"] = tool_uses
+        return {"type": "transcript", "data": data}
+
+    if etype == "user":
+        msg = entry.get("message") or {}
+        content = msg.get("content")
+        # Tool results arrive as user messages whose content is a list of
+        # tool_result blocks. Raw text user messages (the original prompt)
+        # are skipped — they're already visible to the dashboard.
+        if not isinstance(content, list):
+            return None
+        results: list[dict] = []
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            if block.get("type") != "tool_result":
+                continue
+            body = _flatten_tool_result(block.get("content"))
+            entry_out: dict = {"is_error": bool(block.get("is_error"))}
+            if body:
+                entry_out["preview"] = body[:MAX_PREVIEW_CHARS]
+            results.append(entry_out)
+        if not results:
+            return None
+        data = {"role": "tool_result", "is_sidechain": is_sidechain, "results": results}
+        if uuid:
+            data["uuid"] = uuid
+        return {"type": "transcript", "data": data}
+
+    return None
+
+
+def _flatten_tool_result(raw) -> str:
+    if isinstance(raw, str):
+        return raw
+    if isinstance(raw, list):
+        parts: list[str] = []
+        for c in raw:
+            if isinstance(c, dict) and c.get("type") == "text":
+                parts.append(c.get("text", ""))
+            elif isinstance(c, str):
+                parts.append(c)
+        return "\n".join(parts)
+    return ""
+
+
+def _preview(obj) -> str:
+    try:
+        s = json.dumps(obj, default=str, ensure_ascii=False)
+    except Exception:
+        s = str(obj)
+    return s[:MAX_PREVIEW_CHARS]
+
+
+def forward(payload: dict, generation_id: str, api_url: str, api_key: str) -> None:
+    url = f"{api_url.rstrip('/')}/v1/setup/setups/{generation_id}/events"
+    data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        url,
+        data=data,
+        method="POST",
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:
+            resp.read()
+            log(f"POST {resp.status} {payload.get('type')} {_summarize(payload)}")
+    except urllib.error.HTTPError as e:
+        body = ""
+        try:
+            body = e.read().decode("utf-8", errors="replace")[:300]
+        except Exception:
+            pass
+        log(f"POST {e.code} {payload.get('type')} body={body}")
+    except (urllib.error.URLError, TimeoutError, ConnectionError) as e:
+        log(f"POST network-error {payload.get('type')} err={e!r}")
+    except Exception as e:
+        log(f"POST unknown-error {payload.get('type')} err={e!r}")
+
+
+def _summarize(payload: dict) -> str:
+    data = payload.get("data") or {}
+    role = data.get("role")
+    if role == "assistant":
+        snippet = (data.get("text") or "").replace("\n", " ")[:80]
+        tools = ",".join(t.get("name", "?") for t in data.get("tool_uses") or [])
+        return f"role=assistant text={snippet!r} tools=[{tools}]"
+    if role == "tool_result":
+        return f"role=tool_result n_results={len(data.get('results') or [])}"
+    return ""
+
+
+def log(msg: str) -> None:
+    # Emit to stderr which is redirected to autonoma/.streamer.log by the kickoff hook.
+    try:
+        print(f"[{time.strftime('%H:%M:%S')}] {msg}", file=sys.stderr, flush=True)
+    except Exception:
+        pass
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        # Daemon must never propagate — swallow and exit clean so nothing
+        # surfaces in the user's terminal.
+        pass
diff --git a/hooks/validate-pipeline-output.sh b/hooks/validate-pipeline-output.sh
index ba71260..071d7d7 100755
--- a/hooks/validate-pipeline-output.sh
+++ b/hooks/validate-pipeline-output.sh
@@ -1,70 +1,264 @@
 #!/bin/bash
-# Validates pipeline output files after Write tool use.
+# Validates pipeline output files after Write tool use and emits lifecycle
+# events + artifact uploads to the Autonoma dashboard on successful artifact
+# production. All backend reporting lives here so the agent can never forget.
+#
 # Exit 0 = allow (file is valid or not a pipeline file)
 # Exit 2 = block and send error message to Claude
 
+set -u
+
 INPUT=$(cat)
 
-# Extract the file path from the tool input
 FILE_PATH=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('tool_input',{}).get('file_path',''))" 2>/dev/null)
 
 if [ -z "$FILE_PATH" ]; then
   exit 0
 fi
 
-# Resolve the validators directory relative to this script
+# ----------------------------------------------------------------------------
+# Lifecycle emission helpers
+# ----------------------------------------------------------------------------
+_reporting_ready() {
+  local generation_id
+  generation_id=$(cat autonoma/.generation-id 2>/dev/null || echo '')
+  [ -n "$generation_id" ] && [ -n "${AUTONOMA_API_URL:-}" ] && [ -n "${AUTONOMA_API_KEY:-}" ]
+}
+
+# emit_step_event <step> <started|completed> [<name>] — idempotent via marker.
+emit_step_event() {
+  local step="$1"
+  local action="$2"
+  local name="${3:-}"
+  local marker="autonoma/.step-${step}-${action}"
+
+  [ -f "$marker" ] && return 0
+  mkdir -p autonoma 2>/dev/null || true
+  touch "$marker"
+
+  _reporting_ready || return 0
+  local generation_id
+  generation_id=$(cat autonoma/.generation-id)
+
+  local payload
+  if [ -n "$name" ]; then
+    payload=$(printf '{"type":"step.%s","data":{"step":%s,"name":"%s"}}' "$action" "$step" "$name")
+  else
+    payload=$(printf '{"type":"step.%s","data":{"step":%s}}' "$action" "$step")
+  fi
+
+  curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/events" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d "$payload" >/dev/null 2>&1 || true
+}
+
+# upload_skills — bundle autonoma/skills/*.md and POST to /artifacts. Idempotent.
+upload_skills() {
+  local marker="autonoma/.skills-uploaded"
+  [ -f "$marker" ] && return 0
+  _reporting_ready || return 0
+  [ -d autonoma/skills ] || return 0
+
+  local generation_id
+  generation_id=$(cat autonoma/.generation-id)
+
+  python3 -c "
+import os, json
+skills = []
+d = 'autonoma/skills'
+if os.path.isdir(d):
+    for f in sorted(os.listdir(d)):
+        if f.endswith('.md'):
+            with open(os.path.join(d, f)) as fh:
+                skills.append({'name': f, 'content': fh.read()})
+print(json.dumps({'skills': skills}))
+" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @- >/dev/null 2>&1 || true
+
+  touch "$marker"
+}
+
+# upload_test_cases — bundle autonoma/qa-tests/**/*.md (except INDEX) and POST. Idempotent.
+upload_test_cases() {
+  local marker="autonoma/.test-cases-uploaded"
+  [ -f "$marker" ] && return 0
+  _reporting_ready || return 0
+  [ -d autonoma/qa-tests ] || return 0
+
+  local generation_id
+  generation_id=$(cat autonoma/.generation-id)
+
+  python3 -c "
+import os, json
+test_cases = []
+for root, dirs, files in os.walk('autonoma/qa-tests'):
+    for f in sorted(files):
+        if f.endswith('.md') and f != 'INDEX.md':
+            path = os.path.join(root, f)
+            folder = os.path.relpath(root, 'autonoma/qa-tests')
+            with open(path) as fh:
+                content = fh.read()
+            entry = {'name': f, 'content': content}
+            if folder != '.':
+                entry['folder'] = folder
+            test_cases.append(entry)
+print(json.dumps({'testCases': test_cases}))
+" | curl -sf -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${generation_id}/artifacts" \
+    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
+    -H "Content-Type: application/json" \
+    -d @- >/dev/null 2>&1 || true
+
+  touch "$marker"
+}
+
+# ----------------------------------------------------------------------------
+# Sentinel files: no validation, just event emission.
+#   - autonoma/.endpoint-implemented — env-factory agent writes this after the
+#     discover smoke test + factory-integrity check pass; signals step 3 complete.
+#   - autonoma/.endpoint-validated — scenario-validator writes this after the full
+#     up/down lifecycle passes for every scenario; signals step 4 complete AND
+#     unlocks the gate that allows qa-tests/*.md to be written.
+#   - autonoma/.step-<N>-ack — orchestrator writes this AFTER the user has
+#     confirmed via AskUserQuestion; this is the *only* path that emits
+#     step.started for step N. The UI can therefore show "waiting for
+#     confirmation" in the gap between step.completed (N-1) and step.started N.
+# ----------------------------------------------------------------------------
+STEP_NAMES=("Knowledge Base" "Entity Audit" "Scenarios" "Implement" "Validate" "E2E Tests")
+
+case "$FILE_PATH" in
+  */autonoma/.endpoint-implemented)
+    # Hook-level factory-integrity gate. The env-factory agent's self-policed
+    # check has proven insufficient — see the post-mortem in the plugin repo.
+    # This validator parses autonoma/entity-audit.md, opens the handler named
+    # in the sentinel body, and blocks the write when any factory for a
+    # independently_created: true model contains an inline ORM write.
+    SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+    # Gate 1 — cheap syntactic checks (grep, mount, audit-flip cap).
+    if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_endpoint_implemented.py" "$FILE_PATH" 2>&1); then
+      printf '%s\n' "$OUTPUT" >&2
+      exit 2
+    fi
+    # Gate 2 — creation_file immutability (catches the audit-rewrite attack
+    # without needing an LLM call). Cheap, fast, deterministic.
+    if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_creation_file_immutable.py" 2>&1); then
+      printf '%s\n' "$OUTPUT" >&2
+      exit 2
+    fi
+    # Gate 3 — semantic per-model fidelity via claude -p fan-out. Reads the
+    # rubric from the docs URL at runtime (updatable without plugin changes).
+    # Blocks on hard failures; transient errors + missing config are
+    # warning-only so a broken docs endpoint does not freeze the pipeline.
+    if ! OUTPUT=$(python3 "$SCRIPT_DIR/validators/validate_factory_fidelity.py" "$FILE_PATH" 2>&1); then
+      printf '%s\n' "$OUTPUT" >&2
+      exit 2
+    fi
+    # Gate 3 prints progress to stderr even on success; surface it so the
+    # user sees the validator actually ran.
+    printf '%s\n' "$OUTPUT" >&2
+    emit_step_event 3 completed "Implement"
+    exit 0
+    ;;
+  */autonoma/.endpoint-validated)
+    emit_step_event 4 completed "Validate"
+    exit 0
+    ;;
+  */autonoma/.pipeline-complete)
+    emit_step_event 5 completed "E2E Tests"
+    exit 0
+    ;;
+  */autonoma/.step-*-ack)
+    ack_num=$(basename "$FILE_PATH" | sed -E 's/^\.step-([0-9]+)-ack$/\1/')
+    if [[ "$ack_num" =~ ^[0-9]+$ ]] && [ "$ack_num" -ge 0 ] && [ "$ack_num" -lt ${#STEP_NAMES[@]} ]; then
+      emit_step_event "$ack_num" started "${STEP_NAMES[$ack_num]}"
+    fi
+    # Snapshot entity-audit.md the moment the user confirms the audit is
+    # accepted (step-2-ack = "Scenarios starting", which fires AFTER the user
+    # approves the Entity Audit). This snapshot is diffed against the current
+    # audit at .endpoint-implemented time to detect the env-factory agent
+    # gaming the factory-integrity check by mass-flipping independently_created
+    # true -> false. See the post-mortem in the plugin repo.
+    if [ "$ack_num" = "2" ] && [ -f "autonoma/entity-audit.md" ] && [ ! -f "autonoma/.entity-audit-step2.md" ]; then
+      cp autonoma/entity-audit.md autonoma/.entity-audit-step2.md 2>/dev/null || true
+    fi
+    exit 0
+    ;;
+esac
+
+# ----------------------------------------------------------------------------
+# Validation gate: test files (INDEX.md or any qa-tests/*.md) cannot be written
+# until the scenario-validator writes autonoma/.endpoint-validated. This
+# prevents step 6 from generating tests against an unproven endpoint.
+# ----------------------------------------------------------------------------
+case "$FILE_PATH" in
+  */autonoma/qa-tests/INDEX.md|*/autonoma/qa-tests/*.md)
+    if [ ! -f "autonoma/.endpoint-validated" ]; then
+      echo "VALIDATION GATE: Cannot write $FILE_PATH — autonoma/.endpoint-validated is missing. Complete Step 5 (scenario-validator) first. The validator must run discover/up/down against every scenario and write the sentinel before test generation is allowed." >&2
+      exit 2
+    fi
+    ;;
+esac
+
+# ----------------------------------------------------------------------------
+# Validation routing
+# ----------------------------------------------------------------------------
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 VALIDATORS_DIR="$SCRIPT_DIR/validators"
 
-# Persist the plugin root so orchestrator/subagent bash snippets can find plugin-local scripts.
-# This hook is the earliest reliable place where we know the plugin directory.
-PLUGIN_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-echo "$PLUGIN_ROOT" > /tmp/autonoma-plugin-root
-
-# Ensure PyYAML is available (required for frontmatter parsing)
 python3 -c "import yaml" 2>/dev/null || pip3 install pyyaml -q 2>/dev/null
 
-# Only validate pipeline output files
+STEP_COMPLETED=""
+STEP_COMPLETED_NAME=""
+STEP_STARTED=""
+STEP_STARTED_NAME=""
+POST_UPLOAD=""
+
 case "$FILE_PATH" in
   */autonoma/AUTONOMA.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_kb.py"
     VALIDATOR_NAME="validate-kb"
-    ;;
-  */autonoma/discover.json)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_discover.py"
-    VALIDATOR_NAME="validate-discover"
-    ;;
-  */autonoma/.sdk-endpoint)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_endpoint.py"
-    VALIDATOR_NAME="validate-sdk-endpoint"
-    ;;
-  */autonoma/.sdk-integration.json)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_sdk_integration.py"
-    VALIDATOR_NAME="validate-sdk-integration"
+    STEP_COMPLETED=0
+    STEP_COMPLETED_NAME="Knowledge Base"
+    STEP_STARTED=1
+    STEP_STARTED_NAME="Entity Audit"
+    POST_UPLOAD="skills"
     ;;
   */autonoma/features.json)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_features.py"
     VALIDATOR_NAME="validate-features"
     ;;
+  */autonoma/entity-audit.md)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_entity_audit.py"
+    VALIDATOR_NAME="validate-entity-audit"
+    STEP_COMPLETED=1
+    STEP_COMPLETED_NAME="Entity Audit"
+    STEP_STARTED=2
+    STEP_STARTED_NAME="Scenarios"
+    ;;
   */autonoma/scenarios.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenarios.py"
     VALIDATOR_NAME="validate-scenarios"
-    ;;
-  */autonoma/.scenario-validation.json)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py"
-    VALIDATOR_NAME="validate-scenario-validation"
+    STEP_COMPLETED=2
+    STEP_COMPLETED_NAME="Scenarios"
+    STEP_STARTED=3
+    STEP_STARTED_NAME="Implement"
     ;;
   */autonoma/scenario-recipes.json)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_recipes.py"
     VALIDATOR_NAME="validate-scenario-recipes"
     ;;
+  */autonoma/.scenario-validation.json)
+    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_scenario_validation.py"
+    VALIDATOR_NAME="validate-scenario-validation"
+    ;;
   */autonoma/qa-tests/INDEX.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
     VALIDATOR_NAME="validate-test-index"
-    ;;
-  */autonoma/qa-tests/*/INDEX.md)
-    VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_index.py"
-    VALIDATOR_NAME="validate-adhoc-test-index"
+    STEP_COMPLETED=5
+    STEP_COMPLETED_NAME="E2E Tests"
+    POST_UPLOAD="test_cases"
     ;;
   */autonoma/qa-tests/*/[!I]*.md)
     VALIDATOR_SCRIPT="$VALIDATORS_DIR/validate_test_file.py"
@@ -75,25 +269,21 @@ case "$FILE_PATH" in
     ;;
 esac
 
-# Check file exists
 if [ ! -f "$FILE_PATH" ]; then
   echo "VALIDATION FAILED [$VALIDATOR_NAME]: File does not exist: $FILE_PATH" >&2
   exit 2
 fi
 
-# Check file is non-empty
 if [ ! -s "$FILE_PATH" ]; then
   echo "VALIDATION FAILED [$VALIDATOR_NAME]: File is empty: $FILE_PATH" >&2
   exit 2
 fi
 
-# Check validator script exists
 if [ ! -f "$VALIDATOR_SCRIPT" ]; then
   echo "VALIDATION FAILED [$VALIDATOR_NAME]: Validator script not found: $VALIDATOR_SCRIPT" >&2
   exit 2
 fi
 
-# Run the validator
 RESULT=$(python3 "$VALIDATOR_SCRIPT" "$FILE_PATH" 2>&1)
 EXIT_CODE=$?
 
@@ -102,26 +292,6 @@ if [ $EXIT_CODE -ne 0 ] || [ "$RESULT" != "OK" ]; then
   exit 2
 fi
 
-# scenario-recipes.json must also pass live endpoint preflight. This is the
-# only deterministic check that the generated create payload actually works
-# against the current SDK contract.
-if [ "$VALIDATOR_NAME" = "validate-scenario-recipes" ]; then
-  PREFLIGHT_SCRIPT="$SCRIPT_DIR/preflight_scenario_recipes.py"
-  if [ ! -f "$PREFLIGHT_SCRIPT" ]; then
-    echo "VALIDATION FAILED [scenario-recipes-preflight]: Script not found: $PREFLIGHT_SCRIPT" >&2
-    exit 2
-  fi
-
-  PREFLIGHT_RESULT=$(python3 "$PREFLIGHT_SCRIPT" "$FILE_PATH" 2>&1)
-  PREFLIGHT_EXIT=$?
-  if [ $PREFLIGHT_EXIT -ne 0 ]; then
-    echo "VALIDATION FAILED [scenario-recipes-preflight]: $PREFLIGHT_RESULT" >&2
-    exit 2
-  fi
-fi
-
-# For root INDEX.md only, also validate directory structure
-# (subfolder INDEX.md from adhoc runs uses validate-adhoc-test-index and skips this check)
 if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
   DIR_SCRIPT="$VALIDATORS_DIR/validate_directory_structure.py"
   DIR_RESULT=$(python3 "$DIR_SCRIPT" "$FILE_PATH" 2>&1)
@@ -132,4 +302,17 @@ if [ "$VALIDATOR_NAME" = "validate-test-index" ]; then
   fi
 fi
 
+# Validation passed — emit lifecycle events and upload artifacts.
+# Note: step.started for the NEXT step is NOT emitted here. It fires only when
+# the orchestrator writes autonoma/.step-<N>-ack after the user confirms via
+# AskUserQuestion. That gap gives the UI its "waiting for confirmation" banner.
+if [ -n "$STEP_COMPLETED" ]; then
+  emit_step_event "$STEP_COMPLETED" completed "$STEP_COMPLETED_NAME"
+fi
+
+case "$POST_UPLOAD" in
+  skills) upload_skills ;;
+  test_cases) upload_test_cases ;;
+esac
+
 exit 0
diff --git a/hooks/validators/_audit_schema.py b/hooks/validators/_audit_schema.py
new file mode 100644
index 0000000..f66891e
--- /dev/null
+++ b/hooks/validators/_audit_schema.py
@@ -0,0 +1,67 @@
+"""Shared helpers for reading the entity audit with backwards compatibility.
+
+Two schemas exist on disk:
+
+- v1 (legacy): each model entry has `has_creation_code: bool` and, when true,
+  `creation_file` / `creation_function` / `side_effects`. Nothing about who
+  else mints the model.
+- v2 (current): each model entry has `independently_created: bool` plus a
+  `created_by: [{owner, via, why}]` list.
+
+The compat shim translates v1 into v2 on read so callers only reason about
+`independently_created`. We never rewrite the on-disk file here — that's the
+audit generator's job.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import yaml  # type: ignore
+
+
+def load_audit(path: Path) -> dict[str, dict]:
+    """Return {model_name: normalized_entry}. Empty dict if the file is missing or malformed."""
+    if not path.exists():
+        return {}
+    text = path.read_text()
+    if not text.startswith("---"):
+        return {}
+    end = text.find("\n---", 3)
+    if end < 0:
+        return {}
+    try:
+        fm = yaml.safe_load(text[3:end])
+    except yaml.YAMLError:
+        return {}
+    out: dict[str, dict] = {}
+    for entry in (fm.get("models") or []):
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name") or entry.get("model")
+        if not name:
+            continue
+        out[str(name)] = _normalize(entry)
+    return out
+
+
+def _normalize(entry: dict[str, Any]) -> dict[str, Any]:
+    """Return a copy of entry with `independently_created` + `created_by` populated.
+
+    - If `independently_created` is already set, the entry is v2 — leave it alone
+      (just default `created_by` to []).
+    - Otherwise fall back to v1 `has_creation_code` and set `created_by: []`.
+    """
+    out = dict(entry)
+    if "independently_created" not in out:
+        out["independently_created"] = bool(out.get("has_creation_code"))
+    if "created_by" not in out or out["created_by"] is None:
+        out["created_by"] = []
+    return out
+
+
+def is_independently_created(entry: dict[str, Any]) -> bool:
+    """True when the model has its own standalone creation path (factory-worthy)."""
+    if "independently_created" in entry:
+        return bool(entry["independently_created"])
+    return bool(entry.get("has_creation_code"))
diff --git a/hooks/validators/evals/README.md b/hooks/validators/evals/README.md
new file mode 100644
index 0000000..5c5c7f8
--- /dev/null
+++ b/hooks/validators/evals/README.md
@@ -0,0 +1,53 @@
+# Factory-fidelity evals
+
+Ad-hoc eval harness for the semantic validator in `../validate_factory_fidelity.py`.
+Each fixture simulates one model's Step 2 audit entry, current audit entry,
+factory block, helper (optional), and original creation snippet, then asserts
+the verdict the rubric should produce.
+
+## Run
+
+```bash
+# against a local Astro dev server
+AUTONOMA_DOCS_URL=http://localhost:4321 \
+    python3 hooks/validators/evals/run_evals.py
+
+# single fixture
+AUTONOMA_DOCS_URL=http://localhost:4321 \
+    python3 hooks/validators/evals/run_evals.py --only good_uses_service
+
+# dump the rendered prompt without calling claude (for debugging)
+AUTONOMA_DOCS_URL=http://localhost:4321 \
+    python3 hooks/validators/evals/run_evals.py --write-prompt
+```
+
+Requires the `claude` CLI on `PATH`. Model is configurable via
+`AUTONOMA_FIDELITY_MODEL` (defaults to `sonnet`).
+
+## Fixture schema
+
+```json
+{
+  "model": "<PascalCase model name>",
+  "expected_verdict": "pass" | "fail",
+  "expected_fail_criteria": [1, 2, 3, 4],
+  "step2_audit_entry": "<YAML list-item string for the snapshot>",
+  "current_audit_entry": "<YAML list-item string for the current audit>",
+  "handler_path": "<synthetic path>",
+  "factory_block": "<defineFactory registration snippet>",
+  "helper_section": "File: <path>\\nFunction: <name>\\n\\n```\\n<code>\\n```",
+  "original_creation_file": "<path>",
+  "original_creation_snippet": "<source of the Step 2 creation_function>"
+}
+```
+
+Keep fixtures generic — placeholder names (`UserService`, `src/users/...`) only,
+no references to real Autonoma-internal codebases. The rubric itself is generic;
+evals that leak specific names would mask rubric bias.
+
+## When to add a fixture
+
+- New failure mode observed in the wild → add a `bad_*.json` that captures it
+  with the smallest reproduction, and confirm the current rubric catches it.
+- Rubric edit → run the full suite against the new rubric. A fixture flipping
+  verdict is a signal that the criteria are ambiguous; tighten the wording.
diff --git a/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json
new file mode 100644
index 0000000..de57863
--- /dev/null
+++ b/hooks/validators/evals/fixtures/bad_audit_rewrite_only.json
@@ -0,0 +1,12 @@
+{
+  "model": "Session",
+  "expected_verdict": "fail",
+  "expected_fail_criteria": [3],
+  "step2_audit_entry": "- name: Session\n  has_creation_code: true\n  creation_file: src/auth/auth.ts\n  creation_function: buildAuth.createSession\n  side_effects:\n    - Signs session token\n    - Records session in audit log\n",
+  "current_audit_entry": "- name: Session\n  has_creation_code: true\n  creation_file: src/routes/autonoma/autonoma-factories.ts\n  creation_function: createSession\n  side_effects:\n    - Signs session token\n    - Records session in audit log\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "Session: defineFactory({\n    async create(data, ctx) {\n        return createSession(ctx.executor, data);\n    },\n}),",
+  "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createSession\n\n```\n// Thin wrapper around buildAuth.createSession — preserves signing + audit.\nexport async function createSession(db, data) {\n    const auth = buildAuth(db);\n    return auth.createSession(data);\n}\n```",
+  "original_creation_file": "src/auth/auth.ts",
+  "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n    database: prismaAdapter(db),\n    createSession: async (data) => {\n        const token = signSessionToken(data);\n        const session = await db.session.create({ data: { ...data, token } });\n        await auditLog.record('session.created', { sessionId: session.id });\n        return session;\n    },\n});"
+}
diff --git a/hooks/validators/evals/fixtures/bad_missing_owner.json b/hooks/validators/evals/fixtures/bad_missing_owner.json
new file mode 100644
index 0000000..69e20aa
--- /dev/null
+++ b/hooks/validators/evals/fixtures/bad_missing_owner.json
@@ -0,0 +1,7 @@
+{
+  "kind": "audit_validator",
+  "note": "Dependent whose created_by owner doesn't exist in the audit. The audit VALIDATOR (not the fidelity validator) must reject. This fixture is asserted via subprocess against validate_entity_audit.py.",
+  "audit_frontmatter": "model_count: 2\nfactory_count: 1\nmodels:\n  - name: User\n    independently_created: true\n    creation_file: src/users/user.service.ts\n    creation_function: UserService.create\n    side_effects: []\n    created_by: []\n  - name: Branch\n    independently_created: false\n    created_by:\n      - owner: Application\n        via: ApplicationsService.createApplication\n        why: \"Minted inline — but Application is not in this audit.\"\n",
+  "expected_exit": 1,
+  "expected_stderr_substring": "owner='Application' does not match any model"
+}
diff --git a/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json
new file mode 100644
index 0000000..9eb5f41
--- /dev/null
+++ b/hooks/validators/evals/fixtures/bad_raw_orm_in_factory.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "fail",
+  "expected_fail_criteria": [1, 2],
+  "step2_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password via bcrypt\n    - Creates sibling Organization + Member rows\n    - Emits user_signed_up analytics event\n",
+  "current_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password via bcrypt\n    - Creates sibling Organization + Member rows\n    - Emits user_signed_up analytics event\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "User: defineFactory({\n    async create(data, ctx) {\n        return ctx.executor.user.create({ data });\n    },\n}),",
+  "helper_section": "(The factory does not call an external helper.)",
+  "original_creation_file": "src/users/user.service.ts",
+  "original_creation_snippet": "export const UserService = {\n    async create(input, deps) {\n        const hashed = await bcrypt.hash(input.password, 10);\n        const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n        await ensureOrgMembership(user, deps);\n        await analytics.capture('user_signed_up', { userId: user.id });\n        return user;\n    },\n};"
+}
diff --git a/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json
new file mode 100644
index 0000000..82c3daf
--- /dev/null
+++ b/hooks/validators/evals/fixtures/bad_stub_helper_in_handler_dir.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "fail",
+  "expected_fail_criteria": [1, 2, 4],
+  "step2_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/auth/auth.ts\n  creation_function: buildAuth.databaseHooks.user.create\n  side_effects:\n    - Calls ensureOrgMembership (creates Organization + Member)\n    - Calls ensureBillingProvisioning (creates BillingCustomer)\n    - Emits user_signed_up analytics event\n    - Fires signup webhook\n",
+  "current_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/auth/auth.ts\n  creation_function: buildAuth.databaseHooks.user.create\n  side_effects:\n    - Calls ensureOrgMembership (creates Organization + Member)\n    - Calls ensureBillingProvisioning (creates BillingCustomer)\n    - Emits user_signed_up analytics event\n    - Fires signup webhook\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "User: defineFactory({\n    async create(data, ctx) {\n        return createUser(ctx.executor, data);\n    },\n}),",
+  "helper_section": "File: src/routes/autonoma/autonoma-factories.ts\nFunction: createUser\n\n```\n// better-auth's internal adapter does the same thing — no business logic\n// beyond the raw insert.\nexport async function createUser(db, data) {\n    return db.user.create({ data, select: { id: true } });\n}\n```",
+  "original_creation_file": "src/auth/auth.ts",
+  "original_creation_snippet": "export const buildAuth = (db) => betterAuth({\n    database: prismaAdapter(db),\n    databaseHooks: {\n        user: {\n            create: async (user) => {\n                const created = await db.user.create({ data: user });\n                await ensureOrgMembership(created, { db });\n                await ensureBillingProvisioning(created, { db });\n                await analytics.capture('user_signed_up', { userId: created.id });\n                await fireSignupWebhook(created);\n                return created;\n            },\n        },\n    },\n});"
+}
diff --git a/hooks/validators/evals/fixtures/dependent_skipped.json b/hooks/validators/evals/fixtures/dependent_skipped.json
new file mode 100644
index 0000000..1e131cd
--- /dev/null
+++ b/hooks/validators/evals/fixtures/dependent_skipped.json
@@ -0,0 +1,7 @@
+{
+  "kind": "audit_filter",
+  "note": "Pure dependent (independently_created:false) must be silently skipped by the fidelity validator — no factory, no claude -p call. This fixture is evaluated by checking validate_factory_fidelity's model list, not by calling the LLM.",
+  "model": "BranchDeployment",
+  "expected_verdict": "skip",
+  "step2_audit_entry": "- name: BranchDeployment\n  independently_created: false\n  created_by:\n    - owner: Application\n      via: ApplicationsService.createApplication\n      why: \"Minted inside the Application transaction so the default branch has a deployment row wired up from the start.\"\n"
+}
diff --git a/hooks/validators/evals/fixtures/dual_judged_on_standalone.json b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json
new file mode 100644
index 0000000..569a5cd
--- /dev/null
+++ b/hooks/validators/evals/fixtures/dual_judged_on_standalone.json
@@ -0,0 +1,13 @@
+{
+  "note": "Dual model (independently_created:true AND in someone's created_by). Must be judged ONLY on its standalone factory; the via-owner relationship must not affect the verdict.",
+  "model": "Branch",
+  "expected_verdict": "pass",
+  "expected_fail_criteria": [],
+  "step2_audit_entry": "- name: Branch\n  independently_created: true\n  creation_file: src/branches/branch.service.ts\n  creation_function: BranchService.create\n  side_effects:\n    - Writes a default BranchSettings row\n  created_by:\n    - owner: Application\n      via: ApplicationsService.createApplication\n      why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n",
+  "current_audit_entry": "- name: Branch\n  independently_created: true\n  creation_file: src/branches/branch.service.ts\n  creation_function: BranchService.create\n  side_effects:\n    - Writes a default BranchSettings row\n  created_by:\n    - owner: Application\n      via: ApplicationsService.createApplication\n      why: \"Every new Application needs a default main branch, created inline in the same transaction.\"\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "Branch: defineFactory({\n    async create(data, ctx) {\n        return BranchService.create(data, { executor: ctx.executor });\n    },\n}),",
+  "helper_section": "(The factory does not call an external helper.)",
+  "original_creation_file": "src/branches/branch.service.ts",
+  "original_creation_snippet": "export const BranchService = {\n    async create(input, deps) {\n        const branch = await deps.executor.branch.create({ data: input });\n        await deps.executor.branchSettings.create({ data: { branchId: branch.id, theme: 'default' } });\n        return branch;\n    },\n};"
+}
diff --git a/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json
new file mode 100644
index 0000000..73934fb
--- /dev/null
+++ b/hooks/validators/evals/fixtures/framework_hook_extraction_pass.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "pass",
+  "expected_fail_criteria": [],
+  "step2_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/auth/auth.ts\n  creation_function: betterAuth.databaseHooks.user.create\n  needs_extraction: true\n  extracted_to: src/auth/create-user-with-onboarding.ts\n  side_effects:\n    - ensureOrgMembership\n    - signupHooks.run\n    - platformEvents.emit\n  created_by: []\n",
+  "current_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/auth/auth.ts\n  creation_function: betterAuth.databaseHooks.user.create\n  needs_extraction: true\n  extracted_to: src/auth/create-user-with-onboarding.ts\n  side_effects:\n    - ensureOrgMembership\n    - signupHooks.run\n    - platformEvents.emit\n  created_by: []\n",
+  "handler_path": "src/autonoma/handler.ts",
+  "factory_block": "User: defineFactory({\n    create: async (data) => {\n        return createUserWithOnboarding(db, data, { signupHooks, platformEvents });\n    },\n}),",
+  "helper_section": "File: src/auth/create-user-with-onboarding.ts\nFunction: createUserWithOnboarding\n\n```\nexport async function createUserWithOnboarding(db, data, { signupHooks, platformEvents }) {\n    const user = await db.user.create({ data: { name: data.name, email: data.email } });\n    await ensureOrgMembership(db, user.id);\n    await signupHooks.run(user);\n    await platformEvents.emit('user_signed_up', { userId: user.id });\n    return user;\n}\n```",
+  "original_creation_file": "src/auth/auth.ts",
+  "original_creation_snippet": "export const buildAuth = () => betterAuth({\n    databaseHooks: {\n        user: {\n            create: {\n                after: async (user) => {\n                    await ensureOrgMembership(db, user.id);\n                    await signupHooks.run(user);\n                    await platformEvents.emit('user_signed_up', { userId: user.id });\n                },\n            },\n        },\n    },\n});"
+}
diff --git a/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json
new file mode 100644
index 0000000..f3eccef
--- /dev/null
+++ b/hooks/validators/evals/fixtures/framework_hook_raw_write_fail.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "fail",
+  "expected_fail_criteria": [1, 4],
+  "step2_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/auth/auth.ts\n  creation_function: betterAuth.databaseHooks.user.create\n  needs_extraction: true\n  extracted_to: src/auth/create-user-with-onboarding.ts\n  side_effects:\n    - ensureOrgMembership\n    - signupHooks.run\n    - platformEvents.emit\n  created_by: []\n",
+  "current_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/auth/auth.ts\n  creation_function: betterAuth.databaseHooks.user.create\n  needs_extraction: true\n  extracted_to: src/auth/create-user-with-onboarding.ts\n  side_effects:\n    - ensureOrgMembership\n    - signupHooks.run\n    - platformEvents.emit\n  created_by: []\n",
+  "handler_path": "src/autonoma/handler.ts",
+  "factory_block": "User: defineFactory({\n    create: async (data) => {\n        return db.user.create({ data: { name: data.name, email: data.email } });\n    },\n}),",
+  "helper_section": "(The factory does not call an external helper.)",
+  "original_creation_file": "src/auth/auth.ts",
+  "original_creation_snippet": "export const buildAuth = () => betterAuth({\n    databaseHooks: {\n        user: {\n            create: {\n                after: async (user) => {\n                    await ensureOrgMembership(db, user.id);\n                    await signupHooks.run(user);\n                    await platformEvents.emit('user_signed_up', { userId: user.id });\n                },\n            },\n        },\n    },\n});"
+}
diff --git a/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json
new file mode 100644
index 0000000..b2c2078
--- /dev/null
+++ b/hooks/validators/evals/fixtures/good_thin_wrapper_after_extraction.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "pass",
+  "expected_fail_criteria": [],
+  "step2_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/auth/create-user.ts\n  creation_function: createUser\n  side_effects:\n    - Calls ensureOrgMembership (creates Organization + Member)\n    - Calls ensureBillingProvisioning (creates BillingCustomer)\n    - Emits user_signed_up analytics event\n",
+  "current_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/auth/create-user.ts\n  creation_function: createUser\n  extracted_to: src/auth/create-user.ts\n  side_effects:\n    - Calls ensureOrgMembership (creates Organization + Member)\n    - Calls ensureBillingProvisioning (creates BillingCustomer)\n    - Emits user_signed_up analytics event\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "User: defineFactory({\n    async create(data, ctx) {\n        return createUser(data, { db: ctx.executor, analytics, billing });\n    },\n}),",
+  "helper_section": "File: src/auth/create-user.ts\nFunction: createUser\n\n```\n// Extracted from the databaseHooks.user.create closure for Environment\n// Factory reuse (preserves Org + Member + billing provisioning).\nexport async function createUser(input, deps) {\n    const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n    await ensureOrgMembership(user, deps);\n    await ensureBillingProvisioning(user, deps);\n    await deps.analytics.capture('user_signed_up', { userId: user.id });\n    return user;\n}\n```",
+  "original_creation_file": "src/auth/create-user.ts",
+  "original_creation_snippet": "export async function createUser(input, deps) {\n    const user = await deps.db.user.create({ data: { ...input, password: hash(input.password) } });\n    await ensureOrgMembership(user, deps);\n    await ensureBillingProvisioning(user, deps);\n    await deps.analytics.capture('user_signed_up', { userId: user.id });\n    return user;\n}"
+}
diff --git a/hooks/validators/evals/fixtures/good_uses_service.json b/hooks/validators/evals/fixtures/good_uses_service.json
new file mode 100644
index 0000000..86684bd
--- /dev/null
+++ b/hooks/validators/evals/fixtures/good_uses_service.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "pass",
+  "expected_fail_criteria": [],
+  "step2_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password via bcrypt\n    - Creates sibling Organization + Member rows\n    - Emits user_signed_up analytics event\n",
+  "current_audit_entry": "- name: User\n  has_creation_code: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password via bcrypt\n    - Creates sibling Organization + Member rows\n    - Emits user_signed_up analytics event\n",
+  "handler_path": "src/routes/autonoma/autonoma.handler.ts",
+  "factory_block": "User: defineFactory({\n    async create(data, ctx) {\n        return UserService.create(data, { executor: ctx.executor });\n    },\n}),",
+  "helper_section": "(The factory does not call an external helper.)",
+  "original_creation_file": "src/users/user.service.ts",
+  "original_creation_snippet": "export const UserService = {\n    async create(input, deps) {\n        const hashed = await bcrypt.hash(input.password, 10);\n        const user = await deps.executor.user.create({ data: { ...input, password: hashed } });\n        await ensureOrgMembership(user, deps);\n        await analytics.capture('user_signed_up', { userId: user.id });\n        return user;\n    },\n};"
+}
diff --git a/hooks/validators/evals/fixtures/helper_unresolvable_errors.json b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json
new file mode 100644
index 0000000..552741a
--- /dev/null
+++ b/hooks/validators/evals/fixtures/helper_unresolvable_errors.json
@@ -0,0 +1,12 @@
+{
+  "model": "User",
+  "expected_verdict": "error",
+  "expected_fail_criteria": [],
+  "step2_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password\n    - Provisions Org + Member\n  created_by: []\n",
+  "current_audit_entry": "- name: User\n  independently_created: true\n  creation_file: src/users/user.service.ts\n  creation_function: UserService.create\n  side_effects:\n    - Hashes password\n    - Provisions Org + Member\n  created_by: []\n",
+  "handler_path": "src/autonoma/handler.ts",
+  "factory_block": "User: defineFactory({\n    create: async (data) => {\n        return createUserWithMystery(data);\n    },\n}),",
+  "helper_section": "(The factory calls identifiers that were not resolvable as named imports: createUserWithMystery. Treat this as missing-context, not as evidence of a raw-write factory.)",
+  "original_creation_file": "src/users/user.service.ts",
+  "original_creation_snippet": "export const UserService = {\n    async create(input) {\n        return db.user.create({ data: input });\n    },\n};"
+}
diff --git a/hooks/validators/evals/run_evals.py b/hooks/validators/evals/run_evals.py
new file mode 100755
index 0000000..fc695ed
--- /dev/null
+++ b/hooks/validators/evals/run_evals.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""Evals for the semantic factory-fidelity validator + the entity-audit
+validator's schema invariants.
+
+Each fixture is a self-contained JSON blob. The kind of fixture is chosen by
+`expected_verdict` (or by the `kind` field for non-LLM fixtures):
+
+- `expected_verdict: "pass" | "fail" | "error"` — LLM fixture. Feeds the
+  prompt to `claude -p`, parses the JSON verdict, and asserts verdict +
+  failing criteria match. `error` is used when a fixture deliberately
+  withholds context (e.g. helper unresolvable) and the LLM should decline
+  to fail-judge rather than falsely fail.
+- `expected_verdict: "skip"` — filter fixture. Asserts that the fidelity
+  validator's model selector would NOT include this model (i.e. the audit
+  entry is pure dependent / legacy false). No LLM call, no cost.
+- `kind: "audit_validator"` — audit-validator fixture. Synthesises a
+  minimal entity-audit.md from `audit_frontmatter`, runs
+  `validate_entity_audit.py` as a subprocess, and asserts the exit code +
+  stderr substring.
+
+Run:
+    AUTONOMA_DOCS_URL=http://localhost:4321 python3 hooks/validators/evals/run_evals.py
+
+    # single fixture:
+    ... run_evals.py --only good_uses_service
+
+Exits 0 on success, 1 on any mismatch.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+VALIDATORS = HERE.parent
+sys.path.insert(0, str(VALIDATORS))
+
+import validate_factory_fidelity as v  # noqa: E402
+from _audit_schema import is_independently_created  # noqa: E402
+
+
+def load_fixture(path: Path) -> dict:
+    return json.loads(path.read_text())
+
+
+def render_prompt(fixture: dict, rubric: str, tpl: str) -> str:
+    return (
+        tpl.replace("{{RUBRIC}}", rubric)
+        .replace("{{MODEL}}", fixture["model"])
+        .replace("{{STEP2_AUDIT_ENTRY}}", fixture["step2_audit_entry"])
+        .replace("{{CURRENT_AUDIT_ENTRY}}", fixture["current_audit_entry"])
+        .replace("{{HANDLER_PATH}}", fixture.get("handler_path", "(fixture)"))
+        .replace("{{FACTORY_BLOCK}}", fixture["factory_block"])
+        .replace("{{HELPER_SECTION}}", fixture.get("helper_section", "(The factory does not call an external helper.)"))
+        .replace("{{ORIGINAL_CREATION_FILE}}", fixture.get("original_creation_file", "(unknown)"))
+        .replace("{{ORIGINAL_CREATION_SNIPPET}}", fixture.get("original_creation_snippet", ""))
+    )
+
+
+def run_skip_fixture(fixture: dict) -> tuple[bool, str]:
+    """Parse fixture's step2_audit_entry as a single-model YAML list and assert
+    is_independently_created() returns False (so the fidelity validator would skip it)."""
+    import yaml
+    try:
+        parsed = yaml.safe_load(fixture["step2_audit_entry"])
+    except yaml.YAMLError as e:
+        return False, f"could not parse step2_audit_entry: {e}"
+    if not isinstance(parsed, list) or not parsed or not isinstance(parsed[0], dict):
+        return False, "step2_audit_entry must be a single-entry YAML list"
+    entry = parsed[0]
+    if is_independently_created(entry):
+        return False, (
+            f"fidelity validator would NOT skip this model — is_independently_created "
+            f"returned True for entry {entry!r}"
+        )
+    return True, "ok"
+
+
+def run_audit_validator_fixture(fixture: dict) -> tuple[bool, str]:
+    fm = fixture["audit_frontmatter"]
+    expected_exit = int(fixture.get("expected_exit", 1))
+    expected_substr = fixture.get("expected_stderr_substring", "")
+    with tempfile.TemporaryDirectory() as td:
+        audit = Path(td) / "entity-audit.md"
+        audit.write_text("---\n" + fm + "---\nBody\n")
+        proc = subprocess.run(
+            [sys.executable, str(VALIDATORS / "validate_entity_audit.py"), str(audit)],
+            capture_output=True, text=True, timeout=30,
+        )
+    if proc.returncode != expected_exit:
+        return False, (
+            f"exit mismatch: expected={expected_exit} observed={proc.returncode} "
+            f"stdout={proc.stdout!r} stderr={proc.stderr!r}"
+        )
+    combined = (proc.stdout or "") + (proc.stderr or "")
+    if expected_substr and expected_substr not in combined:
+        return False, f"expected stderr substring {expected_substr!r} not in output:\n{combined}"
+    return True, "ok"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--only", help="Run a single fixture by name (no extension)")
+    ap.add_argument("--write-prompt", action="store_true", help="Write the rendered prompt for each LLM fixture to stdout and exit without calling claude")
+    args = ap.parse_args()
+
+    os.chdir(VALIDATORS.parent.parent)
+    Path("autonoma").mkdir(exist_ok=True)
+    url_file = Path("autonoma/.docs-url")
+    restore = url_file.exists()
+    prior = url_file.read_text() if restore else None
+    docs = os.environ.get("AUTONOMA_DOCS_URL")
+    if docs:
+        url_file.write_text(docs.strip())
+
+    fixtures_dir = HERE / "fixtures"
+    fixtures = sorted(fixtures_dir.glob("*.json"))
+    if args.only:
+        fixtures = [f for f in fixtures if f.stem == args.only]
+        if not fixtures:
+            print(f"no fixture named {args.only}", file=sys.stderr)
+            return 1
+
+    # Only fetch rubric if we have any LLM fixtures left in the run list
+    needs_llm = any(
+        load_fixture(fp).get("expected_verdict") in ("pass", "fail", "error")
+        for fp in fixtures
+    )
+    rubric = tpl = None
+    try:
+        if needs_llm:
+            pair = v.fetch_rubric()
+            if not pair:
+                print("could not fetch rubric — set AUTONOMA_DOCS_URL", file=sys.stderr)
+                return 1
+            rubric, tpl = pair
+    finally:
+        if restore:
+            url_file.write_text(prior or "")
+        elif docs:
+            try:
+                url_file.unlink()
+            except OSError:
+                pass
+
+    fails: list[str] = []
+    for fp in fixtures:
+        fixture = load_fixture(fp)
+        kind = fixture.get("kind")
+        expected = fixture.get("expected_verdict")
+
+        if kind == "audit_validator":
+            ok, detail = run_audit_validator_fixture(fixture)
+            tag = "PASS" if ok else "FAIL"
+            print(f"{tag} {fp.stem}: audit_validator")
+            if not ok:
+                print(f"    reason: {detail}")
+                fails.append(fp.stem)
+            continue
+
+        if expected == "skip":
+            ok, detail = run_skip_fixture(fixture)
+            tag = "PASS" if ok else "FAIL"
+            print(f"{tag} {fp.stem}: expected=skip observed={'skip' if ok else 'NOT-skipped'}")
+            if not ok:
+                print(f"    reason: {detail}")
+                fails.append(fp.stem)
+            continue
+
+        # LLM fixture
+        if args.write_prompt:
+            print(f"── {fp.stem} ──")
+            print(render_prompt(fixture, rubric, tpl))
+            print()
+            continue
+        verdict = v.run_claude(render_prompt(fixture, rubric, tpl))
+        observed = verdict.get("verdict", "error")
+        matched = observed == expected
+        detail_ok = True
+        if expected == "fail" and observed == "fail":
+            expected_fails = set(fixture.get("expected_fail_criteria") or [])
+            if expected_fails:
+                observed_fails = {c.get("id") for c in (verdict.get("criteria") or []) if c.get("status") == "fail"}
+                missing = expected_fails - observed_fails
+                if missing:
+                    detail_ok = False
+        ok = matched and detail_ok
+        tag = "PASS" if ok else "FAIL"
+        print(f"{tag} {fp.stem}: expected={expected} observed={observed}")
+        if not ok:
+            print(f"    reason: expected criteria={fixture.get('expected_fail_criteria')} observed={[c for c in (verdict.get('criteria') or [])]}")
+            print(f"    fix_hint: {verdict.get('fix_hint','')}")
+            fails.append(fp.stem)
+
+    if fails:
+        print(f"\n{len(fails)} eval failure(s): {', '.join(fails)}", file=sys.stderr)
+        return 1
+    print("\nall evals passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/hooks/validators/validate_creation_file_immutable.py b/hooks/validators/validate_creation_file_immutable.py
new file mode 100755
index 0000000..3bfdf87
--- /dev/null
+++ b/hooks/validators/validate_creation_file_immutable.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""Validator: `creation_file` must be immutable after Step 2.
+
+Rationale — Run 4 post-mortem. The env-factory agent evaded the factory
+integrity hook by (a) extracting stubs into a new file under the handler's
+directory and (b) rewriting `creation_file` in the audit to point at the stub,
+so every downstream check validated against fabricated ground truth.
+
+Rule: for every model with `has_creation_code: true` in BOTH the Step 2
+snapshot AND the current audit, the `creation_file` column must not change.
+Allowed transitions:
+  - row removed from current (not a change, model dropped)
+  - has_creation_code flipped true -> false (covered by the audit-flip cap)
+  - a new model added in current (snapshot has no row to compare)
+
+Exit 0 = clean. Exit 2 with actionable message on violation.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import yaml  # type: ignore
+
+import sys as _sys
+from pathlib import Path as _Path
+_sys.path.insert(0, str(_Path(__file__).resolve().parent))
+from _audit_schema import is_independently_created  # noqa: E402
+
+
+def load_audit(path: Path) -> dict[str, dict]:
+    if not path.exists():
+        return {}
+    text = path.read_text()
+    if not text.startswith("---"):
+        return {}
+    end = text.find("\n---", 3)
+    if end < 0:
+        return {}
+    try:
+        fm = yaml.safe_load(text[3:end])
+    except yaml.YAMLError:
+        return {}
+    out: dict[str, dict] = {}
+    for entry in (fm.get("models") or []):
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name") or entry.get("model")
+        if not name:
+            continue
+        out[str(name)] = entry
+    return out
+
+
+def main() -> None:
+    snap = load_audit(Path("autonoma/.entity-audit-step2.md"))
+    cur = load_audit(Path("autonoma/entity-audit.md"))
+    if not snap:
+        # Snapshot missing — skip silently. The audit-flip check already
+        # prints a warning when appropriate.
+        sys.exit(0)
+
+    violations: list[tuple[str, str, str]] = []
+    for name, snap_entry in snap.items():
+        if not is_independently_created(snap_entry):
+            continue
+        cur_entry = cur.get(name)
+        if cur_entry is None:
+            continue
+        if not is_independently_created(cur_entry):
+            # Flipped to false — caught elsewhere.
+            continue
+        snap_file = (snap_entry.get("creation_file") or "").strip()
+        cur_file = (cur_entry.get("creation_file") or "").strip()
+        if snap_file and cur_file and snap_file != cur_file:
+            violations.append((name, snap_file, cur_file))
+
+    if not violations:
+        sys.exit(0)
+
+    lines = [
+        f"CREATION_FILE IMMUTABILITY VIOLATED — {len(violations)} models had "
+        "their Step 2 `creation_file` column overwritten.",
+        "",
+        "The Step 2 audit is a statement about the existing codebase at "
+        "analysis time. Its `creation_file` column names where the real "
+        "creation logic lives BEFORE the factory was written. Overwriting it "
+        "to point at a file the factory agent created is the audit-rewrite "
+        "attack from the Run 4 post-mortem — it makes every downstream check "
+        "validate against fabricated ground truth.",
+        "",
+        "Violations (model: snapshot_path -> current_path):",
+    ]
+    for name, s, c in violations[:40]:
+        lines.append(f"  - {name}: {s}  ->  {c}")
+    if len(violations) > 40:
+        lines.append(f"  ... and {len(violations) - 40} more")
+    lines.append("")
+    lines.append(
+        "To fix: restore the original `creation_file` values from "
+        "autonoma/.entity-audit-step2.md. If you extracted the creation code "
+        "into a new helper, record that in an `extracted_to:` field — do NOT "
+        "overwrite `creation_file`. The audit's creation_file must continue "
+        "to name the file where the real business logic originally lives."
+    )
+    sys.stderr.write("\n".join(lines) + "\n")
+    sys.exit(2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hooks/validators/validate_endpoint_implemented.py b/hooks/validators/validate_endpoint_implemented.py
new file mode 100755
index 0000000..4e27ac4
--- /dev/null
+++ b/hooks/validators/validate_endpoint_implemented.py
@@ -0,0 +1,451 @@
+#!/usr/bin/env python3
+"""Validator for autonoma/.endpoint-implemented.
+
+Blocks the sentinel write when the handler file contains an inline ORM write
+inside a defineFactory({ create }) body for a model the entity audit marked
+has_creation_code: true. This is the #1 bug the env-factory agent ships and
+the agent's self-policing factory-integrity check has proven insufficient.
+
+Inputs: path to .endpoint-implemented (via validate-pipeline-output.sh).
+Reads:
+  - autonoma/entity-audit.md (frontmatter: models with has_creation_code true/false)
+  - the handler file path recorded in .endpoint-implemented body (first match of "handler: <path>")
+
+Exit codes:
+  0 — clean
+  2 — anti-pattern found; prints a Claude-facing error message on stderr
+
+The regex set mirrors the language list in agents/env-factory-generator.md's
+"The one thing you MUST NOT do" section. Raw SQL literal INSERTs are not
+matched here because distinguishing them from teardown DELETE strings in the
+same factory block requires full parsing — the grep-level anti-pattern
+detection catches the >95% case.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import sys
+from pathlib import Path
+
+import yaml  # type: ignore
+
+SENTINEL_PATH = sys.argv[1] if len(sys.argv) > 1 else ""
+
+# Max number of models allowed to flip from has_creation_code: true to false
+# between the Step 2 snapshot and the audit at .endpoint-implemented time.
+# Overridable via env for unusual migrations; default 5 matches the agent's
+# own recommendation in the third-run post-mortem.
+AUDIT_FLIP_CAP = int(os.environ.get("AUTONOMA_AUDIT_FLIP_CAP", "5"))
+
+# Standalone server patterns: when the handler directory contains a file that
+# starts its own HTTP server instead of exporting a router mounted on the main
+# app, we block. This is the second bug from the third-run post-mortem.
+STANDALONE_SERVER_PATTERNS = [
+    re.compile(r"\bserve\s*\(\s*\{[^}]*\bfetch\b", re.DOTALL),  # @hono/node-server
+    re.compile(r"\bapp\.listen\s*\("),                              # express / hono-node
+    re.compile(r"\bhttp\.createServer\s*\("),                       # raw node
+    re.compile(r"\buvicorn\.run\s*\("),                             # python
+    re.compile(r"\bFlask\s*\([^)]*\)[^\n]*\.run\s*\("),         # flask
+    re.compile(r"\brun!\s*$", re.MULTILINE),                          # ruby sinatra-ish
+]
+
+# Anti-pattern: ORM create/insert/upsert calls that almost certainly belong to
+# a raw ORM write rather than a service/repository method call.
+ORM_ANTI_PATTERN = re.compile(
+    r"\b(prisma|db|tx|ctx\.executor)\."        # ORM root
+    r"[a-zA-Z_][a-zA-Z0-9_]*\."                # model accessor
+    r"(create|createMany|insert|insertMany|upsert)\s*\(",
+    re.IGNORECASE,
+)
+
+# A second class: Drizzle-style `tx.insert(xTable)` / `db.insert(xTable)`.
+DRIZZLE_INSERT = re.compile(
+    r"\b(tx|db|ctx\.executor)\.insert\s*\(",
+)
+
+FACTORY_HEADER = re.compile(
+    r"([A-Z][A-Za-z0-9_]*)\s*:\s*defineFactory\s*\(\s*\{",
+)
+
+
+def fail(message: str) -> None:
+    sys.stderr.write(message + "\n")
+    sys.exit(2)
+
+
+def find_matching_brace(src: str, open_idx: int) -> int:
+    """Given index of `{`, return index of matching `}`.
+
+    Naive balancer — ignores strings/comments. Good enough for generated
+    handler files that follow the standard shape.
+    """
+    depth = 0
+    i = open_idx
+    n = len(src)
+    while i < n:
+        c = src[i]
+        if c == "{":
+            depth += 1
+        elif c == "}":
+            depth -= 1
+            if depth == 0:
+                return i
+        i += 1
+    return -1
+
+
+def extract_factory_bodies(src: str) -> list[tuple[str, str]]:
+    """Return list of (model_name, factory_inner_src)."""
+    out: list[tuple[str, str]] = []
+    for m in FACTORY_HEADER.finditer(src):
+        model = m.group(1)
+        brace_open = src.find("{", m.end() - 1)
+        if brace_open < 0:
+            continue
+        brace_close = find_matching_brace(src, brace_open)
+        if brace_close < 0:
+            continue
+        out.append((model, src[brace_open + 1 : brace_close]))
+    return out
+
+
+def extract_create_body(factory_src: str) -> str:
+    """Find the `create:` or `create(` body inside a factory config object."""
+    # Pattern: create(data, ctx) { ... }  OR  create: async (data, ctx) => { ... }
+    # OR create: (data, ctx) => { ... }
+    create_start = re.search(r"\bcreate\s*[(:]", factory_src)
+    if not create_start:
+        return ""
+    # Find the first `{` after create_start.
+    brace_open = factory_src.find("{", create_start.end())
+    if brace_open < 0:
+        return ""
+    brace_close = find_matching_brace(factory_src, brace_open)
+    if brace_close < 0:
+        return ""
+    return factory_src[brace_open + 1 : brace_close]
+
+
+def parse_audit() -> dict[str, bool]:
+    """Return {model_name: has_creation_code}."""
+    audit_path = Path("autonoma/entity-audit.md")
+    if not audit_path.exists():
+        fail("Missing autonoma/entity-audit.md — cannot verify factory integrity.")
+    text = audit_path.read_text()
+    if not text.startswith("---"):
+        fail("autonoma/entity-audit.md missing YAML frontmatter.")
+    end = text.find("\n---", 3)
+    if end < 0:
+        fail("autonoma/entity-audit.md frontmatter not terminated.")
+    try:
+        fm = yaml.safe_load(text[3:end])
+    except yaml.YAMLError as e:
+        fail(f"autonoma/entity-audit.md frontmatter not valid YAML: {e}")
+    models = fm.get("models") or []
+    out: dict[str, bool] = {}
+    for entry in models:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get("name") or entry.get("model")
+        if not name:
+            continue
+        out[str(name)] = bool(entry.get("has_creation_code"))
+    return out
+
+
+def resolve_handler_path() -> Path:
+    """Read the handler path recorded in .endpoint-implemented body."""
+    if not SENTINEL_PATH or not Path(SENTINEL_PATH).exists():
+        fail(".endpoint-implemented sentinel path not provided or missing.")
+    body = Path(SENTINEL_PATH).read_text()
+
+    candidates: list[str] = []
+    m = re.search(r"handler:\s*(\S+)", body, re.IGNORECASE)
+    if m:
+        candidates.append(m.group(1).rstrip(".,;:"))
+    # Fallback: extract every path-looking token ending in a source extension.
+    for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body):
+        candidates.append(tok.rstrip(".,;:"))
+
+    seen: set[str] = set()
+    for cand in candidates:
+        if cand in seen:
+            continue
+        seen.add(cand)
+        p = Path(cand)
+        if not p.is_absolute():
+            p = Path.cwd() / cand
+        if p.exists() and p.is_file():
+            return p
+
+    fail(
+        ".endpoint-implemented body must name the handler file (e.g. a line "
+        "'handler: apps/api/src/routes/autonoma/autonoma.handler.ts') so the "
+        "factory-integrity validator can locate it. Checked: "
+        + ", ".join(candidates[:8] or ["(no path tokens found)"])
+    )
+    return Path()  # unreachable
+
+
+def check_audit_flip() -> list[str]:
+    """Compare the Step 2 snapshot to the current audit; return error lines.
+
+    Enforces a cap on how many models may flip from has_creation_code: true
+    to false between Step 2 ack and .endpoint-implemented. If no snapshot
+    exists (older projects that started before this hook shipped) we skip
+    silently — the snapshot is created automatically on .step-2-ack.
+    """
+    snapshot = Path("autonoma/.entity-audit-step2.md")
+    current = Path("autonoma/entity-audit.md")
+    if not snapshot.exists() or not current.exists():
+        return []
+
+    def _true_set(path: Path) -> set[str]:
+        text = path.read_text()
+        if not text.startswith("---"):
+            return set()
+        end = text.find("\n---", 3)
+        if end < 0:
+            return set()
+        try:
+            fm = yaml.safe_load(text[3:end])
+        except yaml.YAMLError:
+            return set()
+        out: set[str] = set()
+        for entry in (fm.get("models") or []):
+            if not isinstance(entry, dict):
+                continue
+            name = entry.get("name") or entry.get("model")
+            if name and bool(entry.get("has_creation_code")):
+                out.add(str(name))
+        return out
+
+    before = _true_set(snapshot)
+    after = _true_set(current)
+    flipped = sorted(before - after)
+    if len(flipped) <= AUDIT_FLIP_CAP:
+        return []
+
+    lines = [
+        f"AUDIT FLIP CAP EXCEEDED — {len(flipped)} models flipped from "
+        f"has_creation_code: true to false since Step 2 (cap: {AUDIT_FLIP_CAP}).",
+        "",
+        "The env-factory agent is editing ground truth to dodge the factory "
+        "integrity check. Branch 3 (\"audit is factually wrong\") is for cases "
+        "where the audit's creation_function does NOT exist or creates NOTHING "
+        "— not for cases where calling it is inconvenient (complex DI, external "
+        "side effects, Temporal workflows, bulk orchestrators). Those are "
+        "Branch 2 problems: extract helpers, wire constructor deps, or guard "
+        "external calls in the service itself.",
+        "",
+        "Models flipped (showing first 40):",
+    ]
+    for name in flipped[:40]:
+        lines.append(f"  - {name}")
+    if len(flipped) > 40:
+        lines.append(f"  ... and {len(flipped) - 40} more")
+    lines.append("")
+    lines.append(
+        "To proceed: (a) restore has_creation_code: true for the models above "
+        "and write real factories per the Per-model decision tree, or (b) if "
+        "you truly believe a subset should flip, ask the user to raise "
+        "AUTONOMA_AUDIT_FLIP_CAP and confirm the diff."
+    )
+    return lines
+
+
+def check_handler_mount(handler_path: Path) -> list[str]:
+    """Return error lines if the handler isn't mounted on the main app.
+
+    Two checks:
+      1. No sibling file in the handler directory starts its own server.
+      2. Somewhere in the backend source tree, a file imports the handler
+         (by relative path, module path, or file basename).
+    """
+    handler_dir = handler_path.parent
+    errors: list[str] = []
+
+    # 1) Detect standalone server files in the handler directory.
+    standalone_hits: list[tuple[Path, str]] = []
+    for sibling in handler_dir.iterdir():
+        if not sibling.is_file():
+            continue
+        if sibling == handler_path:
+            continue
+        if sibling.name.endswith((".test.ts", ".test.js", ".spec.ts", ".spec.js")):
+            continue
+        if sibling.suffix not in {".ts", ".tsx", ".js", ".mjs", ".py", ".rb", ".go", ".rs", ".java"}:
+            continue
+        try:
+            text = sibling.read_text()
+        except OSError:
+            continue
+        for pat in STANDALONE_SERVER_PATTERNS:
+            if pat.search(text):
+                standalone_hits.append((sibling, pat.pattern))
+                break
+
+    if standalone_hits:
+        errors.append(
+            "STANDALONE SERVER DETECTED — the Autonoma handler must be mounted "
+            "as a route on the existing application, not run as its own HTTP "
+            "server. The following files bind their own port:"
+        )
+        errors.append("")
+        for p, pat in standalone_hits:
+            errors.append(f"  - {p} (matched: {pat})")
+        errors.append("")
+        errors.append(
+            "Fix: delete the standalone server file and mount the handler as a "
+            "route on the main app, following the same pattern every other "
+            "feature uses (e.g. `app.route(\"/api/autonoma\", router)` in Hono, "
+            "`app.use(\"/api/autonoma\", router)` in Express, or the equivalent "
+            "for your framework). Read the main app entry file first and copy "
+            "its existing routing pattern."
+        )
+        errors.append("")
+
+    # 2) Verify the handler is imported from somewhere reachable. We use the
+    # last two path segments (parent-dir/file-stem) to avoid false positives
+    # from unrelated packages that happen to share the parent-dir name (e.g.
+    # `@autonoma/logger` vs the local `autonoma/handler`).
+    handler_basename = handler_path.stem              # e.g. "handler"
+    handler_parent_dir = handler_dir.name             # e.g. "autonoma"
+    specific_fragment = f"{handler_parent_dir}/{handler_basename}"  # "autonoma/handler"
+    # Also accept any file in the same parent directory (routes on the router
+    # file next to handler.ts still count as mounting — e.g. autonoma/router.ts
+    # is imported by app.ts and imports handler.ts).
+    import_patterns = [
+        re.compile(rf"['\"][^'\"]*{re.escape(specific_fragment)}(?:['\"]|\.[a-z]+['\"])"),
+        re.compile(rf"\bfrom\s+[\w.]*{re.escape(handler_parent_dir)}\.{re.escape(handler_basename)}\b"),  # python
+    ]
+    found_import = False
+    root = Path.cwd()
+    # Only scan source dirs with reasonable extensions.
+    source_exts = {".ts", ".tsx", ".js", ".mjs", ".cjs", ".py", ".rb", ".go", ".rs", ".java", ".ex", ".exs", ".php"}
+    skip_dirs = {"node_modules", ".git", "dist", "build", ".next", ".turbo", "target", "vendor", "__pycache__", "autonoma"}
+    for dirpath, dirnames, filenames in os.walk(root):
+        dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
+        for fn in filenames:
+            if not any(fn.endswith(ext) for ext in source_exts):
+                continue
+            fp = Path(dirpath) / fn
+            if fp.resolve() == handler_path.resolve():
+                continue
+            if fp.parent.resolve() == handler_path.parent.resolve():
+                # Don't count imports inside the handler's own directory — the
+                # standalone server.ts imports handler.ts but that isn't
+                # "reachable from the main app".
+                continue
+            try:
+                text = fp.read_text()
+            except OSError:
+                continue
+            for pat in import_patterns:
+                if pat.search(text):
+                    found_import = True
+                    break
+            if found_import:
+                break
+        if found_import:
+            break
+
+    if not found_import:
+        errors.append(
+            f"HANDLER NOT MOUNTED — no file outside {handler_dir} imports the "
+            f"Autonoma handler. The endpoint is unreachable from the main "
+            f"application's routes."
+        )
+        errors.append("")
+        errors.append(
+            "Fix: import the handler (or its router) from the main app's entry "
+            "file (e.g. apps/api/src/app.ts) and mount it on a route. The "
+            "Autonoma platform sends HMAC-signed requests to the main API's "
+            "public URL — a handler that nothing imports is dead code."
+        )
+        errors.append("")
+
+    return errors
+
+
+def main() -> None:
+    audit = parse_audit()
+    handler_path = resolve_handler_path()
+    src = handler_path.read_text()
+
+    violations: list[tuple[str, int, str]] = []
+    factories = extract_factory_bodies(src)
+
+    seen_models: set[str] = set()
+    for model, factory_src in factories:
+        seen_models.add(model)
+        if not audit.get(model):
+            # has_creation_code: false or unknown — ORM fallback is legitimate.
+            continue
+        create_body = extract_create_body(factory_src)
+        if not create_body:
+            continue
+        for m in ORM_ANTI_PATTERN.finditer(create_body):
+            line_no = create_body[: m.start()].count("\n") + 1
+            snippet = create_body.splitlines()[line_no - 1].strip()
+            violations.append((model, line_no, snippet))
+        for m in DRIZZLE_INSERT.finditer(create_body):
+            line_no = create_body[: m.start()].count("\n") + 1
+            snippet = create_body.splitlines()[line_no - 1].strip()
+            violations.append((model, line_no, snippet))
+
+    # Flag audited models missing a factory entirely.
+    missing_factories = [
+        name for name, has_code in audit.items() if has_code and name not in seen_models
+    ]
+
+    audit_flip_errors = check_audit_flip()
+    mount_errors = check_handler_mount(handler_path)
+
+    if not violations and not missing_factories and not audit_flip_errors and not mount_errors:
+        sys.exit(0)
+
+    lines = [
+        "FACTORY INTEGRITY CHECK FAILED — .endpoint-implemented will NOT be written.",
+        "",
+        f"Handler inspected: {handler_path}",
+        "",
+    ]
+    if violations:
+        lines.append(
+            "The following factories contain inline ORM writes for models the audit "
+            "marked has_creation_code: true. This is the #1 trap the env-factory "
+            "agent is warned about. You MUST call the audited creation_function "
+            "(extracting it first if needs_extraction: true). See the Per-model "
+            "decision tree and DI playbook in the env-factory prompt."
+        )
+        lines.append("")
+        for model, line_no, snippet in violations:
+            lines.append(f"  - {model} factory body: line {line_no}: {snippet}")
+        lines.append("")
+    if missing_factories:
+        lines.append(
+            "The following models are has_creation_code: true in the audit but have "
+            "no defineFactory registration in the handler:"
+        )
+        for name in missing_factories:
+            lines.append(f"  - {name}")
+        lines.append("")
+    if audit_flip_errors:
+        lines.extend(audit_flip_errors)
+    if mount_errors:
+        lines.extend(mount_errors)
+    if violations or missing_factories:
+        lines.append(
+            "To fix: re-run the Per-model decision tree for every failing model. If the "
+            "creation function is inline in a route/framework hook, extract it into a "
+            "named exported function, update entity-audit.md in place (clear "
+            "needs_extraction), then call the new function from the factory."
+        )
+    fail("\n".join(lines))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hooks/validators/validate_entity_audit.py b/hooks/validators/validate_entity_audit.py
new file mode 100644
index 0000000..ee65369
--- /dev/null
+++ b/hooks/validators/validate_entity_audit.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""Validates entity-audit.md frontmatter format.
+
+Supports two schemas:
+
+- v2 (current): each model has `independently_created: bool` and
+  `created_by: [{owner, via, why}]`. When `independently_created: true` the
+  entry must also have `creation_file`, `creation_function`, and optionally
+  `side_effects`. Dependents (`independently_created: false`) must have a
+  non-empty `created_by` pointing at a model that exists in the audit.
+
+- v1 (legacy): each model has `has_creation_code: bool`. We still accept it
+  and translate on read (see _audit_schema.py). v1 audits cannot express
+  `created_by`, so the dependent-has-owner invariant is vacuously satisfied.
+"""
+import sys
+import yaml
+from pathlib import Path
+
+filepath = sys.argv[1]
+content = open(filepath).read()
+
+if not content.startswith('---'):
+    print('File must start with YAML frontmatter (---)')
+    sys.exit(1)
+
+parts = content.split('---', 2)
+if len(parts) < 3:
+    print('Missing closing --- for frontmatter')
+    sys.exit(1)
+
+try:
+    fm = yaml.safe_load(parts[1])
+except Exception as e:
+    print(f'Invalid YAML in frontmatter: {e}')
+    sys.exit(1)
+
+if not isinstance(fm, dict):
+    print('Frontmatter must be a YAML mapping')
+    sys.exit(1)
+
+required = ['model_count', 'factory_count', 'models']
+missing = [f for f in required if f not in fm]
+if missing:
+    print(f'Missing required frontmatter fields: {missing}')
+    sys.exit(1)
+
+for count_field in ['model_count', 'factory_count']:
+    val = fm.get(count_field)
+    if not isinstance(val, int) or val < 0:
+        print(f'{count_field} must be a non-negative integer')
+        sys.exit(1)
+
+if fm['model_count'] < 1:
+    print('model_count must be at least 1 — no models were audited')
+    sys.exit(1)
+
+models = fm.get('models')
+if not isinstance(models, list) or len(models) == 0:
+    print('models must be a non-empty list')
+    sys.exit(1)
+
+if len(models) != fm['model_count']:
+    print(f'model_count ({fm["model_count"]}) does not match models array length ({len(models)})')
+    sys.exit(1)
+
+
+def is_indep(model):
+    if 'independently_created' in model:
+        return bool(model['independently_created'])
+    return bool(model.get('has_creation_code'))
+
+
+# First pass: sanity + collect names for cross-reference
+names = set()
+for i, model in enumerate(models):
+    if not isinstance(model, dict):
+        print(f'models[{i}] must be a mapping')
+        sys.exit(1)
+    if 'name' not in model or not isinstance(model['name'], str) or not model['name'].strip():
+        print(f'models[{i}].name must be a non-empty string')
+        sys.exit(1)
+    names.add(model['name'])
+
+# Second pass: schema checks per model
+factory_count = 0
+for i, model in enumerate(models):
+    name = model['name']
+    has_v2 = 'independently_created' in model
+    has_v1 = 'has_creation_code' in model
+    if not has_v2 and not has_v1:
+        print(f'models[{i}] ({name}) missing classification (independently_created or has_creation_code)')
+        sys.exit(1)
+    if has_v2 and not isinstance(model['independently_created'], bool):
+        print(f'models[{i}] ({name}).independently_created must be a boolean')
+        sys.exit(1)
+    if has_v1 and not isinstance(model['has_creation_code'], bool):
+        print(f'models[{i}] ({name}).has_creation_code must be a boolean')
+        sys.exit(1)
+
+    indep = is_indep(model)
+
+    if indep:
+        factory_count += 1
+        if 'creation_file' not in model or not isinstance(model.get('creation_file'), str):
+            print(f'models[{i}] ({name}) independently_created=true but missing creation_file')
+            sys.exit(1)
+        if 'creation_function' not in model or not isinstance(model.get('creation_function'), str):
+            print(f'models[{i}] ({name}) independently_created=true but missing creation_function')
+            sys.exit(1)
+        if 'side_effects' in model and not isinstance(model['side_effects'], list):
+            print(f'models[{i}] ({name}) side_effects must be a list when present')
+            sys.exit(1)
+
+    # created_by invariants (v2 only — v1 has no such field)
+    cb = model.get('created_by')
+    if cb is None:
+        # v1 audits don't have it; v2 requires it (empty allowed for roots)
+        if has_v2:
+            print(f'models[{i}] ({name}) missing required field: created_by (list, may be empty)')
+            sys.exit(1)
+        continue
+
+    if not isinstance(cb, list):
+        print(f'models[{i}] ({name}).created_by must be a list')
+        sys.exit(1)
+
+    if not indep and len(cb) == 0:
+        print(
+            f'models[{i}] ({name}) is marked independently_created=false but has no '
+            'created_by entries. Every dependent must have at least one owner — '
+            'either find the creation path, or mark the model independently_created=true.'
+        )
+        sys.exit(1)
+
+    for j, owner_entry in enumerate(cb):
+        if not isinstance(owner_entry, dict):
+            print(f'models[{i}] ({name}).created_by[{j}] must be a mapping')
+            sys.exit(1)
+        for req in ('owner', 'via', 'why'):
+            val = owner_entry.get(req)
+            if not isinstance(val, str) or not val.strip():
+                print(
+                    f'models[{i}] ({name}).created_by[{j}].{req} must be a non-empty string'
+                )
+                sys.exit(1)
+        if owner_entry['owner'] not in names:
+            print(
+                f'models[{i}] ({name}).created_by[{j}].owner={owner_entry["owner"]!r} '
+                f'does not match any model in the audit. Check the owner name or add the owner model.'
+            )
+            sys.exit(1)
+        if owner_entry['owner'] == name:
+            print(f'models[{i}] ({name}).created_by[{j}].owner cannot be the model itself')
+            sys.exit(1)
+
+if factory_count != fm['factory_count']:
+    # Autofix the count instead of blocking. Count-drift is bookkeeping, not a
+    # structural bug — the previous behaviour made the agent oscillate between
+    # stale counts on every edit. Warn loudly but keep the pipeline moving.
+    import sys as _sys
+    _sys.stderr.write(
+        f'[validate-entity-audit] autofixing factory_count: was '
+        f'{fm["factory_count"]}, now {factory_count}\n'
+    )
+    # Rewrite the file in place, preserving the body.
+    fm['factory_count'] = factory_count
+    new_fm = yaml.safe_dump(fm, sort_keys=False).rstrip() + "\n"
+    rewritten = '---\n' + new_fm + '---' + parts[2]
+    Path(filepath).write_text(rewritten)
+
+print('OK')
diff --git a/hooks/validators/validate_factory_fidelity.py b/hooks/validators/validate_factory_fidelity.py
new file mode 100755
index 0000000..c4f03da
--- /dev/null
+++ b/hooks/validators/validate_factory_fidelity.py
@@ -0,0 +1,585 @@
+#!/usr/bin/env python3
+"""Validator: semantic per-model factory fidelity using claude -p.
+
+Rationale — Run 4 post-mortem. Heuristic hooks have been bypassed three runs
+in a row. The agent found factorings that satisfy every regex while still
+producing bare-insert stubs. Only a model that can read the diff between the
+Step 2 snapshot and the current code can tell a faithful extraction apart
+from a stub.
+
+How it works:
+  1. Fetch the factory-fidelity rubric + prompt template from
+     $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt
+  2. Load the Step 2 audit snapshot (ground truth) and the current audit.
+  3. For every model with independently_created: true in the snapshot, build a
+     prompt with: Step 2 entry, current entry, factory block, helper (if
+     imported), original creation_function snippet.
+  4. Run `claude -p --output-format json "<prompt>"` in parallel (bounded
+     concurrency). Each subprocess inherits the parent's model/provider
+     config via env.
+  5. Parse JSON verdicts. If any fail, block the sentinel and return the
+     compiled feedback to the env-factory agent.
+
+Exit 0 = all verdicts pass (or no models to check).
+Exit 2 = one or more verdicts failed; stderr contains the feedback the
+         agent should use to self-correct.
+Exit 0 with a stderr warning = environment not configured to run the check
+         (missing docs URL, claude CLI not found). We do NOT block in that
+         case — the cheap hooks remain the primary gate.
+"""
+
+from __future__ import annotations
+
+import concurrent.futures as futures
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import time
+import urllib.request
+from pathlib import Path
+from typing import Optional
+
+import yaml  # type: ignore
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from _audit_schema import is_independently_created  # noqa: E402
+
+CONCURRENCY = int(os.environ.get("AUTONOMA_FIDELITY_CONCURRENCY", "6"))
+PER_MODEL_TIMEOUT = int(os.environ.get("AUTONOMA_FIDELITY_TIMEOUT", "180"))
+MAX_MODELS = int(os.environ.get("AUTONOMA_FIDELITY_MAX_MODELS", "60"))
+SNIPPET_MAX_LINES = 200
+DOCS_SLUG = "llms/test-planner/factory-fidelity-rubric.txt"
+
+
+def warn(msg: str) -> None:
+    sys.stderr.write(f"[fidelity-validator] {msg}\n")
+
+
+def load_audit(path: Path) -> dict[str, dict]:
+    if not path.exists():
+        return {}
+    text = path.read_text()
+    if not text.startswith("---"):
+        return {}
+    end = text.find("\n---", 3)
+    if end < 0:
+        return {}
+    try:
+        fm = yaml.safe_load(text[3:end])
+    except yaml.YAMLError:
+        return {}
+    out: dict[str, dict] = {}
+    for entry in (fm.get("models") or []):
+        if isinstance(entry, dict):
+            name = entry.get("name") or entry.get("model")
+            if name:
+                out[str(name)] = entry
+    return out
+
+
+def fetch_rubric() -> Optional[tuple[str, str]]:
+    """Return (rubric_text, prompt_template) or None if unavailable."""
+    url_file = Path("autonoma/.docs-url")
+    if not url_file.exists():
+        warn("autonoma/.docs-url missing — skipping semantic validation.")
+        return None
+    base = url_file.read_text().strip().rstrip("/")
+    url = f"{base}/{DOCS_SLUG}"
+    try:
+        with urllib.request.urlopen(url, timeout=20) as resp:
+            content = resp.read().decode("utf-8")
+    except Exception as e:
+        warn(f"failed to fetch rubric from {url}: {e} — skipping.")
+        return None
+    # Split at "## Prompt template"
+    parts = content.split("## Prompt template", 1)
+    if len(parts) != 2:
+        warn("rubric page is missing '## Prompt template' section — skipping.")
+        return None
+    rubric_md = parts[0]
+    # The prompt template lives between explicit HTML-comment delimiters to
+    # avoid clashing with the inner ``` fences the template itself contains.
+    tpl_match = re.search(
+        r"<!--\s*prompt:begin\s*-->\s*\n(.*?)\n<!--\s*prompt:end\s*-->",
+        parts[1],
+        re.DOTALL,
+    )
+    if not tpl_match:
+        warn("rubric page missing <!-- prompt:begin --> / <!-- prompt:end --> markers — skipping.")
+        return None
+    return rubric_md.strip(), tpl_match.group(1)
+
+
+def resolve_handler_path(sentinel_path: str) -> Optional[Path]:
+    body = Path(sentinel_path).read_text()
+    m = re.search(r"handler(?:_path)?:\s*(\S+)", body, re.IGNORECASE)
+    candidates: list[str] = []
+    if m:
+        candidates.append(m.group(1).rstrip(".,;:"))
+    for tok in re.findall(r"[\w./\\-]+\.(?:ts|tsx|js|mjs|cjs|py|rb|php|java|go|rs|ex|exs)", body):
+        candidates.append(tok.rstrip(".,;:"))
+    for cand in candidates:
+        p = Path(cand)
+        if not p.is_absolute():
+            p = Path.cwd() / cand
+        if p.is_file():
+            return p
+    return None
+
+
+def find_factory_block(handler_src: str, model: str) -> str:
+    header = re.search(rf"\b{re.escape(model)}\s*:\s*defineFactory\s*\(\s*\{{", handler_src)
+    if not header:
+        return ""
+    brace = handler_src.find("{", header.end() - 1)
+    if brace < 0:
+        return ""
+    depth = 0
+    i = brace
+    n = len(handler_src)
+    while i < n:
+        c = handler_src[i]
+        if c == "{":
+            depth += 1
+        elif c == "}":
+            depth -= 1
+            if depth == 0:
+                start = handler_src.rfind("\n", 0, header.start()) + 1
+                return handler_src[start : i + 1]
+        i += 1
+    return ""
+
+
+def _load_tsconfig_paths(cwd: Path) -> list[tuple[str, list[str]]]:
+    """Best-effort parse of tsconfig.json compilerOptions.paths for alias
+    resolution. Walks up a few ancestors so apps/api/ monorepos pick up the
+    root tsconfig. Silently returns [] on any parse error."""
+    roots: list[Path] = [cwd]
+    cur = cwd
+    for _ in range(4):
+        cur = cur.parent
+        roots.append(cur)
+    seen: set[Path] = set()
+    out: list[tuple[str, list[str]]] = []
+    for root in roots:
+        for name in ("tsconfig.json", "tsconfig.base.json"):
+            p = root / name
+            if p in seen or not p.is_file():
+                continue
+            seen.add(p)
+            try:
+                raw = p.read_text()
+                raw = re.sub(r"//[^\n]*", "", raw)
+                raw = re.sub(r",\s*([}\]])", r"\1", raw)
+                data = json.loads(raw)
+            except Exception:
+                continue
+            co = (data.get("compilerOptions") or {})
+            base_url = co.get("baseUrl") or "."
+            base_dir = (p.parent / base_url).resolve()
+            for prefix, resolutions in (co.get("paths") or {}).items():
+                if not isinstance(resolutions, list):
+                    continue
+                resolved = [str((base_dir / r).resolve()) for r in resolutions if isinstance(r, str)]
+                out.append((prefix, resolved))
+    return out
+
+
+def _resolve_import_path(rel: str, handler_path: Path, alias_map: list[tuple[str, list[str]]]) -> Optional[Path]:
+    """Resolve an import specifier to a filesystem path. Handles relative
+    imports and TS path aliases with trailing /*."""
+    candidates: list[Path] = []
+    if rel.startswith("."):
+        candidates.append((handler_path.parent / rel).resolve())
+    elif rel.startswith("/"):
+        candidates.append(Path(rel))
+    else:
+        for prefix, resolutions in alias_map:
+            pref = prefix.rstrip("*").rstrip("/")
+            if rel == pref or rel.startswith(pref + "/"):
+                tail = rel[len(pref):].lstrip("/")
+                for r in resolutions:
+                    root = r.rstrip("*").rstrip("/")
+                    candidates.append(Path(root) / tail if tail else Path(root))
+    for c in candidates:
+        for ext in (".ts", ".tsx", ".js", ".mjs", ""):
+            p = Path(str(c) + ext)
+            if p.is_file():
+                return p
+        for idx in ("index.ts", "index.tsx", "index.js"):
+            p = c / idx
+            if p.is_file():
+                return p
+    return None
+
+
+_IDENT_BLOCKLIST = {
+    "if", "for", "while", "switch", "return", "await", "async", "new",
+    "Date", "String", "Number", "Boolean", "Object", "Array", "Error",
+    "Promise", "Map", "Set", "JSON", "Math", "console", "typeof", "function",
+    "require", "import", "catch", "throw", "void", "delete", "instanceof",
+}
+
+
+def find_helpers(handler_src: str, handler_path: Path, factory_block: str) -> list[tuple[Path, str, str]]:
+    """Return every (helper_path, helper_fn_name, helper_source) the factory
+    block invokes via a named import in the handler. Strips string/template
+    literals first so identifiers inside quotes don't produce false calls."""
+    if not factory_block:
+        return []
+    stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block)
+    candidates = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST
+    alias_map = _load_tsconfig_paths(Path.cwd())
+    imports: dict[str, str] = {}
+    for m in re.finditer(
+        r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"]([^'\"]+)['\"]",
+        handler_src,
+    ):
+        spec = m.group(2)
+        for name in m.group(1).split(","):
+            name = name.strip()
+            if " as " in name:
+                name = name.split(" as ", 1)[1].strip()
+            if name:
+                imports[name] = spec
+    out: list[tuple[Path, str, str]] = []
+    seen: set[Path] = set()
+    for name in sorted(candidates):
+        spec = imports.get(name)
+        if not spec:
+            continue
+        resolved = _resolve_import_path(spec, handler_path, alias_map)
+        if not resolved or resolved in seen:
+            continue
+        seen.add(resolved)
+        try:
+            text = resolved.read_text()
+        except OSError:
+            continue
+        snippet = extract_fn_snippet(text, name) or text[:4000]
+        out.append((resolved, name, snippet))
+    return out
+
+
+def find_helper(handler_src: str, handler_path: Path, model: str, factory_block: str) -> Optional[tuple[Path, str, str]]:
+    """Legacy single-helper accessor kept for backwards compat."""
+    helpers = find_helpers(handler_src, handler_path, factory_block)
+    return helpers[0] if helpers else None
+
+
+def _unresolved_calls(handler_src: str, factory_block: str, resolved: list[tuple[Path, str, str]]) -> list[str]:
+    """Identifiers called in the factory block that weren't in resolved + not in the blocklist."""
+    if not factory_block:
+        return []
+    stripped = re.sub(r"'[^'\n]*'|\"[^\"\n]*\"|`[^`]*`", "''", factory_block)
+    calls = set(re.findall(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)) - _IDENT_BLOCKLIST
+    resolved_names = {name for _, name, _ in resolved}
+    # Also strip anything that looks like a member access call (obj.method() captured as "method")
+    # by requiring the name to appear as a named import too.
+    imported = set(re.findall(
+        r"import\s+(?:type\s+)?\{([^}]+)\}\s+from\s+['\"][^'\"]+['\"]",
+        handler_src,
+    ))
+    imported_names: set[str] = set()
+    for group in imported:
+        for n in group.split(","):
+            n = n.strip()
+            if " as " in n:
+                n = n.split(" as ", 1)[1].strip()
+            if n:
+                imported_names.add(n)
+    return sorted((calls & imported_names) - resolved_names)
+
+
+def extract_fn_snippet(src: str, fn_name: str) -> str:
+    """Find `export (async )?function fn_name(` or `fn_name =` and return body."""
+    patterns = [
+        rf"export\s+(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(",
+        rf"export\s+const\s+{re.escape(fn_name)}\s*=",
+        rf"(?:async\s+)?function\s+{re.escape(fn_name)}\s*\(",
+    ]
+    for pat in patterns:
+        m = re.search(pat, src)
+        if not m:
+            continue
+        # Grab until the matching closing brace of the first "{" after m.end()
+        brace = src.find("{", m.end())
+        if brace < 0:
+            continue
+        depth = 0
+        i = brace
+        n = len(src)
+        while i < n:
+            c = src[i]
+            if c == "{":
+                depth += 1
+            elif c == "}":
+                depth -= 1
+                if depth == 0:
+                    start = src.rfind("\n", 0, m.start()) + 1
+                    snippet = src[start : i + 1]
+                    return "\n".join(snippet.splitlines()[:SNIPPET_MAX_LINES])
+            i += 1
+    return ""
+
+
+def load_original_snippet(snap_entry: dict) -> tuple[str, str]:
+    """Return (file_path_str, snippet)."""
+    cfile = (snap_entry.get("creation_file") or "").strip()
+    cfn = (snap_entry.get("creation_function") or "").strip()
+    if not cfile:
+        return "", "(Step 2 audit did not record a creation_file)"
+    p = Path(cfile)
+    if not p.is_absolute():
+        p = Path.cwd() / cfile
+    if not p.is_file():
+        return cfile, f"(file not found at {p})"
+    try:
+        text = p.read_text()
+    except OSError as e:
+        return cfile, f"(could not read file: {e})"
+    if cfn:
+        snip = extract_fn_snippet(text, cfn)
+        if snip:
+            return cfile, snip
+    return cfile, "\n".join(text.splitlines()[:SNIPPET_MAX_LINES])
+
+
+def yaml_entry(entry: dict) -> str:
+    return yaml.safe_dump([entry], sort_keys=False).rstrip()
+
+
+def fill_template(
+    tpl: str,
+    rubric: str,
+    model: str,
+    snap_entry: dict,
+    cur_entry: Optional[dict],
+    handler_path: Path,
+    factory_block: str,
+    helpers: list[tuple[Path, str, str]],
+    unresolved_calls: list[str],
+    orig_path: str,
+    orig_snippet: str,
+) -> str:
+    if helpers:
+        blocks = []
+        for p, name, body in helpers:
+            blocks.append(f"File: {p}\nFunction: {name}\n\n```\n{body}\n```")
+        helper_section = "\n\n".join(blocks)
+        if unresolved_calls:
+            helper_section += (
+                "\n\n(Additional identifiers called by the factory were not resolvable "
+                f"as imports and may or may not be helpers: {', '.join(unresolved_calls)})"
+            )
+    elif unresolved_calls:
+        helper_section = (
+            "(The factory calls identifiers that were not resolvable as named imports: "
+            f"{', '.join(unresolved_calls)}. Treat this as missing-context, not as evidence "
+            "of a raw-write factory.)"
+        )
+    else:
+        helper_section = "(The factory does not call an external helper.)"
+
+    needs_extraction = "true" if snap_entry.get("needs_extraction") else "false"
+    extracted_to = str(snap_entry.get("extracted_to") or "").strip() or "(not set)"
+
+    return (
+        tpl.replace("{{RUBRIC}}", rubric)
+        .replace("{{MODEL}}", model)
+        .replace("{{STEP2_AUDIT_ENTRY}}", yaml_entry(snap_entry))
+        .replace(
+            "{{CURRENT_AUDIT_ENTRY}}",
+            yaml_entry(cur_entry) if cur_entry else "(model not present in current audit)",
+        )
+        .replace("{{HANDLER_PATH}}", str(handler_path))
+        .replace("{{FACTORY_BLOCK}}", factory_block or "(factory registration not found)")
+        .replace("{{HELPER_SECTION}}", helper_section)
+        .replace("{{NEEDS_EXTRACTION}}", needs_extraction)
+        .replace("{{EXTRACTED_TO}}", extracted_to)
+        .replace("{{ORIGINAL_CREATION_FILE}}", orig_path or "(unknown)")
+        .replace("{{ORIGINAL_CREATION_SNIPPET}}", orig_snippet)
+    )
+
+
+def run_claude(prompt: str) -> dict:
+    """Spawn `claude -p --output-format json` with the prompt on stdin.
+
+    Model is configurable via AUTONOMA_FIDELITY_MODEL (defaults to "sonnet",
+    which is cheap, fast, and reliable for bounded rubric tasks). Set to empty
+    string to inherit whatever model the CLI picks.
+    """
+    cmd = ["claude", "-p", "--output-format", "json"]
+    model = os.environ.get("AUTONOMA_FIDELITY_MODEL", "sonnet")
+    if model:
+        cmd.extend(["--model", model])
+    try:
+        proc = subprocess.run(
+            cmd,
+            input=prompt,
+            capture_output=True,
+            text=True,
+            timeout=PER_MODEL_TIMEOUT,
+        )
+    except subprocess.TimeoutExpired:
+        return {"verdict": "error", "error": "timeout"}
+    except FileNotFoundError:
+        return {"verdict": "error", "error": "claude CLI not found"}
+    if proc.returncode != 0:
+        return {"verdict": "error", "error": f"claude exit {proc.returncode}: {proc.stderr[:400]}"}
+    out = proc.stdout.strip()
+    # Outer envelope from `claude -p --output-format json` wraps the assistant
+    # response in a JSON object with a "result" field containing the text.
+    try:
+        envelope = json.loads(out)
+    except json.JSONDecodeError:
+        # Assume raw stdout is the JSON we asked for.
+        return parse_verdict(out)
+    inner = envelope.get("result") or envelope.get("text") or envelope.get("output") or ""
+    if isinstance(inner, list):
+        inner = "\n".join(str(x) for x in inner)
+    return parse_verdict(str(inner))
+
+
+def parse_verdict(text: str) -> dict:
+    text = text.strip()
+    if text.startswith("```"):
+        text = re.sub(r"^```[a-zA-Z]*\n", "", text)
+        text = re.sub(r"\n```\s*$", "", text)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        m = re.search(r"\{.*\}", text, re.DOTALL)
+        if m:
+            try:
+                return json.loads(m.group(0))
+            except json.JSONDecodeError:
+                pass
+        return {"verdict": "error", "error": f"could not parse verdict: {text[:300]}"}
+
+
+def validate_one(task: dict) -> dict:
+    verdict = run_claude(task["prompt"])
+    verdict["model"] = task["model"]
+    return verdict
+
+
+def main() -> None:
+    if os.environ.get("AUTONOMA_SKIP_FIDELITY") == "1":
+        warn("AUTONOMA_SKIP_FIDELITY=1 — skipping.")
+        sys.exit(0)
+
+    if shutil.which("claude") is None:
+        warn("`claude` CLI not on PATH — skipping semantic validation.")
+        sys.exit(0)
+
+    if len(sys.argv) < 2:
+        warn("no sentinel path provided")
+        sys.exit(0)
+    sentinel = sys.argv[1]
+
+    rubric_pair = fetch_rubric()
+    if not rubric_pair:
+        sys.exit(0)
+    rubric, tpl = rubric_pair
+
+    snap = load_audit(Path("autonoma/.entity-audit-step2.md"))
+    cur = load_audit(Path("autonoma/entity-audit.md"))
+    if not snap:
+        warn("Step 2 snapshot missing — skipping.")
+        sys.exit(0)
+
+    handler_path = resolve_handler_path(sentinel)
+    if handler_path is None:
+        warn("handler path not resolvable from sentinel — skipping.")
+        sys.exit(0)
+    handler_src = handler_path.read_text()
+
+    models = [name for name, entry in snap.items() if is_independently_created(entry)]
+    if not models:
+        sys.exit(0)
+    if len(models) > MAX_MODELS:
+        warn(f"truncating from {len(models)} to {MAX_MODELS} models (override via AUTONOMA_FIDELITY_MAX_MODELS).")
+        models = models[:MAX_MODELS]
+
+    tasks = []
+    for model in models:
+        snap_entry = snap[model]
+        cur_entry = cur.get(model)
+        factory_block = find_factory_block(handler_src, model)
+        helpers = find_helpers(handler_src, handler_path, factory_block) if factory_block else []
+        unresolved = _unresolved_calls(handler_src, factory_block, helpers) if factory_block else []
+        orig_path, orig_snippet = load_original_snippet(snap_entry)
+        prompt = fill_template(
+            tpl, rubric, model, snap_entry, cur_entry, handler_path,
+            factory_block, helpers, unresolved, orig_path, orig_snippet,
+        )
+        tasks.append({"model": model, "prompt": prompt})
+
+    t0 = time.time()
+    warn(f"running semantic validation for {len(tasks)} models (concurrency={CONCURRENCY}).")
+
+    results: list[dict] = []
+    with futures.ThreadPoolExecutor(max_workers=CONCURRENCY) as ex:
+        for res in ex.map(validate_one, tasks):
+            results.append(res)
+
+    elapsed = time.time() - t0
+    warn(f"semantic validation complete in {elapsed:.1f}s.")
+
+    failures = [r for r in results if r.get("verdict") == "fail"]
+    errors = [r for r in results if r.get("verdict") == "error"]
+    passes = [r for r in results if r.get("verdict") == "pass"]
+
+    warn(f"results: {len(passes)} pass, {len(failures)} fail, {len(errors)} error.")
+
+    if errors and not failures:
+        # Don't block on our own infra errors; log and allow.
+        warn("no hard failures; transient errors will not block the sentinel.")
+        for e in errors[:5]:
+            warn(f"  - {e.get('model','?')}: {e.get('error','')[:200]}")
+        sys.exit(0)
+
+    if not failures:
+        sys.exit(0)
+
+    lines = [
+        f"FACTORY FIDELITY CHECK FAILED — {len(failures)} of {len(results)} models "
+        "do not faithfully reproduce their Step 2 creation behaviour.",
+        "",
+        "This is the semantic check. It reads the Step 2 snapshot (ground truth), "
+        "the current audit, the factory registration, and the original creation "
+        "function, then applies the rubric at:",
+        "  $(cat autonoma/.docs-url)/llms/test-planner/factory-fidelity-rubric.txt",
+        "",
+        "Per-model feedback:",
+        "",
+    ]
+    for r in failures:
+        model = r.get("model", "?")
+        lines.append(f"── {model} ──")
+        for c in r.get("criteria", []) or []:
+            if c.get("status") == "fail":
+                lines.append(f"  ✗ Criterion {c.get('id')}: {c.get('reason','')}")
+        fix = r.get("fix_hint", "")
+        if fix:
+            lines.append(f"  → Fix: {fix}")
+        lines.append("")
+    lines.append(
+        "To fix: for each failing model, either (a) call the original "
+        "creation_function from the Step 2 audit (the one in the APPLICATION "
+        "codebase, not the helper the factory wrote), or (b) make the helper a "
+        "thin wrapper that calls that function. Do NOT leave bare ORM inserts "
+        "in the helper. If a side effect truly conflicts with the SDK's "
+        "scenario tree (e.g. sibling rows get created twice), document in a "
+        "comment which sibling factory owns that row and reference it."
+    )
+    sys.stderr.write("\n".join(lines) + "\n")
+    sys.exit(2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hooks/validators/validate_scenarios.py b/hooks/validators/validate_scenarios.py
index 8580715..b080522 100644
--- a/hooks/validators/validate_scenarios.py
+++ b/hooks/validators/validate_scenarios.py
@@ -26,7 +26,7 @@
     sys.exit(1)
 
 # Required fields
-required = ['scenario_count', 'scenarios', 'entity_types', 'discover', 'variable_fields', 'planning_sections']
+required = ['scenario_count', 'scenarios', 'entity_types']
 missing = [f for f in required if f not in fm]
 if missing:
     print(f'Missing required frontmatter fields: {missing}')
@@ -73,37 +73,12 @@
         print(f'entity_types[{i}] must be a mapping with at least a "name" field')
         sys.exit(1)
 
-# Validate discover metadata
-discover = fm.get('discover')
-if not isinstance(discover, dict):
-    print('discover must be a mapping')
+# Validate variable_fields (required, may be empty list)
+if 'variable_fields' not in fm:
+    print('Missing required frontmatter field: variable_fields (use [] if none)')
     sys.exit(1)
 
-for field in ['source', 'model_count', 'edge_count', 'relation_count', 'scope_field']:
-    if field not in discover:
-        print(f'discover missing required field: {field}')
-        sys.exit(1)
-
-if discover.get('source') != 'sdk':
-    print('discover.source must be exactly "sdk"')
-    sys.exit(1)
-
-for field in ['model_count', 'edge_count', 'relation_count']:
-    value = discover.get(field)
-    if not isinstance(value, int) or value < 0:
-        print(f'discover.{field} must be a non-negative integer')
-        sys.exit(1)
-
-scope_field = discover.get('scope_field')
-if not isinstance(scope_field, str) or len(scope_field.strip()) == 0:
-    print('discover.scope_field must be a non-empty string')
-    sys.exit(1)
-
-if discover.get('model_count') == 0:
-    print('discover.model_count must be greater than 0')
-    sys.exit(1)
-
-# Validate variable_fields
+scenario_name_set = {s['name'] for s in scenarios}
 variable_fields = fm.get('variable_fields')
 if not isinstance(variable_fields, list):
     print('variable_fields must be a list')
@@ -129,51 +104,29 @@
             print(f'variable_fields[{i}].{field} must be a non-empty string')
             sys.exit(1)
 
-    if 'generator' in variable:
-        generator = variable.get('generator')
-        if not isinstance(generator, str) or len(generator.strip()) == 0:
-            print(f'variable_fields[{i}].generator must be a non-empty string if present')
-            sys.exit(1)
-
-    scenario_names = variable.get('scenarios')
-    if not isinstance(scenario_names, list) or len(scenario_names) == 0:
+    vscenarios = variable.get('scenarios')
+    if not isinstance(vscenarios, list) or len(vscenarios) == 0:
         print(f'variable_fields[{i}].scenarios must be a non-empty list')
         sys.exit(1)
-    unknown_names = [name for name in scenario_names if name not in found_names]
-    if unknown_names:
-        print(f'variable_fields[{i}].scenarios has unknown scenario names: {unknown_names}')
-        sys.exit(1)
+    for name in vscenarios:
+        if name not in scenario_name_set:
+            print(f'variable_fields[{i}].scenarios references unknown scenario: {name}')
+            sys.exit(1)
 
-# Validate planning_sections metadata
-planning_sections = fm.get('planning_sections')
-if not isinstance(planning_sections, list) or len(planning_sections) == 0:
-    print('planning_sections must be a non-empty list')
+# Validate planning_sections (required; must contain the four core sections)
+if 'planning_sections' not in fm:
+    print('Missing required frontmatter field: planning_sections')
     sys.exit(1)
 
-required_sections = {
-    'sdk_discover',
-    'schema_summary',
-    'relationship_map',
-    'variable_data_strategy',
-}
-optional_sections = {
-    'scoping_analysis',
-}
-allowed_sections = required_sections | optional_sections
-
-unknown_sections = [section for section in planning_sections if not isinstance(section, str) or len(section.strip()) == 0]
-if unknown_sections:
-    print('planning_sections must contain only non-empty strings')
+planning = fm.get('planning_sections')
+if not isinstance(planning, list) or len(planning) == 0:
+    print('planning_sections must be a non-empty list')
     sys.exit(1)
 
-missing_sections = required_sections - set(planning_sections)
+required_sections = {'schema_summary', 'relationship_map', 'variable_data_strategy'}
+missing_sections = required_sections - set(planning)
 if missing_sections:
-    print(f'Missing required planning_sections: {missing_sections}')
+    print(f'planning_sections missing required entries: {sorted(missing_sections)}')
     sys.exit(1)
 
-for section in planning_sections:
-    if section not in allowed_sections:
-        print(f'planning_sections contains unknown value: {section}')
-        sys.exit(1)
-
 print('OK')
diff --git a/hooks/validators/validate_sdk_integration.py b/hooks/validators/validate_sdk_integration.py
deleted file mode 100644
index fde09df..0000000
--- a/hooks/validators/validate_sdk_integration.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python3
-"""Validates autonoma/.sdk-integration.json."""
-import json
-import sys
-from urllib.parse import urlparse
-
-
-filepath = sys.argv[1]
-
-
-def fail(message: str) -> None:
-    print(message)
-    sys.exit(1)
-
-
-try:
-    with open(filepath) as fh:
-        payload = json.load(fh)
-except Exception as exc:
-    fail(f"Invalid JSON: {exc}")
-
-if not isinstance(payload, dict):
-    fail("Root must be a JSON object")
-
-required = [
-    "status",
-    "endpointUrl",
-    "endpointPath",
-    "stack",
-    "packagesInstalled",
-    "sharedSecretPresent",
-    "signingSecretPresent",
-    "devServer",
-    "verification",
-    "branch",
-    "blockingIssues",
-]
-missing = [field for field in required if field not in payload]
-if missing:
-    fail(f"Missing required fields: {missing}")
-
-status = payload.get("status")
-if status not in {"ok", "failed"}:
-    fail('status must be "ok" or "failed"')
-
-endpoint_url = payload.get("endpointUrl")
-if not isinstance(endpoint_url, str) or not endpoint_url.strip():
-    fail("endpointUrl must be a non-empty string")
-parsed = urlparse(endpoint_url)
-if parsed.scheme not in {"http", "https"} or not parsed.netloc:
-    fail("endpointUrl must be an absolute http/https URL")
-
-endpoint_path = payload.get("endpointPath")
-if not isinstance(endpoint_path, str) or not endpoint_path.strip():
-    fail("endpointPath must be a non-empty string")
-
-stack = payload.get("stack")
-if not isinstance(stack, dict):
-    fail("stack must be an object")
-for field in ["language", "framework", "orm", "packageManager"]:
-    if field not in stack:
-        fail(f"stack.{field} is required")
-    if stack[field] is not None and not isinstance(stack[field], str):
-        fail(f"stack.{field} must be a string or null")
-
-packages = payload.get("packagesInstalled")
-if not isinstance(packages, list) or not all(isinstance(item, str) and item.strip() for item in packages):
-    fail("packagesInstalled must be a list of non-empty strings")
-
-for field in ["sharedSecretPresent", "signingSecretPresent"]:
-    if not isinstance(payload.get(field), bool):
-        fail(f"{field} must be a boolean")
-
-dev_server = payload.get("devServer")
-if not isinstance(dev_server, dict):
-    fail("devServer must be an object")
-if not isinstance(dev_server.get("startedByPlugin"), bool):
-    fail("devServer.startedByPlugin must be a boolean")
-pid = dev_server.get("pid")
-if pid is not None and not isinstance(pid, int):
-    fail("devServer.pid must be an integer or null")
-
-verification = payload.get("verification")
-if not isinstance(verification, dict):
-    fail("verification must be an object")
-for key in ["discover", "up", "down"]:
-    section = verification.get(key)
-    if not isinstance(section, dict):
-        fail(f"verification.{key} must be an object")
-    if section.get("status") not in {"ok", "failed"}:
-        fail(f'verification.{key}.status must be "ok" or "failed"')
-
-if not isinstance(verification.get("discover", {}).get("validatedByPlugin"), bool):
-    fail("verification.discover.validatedByPlugin must be a boolean")
-
-branch = payload.get("branch")
-if not isinstance(branch, dict) or not isinstance(branch.get("name"), str) or not branch.get("name", "").strip():
-    fail("branch.name must be a non-empty string")
-
-pr = payload.get("pr")
-if pr is not None:
-    if not isinstance(pr, dict):
-        fail("pr must be an object or null")
-    url = pr.get("url")
-    if url is not None:
-        if not isinstance(url, str) or not url.strip():
-            fail("pr.url must be a non-empty string or null")
-
-blocking = payload.get("blockingIssues")
-if not isinstance(blocking, list) or not all(isinstance(item, str) for item in blocking):
-    fail("blockingIssues must be a list of strings")
-
-print("OK")
diff --git a/skills/generate-tests/SKILL.md b/skills/generate-tests/SKILL.md
index 7f0bbc2..4ccc236 100644
--- a/skills/generate-tests/SKILL.md
+++ b/skills/generate-tests/SKILL.md
@@ -9,644 +9,183 @@ description: >
 
 # Autonoma E2E Test Generation Pipeline
 
-You are orchestrating a 5-step test generation pipeline. Each step runs as an isolated subagent.
+You are orchestrating a 6-step test generation pipeline. Each step runs as an isolated subagent.
 **Every step MUST complete successfully and pass validation before the next step begins.**
 Do NOT skip steps. Do NOT proceed if validation fails.
 
-## User Confirmation Between Steps
+## CRITICAL: User Confirmation Between Steps
 
-By default, after each step (1, 2, 3, and 4), present the summary and automatically proceed to the
-next step once validation passes.
+After steps 1, 2, 3, 4, and 5 you MUST present the summary and ask the user for confirmation
+using `AskUserQuestion`. After calling it, wait for the response. Only proceed after they confirm.
 
-**Canonical auto-advance mode:** If `AUTONOMA_AUTO_ADVANCE=true`, keep moving automatically after
-Steps 1-4.
+## How lifecycle reporting works
 
-**Compatibility alias:** If `AUTONOMA_AUTO_ADVANCE` is unset and `AUTONOMA_REQUIRE_CONFIRMATION=false`,
-that means auto-advance as well.
+You do NOT issue `curl` commands to report step start/complete/uploads. Plugin hooks do that:
 
-If auto-advance is disabled, you MUST present the summary and then ask the user for confirmation
-using the `AskUserQuestion` tool.
-
-After calling `AskUserQuestion`, wait for the user's response.
-Only proceed to the next step after they confirm.
+- `UserPromptSubmit` (`pipeline-kickoff.sh`) creates the setup record on `/generate-tests`.
+- `PostToolUse` (`validate-pipeline-output.sh`) runs after every `Write`. It validates output,
+  emits `step.completed`/`step.started`, uploads artifacts, and enforces the validation gate
+  (test files cannot be written until `autonoma/.endpoint-validated` exists).
 
 ## Before Starting
 
-Create the output directory and save the project root:
-
-```bash
-AUTONOMA_ROOT="$(pwd)"
-echo "$AUTONOMA_ROOT" > /tmp/autonoma-project-root
-mkdir -p autonoma autonoma/skills autonoma/qa-tests
-cleanup_dev_server() {
-  DEV_SERVER_PID=$(cat /tmp/autonoma-dev-server-pid 2>/dev/null || echo '')
-  if [ -n "$DEV_SERVER_PID" ]; then
-    kill "$DEV_SERVER_PID" 2>/dev/null || true
-    rm -f /tmp/autonoma-dev-server-pid
-    echo "Dev server (PID $DEV_SERVER_PID) stopped."
-  fi
-}
-```
-
-The plugin root path is persisted to `/tmp/autonoma-plugin-root` automatically by the PostToolUse hook on the first Write:
-
-```bash
-PLUGIN_ROOT=$(cat /tmp/autonoma-plugin-root 2>/dev/null || echo '')
-```
-
-Read the environment variables required for reporting progress back to Autonoma:
-- `AUTONOMA_API_KEY`
-- `AUTONOMA_PROJECT_ID`
-- `AUTONOMA_API_URL`
-- `AUTONOMA_AUTO_ADVANCE` — optional, canonical
-- `AUTONOMA_REQUIRE_CONFIRMATION` — optional legacy alias
-
-Add shared helpers before running the pipeline:
-
 ```bash
-auto_advance_enabled() {
-  if [ "${AUTONOMA_AUTO_ADVANCE:-}" = "true" ]; then
-    return 0
-  fi
-  if [ -z "${AUTONOMA_AUTO_ADVANCE:-}" ] && [ "${AUTONOMA_REQUIRE_CONFIRMATION:-}" = "false" ]; then
-    return 0
-  fi
-  return 1
-}
-
-refresh_generation_id() {
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-  GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-}
-
-build_event_payload() {
-  python3 - "$1" "$2" "$3" <<'PY'
-import json
-import sys
-
-event_type, key, value = sys.argv[1:4]
-print(json.dumps({"type": event_type, "data": {key: json.loads(value)}}))
-PY
-}
-
-build_step_payload() {
-  python3 - "$1" "$2" "$3" <<'PY'
-import json
-import sys
-
-event_type, step, name = sys.argv[1:4]
-print(json.dumps({"type": event_type, "data": {"step": int(step), "name": name}}))
-PY
-}
-
-post_setup_event_blocking() {
-  refresh_generation_id
-  payload="$1"
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  for attempt in 1 2 3; do
-    if curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d "$payload" >/dev/null; then
-      return 0
-    fi
-    sleep "$attempt"
-  done
-  echo "ERROR: Failed to post blocking setup event after retries: $payload"
-  return 1
-}
-
-post_setup_log() {
-  refresh_generation_id
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  payload=$(build_event_payload "log" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$1")")
-  curl -fsS -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/events" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-    -H "Content-Type: application/json" \
-    -d "$payload" >/dev/null || true
-}
-
-patch_setup_status_blocking() {
-  refresh_generation_id
-  status="$1"
-  message="$2"
-  if [ -z "$GENERATION_ID" ]; then
-    return 0
-  fi
-  payload=$(python3 - "$status" "$message" <<'PY'
-import json
-import sys
-
-body = {"status": sys.argv[1]}
-if sys.argv[2]:
-    body["errorMessage"] = sys.argv[2]
-print(json.dumps(body))
-PY
-)
-  for attempt in 1 2 3; do
-    if curl -fsS -X PATCH "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}" \
-      -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-      -H "Content-Type: application/json" \
-      -d "$payload" >/dev/null; then
-      return 0
-    fi
-    sleep "$attempt"
-  done
-  echo "ERROR: Failed to patch setup status after retries: $status"
-  return 1
-}
-
-report_error_and_exit() {
-  message="$1"
-  preserve_dev_server="${2:-false}"
-  payload=$(build_event_payload "error" "message" "$(python3 -c 'import json,sys; print(json.dumps(sys.argv[1]))' "$message")")
-  post_setup_event_blocking "$payload" || true
-  echo "ERROR: $message"
-  if [ "$preserve_dev_server" != "true" ]; then
-    cleanup_dev_server
-  fi
-  exit 1
-}
-
-report_partial_failure_and_exit() {
-  message="$1"
-  post_setup_log "$message"
-  patch_setup_status_blocking "partial_failure" "$message" || true
-  echo "ERROR: $message"
-  cleanup_dev_server
-  exit 1
-}
-
-rehydrate_sdk_env() {
-  AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-  AUTONOMA_SDK_ENDPOINT=$(tr -d '\n' < "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" 2>/dev/null || echo '')
-  AUTONOMA_SHARED_SECRET=$(grep '^AUTONOMA_SHARED_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
-  AUTONOMA_SIGNING_SECRET=$(grep '^AUTONOMA_SIGNING_SECRET=' "$AUTONOMA_ROOT/.env" 2>/dev/null | tail -n 1 | cut -d= -f2-)
-  export AUTONOMA_SDK_ENDPOINT AUTONOMA_SHARED_SECRET AUTONOMA_SIGNING_SECRET
-  if [ -z "$AUTONOMA_SDK_ENDPOINT" ] || [ -z "$AUTONOMA_SHARED_SECRET" ] || [ -z "$AUTONOMA_SIGNING_SECRET" ]; then
-    return 1
-  fi
-  return 0
-}
+mkdir -p autonoma/skills autonoma/qa-tests
 ```
 
-Prepare the SDK reference repo for Step 1:
+The kickoff hook has already written `autonoma/.docs-url` and `autonoma/.generation-id`.
 
-```bash
-SDK_REF_DIR="${AUTONOMA_SDK_REF_DIR:-}"
-if [ -n "$SDK_REF_DIR" ] && [ -d "$SDK_REF_DIR" ]; then
-  echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
-else
-  SDK_REF_DIR="$(mktemp -d)/autonoma-sdk"
-  if git clone --depth 1 https://github.com/Autonoma-AI/sdk.git "$SDK_REF_DIR"; then
-    echo "$SDK_REF_DIR" > /tmp/autonoma-sdk-ref-dir
-  else
-    echo "ERROR: Unable to prepare the SDK reference repo."
-    cleanup_dev_server
-    exit 1
-  fi
-fi
-```
-
-Before creating the record, derive a clean human-readable application name from the repository. Look at the git remote URL, the directory name, and any `package.json` / `pyproject.toml` / `README.md` to infer what the product is actually called. Prefer the product name over the repo slug.
+## Step 1: Generate Knowledge Base
 
-Create the generation record so the dashboard can track progress in real time:
+Spawn `kb-generator`:
 
-```bash
-RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d "{\"applicationId\":\"${AUTONOMA_PROJECT_ID}\",\"repoName\":\"${APP_NAME}\"}")
-HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
-echo "Setup API response (HTTP $HTTP_STATUS): $BODY"
-GENERATION_ID=$(echo "$BODY" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo '')
-echo "$GENERATION_ID" > autonoma/.generation-id
-echo "Generation ID: $GENERATION_ID"
-```
+> Analyze the codebase and generate the knowledge base. Write `autonoma/AUTONOMA.md` with YAML
+> frontmatter (app_name, app_description, core_flows, feature_count, skill_count), create skill
+> files in `autonoma/skills/`, and write `autonoma/features.json` (features array + totals).
+> Fetch instructions first: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-1-knowledge-base.txt"`.
 
-If `GENERATION_ID` is empty, log the HTTP status and response body above for debugging, then continue anyway.
+After completion: verify files exist, present core_flows table, `AskUserQuestion`, then `Write` `autonoma/.step-1-ack` (single character body).
 
-## Step 1: SDK Integration
+## Step 2: Entity Creation Audit
 
-Report step start:
+Spawn `entity-audit-generator`:
 
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-SDK_REF_DIR=$(cat /tmp/autonoma-sdk-ref-dir 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 start."
-post_setup_log "Detecting stack and integrating the Autonoma SDK..."
-```
-
-Spawn the `sdk-integrator` subagent with the following task:
-
-> Read the SDK reference repo path from `/tmp/autonoma-sdk-ref-dir` and use it as read-only context.
-> Detect the project stack, map it against the supported SDK docs matrix, and stop immediately with
-> a `mailto:support@autonoma.app` link if unsupported.
-> Create a branch, install the SDK from package managers only, implement the SDK endpoint following
-> the matching example or README pattern, ensure `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET`
-> exist in `.env`, update `.env.example`, keep `autonoma/` out of commits, start or reuse a dev server,
-> verify signed `discover`, `up`, and `down`, write `autonoma/.sdk-endpoint` and
-> `autonoma/.sdk-integration.json`, commit with
-> `feat: integrate autonoma sdk`, and create a PR if `gh` is available.
-> Do NOT modify the SDK source repo. Do NOT modify database schemas, migrations, or models.
-
-**After the subagent completes:**
-1. Verify `autonoma/.sdk-endpoint` exists and is non-empty
-2. Verify `autonoma/.sdk-integration.json` exists and is non-empty
-3. Read and export `AUTONOMA_SDK_ENDPOINT` from that file
-4. Read `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` from `.env`
-5. Confirm the endpoint is reachable with a signed `discover` request
-6. Retain `/tmp/autonoma-dev-server-pid` for cleanup after the pipeline finishes
-7. Present the summary to the user — detected stack, packages installed, endpoint URL, PR URL if available
-
-Load the endpoint and secrets:
-
-```bash
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_endpoint.py" "$AUTONOMA_ROOT/autonoma/.sdk-endpoint" \
-  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-endpoint artifact." true
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_sdk_integration.py" "$AUTONOMA_ROOT/autonoma/.sdk-integration.json" \
-  || report_error_and_exit "Step 1 did not produce a valid autonoma/.sdk-integration.json artifact." true
-
-rehydrate_sdk_env || report_error_and_exit "Step 1 did not leave a reusable SDK endpoint and both secrets in project files." true
-
-BODY='{"action":"discover"}'
-SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
-HTTP_STATUS=$(curl -sS -o /tmp/autonoma-sdk-discover-check.json -w "%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
-  -H "Content-Type: application/json" \
-  -H "x-signature: $SIG" \
-  -d "$BODY")
-if [ "$HTTP_STATUS" != "200" ]; then
-  report_error_and_exit "SDK discover check failed after Step 1 (HTTP $HTTP_STATUS)." true
-fi
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" /tmp/autonoma-sdk-discover-check.json \
-  || report_error_and_exit "Step 1 discover response did not match the required schema." true
-```
+> Read the knowledge base. Audit how each database model is created. For every model, find the
+> dedicated creation function in a service/repository/helper. Classify as `independently_created: true`
+> (factory) or `false` (raw SQL fallback). Record side_effects (informational). Output
+> `autonoma/entity-audit.md` with frontmatter listing each model.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-2-entity-audit.txt"`.
 
-Report step complete:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.completed" "0" "SDK Integration")" || report_error_and_exit "Failed to report Step 1 completion." true
-```
-
-7. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this SDK integration summary look correct? The next step will use the endpoint produced here."
-   - options: ["Yes, proceed to Step 2", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 2.
-
-## Step 2: Generate Knowledge Base
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 start."
-post_setup_log "Analyzing codebase structure and identifying features..."
-```
-
-Spawn the `kb-generator` subagent with the following task:
-
-> Analyze the codebase and generate the knowledge base. Write the output to `autonoma/AUTONOMA.md`
-> and create skill files in `autonoma/skills/`. The file MUST have YAML frontmatter with
-> app_name, app_description, core_flows (feature/description/core table), feature_count, and skill_count.
-> You MUST also write `autonoma/features.json` — a machine-readable inventory of every feature discovered.
-> It must have: features array (each with name, type, path, core), total_features, total_routes, total_api_routes.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-1-knowledge-base.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/AUTONOMA.md` and `autonoma/features.json` exist and are non-empty
-2. The PostToolUse hook will have validated the frontmatter and features.json schema automatically
-3. Read the file and present the frontmatter to the user — specifically the core_flows table
-
-Report step complete and upload skills:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-SKILL_COUNT=$(ls "$AUTONOMA_ROOT/autonoma/skills/"*.md 2>/dev/null | wc -l | tr -d ' ')
-post_setup_log "Knowledge base complete. Generated ${SKILL_COUNT} skills. Uploading to dashboard..."
-post_setup_event_blocking "$(build_step_payload "step.completed" "1" "Knowledge Base")" || report_error_and_exit "Failed to report Step 2 completion."
-[ -n "$GENERATION_ID" ] && python3 -c "
-import os, json
-root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
-skills = []
-d = os.path.join(root, 'autonoma/skills')
-if os.path.isdir(d):
-    for f in os.listdir(d):
-        if f.endswith('.md'):
-            with open(os.path.join(d, f)) as fh:
-                skills.append({'name': f, 'content': fh.read()})
-print(json.dumps({'skills': skills}))
-" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d @- || true
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this core flows table look correct? These flows determine how the test budget is distributed."
-   - options: ["Yes, proceed to Step 3", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 3.
+After completion: present the audit, `AskUserQuestion`, `Write` `autonoma/.step-2-ack`.
 
 ## Step 3: Generate Scenarios
 
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 start."
-post_setup_log "Mapping data model and designing test data environments..."
-```
-
-Before spawning the subagent, fetch the SDK discover artifact and save it to `autonoma/discover.json`.
-This step assumes Step 1 already produced:
-- `AUTONOMA_SDK_ENDPOINT`
-- `AUTONOMA_SHARED_SECRET`
-
-Fetch and validate the artifact:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-mkdir -p "$AUTONOMA_ROOT/autonoma"
-rehydrate_sdk_env || report_error_and_exit "Step 3 could not reload the SDK endpoint and secrets from Step 1."
-BODY='{"action":"discover"}'
-SIG=$(echo -n "$BODY" | openssl dgst -sha256 -hmac "$AUTONOMA_SHARED_SECRET" | sed 's/.*= //')
-RESPONSE=$(curl -sS -w "\nHTTP_STATUS:%{http_code}" -X POST "$AUTONOMA_SDK_ENDPOINT" \
-  -H "Content-Type: application/json" \
-  -H "x-signature: $SIG" \
-  -d "$BODY")
-HTTP_STATUS=$(echo "$RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-DISCOVER_BODY=$(echo "$RESPONSE" | sed '/HTTP_STATUS:/d')
-if [ "$HTTP_STATUS" != "200" ]; then
-  report_error_and_exit "SDK discover failed during Step 3 (HTTP $HTTP_STATUS): $DISCOVER_BODY"
-fi
-printf '%s\n' "$DISCOVER_BODY" > "$AUTONOMA_ROOT/autonoma/discover.json"
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_discover.py" "$AUTONOMA_ROOT/autonoma/discover.json" \
-  || report_error_and_exit "Step 3 discover artifact did not pass validation."
-```
-
-Spawn the `scenario-generator` subagent with the following task:
-
-> Read the knowledge base from `autonoma/AUTONOMA.md`, `autonoma/skills/`, and the SDK discover
-> artifact from `autonoma/discover.json`.
-> Generate test data scenarios. Write the output to `autonoma/scenarios.md`.
-> The file MUST have YAML frontmatter with scenario_count, scenarios summary, entity_types,
-> discover metadata, and variable_fields. Prefer fixed, reviewable seed values by default. If a
-> field needs uniqueness, prefer a planner-chosen hardcoded literal plus a discriminator before
-> introducing a variable placeholder. Use variable fields only for truly dynamic values such as
-> backend-generated or time-based fields. `generator` is optional and must not default to `faker`.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-2-scenarios.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/discover.json` and `autonoma/scenarios.md` exist and are non-empty
-2. Validate `autonoma/discover.json` using the plugin's validator
-3. The PostToolUse hook will have validated the frontmatter format automatically
-4. Read the file and present the summary to the user — scenario names, entity counts, entity types, discover schema counts, and the minimal variable field tokens that remain dynamic
-
-Report step complete:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_log "Scenarios generated from SDK discover. Preserved standard/empty/large plus schema metadata, keeping variable fields minimal and intentional."
-post_setup_event_blocking "$(build_step_payload "step.completed" "2" "Scenarios")" || report_error_and_exit "Failed to report Step 3 completion."
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Do these scenarios look correct? Most seed values should stay concrete, and only truly dynamic values should remain variable for later tests."
-   - options: ["Yes, proceed to Step 4", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 4.
-
-## Step 4: Generate E2E Test Cases
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 start."
-post_setup_log "Generating E2E test cases from knowledge base and scenarios..."
-```
-
-Spawn the `test-case-generator` subagent with the following task:
-
-> Read the knowledge base from `autonoma/AUTONOMA.md`, skills from `autonoma/skills/`,
-> and scenarios from `autonoma/scenarios.md`.
-> Generate complete E2E test cases as markdown files in `autonoma/qa-tests/`.
-> You MUST create `autonoma/qa-tests/INDEX.md` with frontmatter containing total_tests,
-> total_folders, folder breakdown, and coverage_correlation.
-> Each test file MUST have frontmatter with title, description, criticality, scenario, and flow.
-> Treat `scenarios.md` as fixture input only. Do not generate tests whose purpose is to verify
-> scenario counts, seeded inventories, or Environment Factory correctness. Only reference
-> scenario data when it is needed to test a real user-facing app behavior.
-> Fetch the latest instructions from https://docs.agent.autonoma.app/llms/test-planner/step-3-e2e-tests.txt first.
-
-**After the subagent completes:**
-1. Verify `autonoma/qa-tests/INDEX.md` exists and is non-empty
-2. Verify at least one non-`INDEX.md` test file exists
-3. Verify actual test count matches `INDEX.md`
-4. Verify folder breakdown matches `INDEX.md`
-5. The PostToolUse hook will have validated the INDEX frontmatter and individual test file frontmatter
-6. Read the INDEX.md and present the summary to the user — total tests, folder breakdown, coverage correlation
-
-Enforce the file-count postconditions:
-
-```bash
-INDEX_PATH="$AUTONOMA_ROOT/autonoma/qa-tests/INDEX.md"
-[ -s "$INDEX_PATH" ] || report_error_and_exit "Step 4 did not produce autonoma/qa-tests/INDEX.md."
-TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
-[ "$TEST_COUNT" -gt 0 ] || report_error_and_exit "Step 4 produced INDEX.md but no actual test files."
-python3 - "$INDEX_PATH" "$TEST_COUNT" "$AUTONOMA_ROOT/autonoma/qa-tests" <<'PY' || report_error_and_exit "Step 4 test inventory did not match INDEX.md."
-import sys
-from pathlib import Path
-import yaml
-
-index_path = Path(sys.argv[1])
-actual_count = int(sys.argv[2])
-qa_dir = Path(sys.argv[3])
-
-content = index_path.read_text()
-parts = content.split('---', 2)
-if len(parts) < 3:
-    raise SystemExit('INDEX.md is missing YAML frontmatter')
-frontmatter = yaml.safe_load(parts[1])
-
-if frontmatter.get('total_tests') != actual_count:
-    raise SystemExit(
-        f'total_tests ({frontmatter.get("total_tests")}) does not match actual test files ({actual_count})'
-    )
-
-actual_folders = {}
-for path in qa_dir.rglob('*.md'):
-    if path.name == 'INDEX.md':
-        continue
-    folder = path.parent.relative_to(qa_dir).as_posix()
-    actual_folders[folder] = actual_folders.get(folder, 0) + 1
-
-declared_folders = {entry['name']: entry['test_count'] for entry in frontmatter.get('folders', [])}
-if actual_folders != declared_folders:
-    raise SystemExit(f'folder breakdown mismatch: declared={declared_folders} actual={actual_folders}')
-print('OK')
-PY
-```
-
-Report step complete and upload test cases:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-TEST_COUNT=$(find "$AUTONOMA_ROOT/autonoma/qa-tests" -name '*.md' ! -name 'INDEX.md' 2>/dev/null | wc -l | tr -d ' ')
-post_setup_log "Generated ${TEST_COUNT} test cases. Uploading to dashboard..."
-post_setup_event_blocking "$(build_step_payload "step.completed" "3" "E2E Tests")" || report_error_and_exit "Failed to report Step 4 completion."
-[ -n "$GENERATION_ID" ] && python3 -c "
-import os, json
-proj_root = open('/tmp/autonoma-project-root').read().strip() if os.path.exists('/tmp/autonoma-project-root') else '.'
-qa_dir = os.path.join(proj_root, 'autonoma/qa-tests')
-test_cases = []
-for root, dirs, files in os.walk(qa_dir):
-    for f in files:
-        if f.endswith('.md') and f != 'INDEX.md':
-            path = os.path.join(root, f)
-            folder = os.path.relpath(root, qa_dir)
-            with open(path) as fh:
-                content = fh.read()
-            entry = {'name': f, 'content': content}
-            if folder != '.':
-                entry['folder'] = folder
-            test_cases.append(entry)
-print(json.dumps({'testCases': test_cases}))
-" | curl -f -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/artifacts" \
-  -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-  -H "Content-Type: application/json" \
-  -d @- || true
-```
-
-4. **If auto-advance is disabled:** Call `AskUserQuestion` with:
-   - question: "Does this test distribution look correct? The total test count should roughly correlate with the number of routes and features in your app."
-   - options: ["Yes, proceed to Step 5", "I want to suggest changes"]
-   Wait for the user's response before proceeding.
-   **Otherwise:** Skip the prompt and proceed directly to Step 5.
-
-## Step 5: Scenario Validation
-
-Report step start:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_event_blocking "$(build_step_payload "step.started" "4" "Scenario Validation")" || report_error_and_exit "Failed to report Step 5 start."
-post_setup_log "Validating planned scenarios against the live SDK endpoint..."
-```
-
-Spawn the `scenario-validator` subagent with the following task:
-
-> Read `autonoma/discover.json` and `autonoma/scenarios.md`.
-> Validate the planned scenarios against the existing live SDK endpoint without editing backend code.
-> Smoke-test the signed `discover -> up -> down` lifecycle, validate `standard`, `empty`, and `large`,
-> write approved recipes to `autonoma/scenario-recipes.json`, write the terminal artifact
-> `autonoma/.scenario-validation.json`, and run:
-> `python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" autonoma/scenario-recipes.json`
-> Do NOT install packages, edit backend code, modify SDK source, modify DB schemas or migrations, or create branches/commits/PRs.
-
-**After the subagent completes:**
-1. Rehydrate SDK env from Step 1 artifacts
-2. Verify `autonoma/.scenario-validation.json` exists and is non-empty
-3. Validate `autonoma/.scenario-validation.json`
-4. Require `status == "ok"` and `preflightPassed == true`
-5. Verify `autonoma/scenario-recipes.json` exists and is non-empty
-6. Run the preflight helper if the subagent did not already do so
-7. If preflight fails, stop and report the failure without attempting code changes
-8. Present the results to the user — endpoint validated, smoke-test results, per-scenario validation results, any remaining deployment issues
-
-Run and enforce preflight:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-rehydrate_sdk_env || report_partial_failure_and_exit "Step 5 could not reload the SDK endpoint and secrets from Step 1."
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/validators/validate_scenario_validation.py" "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" \
-  || report_partial_failure_and_exit "Scenario Validation did not produce a valid autonoma/.scenario-validation.json artifact."
-python3 - "$AUTONOMA_ROOT/autonoma/.scenario-validation.json" <<'PY' || report_partial_failure_and_exit "Scenario Validation finished without a successful terminal state."
-import json
-import sys
-
-payload = json.load(open(sys.argv[1]))
-if payload.get("status") != "ok":
-    raise SystemExit(f'status must be "ok", got {payload.get("status")!r}')
-if payload.get("preflightPassed") is not True:
-    raise SystemExit('preflightPassed must be true before Step 5 can upload recipes')
-print('OK')
-PY
-[ -s "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" ] \
-  || report_partial_failure_and_exit "Scenario Validation did not leave an authoritative autonoma/scenario-recipes.json artifact."
-python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$AUTONOMA_ROOT/autonoma/scenario-recipes.json" \
-  || report_partial_failure_and_exit "Scenario recipe preflight failed. Fix the live integration before retrying Step 5."
-```
-
-Report step complete and upload scenario recipes:
-
-```bash
-AUTONOMA_ROOT=$(cat /tmp/autonoma-project-root 2>/dev/null || echo '.')
-GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id" 2>/dev/null || echo '')
-echo "GENERATION_ID=${GENERATION_ID:-<empty>}"
-post_setup_log "Uploading validated scenario recipes to setup..."
-if [ -n "$GENERATION_ID" ]; then
-  RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
-  if ! python3 -c "import json; json.load(open('$RECIPE_PATH'))" 2>/dev/null; then
-    report_partial_failure_and_exit "scenario-recipes.json is not valid JSON. Step 5 cannot complete."
-  fi
-  UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}" \
-    -H "Content-Type: application/json" \
-    -d @"$RECIPE_PATH")
-  UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
-  echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
-  if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
-    report_partial_failure_and_exit "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete."
-  fi
-
-  VERIFY_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X GET "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenarios" \
-    -H "Authorization: Bearer ${AUTONOMA_API_KEY}")
-  VERIFY_STATUS=$(echo "$VERIFY_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
-  VERIFY_BODY=$(echo "$VERIFY_RESPONSE" | sed '/HTTP_STATUS:/d')
-  if [ "$VERIFY_STATUS" != "200" ]; then
-    report_partial_failure_and_exit "Failed to verify uploaded scenarios (HTTP $VERIFY_STATUS)."
-  fi
-fi
-post_setup_log "Scenario validation completed."
-post_setup_event_blocking "$(build_step_payload "step.completed" "4" "Scenario Validation")" || report_partial_failure_and_exit "Failed to report Step 5 completion."
-cleanup_dev_server
-```
+Spawn `scenario-generator`:
+
+> Read the knowledge base and `autonoma/entity-audit.md`. Generate test data scenarios. Write
+> `autonoma/scenarios.md` with frontmatter (scenario_count, scenarios summary, entity_types,
+> variable_fields, planning_sections). Mark values as variable only when they must vary across
+> runs (globally unique, time-sensitive, backend-generated, or when the app lacks natural
+> per-run isolation). Design entity tables so they serialise as nested trees rooted at the
+> scope entity.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-3-scenarios.txt"`.
+
+After completion: present scenarios, `AskUserQuestion`, `Write` `autonoma/.step-3-ack`.
+
+## Step 4: Implement Environment Factory
+
+Spawn `env-factory-generator`:
+
+> Read `autonoma/entity-audit.md` and `autonoma/scenarios.md`. Install SDK packages and configure
+> the handler. Register a factory for every model with `independently_created: true` (call the audit's
+> `creation_file`/`creation_function` — never reimplement inline). Implement the auth callback
+> using the app's real session/token creation. Run a `discover` smoke test. Run the factory-integrity
+> check. Then `Write` `autonoma/.endpoint-implemented` with a short summary. Do NOT run `up`/`down`
+> — that is step 5.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-4-implement.txt"`
+> and `curl -sSfL "$(cat autonoma/.docs-url)/llms/guides/environment-factory.txt"`.
+> Use `AUTONOMA_SHARED_SECRET` and `AUTONOMA_SIGNING_SECRET` as env var names.
+
+After completion: verify `autonoma/.endpoint-implemented` exists, present implementation summary,
+`AskUserQuestion` ("Ready to validate the full up/down lifecycle?"), `Write` `autonoma/.step-4-ack`.
+
+## Step 5: Validate Scenario Lifecycle
+
+Spawn `scenario-validator`:
+
+> Read `autonoma/entity-audit.md`, `autonoma/scenarios.md`, and the handler created in step 4.
+> Run `discover`/`up`/`down` against every scenario with HMAC-signed curl. Iterate (up to 5
+> times): if a scenario fails because of a handler bug, fix the handler and retry; if it fails
+> because the scenario itself is wrong/unfeasible, edit `scenarios.md` to match reality. On
+> success for every scenario, emit `autonoma/scenario-recipes.json` (nested tree rooted at
+> the scope entity; `variables` block for any `{{token}}` placeholders; one validated recipe
+> per scenario), run `preflight_scenario_recipes.py` against it, and write
+> `autonoma/.scenario-validation.json` as the terminal artifact. Then `Write`
+> `autonoma/.endpoint-validated`. If you hit the iteration cap OR preflight fails, STOP and
+> report — do NOT write the sentinel.
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-5-validate.txt"`.
+> Verify: every audited model appears in `discover.schema.models`, every `independently_created`
+> model has a registered factory, `auth` is non-empty, DB state is correct before and after
+> `down`, and preflight exits 0.
+
+After completion:
+1. If `autonoma/.endpoint-validated` exists AND `autonoma/scenario-recipes.json` is valid JSON
+   AND `autonoma/.scenario-validation.json` has `status: "ok"` with `preflightPassed: true`:
+   enforce and upload the recipes to the dashboard, then ack.
+
+   ```bash
+   AUTONOMA_ROOT="${AUTONOMA_ROOT:-.}"
+   VALIDATION_ARTIFACT="$AUTONOMA_ROOT/autonoma/.scenario-validation.json"
+   RECIPE_PATH="$AUTONOMA_ROOT/autonoma/scenario-recipes.json"
+
+   # Enforce terminal artifact contract
+   python3 - "$VALIDATION_ARTIFACT" <<'PY'
+   import json, sys
+   payload = json.load(open(sys.argv[1]))
+   if payload.get("status") != "ok":
+       raise SystemExit("status must be ok before Step 5 can upload recipes")
+   if payload.get("preflightPassed") is not True:
+       raise SystemExit("preflightPassed must be true before Step 5 can upload recipes")
+   PY
+
+   [ -s "$RECIPE_PATH" ] || { echo "scenario-recipes.json missing or empty"; exit 1; }
+   python3 -c "import json; json.load(open('$RECIPE_PATH'))" \
+     || { echo "scenario-recipes.json is not valid JSON"; exit 1; }
+
+   # Re-run preflight at the orchestrator level for belt-and-suspenders safety.
+   python3 "$(cat /tmp/autonoma-plugin-root)/hooks/preflight_scenario_recipes.py" "$RECIPE_PATH" \
+     || { echo "Preflight failed at orchestrator gate"; exit 1; }
+
+   # Upload to dashboard
+   GENERATION_ID=$(cat "$AUTONOMA_ROOT/autonoma/.generation-id")
+   UPLOAD_RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" -X POST \
+     "${AUTONOMA_API_URL}/v1/setup/setups/${GENERATION_ID}/scenario-recipe-versions" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer ${AUTONOMA_API_TOKEN}" \
+     -d @"$RECIPE_PATH")
+   UPLOAD_STATUS=$(echo "$UPLOAD_RESPONSE" | grep -o "HTTP_STATUS:[0-9]*" | cut -d: -f2)
+   UPLOAD_BODY=$(echo "$UPLOAD_RESPONSE" | sed '/HTTP_STATUS:/d')
+   echo "Scenario recipe upload response (HTTP $UPLOAD_STATUS): $UPLOAD_BODY"
+   if [ "$UPLOAD_STATUS" != "200" ] && [ "$UPLOAD_STATUS" != "201" ]; then
+     echo "Recipe upload failed (HTTP $UPLOAD_STATUS). Step 5 cannot complete." >&2
+     exit 1
+   fi
+   ```
+
+   Then present validation summary (scenarios passed, any edits made to `scenarios.md`,
+   recipes uploaded), `AskUserQuestion`, `Write` `autonoma/.step-5-ack`.
+
+2. If any of those artifacts are missing/invalid: the agent failed — surface the failure
+   report to the user and STOP. Do NOT proceed to step 6. The validation gate in the hook
+   will also block test file writes.
+
+## Step 6: Generate E2E Test Cases
+
+Spawn `test-case-generator`:
+
+> Read `autonoma/AUTONOMA.md`, `autonoma/skills/`, and `autonoma/scenarios.md` (the latter has
+> been reconciled with reality in step 5 — use it as the source of truth). Parse the
+> `variable_fields` frontmatter — test steps MUST use the `{{token}}` placeholders for any
+> variable value (typed, asserted, or navigated to), never the hardcoded literal.
+> Treat scenarios as fixture input, not as the subject under test — do NOT generate meta-tests
+> that "audit" seeded counts or fixture existence.
+> Generate test cases in `autonoma/qa-tests/`. Write `autonoma/qa-tests/INDEX.md` with
+> frontmatter (total_tests, total_folders, folder breakdown, coverage_correlation). Each test
+> file needs frontmatter (title, description, criticality, scenario, flow).
+> Fetch: `curl -sSfL "$(cat autonoma/.docs-url)/llms/test-planner/step-6-e2e-tests.txt"`.
+
+After completion:
+1. Verify `autonoma/qa-tests/INDEX.md` exists
+2. Present INDEX summary
+3. `Write` `autonoma/.pipeline-complete` with a short summary. The hook emits `step.completed`
+   for the final step, marking the setup complete.
 
 ## Completion
 
-After all steps complete, summarize:
-- **Step 1**: detected stack, installed packages, endpoint URL, PR URL if available
-- **Step 2**: knowledge base location and core flow count
-- **Step 3**: scenario count and entity types covered
-- **Step 4**: total test count, folder breakdown, coverage correlation
-- **Step 5**: scenario validation results, smoke-test status, and recipe upload status
-
-If Step 1 already launched a dev server and its postconditions fail, preserve the server for diagnosis and report the PID.
-For terminal failures after later steps begin, clean up the dev server before returning control to the user.
+Summarize each step:
+- **Step 1**: KB location, core flows
+- **Step 2**: entity audit — factories vs raw SQL
+- **Step 3**: scenarios generated
+- **Step 4**: endpoint implemented (handler path, packages, factories registered)
+- **Step 5**: lifecycle validated, scenario-recipes.json emitted, preflight passed, recipes uploaded, scenarios.md edits (if any)
+- **Step 6**: test count, folder breakdown
diff --git a/tests/test_validate_pipeline_output.py b/tests/test_validate_pipeline_output.py
deleted file mode 100644
index b40bc26..0000000
--- a/tests/test_validate_pipeline_output.py
+++ /dev/null
@@ -1,321 +0,0 @@
-"""Tests for hooks/validate-pipeline-output.sh."""
-import json
-import os
-import subprocess
-import tempfile
-import threading
-from contextlib import contextmanager
-from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
-from pathlib import Path
-
-
-ROOT = Path(__file__).resolve().parents[1]
-SCRIPT = ROOT / 'hooks' / 'validate-pipeline-output.sh'
-
-VALID_DISCOVER = {
-    'schema': {
-        'models': [
-            {
-                'name': 'Organization',
-                'fields': [
-                    {
-                        'name': 'name',
-                        'type': 'String',
-                        'isRequired': True,
-                        'isId': False,
-                        'hasDefault': False,
-                    },
-                ],
-            },
-        ],
-        'edges': [],
-        'relations': [],
-        'scopeField': 'organizationId',
-    },
-}
-
-VALID_RECIPES = {
-    'version': 1,
-    'source': {
-        'discoverPath': 'autonoma/discover.json',
-        'scenariosPath': 'autonoma/scenarios.md',
-    },
-    'validationMode': 'sdk-check',
-    'recipes': [
-        {
-            'name': 'standard',
-            'description': 'Standard baseline',
-            'create': {'Organization': [{'name': 'Acme Standard'}]},
-            'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
-        },
-        {
-            'name': 'empty',
-            'description': 'Empty workspace',
-            'create': {'Organization': [{'name': 'Acme Empty'}]},
-            'validation': {'status': 'validated', 'method': 'checkScenario', 'phase': 'ok'},
-        },
-        {
-            'name': 'large',
-            'description': 'Large workspace',
-            'create': {'Organization': [{'name': 'Acme Large'}]},
-            'validation': {'status': 'validated', 'method': 'endpoint-up-down', 'phase': 'ok'},
-        },
-    ],
-}
-
-
-def test_sdk_endpoint_hook_accepts_valid_url():
-    env = os.environ.copy()
-
-    code, out, err = _run_hook(
-        {
-            'autonoma/.sdk-endpoint': 'http://127.0.0.1:3000/api/autonoma\n',
-        },
-        'autonoma/.sdk-endpoint',
-        env,
-    )
-
-    assert code == 0
-    assert out == ''
-    assert err == ''
-
-
-def test_sdk_endpoint_hook_blocks_invalid_url():
-    env = os.environ.copy()
-
-    code, _, err = _run_hook(
-        {
-            'autonoma/.sdk-endpoint': '/api/autonoma\n',
-        },
-        'autonoma/.sdk-endpoint',
-        env,
-    )
-
-    assert code == 2
-    assert 'validate-sdk-endpoint' in err
-    assert 'http or https' in err
-
-
-def test_sdk_integration_hook_accepts_valid_json():
-    env = os.environ.copy()
-
-    code, out, err = _run_hook(
-        {
-            'autonoma/.sdk-integration.json': json.dumps(
-                {
-                    'status': 'ok',
-                    'endpointUrl': 'http://127.0.0.1:3000/api/autonoma',
-                    'endpointPath': '/api/autonoma',
-                    'stack': {
-                        'language': 'TypeScript',
-                        'framework': 'Express',
-                        'orm': 'Prisma',
-                        'packageManager': 'pnpm',
-                    },
-                    'packagesInstalled': ['@autonoma-ai/sdk'],
-                    'sharedSecretPresent': True,
-                    'signingSecretPresent': True,
-                    'devServer': {'startedByPlugin': True, 'pid': 1234},
-                    'verification': {
-                        'discover': {'status': 'ok', 'validatedByPlugin': True},
-                        'up': {'status': 'ok'},
-                        'down': {'status': 'ok'},
-                    },
-                    'branch': {'name': 'autonoma/feat-autonoma-sdk'},
-                    'pr': {'url': 'https://github.com/example/repo/pull/1'},
-                    'blockingIssues': [],
-                }
-            ),
-        },
-        'autonoma/.sdk-integration.json',
-        env,
-    )
-
-    assert code == 0
-    assert out == ''
-    assert err == ''
-
-
-def test_sdk_integration_hook_blocks_invalid_json():
-    env = os.environ.copy()
-
-    code, _, err = _run_hook(
-        {
-            'autonoma/.sdk-integration.json': json.dumps({'status': 'ok'}),
-        },
-        'autonoma/.sdk-integration.json',
-        env,
-    )
-
-    assert code == 2
-    assert 'validate-sdk-integration' in err
-    assert 'Missing required fields' in err
-
-
-def test_scenario_validation_hook_accepts_valid_json():
-    env = os.environ.copy()
-
-    code, out, err = _run_hook(
-        {
-            'autonoma/.scenario-validation.json': json.dumps(
-                {
-                    'status': 'ok',
-                    'preflightPassed': True,
-                    'smokeTestPassed': True,
-                    'validatedScenarios': ['standard', 'empty', 'large'],
-                    'failedScenarios': [],
-                    'blockingIssues': [],
-                    'recipePath': 'autonoma/scenario-recipes.json',
-                    'validationMode': 'sdk-check',
-                    'endpointUrl': 'http://127.0.0.1:3000/api/autonoma',
-                }
-            ),
-        },
-        'autonoma/.scenario-validation.json',
-        env,
-    )
-
-    assert code == 0
-    assert out == ''
-    assert err == ''
-
-
-def test_scenario_validation_hook_blocks_invalid_json():
-    env = os.environ.copy()
-
-    code, _, err = _run_hook(
-        {
-            'autonoma/.scenario-validation.json': json.dumps(
-                {
-                    'status': 'failed',
-                    'preflightPassed': False,
-                }
-            ),
-        },
-        'autonoma/.scenario-validation.json',
-        env,
-    )
-
-    assert code == 2
-    assert 'validate-scenario-validation' in err
-    assert 'Missing required fields' in err
-
-
-def _run_hook(files: dict[str, str], target: str, env: dict[str, str]) -> tuple[int, str, str]:
-    with tempfile.TemporaryDirectory() as tmpdir:
-        for relpath, content in files.items():
-            fullpath = Path(tmpdir) / relpath
-            fullpath.parent.mkdir(parents=True, exist_ok=True)
-            fullpath.write_text(content)
-
-        target_path = str(Path(tmpdir) / target)
-        payload = json.dumps({'tool_input': {'file_path': target_path}})
-        result = subprocess.run(
-            ['bash', str(SCRIPT)],
-            input=payload,
-            text=True,
-            capture_output=True,
-            env=env,
-        )
-        return result.returncode, result.stdout.strip(), result.stderr.strip()
-
-
-@contextmanager
-def _sdk_server(up_status: int = 200, down_status: int = 200):
-    class Handler(BaseHTTPRequestHandler):
-        def do_POST(self):
-            length = int(self.headers.get('Content-Length', '0'))
-            body = json.loads(self.rfile.read(length) or '{}')
-            action = body.get('action')
-
-            if action == 'up':
-                status = up_status
-                response = {'auth': {}, 'refs': {'organization': ['org_1']}, 'refsToken': 'token_1'}
-                if status >= 400:
-                    response = {'error': 'up failed'}
-            elif action == 'down':
-                status = down_status
-                response = {'ok': True}
-                if status >= 400:
-                    response = {'error': 'down failed'}
-            else:
-                status = 400
-                response = {'error': 'unknown action'}
-
-            encoded = json.dumps(response).encode()
-            self.send_response(status)
-            self.send_header('Content-Type', 'application/json')
-            self.send_header('Content-Length', str(len(encoded)))
-            self.end_headers()
-            self.wfile.write(encoded)
-
-        def log_message(self, format, *args):
-            return
-
-    server = ThreadingHTTPServer(('127.0.0.1', 0), Handler)
-    thread = threading.Thread(target=server.serve_forever, daemon=True)
-    thread.start()
-    try:
-        yield f'http://127.0.0.1:{server.server_address[1]}'
-    finally:
-        server.shutdown()
-        thread.join()
-
-
-def test_scenario_recipes_hook_requires_preflight_env():
-    env = os.environ.copy()
-    env.pop('AUTONOMA_SDK_ENDPOINT', None)
-    env.pop('AUTONOMA_SHARED_SECRET', None)
-
-    code, _, err = _run_hook(
-        {
-            'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
-            'autonoma/discover.json': json.dumps(VALID_DISCOVER),
-        },
-        'autonoma/scenario-recipes.json',
-        env,
-    )
-
-    assert code == 2
-    assert 'scenario-recipes-preflight' in err
-    assert 'AUTONOMA_SDK_ENDPOINT is not set' in err
-
-
-def test_scenario_recipes_hook_runs_preflight_successfully():
-    with _sdk_server() as endpoint:
-        env = os.environ.copy()
-        env['AUTONOMA_SDK_ENDPOINT'] = endpoint
-        env['AUTONOMA_SHARED_SECRET'] = 'test-secret'
-
-        code, out, err = _run_hook(
-            {
-                'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
-                'autonoma/discover.json': json.dumps(VALID_DISCOVER),
-            },
-            'autonoma/scenario-recipes.json',
-            env,
-        )
-
-    assert code == 0
-    assert out == ''
-    assert err == ''
-
-
-def test_scenario_recipes_hook_blocks_failed_preflight():
-    with _sdk_server(up_status=500) as endpoint:
-        env = os.environ.copy()
-        env['AUTONOMA_SDK_ENDPOINT'] = endpoint
-        env['AUTONOMA_SHARED_SECRET'] = 'test-secret'
-
-        code, _, err = _run_hook(
-            {
-                'autonoma/scenario-recipes.json': json.dumps(VALID_RECIPES),
-                'autonoma/discover.json': json.dumps(VALID_DISCOVER),
-            },
-            'autonoma/scenario-recipes.json',
-            env,
-        )
-
-    assert code == 2
-    assert 'scenario-recipes-preflight' in err
-    assert 'HTTP 500' in err
diff --git a/tests/test_validate_scenarios.py b/tests/test_validate_scenarios.py
index 40c55c0..100de96 100644
--- a/tests/test_validate_scenarios.py
+++ b/tests/test_validate_scenarios.py
@@ -9,70 +9,27 @@
 scenarios:
   - name: standard
     description: Typical usage
-    entity_types: 2
+    entity_types: [user, task]
     total_entities: 10
   - name: empty
     description: No data
-    entity_types: 0
+    entity_types: [user]
     total_entities: 0
   - name: large
     description: Stress test
-    entity_types: 3
+    entity_types: [user, task, project]
     total_entities: 1000
 entity_types:
   - name: user
   - name: task
-discover:
-  source: sdk
-  model_count: 4
-  edge_count: 3
-  relation_count: 2
-  scope_field: organizationId
-variable_fields:
-  - token: "{{project_title}}"
-    entity: Project.title
-    scenarios:
-      - standard
-      - large
-    reason: title must be unique per test run
-    test_reference: ({{project_title}} variable)
+variable_fields: []
 planning_sections:
-  - sdk_discover
   - schema_summary
   - relationship_map
   - variable_data_strategy
 ---
 
 # Scenarios
-
-## SDK Discover
-
-Models: 4
-
-## Schema Summary
-
-- User
-- Task
-
-## Relationship Map
-
-- User.organizationId -> Organization.id
-
-## Variable Data Strategy
-
-- `{{project_title}}` is generated.
-
-## Scenario: `standard`
-
-Standard details.
-
-## Scenario: `empty`
-
-Empty details.
-
-## Scenario: `large`
-
-Large details.
 """
 
 
@@ -95,23 +52,6 @@ def test_missing_required_fields():
     assert 'Missing required frontmatter fields' in out
 
 
-def test_missing_discover_field():
-    content = VALID.replace(
-        "discover:\n  source: sdk\n  model_count: 4\n  edge_count: 3\n  relation_count: 2\n  scope_field: organizationId\n",
-        "",
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert "discover" in out
-
-
-def test_discover_source_must_be_sdk():
-    content = VALID.replace('source: sdk', 'source: codebase')
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'discover.source must be exactly "sdk"' in out
-
-
 def test_scenario_count_too_low():
     content = VALID.replace('scenario_count: 3', 'scenario_count: 2')
     code, out = run_validator(SCRIPT, content)
@@ -127,6 +67,7 @@ def test_scenario_count_mismatch():
 
 
 def test_missing_required_scenario_name():
+    # Replace 'large' with 'extra' — now 'large' is missing
     content = VALID.replace('name: large', 'name: extra')
     code, out = run_validator(SCRIPT, content)
     assert code == 1
@@ -135,6 +76,7 @@ def test_missing_required_scenario_name():
 
 
 def test_scenario_missing_field():
+    # Remove description from first scenario
     content = VALID.replace(
         '  - name: standard\n    description: Typical usage',
         '  - name: standard',
@@ -162,73 +104,3 @@ def test_entity_type_missing_name():
     code, out = run_validator(SCRIPT, content)
     assert code == 1
     assert 'must be a mapping with at least a "name" field' in out
-
-
-def test_variable_token_must_use_double_curly_braces():
-    content = VALID.replace('token: "{{project_title}}"', 'token: project_title')
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'must use double curly braces' in out
-
-
-def test_variable_generator_is_optional():
-    code, out = run_validator(SCRIPT, VALID)
-    assert code == 0
-    assert out == 'OK'
-
-
-def test_non_faker_generator_is_accepted():
-    content = VALID.replace(
-        '    reason: title must be unique per test run\n',
-        '    generator: derived from testRunId\n    reason: title must be unique per test run\n',
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 0
-    assert out == 'OK'
-
-
-def test_empty_generator_fails_if_present():
-    content = VALID.replace(
-        '    reason: title must be unique per test run\n',
-        '    generator: ""\n    reason: title must be unique per test run\n',
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'generator must be a non-empty string if present' in out
-
-
-def test_variable_scenarios_must_be_known():
-    content = VALID.replace('      - large', '      - invalid')
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'unknown scenario names' in out
-
-
-def test_missing_required_planning_section():
-    content = VALID.replace(
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n',
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'Missing required planning_sections' in out
-
-
-def test_scoping_analysis_optional_section_accepted():
-    content = VALID.replace(
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n  - scoping_analysis\n',
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 0
-    assert out == 'OK'
-
-
-def test_unknown_planning_section_rejected():
-    content = VALID.replace(
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n',
-        'planning_sections:\n  - sdk_discover\n  - schema_summary\n  - relationship_map\n  - variable_data_strategy\n  - made_up_section\n',
-    )
-    code, out = run_validator(SCRIPT, content)
-    assert code == 1
-    assert 'planning_sections contains unknown value: made_up_section' in out
diff --git a/tests/test_validate_sdk_integration.py b/tests/test_validate_sdk_integration.py
deleted file mode 100644
index 73fab81..0000000
--- a/tests/test_validate_sdk_integration.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Tests for validate_sdk_integration.py."""
-import json
-
-from conftest import run_validator
-
-
-SCRIPT = "validate_sdk_integration.py"
-
-
-def valid_payload(**overrides):
-    payload = {
-        "status": "ok",
-        "endpointUrl": "http://127.0.0.1:3000/api/autonoma",
-        "endpointPath": "/api/autonoma",
-        "stack": {
-            "language": "TypeScript",
-            "framework": "Express",
-            "orm": "Prisma",
-            "packageManager": "pnpm",
-        },
-        "packagesInstalled": ["@autonoma-ai/sdk", "@autonoma-ai/sdk-prisma"],
-        "sharedSecretPresent": True,
-        "signingSecretPresent": True,
-        "devServer": {"startedByPlugin": True, "pid": 1234},
-        "verification": {
-            "discover": {"status": "ok", "validatedByPlugin": True},
-            "up": {"status": "ok"},
-            "down": {"status": "ok"},
-        },
-        "branch": {"name": "autonoma/feat-autonoma-sdk"},
-        "pr": {"url": "https://github.com/example/repo/pull/1"},
-        "blockingIssues": [],
-    }
-    payload.update(overrides)
-    return payload
-
-
-def test_accepts_valid_payload():
-    code, out = run_validator(SCRIPT, json.dumps(valid_payload()), filename=".sdk-integration.json")
-    assert code == 0
-    assert out == "OK"
-
-
-def test_rejects_missing_required_field():
-    payload = valid_payload()
-    payload.pop("verification")
-    code, out = run_validator(SCRIPT, json.dumps(payload), filename=".sdk-integration.json")
-    assert code == 1
-    assert "Missing required fields" in out
-
-
-def test_rejects_invalid_endpoint_url():
-    code, out = run_validator(
-        SCRIPT,
-        json.dumps(valid_payload(endpointUrl="/api/autonoma")),
-        filename=".sdk-integration.json",
-    )
-    assert code == 1
-    assert "absolute http/https URL" in out
-
-
-def test_accepts_failed_status_with_blocking_issues():
-    code, out = run_validator(
-        SCRIPT,
-        json.dumps(
-            valid_payload(
-                status="failed",
-                verification={
-                    "discover": {"status": "failed", "validatedByPlugin": False},
-                    "up": {"status": "failed"},
-                    "down": {"status": "failed"},
-                },
-                blockingIssues=["discover request failed"],
-            )
-        ),
-        filename=".sdk-integration.json",
-    )
-    assert code == 0
-    assert out == "OK"

From 630482030c40bf17324d358cfca4aab4db1ae0b2 Mon Sep 17 00:00:00 2001
From: Ignacio Pardo <ignacio.pardo@autonoma.app>
Date: Thu, 23 Apr 2026 09:51:09 -0300
Subject: [PATCH 32/33] docs: update pipeline documentation

---
 CLAUDE.md |  55 +++++++++++++---------
 README.md | 133 ++++++++++++++++++++++++++++--------------------------
 2 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 7685d42..7bb5b8b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,37 +1,45 @@
 # Autonoma Test Planner Plugin
 
-Claude Code plugin that generates E2E test suites through a deterministic 5-step pipeline.
+Claude Code plugin that generates E2E test suites through a deterministic multi-step pipeline.
 
 ## Project Structure
 
 ```text
 .claude-plugin/              # Plugin manifest
-commands/generate-tests.md   # Command entry
+commands/generate-tests.md   # Full pipeline command
+commands/generate-adhoc-tests.md
 skills/generate-tests/SKILL.md
+skills/generate-adhoc-tests/SKILL.md
 agents/
-  sdk-integrator.md          # Step 1: SDK integration
-  kb-generator.md            # Step 2: Knowledge base
-  scenario-generator.md      # Step 3: Scenarios
-  test-case-generator.md     # Step 4: E2E tests
-  scenario-validator.md      # Step 5: Scenario validation
+  kb-generator.md              # Step 1: Knowledge base
+  entity-audit-generator.md    # Step 2: Entity creation audit
+  scenario-generator.md        # Step 3: Scenarios
+  env-factory-generator.md     # Step 4: Environment Factory implementation
+  scenario-validator.md        # Step 5: Scenario lifecycle validation
+  test-case-generator.md       # Step 6: E2E tests
+  focused-test-case-generator.md
 hooks/
   hooks.json
+  pipeline-kickoff.sh
+  pretool-heartbeat.sh
+  transcript-streamer.py
   validate-pipeline-output.sh
   preflight_scenario_recipes.py
   validators/
+    evals/
 tests/
 ```
 
 ## Pipeline
 
-1. SDK Integration
-2. Knowledge Base
+1. Knowledge Base
+2. Entity Creation Audit
 3. Scenarios
-4. E2E Tests
-5. Scenario Validation
+4. Implement Environment Factory
+5. Validate Scenario Lifecycle
+6. Generate E2E Tests
 
-The canonical launch mode is `AUTONOMA_AUTO_ADVANCE=true`. If you are still using the older flag,
-`AUTONOMA_REQUIRE_CONFIRMATION=false` is treated as the same auto-advance behavior. Step 5 is final.
+The full pipeline is interactive. After steps 1-5, Claude presents the step summary and waits for user confirmation before continuing. Lifecycle reporting is handled by plugin hooks, not by ad hoc agent curl calls.
 
 ## Validation
 
@@ -39,19 +47,23 @@ Validators are in `hooks/validators/`.
 
 | Validator | File matched | Key checks |
 |-----------|-------------|------------|
-| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | app_name, app_description, core_flows |
-| `validate_discover.py` | `*/autonoma/discover.json` | schema object, models, edges, relations, scopeField |
-| `validate_sdk_endpoint.py` | `*/autonoma/.sdk-endpoint` | absolute http/https URL |
-| `validate_sdk_integration.py` | `*/autonoma/.sdk-integration.json` | Step 1 handoff contract |
+| `validate_kb.py` | `*/autonoma/AUTONOMA.md` | frontmatter and core-flow structure |
 | `validate_features.py` | `*/autonoma/features.json` | feature inventory schema |
-| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario count and metadata |
+| `validate_entity_audit.py` | `*/autonoma/entity-audit.md` | model creation classification and owner links |
+| `validate_scenarios.py` | `*/autonoma/scenarios.md` | scenario count, metadata, required sections |
+| `validate_endpoint_implemented.py` | `*/autonoma/.endpoint-implemented` | handler path and factory integrity |
+| `validate_creation_file_immutable.py` | `*/autonoma/.endpoint-implemented` | accepted audit creation files were not rewritten unsafely |
+| `validate_factory_fidelity.py` | `*/autonoma/.endpoint-implemented` | semantic per-model factory fidelity |
 | `validate_scenario_validation.py` | `*/autonoma/.scenario-validation.json` | Step 5 terminal-state contract |
 | `validate_scenario_recipes.py` | `*/autonoma/scenario-recipes.json` | recipe schema |
 | `validate_test_index.py` | `*/autonoma/qa-tests/INDEX.md` | test totals and folder sums |
+| `validate_directory_structure.py` | `*/autonoma/qa-tests/INDEX.md` | test directory structure |
 | `validate_test_file.py` | `*/autonoma/qa-tests/*/[!I]*.md` | test frontmatter |
 
 Scenario recipes also run live endpoint preflight through `hooks/preflight_scenario_recipes.py`.
 
+Test file writes are blocked until `autonoma/.endpoint-validated` exists.
+
 ## Development
 
 ```bash
@@ -62,6 +74,7 @@ pytest
 
 ## Notes
 
-- Step 1 installs the SDK from package managers only.
-- The SDK reference repo is read-only context.
-- Step 5 validates the live integration and does not edit backend code.
+- Step 4 implements the Environment Factory and may edit target backend code.
+- Step 4 writes `autonoma/.endpoint-implemented` only after discover smoke and factory-integrity checks pass.
+- Step 5 validates signed `discover` / `up` / `down` for every scenario and may fix handler bugs or reconcile `scenarios.md`.
+- Step 6 is gated on `autonoma/.endpoint-validated`.
diff --git a/README.md b/README.md
index 6ef2b28..28fb2b5 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # Autonoma Test Planner
 
-A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 5-step pipeline.
+A Claude Code plugin that generates comprehensive E2E test suites for your codebase through a validated 6-step pipeline.
 
-Each step runs in an isolated subagent with deterministic validation. The first step now integrates the Autonoma SDK directly into the target project, and the final step validates scenarios against that live endpoint without editing backend code.
+Each step runs in an isolated subagent with deterministic validation. The pipeline audits how application entities are created, implements an Autonoma Environment Factory against the target app, validates scenario lifecycles through the live endpoint, and only then generates E2E tests.
 
 ## Install
 
@@ -19,55 +19,73 @@ Inside any project with Claude Code:
 /autonoma-test-planner:generate-tests
 ```
 
-The canonical launch mode is `AUTONOMA_AUTO_ADVANCE=true`, which keeps the plugin moving after
-Steps 1-4. If you are still using the older confirmation flag, `AUTONOMA_REQUIRE_CONFIRMATION=false`
-is treated as the same auto-advance behavior.
+The full pipeline is interactive. After steps 1-5, Claude presents the step summary and waits for your confirmation before continuing.
+
+Lifecycle reporting is hook-driven:
+
+- `hooks/pipeline-kickoff.sh` creates the setup record and writes `autonoma/.docs-url` plus `autonoma/.generation-id`.
+- `hooks/validate-pipeline-output.sh` validates artifacts, emits step events, uploads artifacts, and enforces the test-generation gate.
+- `hooks/pretool-heartbeat.sh` keeps dashboard activity reporting alive while tools are running.
 
 ## Pipeline
 
-### Step 1: SDK Integration
+### Step 1: Knowledge Base
 
-Detects the project stack, installs the Autonoma SDK from package managers, wires the endpoint, ensures secrets exist, starts or reuses a local dev server, verifies signed `discover` / `up` / `down`, and writes `autonoma/.sdk-endpoint` plus `autonoma/.sdk-integration.json`.
+Analyzes the app and produces `autonoma/AUTONOMA.md`, `autonoma/skills/*.md`, and `autonoma/features.json`.
 
-It may also create a branch, commit the integration, and open a PR when `gh` is available.
+**You review**: the core flows table.
 
-**You review**: detected stack, installed packages, endpoint URL, generated env vars, and PR status.
+### Step 2: Entity Creation Audit
 
-### Step 2: Knowledge Base
+Audits every database model and records how each model comes into existence in `autonoma/entity-audit.md`.
 
-Analyzes the app and produces `autonoma/AUTONOMA.md` and `autonoma/features.json`.
+Models marked `independently_created: true` become Environment Factory factories that call the app's real creation functions. Dependent-only models use the SDK's raw SQL fallback and are torn down through their owner model.
 
-**You review**: the core flows table.
+**You review**: factory-backed models, dependent-only models, and any dual-creation models.
 
 ### Step 3: Scenarios
 
-Fetches `discover` from the Step 1 endpoint and produces `autonoma/discover.json` plus `autonoma/scenarios.md`.
+Reads the knowledge base and `autonoma/entity-audit.md`, then produces `autonoma/scenarios.md`.
 
-**You review**: entity names, counts, relationships, and which values should stay concrete versus variable.
+Scenarios include `standard`, `empty`, and `large`, track variable fields that must vary across runs, and use nested create trees rooted at the scope entity.
 
-### Step 4: E2E Tests
+**You review**: entity names, counts, relationships, variable fields, and via-owner versus standalone creation choices.
 
-Generates markdown test files in `autonoma/qa-tests/` plus `INDEX.md`.
+### Step 4: Implement Environment Factory
 
-**You review**: test distribution and coverage correlation.
+Installs and configures the Autonoma SDK endpoint, then registers a factory for every `independently_created: true` model from `entity-audit.md`.
+
+This step runs a signed `discover` smoke test and factory-integrity checks, then writes `autonoma/.endpoint-implemented`. It does **not** run full `up` / `down`; lifecycle validation happens in Step 5.
 
-### Step 5: Scenario Validation
+**You review**: handler path, installed packages, factories registered, and required secrets.
 
-Validates `standard`, `empty`, and `large` against the live SDK endpoint, writes `autonoma/scenario-recipes.json` plus `autonoma/.scenario-validation.json`, runs endpoint preflight, and uploads the approved recipes to the setup API only after all checks pass.
+### Step 5: Validate Scenario Lifecycle
 
-This step does **not** implement backend code. It only validates the existing integration.
+Runs signed `discover` / `up` / `down` against every scenario. The validator may fix handler bugs or reconcile `autonoma/scenarios.md` with real endpoint behavior.
+
+On success, it writes `autonoma/scenario-recipes.json`, `autonoma/.scenario-validation.json`, and `autonoma/.endpoint-validated`. The `.endpoint-validated` sentinel gates Step 6; test files cannot be written before it exists.
+
+**You review**: scenarios passed, scenario edits, preflight result, and recipe upload status.
+
+### Step 6: Generate E2E Tests
+
+Generates markdown test files in `autonoma/qa-tests/` plus `autonoma/qa-tests/INDEX.md`.
+
+**You review**: test distribution and coverage correlation.
 
 ## Key Outputs
 
-- `autonoma/.sdk-endpoint`: validated SDK endpoint URL
-- `autonoma/.sdk-integration.json`: Step 1 machine-readable handoff
 - `autonoma/AUTONOMA.md`
+- `autonoma/skills/*.md`
 - `autonoma/features.json`
-- `autonoma/discover.json`
+- `autonoma/entity-audit.md`
 - `autonoma/scenarios.md`
-- `autonoma/qa-tests/INDEX.md`
-- `autonoma/.scenario-validation.json`: Step 5 terminal-state artifact
+- `autonoma/.factory-plan.md`
+- `autonoma/.endpoint-implemented`
 - `autonoma/scenario-recipes.json`
+- `autonoma/.scenario-validation.json`
+- `autonoma/.endpoint-validated`
+- `autonoma/qa-tests/INDEX.md`
 
 ## Ad Hoc Test Generation
 
@@ -89,7 +107,7 @@ Or invoke without arguments and the command will suggest focus areas based on yo
 
 ### How it works
 
-**Subsequent runs** (scenarios already configured in Autonoma): fetches scenarios and existing tests from the Autonoma, then runs only focused test generation (Step 3). Steps 1, 2, and 4 are skipped.
+**Subsequent runs** (active scenarios and recipes already exist in Autonoma): fetches existing scenario, skill, and test context from Autonoma, then runs only focused test generation for the requested topic.
 
 Tests are written to `autonoma/qa-tests/{focus-slug}/` so they sit alongside your existing test suite without overwriting it.
 
@@ -108,50 +126,40 @@ autonoma/qa-tests/
 Provide these before running the plugin:
 
 ```bash
+AUTONOMA_DOCS_URL=<docs base url>
 AUTONOMA_API_KEY=<api key>
 AUTONOMA_PROJECT_ID=<application id>
 AUTONOMA_API_URL=<setup api base url>
 ```
 
-Canonical:
-
-```bash
-AUTONOMA_AUTO_ADVANCE=true
-```
+`AUTONOMA_DOCS_URL` is required so subagents can fetch the latest Autonoma instructions. `AUTONOMA_API_KEY`, `AUTONOMA_PROJECT_ID`, and `AUTONOMA_API_URL` are required for dashboard setup records, lifecycle events, artifact uploads, and recipe uploads.
 
-Compatibility alias:
-
-```bash
-AUTONOMA_REQUIRE_CONFIRMATION=false
-```
-
-You no longer need to pre-provide `AUTONOMA_SDK_ENDPOINT` or `AUTONOMA_SHARED_SECRET`. Step 1 creates or discovers them in the target project.
-
-The integration step updates `.env` and `.env.example` in the target repo with:
+The Environment Factory step generates or discovers these target-app values and updates `.env` and `.env.example` when applicable:
 
 ```bash
 AUTONOMA_SHARED_SECRET=<shared hmac secret>
 AUTONOMA_SIGNING_SECRET=<private signing secret>
 ```
 
-Those changes still need to be deployed after PR creation or merge.
+`AUTONOMA_SDK_ENDPOINT` is needed by scenario validation and recipe preflight once the endpoint exists. Generated environment changes still need to be deployed with the target app.
 
 ## Validation
 
 Every pipeline output is validated by shell-dispatched Python validators.
 
-| File | Validation |
-| --- | --- |
-| `AUTONOMA.md` | frontmatter and core-flow structure |
-| `features.json` | feature inventory schema |
-| `discover.json` | SDK discover schema |
-| `.sdk-endpoint` | absolute `http` or `https` URL |
-| `.sdk-integration.json` | Step 1 handoff contract |
-| `scenarios.md` | scenario schema and required sections |
-| `.scenario-validation.json` | Step 5 terminal-state contract |
-| `scenario-recipes.json` | recipe schema plus live endpoint preflight |
-| `INDEX.md` | test totals and folder breakdown |
-| test files | required frontmatter |
+| File | Validator | Validation |
+| --- | --- | --- |
+| `AUTONOMA.md` | `validate_kb.py` | frontmatter and core-flow structure |
+| `features.json` | `validate_features.py` | feature inventory schema |
+| `entity-audit.md` | `validate_entity_audit.py` | model creation classification, factory counts, and owner links |
+| `scenarios.md` | `validate_scenarios.py` | scenario schema and required sections |
+| `.endpoint-implemented` | `validate_endpoint_implemented.py`, `validate_creation_file_immutable.py`, `validate_factory_fidelity.py` | handler path, factory integrity, immutable audit snapshot, and semantic factory fidelity |
+| `.scenario-validation.json` | `validate_scenario_validation.py` | Step 5 terminal-state contract |
+| `scenario-recipes.json` | `validate_scenario_recipes.py` | recipe schema plus live endpoint preflight |
+| `INDEX.md` | `validate_test_index.py`, `validate_directory_structure.py` | test totals, folder breakdown, and directory structure |
+| test files | `validate_test_file.py` | required frontmatter |
+
+Test files are blocked until `autonoma/.endpoint-validated` exists.
 
 ## Local Development
 
@@ -167,26 +175,25 @@ pytest
 autonoma-test-planner/
 ├── .claude-plugin/
 ├── commands/generate-tests.md
+├── commands/generate-adhoc-tests.md
 ├── skills/generate-tests/SKILL.md
+├── skills/generate-adhoc-tests/SKILL.md
 ├── agents/
-│   ├── sdk-integrator.md
 │   ├── kb-generator.md
+│   ├── entity-audit-generator.md
 │   ├── scenario-generator.md
+│   ├── env-factory-generator.md
 │   ├── test-case-generator.md
+│   ├── focused-test-case-generator.md
 │   └── scenario-validator.md
 ├── hooks/
+│   ├── pipeline-kickoff.sh
+│   ├── pretool-heartbeat.sh
+│   ├── transcript-streamer.py
 │   ├── validate-pipeline-output.sh
 │   ├── preflight_scenario_recipes.py
 │   └── validators/
-├── adhoc/
-│   ├── .claude-plugin/
-│   ├── skills/generate-adhoc-tests/SKILL.md
-│   ├── commands/generate-adhoc-tests.md
-│   ├── agents/focused-test-case-generator.md
-│   └── hooks/
-│       ├── hooks.json
-│       ├── validate-pipeline-output.sh
-│       └── validators/
+│       └── evals/
 └── tests/
 ```
 

From 3f875aab4b7f1501566c968675153b4d577790bf Mon Sep 17 00:00:00 2001
From: Tom Piaggio <tomas.piaggio12@gmail.com>
Date: Thu, 23 Apr 2026 09:23:07 -0700
Subject: [PATCH 33/33] chore: sync release-please manifest to 1.14.0 (#34)

---
 .release-please-manifest.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 4c313f9..2ef9a1c 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "1.4.0"
+  ".": "1.14.0"
 }