diff --git a/manifest.json b/manifest.json index bc7836a..8a1f3fc 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,12 @@ { "version": "2", - "updated_at": "2026-05-11T13:22:07Z", + "updated_at": "2026-05-18T12:22:38Z", "skills": { "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment (evaluates analytics vs synced tables data access)", "experimental": false, - "updated_at": "2026-05-11T13:22:01Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -33,7 +33,7 @@ "version": "0.1.0", "description": "Core Databricks skill for CLI, auth, and data exploration", "experimental": false, - "updated_at": "2026-05-11T10:22:59Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -48,7 +48,7 @@ "version": "0.0.0", "description": "Declarative Automation Bundles (DABs) for deploying and managing Databricks resources", "experimental": false, - "updated_at": "2026-05-05T15:31:42Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -66,7 +66,7 @@ "version": "0.1.0", "description": "Databricks Jobs orchestration and scheduling", "experimental": false, - "updated_at": "2026-05-07T15:19:50Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -78,7 +78,7 @@ "version": "0.1.0", "description": "Databricks Lakebase Postgres: projects, scaling, connectivity, synced tables, and Data API", "experimental": false, - "updated_at": "2026-05-11T10:23:05Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -93,7 +93,7 @@ "version": "0.1.0", "description": "Databricks Model Serving endpoint management", "experimental": false, - "updated_at": "2026-05-07T15:19:45Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -105,7 +105,7 @@ "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-05-07T15:19:55Z", + "updated_at": "2026-05-18T12:11:44Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -152,7 +152,7 @@ "version": "0.1.0", "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "experimental": false, - "updated_at": "2026-05-07T15:19:59Z", + "updated_at": "2026-05-18T12:22:14Z", "files": [ "SKILL.md", "agents/openai.yaml", @@ -164,6 +164,22 @@ "references/networking-and-security.md", "references/streaming-migration.md" ] + }, + "databricks-serverless-storage-check": { + "version": "0.1.0", + "description": "Detect cross-task file-sharing antipatterns in serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks) and recommend UC Volumes or /Workspace handoff", + "experimental": false, + "updated_at": "2026-05-18T12:21:52Z", + "files": [ + "SKILL.md", + "agents/openai.yaml", + "assets/databricks.png", + "assets/databricks.svg", + "references/pattern-catalog.md", + "references/remediation-guide.md", + "scripts/preflight.py", + "scripts/test_preflight.py" + ] } } } diff --git a/scripts/skills.py b/scripts/skills.py index cdfdcf7..b1865b1 100644 --- a/scripts/skills.py +++ b/scripts/skills.py @@ -48,6 +48,10 @@ "description": "Migrate Databricks workloads from classic compute to serverless compute, including compatibility checks and concrete fixes", "experimental": False, }, + "databricks-serverless-storage-check": { + "description": "Detect cross-task file-sharing antipatterns in serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks) and recommend UC Volumes or /Workspace handoff", + "experimental": False, + }, } diff --git a/skills/databricks-serverless-migration/SKILL.md b/skills/databricks-serverless-migration/SKILL.md index 859d01a..97f69e4 100644 --- a/skills/databricks-serverless-migration/SKILL.md +++ b/skills/databricks-serverless-migration/SKILL.md @@ -118,7 +118,7 @@ Scan the code for patterns that are incompatible with the serverless compute arc | Pattern | Severity | Fix | |---------|----------|-----| | `dbfs:/` or `/dbfs/` paths (persistent data) | Blocker | Replace with `/Volumes//schema/volume/path` | -| `dbfs:/tmp/`, `/dbfs/tmp/`, paths with `cache`/`scratch`/`temp` | Warning | Use `/tmp/` or `/local_disk0/tmp/` (local driver disk) — do not use Volumes for temp files due to performance | +| `dbfs:/tmp/`, `/dbfs/tmp/`, paths with `cache`/`scratch`/`temp` | Warning | Use `/tmp/` or `/local_disk0/tmp/` (local driver disk) — do not use Volumes for temp files due to performance. **Per-task scratch only**: if another task (child notebook, sibling job task, or pipeline) needs to read the file, use UC Volumes or `/Workspace` — see [`databricks-serverless-storage-check`](../databricks-serverless-storage-check/SKILL.md). | | `file:///dbfs/` FUSE mount paths | Warning | Replace persistent paths with `/Volumes/...`; replace temp paths with `/local_disk0/tmp/` | | `dbutils.fs.mount(...)` | Blocker | Create UC external location + external volume | | `hive_metastore.db.table` | Warning | Migrate to UC or use HMS Federation: `CREATE FOREIGN CATALOG ... USING CONNECTION hms_connection` | diff --git a/skills/databricks-serverless-storage-check/SKILL.md b/skills/databricks-serverless-storage-check/SKILL.md new file mode 100644 index 0000000..9ecef20 --- /dev/null +++ b/skills/databricks-serverless-storage-check/SKILL.md @@ -0,0 +1,149 @@ +--- +name: databricks-serverless-storage-check +description: "Detect cross-task file-sharing antipatterns in Databricks serverless jobs (writes to /local_disk0, /tmp, or trustedTemp that are read by sibling or child tasks on potentially different compute nodes) and recommend UC Volumes or /Workspace for handoff. Use when a serverless job fails with `INTERNAL_ERROR: [Errno 13] Permission denied` on /local_disk0 paths, when parallel child notebooks fail intermittently, when reviewing a DAB job before deploying to serverless, or when the user mentions trustedTemp, fan-out, or cross-task file handoff. Complements databricks-serverless-migration (which covers single-notebook migration)." +compatibility: Requires databricks CLI (>= v0.292.0) for --job-id and --run-id modes; --notebook / --dir / --job-yaml modes have no external dependencies. +metadata: + version: "0.1.0" +parent: databricks-core +--- + +# Serverless Storage Check + +**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection. + +This skill detects a specific class of serverless failure: **cross-task file handoffs through local disk**. On serverless compute, each task may run on a different node, so a path written by a parent task to `/local_disk0`, `/tmp`, or a `trustedTemp` directory is not guaranteed to be visible to a child task. The typical symptom is: + +``` +INTERNAL_ERROR: [Errno 13] Permission denied: +'/local_disk0/spark-/trustedTemp-/tmp' +``` + +The fix is to move the handoff off local disk and onto durable, cross-node storage — UC Volumes (preferred) or `/Workspace` (fallback) — or replace the file handoff entirely with `dbutils.jobs.taskValues` for small payloads. + +This skill ships an executable preflight scanner (`scripts/preflight.py`) that statically detects these antipatterns and emits remediation guidance. It is intentionally narrow: it does **not** try to fix `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT`, which is a separate, platform-side intermittent issue (see "What this skill does NOT cover" below). + +## When to use this skill + +Use this skill when any of these triggers appear: + +- A serverless job fails with `INTERNAL_ERROR: [Errno 13] Permission denied` on `/local_disk0`, `/tmp`, or a path containing `trustedTemp` +- Parallel child notebooks (`dbutils.notebook.run`) fail intermittently while the same logic succeeds when run sequentially in a single notebook +- A DAB job is about to be deployed to serverless and has multiple `notebook_task` or `pipeline_task` tasks +- The user mentions "trustedTemp", "fan-out", "cross-task file sharing", or `/local_disk0` +- A new serverless job design needs a sanity check before first run + +This skill is **complementary to**, not a replacement for, [`databricks-serverless-migration`](../databricks-serverless-migration/SKILL.md). That skill handles single-notebook migration and explicitly recommends `/local_disk0/tmp` for per-task scratch — which is correct *inside* a task. The boundary between the two skills: + +| Concern | Use skill | +|---------|-----------| +| Migrating one notebook from classic DBR to serverless | `databricks-serverless-migration` | +| Per-task scratch storage (intra-task) | `databricks-serverless-migration` (recommends `/local_disk0/tmp`) | +| **Cross-task file handoff between parent/child notebooks or sibling tasks** | **this skill** | +| Permission-denied on `/local_disk0` during a multi-task run | **this skill** | + +## Quick start + +Run the preflight scanner against any of: a single notebook, a directory, a DAB job YAML, a remote job, or a failed run. + +```bash +# Single notebook +python3 scripts/preflight.py --notebook path/to/notebook.ipynb + +# Recursive scan of a directory +python3 scripts/preflight.py --dir path/to/repo/ + +# A DAB job YAML (auto-resolves referenced notebooks) +python3 scripts/preflight.py --job-yaml resources/my_job.job.yml + +# A remote job (pulls notebook source via databricks workspace export) +python3 scripts/preflight.py --job-id 123456789 --profile DEFAULT + +# A failed run (classifies the error trace as fan-out vs env-sync) +python3 scripts/preflight.py --run-id 987654321 --profile DEFAULT + +# Machine-readable output for CI gating +python3 scripts/preflight.py --dir . --json +``` + +## Interpreting the output + +The scanner prints findings grouped by severity. Each finding includes the pattern ID, file, line, code snippet, and a recommended fix snippet. + +| Severity | Meaning | Exit code | +|----------|---------|-----------| +| **Blocker** | Will fail on serverless. Must fix before deploy. | `2` | +| **Warning** | Likely to fail under parallel execution. Should fix. | `1` | +| **Info** | Awareness-only or escalation routing (e.g. env-sync error). | `0` | + +Clean scans exit `0`. Use `--json` for CI: pipe to `jq` or fail builds when blockers are found. + +## The core rule + +The boundary between safe and unsafe local-disk use on serverless: + +> **Local disk (`/local_disk0`, `/tmp`, `trustedTemp`) is per-task only.** Anything one task writes that another task reads MUST live on `/Volumes` or `/Workspace`. + +This is verbatim from the BSI thread guidance: when the parent task writes to local disk and the child task tries to read it, the child may be on a different node and the file won't exist (or will hit `Permission denied`). See [`references/remediation-guide.md`](references/remediation-guide.md) for concrete before/after patterns. + +## Pattern catalog (summary) + +| ID | Severity | What it detects | +|----|----------|-----------------| +| `FANOUT001` | Blocker | Local-disk path written then passed to `dbutils.notebook.run`, `taskValues.set`, or job-task parameter | +| `FANOUT002` | Blocker | Child notebook reads from `/local_disk0` or `/tmp` via widget, parameter, or `taskValues.get` | +| `FANOUT003` | Warning | DAB job with multiple sibling tasks referencing the same local-disk path | +| `FANOUT004` | Warning | `pipeline_task` immediately downstream of a `notebook_task` that wrote to local temp | +| `FANOUT005` | Info | `dbutils.fs.cp` from local path to local path inside a multi-task job (heuristic) | +| `FANOUT006` | Blocker | Hardcoded path matching the BSI signature `/local_disk0/spark-*/trustedTemp/...` | +| `ENV001` | Info | Run output contains `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` — route to escalation | + +Full rules, sample matches, and per-pattern fixes are in [`references/pattern-catalog.md`](references/pattern-catalog.md). + +## Remediation summary + +When the scanner flags a finding, prefer fixes in this order: + +1. **UC Volumes** (preferred): `/Volumes////handoff//...` + - Durable, cross-node, UC-governed, works for any file size + - Requires `WRITE FILES` on the volume and a parent that creates the volume per run or per job + +2. **`/Workspace`** (fallback): `/Workspace/Shared//handoff/...` + - Durable and cross-node, no UC dependency + - Best for smaller files; subject to workspace storage limits + +3. **`dbutils.jobs.taskValues`** (small payloads only): no file at all + - For scalars and small JSON (well under 48 KB total per run) + - Replaces the file entirely — preferred when the handoff is just a parameter, config, or summary + +4. **Keep `/local_disk0/tmp`** for **intra-task scratch only**. Never for cross-task. + +Full before/after code is in [`references/remediation-guide.md`](references/remediation-guide.md). + +## What this skill does NOT cover + +The original BSI thread combined two distinct failures. This skill addresses only the storage one. The other failure, `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` / "Virtual environment changed while syncing", is a rare, platform-side issue that the Databricks team treats as an engineering escalation. The scanner detects it in `--run-id` mode and emits an `ENV001` info finding routing the user to support, but does not attempt to fix it. + +If the scanner emits `ENV001`: + +1. Open a Databricks engineering support ticket (use the `/jira-actions` skill or `/support-escalation` if available) with the run ID and error trace +2. As a temporary mitigation, reduce dependency setup during child notebook startup (move heavy `%pip install` to the parent or a job-level environment spec) +3. Add retries on the affected task — the error is usually transient + +## Related skills + +- [`databricks-serverless-migration`](../databricks-serverless-migration/SKILL.md) — single-notebook classic-to-serverless migration. **Use that skill first** if the workload hasn't been migrated yet. +- [`databricks-dabs`](../databricks-dabs/SKILL.md) — DAB structure and resource definitions. Use when authoring or fixing the `job.yml` flagged by `FANOUT003` or `FANOUT004`. +- [`databricks-jobs`](../databricks-jobs/SKILL.md) — Lakeflow Jobs orchestration. Use when restructuring task dependencies to avoid the fan-out antipattern. +- [`databricks-core`](../databricks-core/SKILL.md) — parent skill for CLI auth and profile selection. + +## Reference docs + +- [Pattern catalog](references/pattern-catalog.md) — all detection rules with examples +- [Remediation guide](references/remediation-guide.md) — before/after code for Volumes, Workspace, and taskValues handoffs + +## External documentation + +- [Serverless compute limitations](https://docs.databricks.com/en/compute/serverless/limitations) — official local-disk scoping rules +- [Unity Catalog volumes](https://docs.databricks.com/en/connect/unity-catalog/volumes.html) — the preferred handoff target +- [Workspace files](https://docs.databricks.com/en/files/workspace.html) — the fallback handoff target +- [`dbutils.jobs.taskValues`](https://docs.databricks.com/en/dev-tools/databricks-utils.html#task-values-utility-dbutilsjobstaskvalues) — for non-file handoffs diff --git a/skills/databricks-serverless-storage-check/agents/openai.yaml b/skills/databricks-serverless-storage-check/agents/openai.yaml new file mode 100644 index 0000000..082e5b9 --- /dev/null +++ b/skills/databricks-serverless-storage-check/agents/openai.yaml @@ -0,0 +1,7 @@ +interface: + display_name: "Databricks Serverless Storage Check" + short_description: "Detect cross-task local-disk handoffs in serverless jobs" + icon_small: "./assets/databricks.svg" + icon_large: "./assets/databricks.png" + brand_color: "#FF3621" + default_prompt: "Use $databricks-serverless-storage-check to scan a serverless job, notebook, or DAB for cross-task file handoffs through /local_disk0, /tmp, or trustedTemp." diff --git a/skills/databricks-serverless-storage-check/assets/databricks.png b/skills/databricks-serverless-storage-check/assets/databricks.png new file mode 100644 index 0000000..263fe98 Binary files /dev/null and b/skills/databricks-serverless-storage-check/assets/databricks.png differ diff --git a/skills/databricks-serverless-storage-check/assets/databricks.svg b/skills/databricks-serverless-storage-check/assets/databricks.svg new file mode 100644 index 0000000..9d19110 --- /dev/null +++ b/skills/databricks-serverless-storage-check/assets/databricks.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/skills/databricks-serverless-storage-check/eval/evaluation_results.json b/skills/databricks-serverless-storage-check/eval/evaluation_results.json new file mode 100644 index 0000000..631a9bf --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/evaluation_results.json @@ -0,0 +1,691 @@ +{ + "skill_name": "databricks-serverless-storage-check", + "composite_score": 0.8866666666666667, + "duration_seconds": 18.059252977371216, + "mlflow_run_id": "18e1d8edab42450590804ea904cc7a57", + "levels": { + "unit": { + "level": "L1", + "score": 1.0, + "passed": true, + "num_feedbacks": 43, + "feedbacks": [ + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_1", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_2", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_3", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/yaml_syntax/references/pattern-catalog.md:block_4", + "value": "pass", + "rationale": "Valid YAML syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_5", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_6", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/yaml_syntax/references/pattern-catalog.md:block_7", + "value": "pass", + "rationale": "Valid YAML syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_8", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_10", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/pattern-catalog.md:block_11", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/sql_syntax/references/remediation-guide.md:block_2", + "value": "pass", + "rationale": "Valid SQL syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_3", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_4", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_5", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_6", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_7", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_8", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_9", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_10", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_11", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/yaml_syntax/references/remediation-guide.md:block_12", + "value": "pass", + "rationale": "Valid YAML syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_13", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_14", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_15", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_16", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_17", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/yaml_syntax/references/remediation-guide.md:block_18", + "value": "pass", + "rationale": "Valid YAML syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_19", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/python_syntax/references/remediation-guide.md:block_20", + "value": "pass", + "rationale": "Valid Python syntax", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`databricks-serverless-migration`", + "value": "pass", + "rationale": "Link to '../databricks-serverless-migration/SKILL.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`references/remediation-guide.md`", + "value": "pass", + "rationale": "Link to 'references/remediation-guide.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`references/pattern-catalog.md`", + "value": "pass", + "rationale": "Link to 'references/pattern-catalog.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`references/remediation-guide.md`", + "value": "pass", + "rationale": "Link to 'references/remediation-guide.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`databricks-serverless-migration`", + "value": "pass", + "rationale": "Link to '../databricks-serverless-migration/SKILL.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`databricks-dabs`", + "value": "pass", + "rationale": "Link to '../databricks-dabs/SKILL.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`databricks-jobs`", + "value": "pass", + "rationale": "Link to '../databricks-jobs/SKILL.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/`databricks-core`", + "value": "pass", + "rationale": "Link to '../databricks-core/SKILL.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/Pattern catalog", + "value": "pass", + "rationale": "Link to 'references/pattern-catalog.md' exists", + "source": "CODE" + }, + { + "name": "unit/link/SKILL.md/Remediation guide", + "value": "pass", + "rationale": "Link to 'references/remediation-guide.md' exists", + "source": "CODE" + }, + { + "name": "unit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_exists", + "value": "pass", + "rationale": "Passed in 0.001s", + "source": "CODE" + }, + { + "name": "unit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_not_empty", + "value": "pass", + "rationale": "Passed in 0.001s", + "source": "CODE" + }, + { + "name": "unit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_has_frontmatter", + "value": "pass", + "rationale": "Passed in 0.001s", + "source": "CODE" + }, + { + "name": "unit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillContent::test_no_todo_placeholders", + "value": "pass", + "rationale": "Passed in 0.000s", + "source": "CODE" + } + ], + "task_results": null, + "artifacts": null, + "metadata": { + "code_blocks_tested": 29, + "syntax_errors": 0 + }, + "trace_ids": [], + "details": { + "checks": [ + { + "name": "pattern-catalog.md:block_1", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_2", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_3", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_4", + "passed": true, + "message": "Valid YAML syntax" + }, + { + "name": "pattern-catalog.md:block_5", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_6", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_7", + "passed": true, + "message": "Valid YAML syntax" + }, + { + "name": "pattern-catalog.md:block_8", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_10", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "pattern-catalog.md:block_11", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_2", + "passed": true, + "message": "Valid SQL syntax" + }, + { + "name": "remediation-guide.md:block_3", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_4", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_5", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_6", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_7", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_8", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_9", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_10", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_11", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_12", + "passed": true, + "message": "Valid YAML syntax" + }, + { + "name": "remediation-guide.md:block_13", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_14", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_15", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_16", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_17", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_18", + "passed": true, + "message": "Valid YAML syntax" + }, + { + "name": "remediation-guide.md:block_19", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "remediation-guide.md:block_20", + "passed": true, + "message": "Valid Python syntax" + }, + { + "name": "`databricks-serverless-migration`", + "passed": true, + "message": "Link to '../databricks-serverless-migration/SKILL.md' exists" + }, + { + "name": "remediation-guide.md`", + "passed": true, + "message": "Link to 'references/remediation-guide.md' exists" + }, + { + "name": "pattern-catalog.md`", + "passed": true, + "message": "Link to 'references/pattern-catalog.md' exists" + }, + { + "name": "remediation-guide.md`", + "passed": true, + "message": "Link to 'references/remediation-guide.md' exists" + }, + { + "name": "`databricks-serverless-migration`", + "passed": true, + "message": "Link to '../databricks-serverless-migration/SKILL.md' exists" + }, + { + "name": "`databricks-dabs`", + "passed": true, + "message": "Link to '../databricks-dabs/SKILL.md' exists" + }, + { + "name": "`databricks-jobs`", + "passed": true, + "message": "Link to '../databricks-jobs/SKILL.md' exists" + }, + { + "name": "`databricks-core`", + "passed": true, + "message": "Link to '../databricks-core/SKILL.md' exists" + }, + { + "name": "Pattern catalog", + "passed": true, + "message": "Link to 'references/pattern-catalog.md' exists" + }, + { + "name": "Remediation guide", + "passed": true, + "message": "Link to 'references/remediation-guide.md' exists" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_exists", + "passed": true, + "message": "Passed in 0.001s" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_not_empty", + "passed": true, + "message": "Passed in 0.001s" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_has_frontmatter", + "passed": true, + "message": "Passed in 0.001s" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillContent::test_no_todo_placeholders", + "passed": true, + "message": "Passed in 0.000s" + } + ], + "passed": 43, + "total": 43 + }, + "duration_ms": 449, + "last_run_at": "2026-05-29T10:47:39.734319+00:00", + "is_carried_forward": true + }, + "static": { + "level": "static", + "score": 0.86, + "passed": true, + "num_feedbacks": 11, + "feedbacks": [ + { + "name": "static/tool_accuracy", + "value": "pass", + "rationale": "No tool references to check (derived from L1)", + "source": "CODE" + }, + { + "name": "static/examples_valid", + "value": "pass", + "rationale": "Derived from L1: 29/29 code blocks valid. Score: 10.0/10", + "source": "CODE" + }, + { + "name": "static/security/secrets_scan", + "value": "pass", + "rationale": "No hardcoded secrets detected", + "source": "CODE" + }, + { + "name": "static/self_contained", + "value": "pass", + "rationale": "Score: 7/10. The skill provides comprehensive context about the antipatterns, detection rules, remediation patterns, and decision trees. However, it references a `scripts/preflight.py` file that is central to the skill's operation but is NOT included in the skill content. The agent cannot verify or run the scanner without this file. Similarly, it references parent skills (`databricks-serverless-migration`, `databricks-core`) without including their content, though it does summarize the relevant boundaries. The pattern catalog and remediation guide are fully included as reference files, which is excellent. Recommendation: Include the `scripts/preflight.py` source (or at minimum its interface/usage documentation) so an agent can verify the scanner exists and understand its implementation. Without it, the agent must trust that the script exists at the referenced path.", + "source": "LLM_JUDGE" + }, + { + "name": "static/no_conflicts", + "value": "pass", + "rationale": "Score: 9/10. Instructions are consistent throughout. The boundary between parent skill (intra-task) and this sub-skill (cross-task) is stated clearly and repeated in the table. The remediation guide's 'What NOT to do' section aligns perfectly with the pattern catalog. The 'When /local_disk0/tmp IS fine' section correctly reinforces the core rule without contradicting it. The decision tree in the remediation guide is consistent with the numbered fix priority in the main SKILL.md.", + "source": "LLM_JUDGE" + }, + { + "name": "static/security", + "value": "pass", + "rationale": "Score: 9/10. No hardcoded tokens or secrets. The SQL example uses placeholder `` for grants. Permission notes are included for each fix pattern (WRITE VOLUME, CAN_EDIT, etc.). The false-positive escape hatch section appropriately warns 'Prefer fixing the antipattern when possible; this is an explicit opt-out, not a recommendation.'", + "source": "LLM_JUDGE" + }, + { + "name": "static/llm_navigable_structure", + "value": "pass", + "rationale": "Score: 9/10. Excellent hierarchical structure: clear headings (When to use, Quick start, Interpreting output, Core rule, Pattern catalog, Remediation summary, What this skill does NOT cover, Related skills). Tables are used effectively for severity/exit-code mapping and skill boundary delineation. The pattern catalog uses consistent formatting (ID, Severity, What it matches, Example, Fix, Detection rule) for each pattern. The remediation guide has a decision tree at the top for quick navigation.", + "source": "LLM_JUDGE" + }, + { + "name": "static/actionable_instructions", + "value": "pass", + "rationale": "Score: 8/10. The Quick start section provides concrete CLI invocations for all modes (`--notebook`, `--dir`, `--job-yaml`, `--job-id`, `--run-id`, `--json`). The remediation guide provides complete before/after code blocks with SQL setup commands. The ENV001 handling gives a numbered 3-step action plan. However, some instructions reference tools that may not exist (e.g., '/jira-actions skill or /support-escalation if available') and the agent cannot verify the scanner script exists. Recommendation: Add a verification step at the beginning: 'First, confirm scripts/preflight.py exists at the expected path relative to this skill file. If not found, the detection rules can still be applied manually by searching for the patterns described in the catalog.'", + "source": "LLM_JUDGE" + }, + { + "name": "static/scoped_clearly", + "value": "pass", + "rationale": "Score: 10/10. Exceptionally clear scoping. The skill explicitly states what it covers (cross-task file handoff antipattern only) and what it does NOT cover (ENVIRONMENT_SETUP_ERROR, single-notebook migration). The boundary table between parent and sub-skill is precise. The 'When to use this skill' section lists 5 specific triggers. The 'What this skill does NOT cover' section explicitly addresses the other failure mode from the same BSI thread and explains why it's excluded.", + "source": "LLM_JUDGE" + }, + { + "name": "static/error_handling", + "value": "pass", + "rationale": "Score: 7/10. Exit codes are clearly documented (0, 1, 2). The ENV001 pattern routes platform-side errors to escalation with a 3-step mitigation plan. The false-positive escape hatch provides a workaround. However, there's no guidance for: what happens if `databricks CLI` is not authenticated when using `--job-id`/`--run-id` modes, what if the scanner itself crashes, what if referenced notebooks in a DAB YAML can't be found, or what if the Volume doesn't exist yet when the fix is applied. Recommendation: Add a 'Troubleshooting' section covering: (1) CLI auth failures in --job-id/--run-id mode (point to databricks-core for auth setup), (2) scanner errors when referenced notebooks are missing (expected behavior: skip with warning?), (3) runtime errors when the target Volume doesn't exist (include the CREATE VOLUME prerequisite check).", + "source": "LLM_JUDGE" + }, + { + "name": "static/no_hallucination_triggers", + "value": "pass", + "rationale": "Score: 7/10. The skill references `scripts/preflight.py` extensively but this file is not provided \u2014 an agent might attempt to run it and fail, or worse, try to create it from the description. References to `/jira-actions` and `/support-escalation` skills are hedged with 'if available', which is good. The `dbutils.notebook.entry_point.getDbutils().notebook().getContext().jobId().get()` chain in the remediation guide is a real but fragile API that may not work in all contexts (e.g., interactive notebooks). The `dbutils.task_values.set` 'legacy spelling' mentioned in FANOUT001 may not actually exist. Recommendation: Clarify that `scripts/preflight.py` must be present in the skill's directory structure (or provide it). For the jobId context chain, add a note: 'This API is available only inside a job run; for interactive testing, hardcode a test run_id.' Verify whether `dbutils.task_values.set` is a real legacy API or remove the reference.", + "source": "LLM_JUDGE" + } + ], + "task_results": null, + "artifacts": null, + "metadata": { + "overall_score": 8.6, + "criteria": { + "tool_accuracy": 10.0, + "examples_valid": 10.0, + "self_contained": 7.0, + "no_conflicts": 9.0, + "security": 9.0, + "llm_navigable_structure": 9.0, + "actionable_instructions": 8.0, + "scoped_clearly": 10.0, + "error_handling": 7.0, + "no_hallucination_triggers": 7.0 + }, + "recommendations": [ + "Include the `scripts/preflight.py` source (or at minimum its interface/usage documentation) so an agent can verify the scanner exists and understand its implementation. Without it, the agent must trust that the script exists at the referenced path.", + "Add a verification step at the beginning: 'First, confirm scripts/preflight.py exists at the expected path relative to this skill file. If not found, the detection rules can still be applied manually by searching for the patterns described in the catalog.'", + "Add a 'Troubleshooting' section covering: (1) CLI auth failures in --job-id/--run-id mode (point to databricks-core for auth setup), (2) scanner errors when referenced notebooks are missing (expected behavior: skip with warning?), (3) runtime errors when the target Volume doesn't exist (include the CREATE VOLUME prerequisite check).", + "Clarify that `scripts/preflight.py` must be present in the skill's directory structure (or provide it). For the jobId context chain, add a note: 'This API is available only inside a job run; for interactive testing, hardcode a test run_id.' Verify whether `dbutils.task_values.set` is a real legacy API or remove the reference." + ], + "dimensions_evaluated": 10, + "dimensions_total": 10, + "coverage_factor": 1.0, + "priority_fixes": [] + }, + "trace_ids": [], + "details": {}, + "duration_ms": 34709, + "last_run_at": "2026-05-29T10:48:15.850730+00:00", + "is_carried_forward": true + }, + "integration": { + "level": "L2", + "score": 0.8, + "passed": true, + "num_feedbacks": 5, + "feedbacks": [ + { + "name": "integration/connectivity/workspace", + "value": "pass", + "rationale": "Workspace configured: https://adb-984752964297111.11.azuredatabricks.net", + "source": "CODE" + }, + { + "name": "integration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_workspace_reachable", + "value": "pass", + "rationale": "Passed in 1.652s", + "source": "CODE" + }, + { + "name": "integration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_catalog_exists", + "value": "pass", + "rationale": "Passed in 7.870s", + "source": "CODE" + }, + { + "name": "integration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_schema_exists", + "value": "fail", + "rationale": "FAILED: AssertionError: Schema 'skill_test' not found in main\nassert 'skill_test' in ['_inspire', '_silver_agent', 'abac_demo', 'abac_demo_ms', 'abhijeet_rao_tooling', 'abhijeet_test', ...]\nskills/databricks-serverless-storage-check/eval/tests/test_integration.py:28: in test_schema_exists\n assert test_schema in schemas, f\"Schema '{test_schema}' not found in {test_catalog}\"\nE AssertionError: Schema 'skill_test' not found in main\nE assert 'skill_test' in ['_inspire', '_silver_agent', 'abac_demo', 'abac_demo_ms', 'abhijeet_rao_tooling', 'abhijeet_test', ...]", + "source": "CODE" + }, + { + "name": "integration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestWarehouse::test_warehouse_accessible", + "value": "pass", + "rationale": "Passed in 0.167s", + "source": "CODE" + } + ], + "task_results": null, + "artifacts": null, + "metadata": { + "user_integration_tests": 4, + "connectivity_checks": 1 + }, + "trace_ids": [], + "details": { + "checks": [ + { + "name": "workspace", + "passed": true, + "message": "Workspace configured: https://adb-984752964297111.11.azuredatabricks.net" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_workspace_reachable", + "passed": true, + "message": "Passed in 1.652s" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_catalog_exists", + "passed": true, + "message": "Passed in 7.870s" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_schema_exists", + "passed": false, + "message": "FAILED: AssertionError: Schema 'skill_test' not found in main\nassert 'skill_test' in ['_inspire', '_silver_agent', 'abac_demo', 'abac_demo_ms', 'abhijeet_rao_tooling', 'abhijeet_test', ...]\nskills/databricks-serverless-storage-check/eval/tests/test_integration.py:28: in test_schema_exists\n assert test_schema in schemas, f\"Schema '{test_schema}' not found in {test_catalog}\"\nE AssertionError: Schema 'skill_test' not found in main\nE assert 'skill_test' in ['_inspire', '_silver_agent', 'abac_demo', 'abac_demo_ms', 'abhijeet_rao_tooling', 'abhijeet_test', ...]" + }, + { + "name": "skills.databricks-serverless-storage-check.eval.tests.test_integration.TestWarehouse::test_warehouse_accessible", + "passed": true, + "message": "Passed in 0.167s" + } + ], + "passed": 4, + "total": 5 + }, + "duration_ms": 10383, + "last_run_at": "2026-05-29T11:02:12.624608+00:00", + "is_carried_forward": false + } + }, + "suggestions": [] +} \ No newline at end of file diff --git a/skills/databricks-serverless-storage-check/eval/ground_truth.yaml b/skills/databricks-serverless-storage-check/eval/ground_truth.yaml new file mode 100644 index 0000000..fc483de --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/ground_truth.yaml @@ -0,0 +1,154 @@ +# ground_truth.yaml — Test cases for `databricks-serverless-storage-check`. +# +# Goal: verify the skill is correctly invoked (parent reparented to +# databricks-serverless-migration) and that the preflight scanner is run on +# the right inputs with the right remediation guidance. +# +# The cases below cover the documented invocation triggers from SKILL.md: +# 1. Permission-denied / trustedTemp error symptom +# 2. Fan-out via dbutils.notebook.run +# 3. DAB job YAML preflight before deploy +# 4. Boundary case: single-notebook migration should NOT invoke this skill +# (that's the parent databricks-serverless-migration skill's job) + +version: "4" + +test_cases: + # ───────────────────────────────────────────────────────────────────── + # Case 1 — Classic BSI symptom: trustedTemp permission denied + # ───────────────────────────────────────────────────────────────────── + - id: storage-check_001_trusted_temp_permission_denied + inputs: + prompt: | + My serverless job is failing with this error and I can't figure out + what's going on: + + INTERNAL_ERROR: [Errno 13] Permission denied: + '/local_disk0/spark-abc123/trustedTemp-xyz789/tmp7b2' + + The parent notebook writes a parquet file to /local_disk0 and then + calls dbutils.notebook.run on a child notebook that reads it back. + Same code works fine on classic compute. What's wrong? + expectations: + expected_facts: + - "cross-task" + - "/Volumes" + - "local disk" + assertions: + - "Identifies the failure as a cross-task local-disk handoff antipattern, not a generic permission issue" + - "Recommends moving the handoff to UC Volumes (or /Workspace as fallback) rather than fixing local-disk permissions" + - "Explains that on serverless each task may run on a different node so /local_disk0 is per-task only" + - "Does NOT recommend changing chmod or filesystem ACLs as a fix (that misses the root cause)" + expected_patterns: + - '(?i)/Volumes/' + - '(?i)cross[- ]task|fan[- ]out|sibling' + trace_expectations: + required_tools: [] + metadata: + category: error_diagnosis + difficulty: intermediate + regression_intent: "Without this skill, agent typically blames file permissions or chmod the local-disk path. With it, agent recognizes the BSI signature and routes to UC Volumes handoff." + generation_session_id: "legacy_v4" + sources: + - type: manual + reference: "skills/databricks-serverless-storage-check/SKILL.md — cross-task local-disk handoff antipattern and UC Volumes remediation" + + # ───────────────────────────────────────────────────────────────────── + # Case 2 — Pre-deploy review of a fan-out job + # ───────────────────────────────────────────────────────────────────── + - id: storage-check_002_fanout_preflight + inputs: + prompt: | + I'm about to deploy a multi-task serverless job. The parent task is + a notebook that does feature engineering and writes a delta table to + /local_disk0/features/, then it kicks off 4 child notebooks in + parallel via dbutils.notebook.run, each reading from that same path + and training a different model. Can you sanity-check this before I + deploy? + expectations: + expected_facts: + - "/Volumes" + - "fan-out" + - "cross-task" + assertions: + - "Flags the parent-writes-/local_disk0 child-reads pattern as a serverless blocker that will fail under parallel execution" + - "Recommends moving the shared features path off /local_disk0 to /Volumes//// before deploying" + - "Mentions that /local_disk0/tmp is fine for INTRA-task scratch but not for cross-task sharing" + - "Does NOT block the user from using serverless compute entirely — only blocks the handoff pattern" + expected_patterns: + - '(?i)/Volumes/' + - '(?i)blocker|will fail|serverless' + trace_expectations: + required_tools: [] + metadata: + category: pre_deploy_review + difficulty: intermediate + regression_intent: "Without this skill, agent likely approves the design or only complains about generic serverless config; with it, agent calls the fan-out pattern out specifically." + generation_session_id: "legacy_v4" + sources: + - type: manual + reference: "skills/databricks-serverless-storage-check/SKILL.md — fan-out pattern and pre-deploy review guidance" + + # ───────────────────────────────────────────────────────────────────── + # Case 3 — taskValues alternative + # ───────────────────────────────────────────────────────────────────── + - id: storage-check_003_small_payload_handoff + inputs: + prompt: | + On my serverless multi-task job, the parent notebook computes a + config dict (about 2 KB of JSON: model hyperparameters and a list + of feature names) and writes it to /tmp/config.json so the child + notebook can read it. The child fails with permission denied + intermittently. What's the right way to pass this between tasks? + expectations: + expected_facts: + - "taskValues" + - "dbutils.jobs.taskValues" + assertions: + - "Recommends dbutils.jobs.taskValues.set / .get as the preferred fix for this small payload, not a file handoff" + - "Explains that the payload is well under the ~48 KB taskValues limit so no file is needed at all" + - "Calls out /tmp as per-task-only on serverless (not durable across tasks)" + expected_patterns: + - '(?i)taskValues|task_values' + trace_expectations: + required_tools: [] + metadata: + category: remediation_choice + difficulty: easy + regression_intent: "Without this skill, agent usually suggests UC Volumes for everything. With it, agent recognizes small-payload case and recommends taskValues (no file)." + generation_session_id: "legacy_v4" + sources: + - type: manual + reference: "skills/databricks-serverless-storage-check/SKILL.md — taskValues as preferred fix for small cross-task payloads under 48 KB" + + # ───────────────────────────────────────────────────────────────────── + # Case 4 — Boundary case: single-notebook migration should NOT call this + # ───────────────────────────────────────────────────────────────────── + - id: storage-check_004_single_notebook_boundary + inputs: + prompt: | + I'm migrating one notebook from classic DBR 13.3 to serverless. It + reads a parquet from DBFS, runs some pandas transforms with + intermediate files in /local_disk0/tmp/, and writes the result back + to DBFS. No sub-notebooks, no child tasks, just this one notebook + running end to end. What do I need to change? + expectations: + expected_facts: + - "/Volumes" + - "/local_disk0/tmp" + assertions: + - "Recommends replacing the DBFS read/write with UC Volumes paths (parent skill's responsibility)" + - "Keeps /local_disk0/tmp for the intra-task pandas scratch files — does NOT incorrectly flag this as a cross-task antipattern" + - "Treats this as a single-notebook migration owned by databricks-serverless-migration, not a fan-out problem" + expected_patterns: + - '(?i)/local_disk0/tmp|local_disk0/tmp' + trace_expectations: + required_tools: [] + metadata: + category: boundary + difficulty: hard + regression_intent: "Tests that the storage-check skill correctly bounds its scope — it should NOT misfire on single-notebook intra-task scratch use of /local_disk0/tmp, which is the parent migration skill's correct recommendation." + generation_session_id: "legacy_v4" + sources: + - type: manual + reference: "skills/databricks-serverless-storage-check/SKILL.md — scope boundary: intra-task /local_disk0/tmp is correct, cross-task handoff is the antipattern" diff --git a/skills/databricks-serverless-storage-check/eval/manifest.yaml b/skills/databricks-serverless-storage-check/eval/manifest.yaml new file mode 100644 index 0000000..7e56327 --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/manifest.yaml @@ -0,0 +1,66 @@ +skill_name: databricks-serverless-storage-check +tool_modules: [] +description: "Evaluation config for databricks-serverless-storage-check" + +# Test directory — where L1/L2 look for pytest tests. +# Default: eval/tests/ (relative to skill dir). +# test_dir: /absolute/path/to/tests + +# Workflow-style ground_truth: cases share the per-side cwd so case 2 can +# see case 1's artifacts. Default false (per-case isolation). Set to true +# for multi-step workflows like create → deploy → verify. See CLAUDE.md +# "Workflow-style ground_truth" section for the trade-off. +# shared_cwd: false + +scorers: + default_guidelines: + - "Response must address user's request completely" + - "Code must follow documented best practices" + +# ── L3 static audit tuning ───────────────────────────────────────────── +# static_audit: +# focus_areas: [security, no_hallucination_triggers, actionable] +# skip_dimensions: [mcp_e2e] +# severity_overrides: +# security: critical + +# ── Comparison judge rubric + anti-bias (used by /api/eval/compare) ──── +# comparison_judge: +# rubric_file: comparison_rubric.md # optional; falls back to DEFAULT_RUBRIC +# dimensions: [correctness, user_experience, safety, code_quality] +# anti_bias: +# - "Do not count tool calls. Fewer calls are not inherently better." +# - "Do not reward shorter output." + +# ── Custom MLflow judges (L4 + L5) ───────────────────────────────────── +# Skill-specific LLM and rule judges layered on top of the built-in dimension +# judges. Each judge becomes an MLflow trace assessment + a row in +# task_results[i].custom_judge_assessments. Built-ins (response_grounded_in_tools, +# agent_used_required_tools, dataset_or_pipeline_created) ship for free; turn +# any of them off via custom_judges_disable below. +# +# custom_judges: +# - name: prefers_materialized_view +# type: llm # llm | rule +# level: output # thinking | output | both +# feedback_value_type: yes_no # yes_no (default) | bool +# instructions: | +# Examine { inputs }, { outputs }, and the { trace }. +# The user described a HISTORICAL BATCH workload, not streaming. +# Did the agent recommend a Materialized View instead of a Streaming Table? +# Answer 'yes' if the agent proposed an MV (or justified an ST). 'no' otherwise. +# Quote the exact phrase from the response that supports your verdict. +# model: databricks:/databricks-claude-opus-4-6 # optional override +# +# - name: pipeline_runs_serverless +# type: rule +# level: output +# scorer_ref: judges.pipeline_runs_serverless # see /eval/judges.py +# +# custom_judges_disable: [] # e.g. [response_grounded_in_tools] + +# ── MCP end-to-end probes — L3 invokes these tools against real Databricks +# mcp_e2e_probes: +# - tool: execute_sql +# params: { sql_query: "SELECT 1" } +# expect_success: true diff --git a/skills/databricks-serverless-storage-check/eval/output_instructions.md b/skills/databricks-serverless-storage-check/eval/output_instructions.md new file mode 100644 index 0000000..0912eb7 --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/output_instructions.md @@ -0,0 +1,66 @@ +# Output Evaluation Criteria — databricks-serverless-storage-check + +L5 grades the WITH-skill response against these expectations (plus per-case `expected_facts` / `assertions` in `ground_truth.yaml`). The goal of this rubric is to distinguish a real, scoped use of the storage-check skill from a generic "anything about serverless" reply. + +## Expected Artifacts + +Every WITH-skill response on a real storage-check trigger should produce: + +- A diagnosis that names the antipattern explicitly (cross-task local-disk handoff) by symptom or pattern ID (`FANOUT001`–`FANOUT006`, `ENV001`) +- A recommended fix tier from the priority ladder: **UC Volumes → /Workspace → taskValues → keep local for intra-task only** +- A concrete code snippet (Python or DAB YAML) showing the before/after rewrite — not just prose +- The severity of the finding (Blocker / Warning / Info) so the user can decide whether to deploy +- Where appropriate, an invocation of `scripts/preflight.py` (or the explicit reason it was skipped, e.g. paste-only error trace) + +## Mandatory Facts + +Defined per test case in `ground_truth.yaml::test_cases[].expectations.expected_facts`. Common cross-case mandatory tokens: + +- `/Volumes/` — the preferred handoff target. Should appear in every fix recommendation except taskValues-only cases. +- `cross-task` or `fan-out` — the antipattern label. Without one of these phrases the diagnosis is ambiguous. +- `taskValues` — when the payload is small (well under 48 KB), this must surface as the preferred fix over a file handoff. + +## Negative Signals (the agent should NOT do these) + +These are skill-specific anti-patterns that distinguish a misuse of the skill from a correct invocation: + +- Recommend `chmod` / file ACL changes — that misdiagnoses the failure as a permission issue when it is actually a cross-node visibility issue. +- Tell the user to switch back to classic compute — the skill exists to keep them on serverless; rolling back is not the fix. +- Recommend UC Volumes for genuinely intra-task scratch (`/local_disk0/tmp` is correct there, and the parent migration skill explicitly recommends it). Misfiring the antipattern on a non-handoff scratch path is a regression. +- Generate code fixes for `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` — that is the `ENV001` info-only finding. Route to support escalation instead. +- Recommend writing to DBFS as the cross-task handoff target — DBFS mounts are deprecated and not the documented fix. +- Block deploy entirely with no actionable remediation — the skill must always end with at least one fix snippet the user can apply. + +## Comparison Approach + +The semantic grader handles the WITH vs WITHOUT comparison. Reference fixture remediation snippets live in `eval/source_of_truth/` (empty by default — drop in expected DAB rewrites for higher-fidelity L5 grading once a baseline exists). + +## Skill Invocation Verification + +For the user's stated goal of "verify the skill is correctly invoked": + +- **L4** inspects whether the agent loaded the storage-check `SKILL.md` for cases 1–3 and did NOT misfire on case 4 (boundary). The four cases collectively exercise positive triggers (error symptom, fan-out, small-payload) and a negative boundary (single-notebook intra-task). +- **L5** WITH/WITHOUT compares answers: case 4's NEEDS_SKILL count should be near zero (correct scoping — the skill should not "add value" when not needed); cases 1–3's POSITIVE count should be high (skill provides real diagnostic lift over a baseline agent). +- A correctly integrated skill should also reference the parent (`databricks-serverless-migration`) when the user's workload is unmigrated, demonstrating the hierarchy works as intended. + +## Severity Calibration Reference + +When L5 classifies an assertion, use this hierarchy: + +- POSITIVE — the WITH response named the pattern, surfaced the severity, and produced a runnable fix snippet +- NEEDS_SKILL — the WITHOUT response missed the diagnosis but WITH caught it (this is the lift the skill is paying for) +- REGRESSION — the WITHOUT response was better than WITH (skill made things worse — investigate) +- NEUTRAL — both responses arrived at substantively the same fix (skill is dead weight for this case) + +## Per-Case Acceptance Bar + +Each test case in `ground_truth.yaml` maps to a concrete acceptance bar that L5 should hold the WITH-skill response to: + +- **Case 1 (trustedTemp permission denied)**: Must produce a UC Volumes fix snippet AND explain the per-node visibility issue. A reply that only mentions "use Volumes" without explaining why local disk fails is incomplete. +- **Case 2 (fan-out preflight)**: Must produce an explicit deploy decision (block / allow with changes / allow as-is) backed by the scanner's severity tier. Generic "looks risky" without a verdict is incomplete. +- **Case 3 (small payload)**: Must recommend `taskValues` as the primary fix and explain the 48 KB ceiling. Recommending Volumes for a 2 KB JSON payload is over-engineering and should be flagged as a partial regression. +- **Case 4 (single-notebook boundary)**: Must NOT recommend moving `/local_disk0/tmp` for intra-task scratch. Doing so is a false positive — the antipattern is cross-task, not intra-task, and confusing the two is the most common skill misuse. + +## Cross-Reference With Pattern Catalog + +The agent's response should cross-reference the in-skill pattern catalog (`references/pattern-catalog.md`) when discussing fixes. A response that quotes the catalog by pattern ID demonstrates the skill is loaded and applied; a response that produces correct fixes but never names a pattern ID is providing the right answer for the wrong reasons and should score lower on attribution. diff --git a/skills/databricks-serverless-storage-check/eval/report.html b/skills/databricks-serverless-storage-check/eval/report.html new file mode 100644 index 0000000..df2721d --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/report.html @@ -0,0 +1,865 @@ + + + + +Skill Evaluation: databricks-serverless-storage-check + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skill Evaluation: databricks-serverless-storage-check + 2026-05-29 12:02 | MLflow: 18e1d8edab42450590804ea904cc7a57 + +
+ +
+
Summary
+
+
Composite Score
89%
+
Levels Run
3
+
Duration
18s
+
Total Checks
59
+
+
+ +
+
Level Scores
+
+
+ L1 unit +
+ 100% +
+
+ L2 integration +
+ 80% +
+
+ L3 static +
+ 86% +
+
+ L4 thinking +
+ -- +
+
+ L5 output +
+ -- +
+
+ + +
+
Level 1: UNIT
+
+
+ + Unit + 100% + FROM PREVIOUS RUN +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StatusCheckDetailsSource
PASSunit/python_syntax/references/pattern-catalog.md:block_1Valid Python syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_2Valid Python syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_3Valid Python syntaxCODE
PASSunit/yaml_syntax/references/pattern-catalog.md:block_4Valid YAML syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_5Valid Python syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_6Valid Python syntaxCODE
PASSunit/yaml_syntax/references/pattern-catalog.md:block_7Valid YAML syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_8Valid Python syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_10Valid Python syntaxCODE
PASSunit/python_syntax/references/pattern-catalog.md:block_11Valid Python syntaxCODE
PASSunit/sql_syntax/references/remediation-guide.md:block_2Valid SQL syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_3Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_4Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_5Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_6Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_7Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_8Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_9Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_10Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_11Valid Python syntaxCODE
PASSunit/yaml_syntax/references/remediation-guide.md:block_12Valid YAML syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_13Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_14Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_15Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_16Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_17Valid Python syntaxCODE
PASSunit/yaml_syntax/references/remediation-guide.md:block_18Valid YAML syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_19Valid Python syntaxCODE
PASSunit/python_syntax/references/remediation-guide.md:block_20Valid Python syntaxCODE
PASSunit/link/SKILL.md/`databricks-serverless-migration`Link to '../databricks-serverless-migration/SKILL.md' existsCODE
PASSunit/link/SKILL.md/`references/remediation-guide.md`Link to 'references/remediation-guide.md' existsCODE
PASSunit/link/SKILL.md/`references/pattern-catalog.md`Link to 'references/pattern-catalog.md' existsCODE
PASSunit/link/SKILL.md/`references/remediation-guide.md`Link to 'references/remediation-guide.md' existsCODE
PASSunit/link/SKILL.md/`databricks-serverless-migration`Link to '../databricks-serverless-migration/SKILL.md' existsCODE
PASSunit/link/SKILL.md/`databricks-dabs`Link to '../databricks-dabs/SKILL.md' existsCODE
PASSunit/link/SKILL.md/`databricks-jobs`Link to '../databricks-jobs/SKILL.md' existsCODE
PASSunit/link/SKILL.md/`databricks-core`Link to '../databricks-core/SKILL.md' existsCODE
PASSunit/link/SKILL.md/Pattern catalogLink to 'references/pattern-catalog.md' existsCODE
PASSunit/link/SKILL.md/Remediation guideLink to 'references/remediation-guide.md' existsCODE
PASSunit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_existsPassed in 0.001sCODE
PASSunit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_not_emptyPassed in 0.001sCODE
PASSunit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillStructure::test_skill_md_has_frontmatterPassed in 0.001sCODE
PASSunit/pytest/skills.databricks-serverless-storage-check.eval.tests.test_unit.TestSkillContent::test_no_todo_placeholdersPassed in 0.000sCODE
+
+
+
+
Level 2: INTEGRATION
+
+
+ + Integration + 80% + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StatusCheckDetailsSource
PASSintegration/connectivity/workspaceWorkspace configured: https://adb-984752964297111.11.azuredatabricks.netCODE
PASSintegration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_workspace_reachablePassed in 1.652sCODE
PASSintegration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_catalog_existsPassed in 7.870sCODE
FAILintegration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestDatabricksConnectivity::test_schema_existsFAILED: AssertionError: Schema 'skill_test' not found in main +assert 'skill_test' in ['_inspire', '_silver_agent', 'abac_demo', 'abac_demo_ms', 'abhijeet_rao_tooling', 'abhijeet_test', ...] +skills/databricks-serverless-storage-check/eval/tests/test_integration.py:28: in test_schema_exists + assertCODE
PASSintegration/pytest/skills.databricks-serverless-storage-check.eval.tests.test_integration.TestWarehouse::test_warehouse_accessiblePassed in 0.167sCODE
+
+
+
+
Level 3: STATIC
+
+
+ + Static + 86% + FROM PREVIOUS RUN +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StatusCheckDetailsSource
PASSstatic/tool_accuracyNo tool references to check (derived from L1)CODE
PASSstatic/examples_validDerived from L1: 29/29 code blocks valid. Score: 10.0/10CODE
PASSstatic/security/secrets_scanNo hardcoded secrets detectedCODE
PASSstatic/self_containedScore: 7/10. The skill provides comprehensive context about the antipatterns, detection rules, remediation patterns, and decision trees. However, it references a `scripts/preflight.py` file that is central to the skill's operation but is NOT included in the skill content. The agent cannot verify or LLM
PASSstatic/no_conflictsScore: 9/10. Instructions are consistent throughout. The boundary between parent skill (intra-task) and this sub-skill (cross-task) is stated clearly and repeated in the table. The remediation guide's 'What NOT to do' section aligns perfectly with the pattern catalog. The 'When /local_disk0/tmp IS fLLM
PASSstatic/securityScore: 9/10. No hardcoded tokens or secrets. The SQL example uses placeholder `<workspace-user-or-sp>` for grants. Permission notes are included for each fix pattern (WRITE VOLUME, CAN_EDIT, etc.). The false-positive escape hatch section appropriately warns 'Prefer fixing the antipattern when possibLLM
PASSstatic/llm_navigable_structureScore: 9/10. Excellent hierarchical structure: clear headings (When to use, Quick start, Interpreting output, Core rule, Pattern catalog, Remediation summary, What this skill does NOT cover, Related skills). Tables are used effectively for severity/exit-code mapping and skill boundary delineation. TLLM
PASSstatic/actionable_instructionsScore: 8/10. The Quick start section provides concrete CLI invocations for all modes (`--notebook`, `--dir`, `--job-yaml`, `--job-id`, `--run-id`, `--json`). The remediation guide provides complete before/after code blocks with SQL setup commands. The ENV001 handling gives a numbered 3-step action pLLM
PASSstatic/scoped_clearlyScore: 10/10. Exceptionally clear scoping. The skill explicitly states what it covers (cross-task file handoff antipattern only) and what it does NOT cover (ENVIRONMENT_SETUP_ERROR, single-notebook migration). The boundary table between parent and sub-skill is precise. The 'When to use this skill' sLLM
PASSstatic/error_handlingScore: 7/10. Exit codes are clearly documented (0, 1, 2). The ENV001 pattern routes platform-side errors to escalation with a 3-step mitigation plan. The false-positive escape hatch provides a workaround. However, there's no guidance for: what happens if `databricks CLI` is not authenticated when usLLM
PASSstatic/no_hallucination_triggersScore: 7/10. The skill references `scripts/preflight.py` extensively but this file is not provided — an agent might attempt to run it and fail, or worse, try to create it from the description. References to `/jira-actions` and `/support-escalation` skills are hedged with 'if available', which is gooLLM
+
+
+
+
Level 4: THINKING
+
Thinking + NOT RUN
+
+
+
Level 5: OUTPUT
+
Output + NOT RUN
+
+ + \ No newline at end of file diff --git a/skills/databricks-serverless-storage-check/eval/tests/conftest.py b/skills/databricks-serverless-storage-check/eval/tests/conftest.py new file mode 100644 index 0000000..68967de --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/tests/conftest.py @@ -0,0 +1,72 @@ +"""Shared fixtures for databricks-serverless-storage-check evaluation tests. + +Unit tests (no marker) run at L1. Integration tests (@pytest.mark.integration) run at L2. +Environment variables are injected by the evaluator for integration tests: + DATABRICKS_HOST, DATABRICKS_PROFILE, TEST_CATALOG, TEST_SCHEMA, WAREHOUSE_ID +""" + +import os +from pathlib import Path + +import pytest + + +def pytest_configure(config): + """Register custom markers.""" + config.addinivalue_line("markers", "integration: marks tests as integration tests (require Databricks)") + + +@pytest.fixture(scope="session") +def skill_dir(): + """Path to the skill directory (parent of eval/).""" + return Path(__file__).resolve().parents[2] + + +@pytest.fixture(scope="session") +def skill_md(skill_dir): + """Contents of SKILL.md.""" + skill_path = skill_dir / "SKILL.md" + if skill_path.exists(): + return skill_path.read_text() + return "" + + +@pytest.fixture(scope="session") +def databricks_configured(): + """Skip tests if Databricks is not configured.""" + if not os.environ.get("DATABRICKS_HOST"): + pytest.skip("DATABRICKS_HOST not set — run stf auth first") + return True + + +@pytest.fixture(scope="session") +def workspace_client(databricks_configured): + """Create a Databricks WorkspaceClient from environment.""" + from databricks.sdk import WorkspaceClient + client = WorkspaceClient( + host=os.environ.get("DATABRICKS_HOST"), + profile=os.environ.get("DATABRICKS_PROFILE"), + ) + client.current_user.me() + return client + + +@pytest.fixture(scope="session") +def test_catalog(): + """Test catalog from environment.""" + return os.environ.get("TEST_CATALOG", "main") + + +@pytest.fixture(scope="session") +def test_schema(): + """Test schema from environment.""" + return os.environ.get("TEST_SCHEMA", "default") + + +@pytest.fixture(scope="session") +def warehouse_id(): + """SQL warehouse ID from environment.""" + wid = os.environ.get("WAREHOUSE_ID") + if not wid: + pytest.skip("WAREHOUSE_ID not set") + return wid diff --git a/skills/databricks-serverless-storage-check/eval/tests/test_integration.py b/skills/databricks-serverless-storage-check/eval/tests/test_integration.py new file mode 100644 index 0000000..50e4e05 --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/tests/test_integration.py @@ -0,0 +1,39 @@ +"""Integration tests for databricks-serverless-storage-check. + +These tests run at L2 and require a real Databricks workspace. +Environment variables are injected by the evaluator: + DATABRICKS_HOST, DATABRICKS_PROFILE, TEST_CATALOG, TEST_SCHEMA, WAREHOUSE_ID +""" + +import pytest + + +@pytest.mark.integration +class TestDatabricksConnectivity: + """Verify basic Databricks workspace connectivity.""" + + def test_workspace_reachable(self, workspace_client): + """Can authenticate and reach the workspace.""" + user = workspace_client.current_user.me() + assert user.user_name is not None + + def test_catalog_exists(self, workspace_client, test_catalog): + """The configured test catalog exists.""" + catalogs = [c.name for c in workspace_client.catalogs.list()] + assert test_catalog in catalogs, f"Catalog '{test_catalog}' not found" + + def test_schema_exists(self, workspace_client, test_catalog, test_schema): + """The configured test schema exists.""" + schemas = [s.name for s in workspace_client.schemas.list(catalog_name=test_catalog)] + assert test_schema in schemas, f"Schema '{test_schema}' not found in {test_catalog}" + + +@pytest.mark.integration +class TestWarehouse: + """Verify SQL warehouse accessibility.""" + + def test_warehouse_accessible(self, workspace_client, warehouse_id): + """The configured SQL warehouse is accessible.""" + warehouse = workspace_client.warehouses.get(warehouse_id) + assert warehouse is not None + assert warehouse.state is not None diff --git a/skills/databricks-serverless-storage-check/eval/tests/test_unit.py b/skills/databricks-serverless-storage-check/eval/tests/test_unit.py new file mode 100644 index 0000000..9ea1169 --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/tests/test_unit.py @@ -0,0 +1,38 @@ +"""Unit tests for databricks-serverless-storage-check. + +These tests run at L1 (no Databricks or external services needed). +""" + +from pathlib import Path + +import yaml + + +class TestSkillStructure: + """Validate the skill directory has required files and structure.""" + + def test_skill_md_exists(self, skill_dir): + """SKILL.md must exist in the skill directory.""" + assert (skill_dir / "SKILL.md").exists(), "SKILL.md not found" + + def test_skill_md_not_empty(self, skill_md): + """SKILL.md must have meaningful content.""" + assert len(skill_md.strip()) > 100, "SKILL.md is too short or empty" + + def test_skill_md_has_frontmatter(self, skill_md): + """SKILL.md should have YAML frontmatter with name and description.""" + assert skill_md.startswith("---"), "SKILL.md missing YAML frontmatter (---)" + parts = skill_md.split("---", 2) + assert len(parts) >= 3, "SKILL.md frontmatter not properly closed" + frontmatter = yaml.safe_load(parts[1]) + assert frontmatter is not None, "Frontmatter is empty" + + +class TestSkillContent: + """Validate skill content quality.""" + + def test_no_todo_placeholders(self, skill_md): + """SKILL.md should not contain unresolved TODO placeholders.""" + # TODO: Uncomment when skill is fully authored + # assert "TODO" not in skill_md, "SKILL.md contains unresolved TODOs" + pass diff --git a/skills/databricks-serverless-storage-check/eval/thinking_instructions.md b/skills/databricks-serverless-storage-check/eval/thinking_instructions.md new file mode 100644 index 0000000..e890ad3 --- /dev/null +++ b/skills/databricks-serverless-storage-check/eval/thinking_instructions.md @@ -0,0 +1,56 @@ +# Thinking Evaluation Criteria — databricks-serverless-storage-check + +These layer on top of the four generic L4 dimension judges (Efficiency, Clarity, Recovery, Completeness). They are skill-specific signals that the storage-check skill is being applied correctly, not generic "did the agent answer politely." + +## Efficiency + +The skill ships an executable preflight scanner. Spending agent tokens to re-derive what the scanner detects is wasted work. + +- Prefer running `scripts/preflight.py` against the supplied input (notebook, dir, job-yaml, job-id, or run-id) over reading every file by hand. The whole point of the skill is the scanner. +- For a paste of an error trace (no code), it is correct to skip the scanner and route directly to the pattern catalog plus remediation guide — do not request the user's full notebook just to confirm a symptom that is already diagnostic. +- One read of `SKILL.md` is enough; do not re-read it between every step. Once the pattern is identified, jump to `references/remediation-guide.md` for the fix. +- Do not invoke `databricks` CLI commands unless `--job-id` / `--run-id` modes are explicitly being used; for paste-in cases, the scanner runs locally with no CLI dependency. + +## Clarity + +Output language has to map cleanly onto the user's mental model of "what happens when I deploy this." + +- Name the pattern by ID when the scanner finds one (`FANOUT001`, `FANOUT006`, `ENV001`, etc.) so the user can cross-reference the pattern catalog. +- When recommending a fix, surface the severity tier (Blocker / Warning / Info) explicitly — users need to know whether they can deploy as-is or must change code first. +- Distinguish intra-task scratch (`/local_disk0/tmp` is fine, owned by parent skill) from cross-task handoff (must move off local disk, this skill's domain). Conflating these two is the most common diagnostic error. +- When the parent skill (`databricks-serverless-migration`) is the right destination, say so explicitly and hand off rather than papering over an unmigrated workload. + +## Recovery + +This skill has well-defined "do not try to fix" boundaries — respect them. + +- If the user reports `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT`, do NOT try to fix it — route to support escalation (this is the `ENV001` info-only finding). Attempts to "fix" this with code changes are off-scope and will fail. +- If `--job-id` mode fails (no CLI, no permission, bad profile), fall back to asking the user to paste the notebook source rather than aborting the entire flow. +- If a fix attempt does not pass the scanner on re-run, re-classify the pattern; do not loop on the same fix. The pattern catalog has six distinct findings — re-running the same Volumes rewrite against a `FANOUT006` hardcoded-path issue is wasted effort. +- When unsure, surface the scanner JSON output verbatim — do not invent severities or pattern IDs. + +## Completeness + +A correct invocation of this skill produces a small, fixed shape of output regardless of the input mode. + +- A complete answer for an error-symptom prompt includes: (1) the pattern name, (2) why it fails on serverless specifically, (3) the recommended fix in priority order (Volumes > Workspace > taskValues), (4) a code snippet for the chosen fix. +- A complete answer for a pre-deploy review includes: scanner output summary (counts per severity), the specific files/lines flagged, and an explicit go/no-go recommendation for the deploy. +- Always invoke the parent `databricks-serverless-migration` skill first when the user has not yet migrated the notebook from classic — do not jump straight to storage-check on an unmigrated workload, because the cross-task handoff antipattern is a deploy-time concern that only matters once single-notebook migration is done. +- For boundary cases (single-notebook, intra-task scratch only), explicitly say "this is not a cross-task antipattern" so the user understands why no fix is needed here. Silence is interpreted as agreement. + +## Hierarchy Awareness + +This skill is a niche sub-skill of `databricks-serverless-migration` (the parent in the integrated hierarchy). The agent should treat the relationship explicitly: + +- Reference the parent skill by name when the user's workload is unmigrated — the parent owns the four-step Ingest → Analyze → Test → Validate lifecycle; this skill plugs in at "Test" for multi-task hardening. +- Do not re-derive single-notebook migration guidance — quote or link to the parent skill instead of duplicating its content. +- When the parent skill's per-task scratch guidance (`/local_disk0/tmp` is fine inside a task) is correct for the case, defer to it; do not override. + +## Scanner Output Hygiene + +When the preflight scanner is run, the agent must treat its output as authoritative: + +- Surface the scanner exit code (0 / 1 / 2) and translate it to a deploy decision (clean / warnings / blockers). +- Quote the scanner's pattern ID and severity verbatim instead of paraphrasing. +- If the scanner exits with `--json`, parse the JSON and present the findings as a small table rather than dumping the raw payload. +- Never silently suppress info-level findings — `ENV001` in particular needs to be surfaced even when other findings are zero, because it routes the user to a different remediation path. diff --git a/skills/databricks-serverless-storage-check/references/pattern-catalog.md b/skills/databricks-serverless-storage-check/references/pattern-catalog.md new file mode 100644 index 0000000..78a34fd --- /dev/null +++ b/skills/databricks-serverless-storage-check/references/pattern-catalog.md @@ -0,0 +1,198 @@ +# Pattern Catalog + +All detection rules used by `scripts/preflight.py`. Each pattern has a stable ID, a severity, a description of what it matches, an example that triggers it, the recommended fix, and the underlying detection rule. + +The preflight scanner is intentionally conservative — false-negatives on heavily dynamic code (paths built from many variables at runtime) are expected. False positives should be rare. If you hit one, the scanner accepts the finding being ignored at the call site; please file an issue with a minimal repro. + +## Severity scale + +| Severity | Meaning | +|----------|---------| +| **Blocker** | The job WILL fail on serverless under realistic execution. Must fix before deploying. Contributes to exit code `2`. | +| **Warning** | The job is likely to fail under parallel execution or fan-out. Should fix. Contributes to exit code `1`. | +| **Info** | Awareness-only or escalation routing (e.g. env-sync error). Does not affect exit code. | + +## Patterns + +### FANOUT001 — Local-disk path passed to a child call + +**Severity**: Blocker + +**What it matches**: A string literal (or a variable bound to a string literal) starting with `/local_disk0`, `/tmp`, `/dbfs/tmp`, or containing `trustedTemp` is passed as an argument to one of: + +- `dbutils.notebook.run(notebook, timeout, args)` +- `dbutils.jobs.taskValues.set(key=..., value=path)` +- `dbutils.task_values.set(...)` (legacy spelling) + +The path may be passed directly, inside a dict literal (`{"handoff_path": tmp}`), or inside a list/tuple/set literal. + +**Example that triggers it**: + +```python +tmp = "/local_disk0/scratch/output.parquet" +df.write.parquet(tmp) +dbutils.notebook.run("./child", 600, {"handoff_path": tmp}) +``` + +**Fix**: + +```python +import uuid +handoff = f"/Volumes/main/analytics/handoffs/{uuid.uuid4()}.parquet" +df.write.parquet(handoff) +dbutils.notebook.run("./child", 600, {"handoff_path": handoff}) +``` + +**Detection rule**: AST visitor on `ast.Call`. `_call_qualname` matches `dbutils.notebook.run`, `dbutils.jobs.taskValues.set`, or `dbutils.task_values.set`. `_string_args` resolves Name nodes via the cell's variable map and recurses into dict/list literals to find string values. Each resolved string is tested with `is_local_disk_path()`. + +### FANOUT002 — Child notebook reads from a local-disk path + +**Severity**: Blocker + +**What it matches**: A notebook that pulls parameters via `dbutils.widgets.get` or `dbutils.jobs.taskValues.get` (suggesting it's a child) also performs a read (`open(path)`, `pd.read_*`, `spark.read.*`) from a `/local_disk0`, `/tmp`, or `trustedTemp` path. + +**Example that triggers it**: + +```python +dbutils.widgets.text("handoff_path", "") +path = dbutils.widgets.get("handoff_path") +df = pd.read_parquet("/local_disk0/scratch/input.parquet") +``` + +**Fix**: The parent must write to durable storage (`/Volumes` or `/Workspace`), and the child must read from the same. Pass the durable path via the parameter. + +**Detection rule**: The scanner flags a cell as `is_likely_child` if any cell in the notebook uses `dbutils.widgets.get`, `dbutils.jobs.taskValues.get`, or `dbutils.task_values.get`. In a likely-child notebook, any read target (resolved via `_read_targets_in_cell`) matching `is_local_disk_path()` triggers this finding. + +### FANOUT003 — Sibling tasks share a local-disk path + +**Severity**: Warning + +**What it matches**: A DAB job YAML defines two or more sibling tasks (or a task and one of its descendants) whose referenced notebooks both touch the same `/local_disk0`, `/tmp`, or `trustedTemp` path. + +**Example that triggers it**: + +```yaml +resources: + jobs: + my_job: + tasks: + - task_key: producer + notebook_task: + notebook_path: ./producer.py + - task_key: consumer + depends_on: [{ task_key: producer }] + notebook_task: + notebook_path: ./consumer.py +``` + +```python +shared = "/tmp/foo.parquet" +pd.DataFrame({"x": [1]}).to_parquet(shared) +``` + +```python +shared = "/tmp/foo.parquet" +df = pd.read_parquet(shared) +``` + +**Fix**: Move the shared artifact to `/Volumes/...` or `/Workspace/...` and update both notebooks. + +**Detection rule**: `scan_job_yaml` resolves each task's `notebook_path`, runs the per-cell scanner on every referenced notebook, and collects the set of local-disk paths each notebook touches (writes, reads, child-call args, or bare string literals). When two or more task keys overlap on the same path, the finding fires. + +### FANOUT004 — `pipeline_task` downstream of a local-temp-writing notebook + +**Severity**: Warning + +**What it matches**: A DAB task with `pipeline_task: ...` whose `depends_on` includes a `notebook_task` that writes to a local-disk path. + +**Example that triggers it**: + +```yaml +tasks: + - task_key: prep + notebook_task: + notebook_path: ./prep.py # writes to /tmp/staging.parquet + - task_key: run_pipeline + depends_on: [{ task_key: prep }] + pipeline_task: + pipeline_id: 12345 +``` + +**Fix**: Have `prep` write to a UC Volume that the pipeline ingests via Auto Loader or a streaming table, or materialize the prep output as a table the pipeline reads from. + +**Detection rule**: For each task with `is_pipeline_task == True`, if any upstream `depends_on` task's notebook contains a local-disk write (recorded in `notebook_local_paths`), the finding fires. + +### FANOUT005 — `dbutils.fs.cp` from local path to local path + +**Severity**: Info + +**What it matches**: A `dbutils.fs.cp(src, dst)` or `dbutils.fs.mv(src, dst)` call where both arguments resolve to local-disk paths. + +**Example that triggers it**: + +```python +dbutils.fs.cp("/local_disk0/staging/x.parquet", "/tmp/cache/x.parquet") +``` + +**Fix**: Safe within a single task. If the notebook is invoked by a multi-task job, change one side to `/Volumes/...` or `/Workspace/...` so the destination is visible to other tasks. + +**Detection rule**: AST visitor matches `dbutils.fs.cp` and `dbutils.fs.mv` calls with two string args that both pass `is_local_disk_path()`. + +### FANOUT006 — Hardcoded BSI trustedTemp signature + +**Severity**: Blocker + +**What it matches**: Any string anywhere in the source that matches the regex `/local_disk0/spark-[A-Za-z0-9\-]+/trustedTemp[A-Za-z0-9\-]*`. This is the exact path family that produced the original BSI failure: + +``` +/local_disk0/spark-d6bae111-42bd-4f54-9136-a4e9fbdec3d6/trustedTemp-55adadbe-d9ed-4278-a751-868797c1562f/tmpc58fz4pv +``` + +**Example that triggers it**: + +```python +tmp = "/local_disk0/spark-abc/trustedTemp-def/handoff.parquet" +``` + +**Fix**: Never hardcode a `trustedTemp` path. The full path is a runtime-internal Spark scratch location; if you depend on it from another task, the path will exist on a different node from where you wrote it. Use `/Volumes/...` or `/Workspace/...` for any cross-task data. + +**Detection rule**: A tree-walk over every string `Constant` node in every cell tests `is_bsi_signature()` (which uses the `BSI_TRUSTED_TEMP_RE` regex). Triggers regardless of whether the string is in an assignment, a call arg, or a free expression. + +### ENV001 — `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` in run output + +**Severity**: Info + +**What it matches**: `--run-id` mode only. The run's error trace contains `ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT` (often accompanied by "Virtual environment changed while syncing"). + +**Why this is info-only**: Per the BSI thread (Philip Nord), this error is a rare, platform-side intermittent issue. There is no customer-side fix this skill can apply. The scanner emits this finding to route the user to escalation rather than mislead them into a code change. + +**Fix**: + +1. Open a Databricks engineering support ticket (use `/jira-actions` or `/support-escalation`) with the run ID and error trace. +2. As a mitigation, reduce dependency setup during child notebook startup. Move heavy `%pip install` into the parent or into a job-level environment spec where possible. +3. Add task retries — the error is usually transient and the next run typically succeeds. + +**Detection rule**: `--run-id` mode shells out to `databricks jobs get-run-output` and tests the combined `error` + `error_trace` text against `ENV_SYNC_RE`. + +## False-positive escape hatch + +If a finding is genuinely safe in your workload (rare, but possible — e.g. you have a single-task notebook where `/local_disk0` use is fine), the simplest mitigation is to wrap the path construction so the literal doesn't appear in source: + +```python +# Hidden from the static scanner; only do this when you've verified the +# context is genuinely single-task. +import os +LOCAL_SCRATCH = os.environ.get("LOCAL_SCRATCH_ROOT", "/local_disk0/tmp") +``` + +The scanner does not resolve `os.environ.get()`, so paths constructed this way are skipped. Prefer fixing the antipattern when possible; this is an explicit opt-out, not a recommendation. + +## Adding a new pattern + +To add a new detection rule: + +1. Add the rule logic to `_NotebookScanner` (cell-scoped) or `scan_job_yaml` (DAB-scoped) in `scripts/preflight.py`. +2. Append a new entry to this catalog with: ID (next `FANOUT###` or topical prefix), severity, what-it-matches, example, fix, detection rule. +3. Add a unit test in `scripts/test_preflight.py` that exercises a triggering fixture and asserts the expected finding. +4. Update the summary table in `SKILL.md`. +5. Run `python3 scripts/test_preflight.py` — must still pass cleanly. diff --git a/skills/databricks-serverless-storage-check/references/remediation-guide.md b/skills/databricks-serverless-storage-check/references/remediation-guide.md new file mode 100644 index 0000000..515d4dd --- /dev/null +++ b/skills/databricks-serverless-storage-check/references/remediation-guide.md @@ -0,0 +1,272 @@ +# Remediation Guide + +Concrete before/after patterns for fixing the antipatterns flagged by `scripts/preflight.py`. Choose the fix that matches your handoff payload size and governance requirements. + +## Decision tree + +``` +What is the handoff? + + Small scalar or JSON (< ~48 KB total per run) + → use dbutils.jobs.taskValues (no file at all) + + A file + Need UC governance / large files / Delta tables? + → /Volumes////handoff/... (PREFERRED) + Smaller files, no UC required, simpler permissions? + → /Workspace/Shared//handoff/... (FALLBACK) + + Same-task scratch only + → /local_disk0/tmp/... is FINE (and recommended) +``` + +## Fix 1 — UC Volumes handoff (preferred) + +Use a Volume for any cross-task file. Volumes are durable, cross-node, UC-governed, and work for any file size. + +### Setup (one-time, per workload) + +```sql +CREATE VOLUME IF NOT EXISTS main.analytics.job_handoffs; +GRANT WRITE VOLUME ON VOLUME main.analytics.job_handoffs TO ``; +GRANT READ VOLUME ON VOLUME main.analytics.job_handoffs TO ``; +``` + +### Before — broken (FANOUT001 + FANOUT006) + +```python +import pandas as pd + +tmp = "/local_disk0/spark-abc/trustedTemp-def/handoff.parquet" +pd.DataFrame({"x": [1, 2, 3]}).to_parquet(tmp) + +dbutils.notebook.run("./child", 600, {"handoff_path": tmp}) +``` + +### After — durable Volumes handoff + +```python +import pandas as pd + +run_id = dbutils.notebook.entry_point.getDbutils().notebook().getContext().jobId().get() +handoff = f"/Volumes/main/analytics/job_handoffs/run_{run_id}/data.parquet" + +dbutils.fs.mkdirs(f"/Volumes/main/analytics/job_handoffs/run_{run_id}") +pd.DataFrame({"x": [1, 2, 3]}).to_parquet(handoff) + +dbutils.notebook.run("./child", 600, {"handoff_path": handoff}) +``` + +### Cleanup (optional, end-of-job task) + +```python +# Remove this run's handoff directory at the end of the job +import shutil +run_dir = f"/Volumes/main/analytics/job_handoffs/run_{run_id}" +shutil.rmtree(run_dir, ignore_errors=True) +``` + +### Permission notes + +- The job's run-as identity needs `WRITE VOLUME` on the producing side and `READ VOLUME` on the consuming side. +- For ad-hoc development from a notebook, the calling user needs the same grants. +- Volume paths are accessible from Python (`open`, `pd.read_*`, `shutil.*`), Spark (`spark.read.*`), and shell commands (`cat`, `ls`). +- Lifecycle: Volumes persist until you delete them. Plan for cleanup if your job produces many handoff directories. + +## Fix 2 — `/Workspace` handoff (fallback) + +Use `/Workspace` files when UC is not available or when the file is small and ephemeral. Files written under `/Workspace` are durable and visible across nodes, but subject to workspace storage quotas and not designed for high-throughput I/O. + +### Before — broken + +```python +import json + +tmp = "/tmp/config.json" +with open(tmp, "w") as f: + json.dump({"feature_flags": ["a", "b"]}, f) + +dbutils.notebook.run("./apply_config", 600, {"config_path": tmp}) +``` + +### After — Workspace handoff + +```python +import json, os + +handoff_dir = "/Workspace/Shared/my_job/handoff" +os.makedirs(handoff_dir, exist_ok=True) +handoff = f"{handoff_dir}/config.json" + +with open(handoff, "w") as f: + json.dump({"feature_flags": ["a", "b"]}, f) + +dbutils.notebook.run("./apply_config", 600, {"config_path": handoff}) +``` + +### Permission notes + +- The run-as identity needs **CAN_EDIT** on `/Workspace/Shared/my_job/` (or whichever folder you write to). +- `/Workspace` is workspace-scoped: the same path is **not** visible from a different workspace. +- Keep files under `/Workspace` modest in size (megabytes, not gigabytes). For large data, use Volumes. + +## Fix 3 — `dbutils.jobs.taskValues` for small payloads + +If the handoff is a scalar, a small dict, or a small JSON blob, skip the file entirely. Task values are designed for this and avoid all the storage concerns. + +### Before — broken + +```python +# In parent task +import json +status = {"records_processed": 12345, "skipped": 2} +with open("/tmp/status.json", "w") as f: + json.dump(status, f) +``` + +```python +# In child task +import json +with open("/tmp/status.json") as f: # FANOUT002: child reads from /tmp + status = json.load(f) +``` + +### After — taskValues handoff + +```python +# In parent task +dbutils.jobs.taskValues.set(key="records_processed", value=12345) +dbutils.jobs.taskValues.set(key="status", value={"records_processed": 12345, "skipped": 2}) +``` + +```python +# In child task — referencing the parent task by key +records = dbutils.jobs.taskValues.get( + taskKey="parent_task", + key="records_processed", + debugValue=0, +) +status = dbutils.jobs.taskValues.get( + taskKey="parent_task", + key="status", + debugValue={}, +) +``` + +### Limits + +- Per-task-value: 48 KB serialized JSON +- Per-run total across all task values: 5 MB +- Types: any JSON-serializable Python value (str, int, float, bool, list, dict, None) +- The `debugValue` is required and is used when running the notebook interactively (outside a job) + +## Fix 4 — `pipeline_task` downstream of a notebook (FANOUT004) + +When a pipeline task depends on a notebook task, don't try to hand off via a local-disk path. The pipeline runs in its own context. + +### Before — broken + +```yaml +tasks: + - task_key: prep + notebook_task: + notebook_path: ./prep_data.py # writes /tmp/staging.parquet + - task_key: run_pipeline + depends_on: [{ task_key: prep }] + pipeline_task: + pipeline_id: 12345 # tries to read /tmp/staging.parquet +``` + +### After — Volumes-based handoff + +Update the notebook: + +```python +# prep_data.py +dest = "/Volumes/main/raw/staging/run_42/data.parquet" +df.write.format("parquet").mode("overwrite").save(dest) +``` + +Update the pipeline to read from the volume: + +```python +# In the pipeline notebook (DLT / SDP) +import dlt + +@dlt.table +def staging(): + return spark.read.format("parquet").load("/Volumes/main/raw/staging/run_42/data.parquet") +``` + +For incremental ingest, prefer Auto Loader over a single-path read: + +```python +@dlt.table +def staging(): + return ( + spark.readStream.format("cloudFiles") + .option("cloudFiles.format", "parquet") + .load("/Volumes/main/raw/staging/") + ) +``` + +## What NOT to do — anti-examples + +These are the exact patterns the scanner exists to catch. Do not use any of them for cross-task data. + +### Anti-pattern 1: parent writes to trustedTemp, child reads + +```python +# Parent +tmp = "/local_disk0/spark-abc/trustedTemp-def/handoff.parquet" # FANOUT006 +df.write.parquet(tmp) # writes to local node only +dbutils.notebook.run("./child", 600, {"handoff_path": tmp}) # FANOUT001 +``` + +```python +# Child +path = dbutils.widgets.get("handoff_path") +df = pd.read_parquet(path) # FANOUT002 — child likely runs on a different node +``` + +### Anti-pattern 2: sibling tasks share `/tmp` + +```yaml +tasks: + - task_key: producer + notebook_task: { notebook_path: ./producer.py } # writes /tmp/foo.parquet + - task_key: consumer + depends_on: [{ task_key: producer }] + notebook_task: { notebook_path: ./consumer.py } # reads /tmp/foo.parquet — FANOUT003 +``` + +### Anti-pattern 3: cleanup that depends on local state across tasks + +```python +# Final cleanup task +import shutil +shutil.rmtree("/local_disk0/scratch/") # only cleans this node; other nodes are untouched +``` + +The "cleanup" task may run on a node that never saw the scratch directory. Either move scratch to `/Volumes` and clean that, or skip the cleanup task entirely (local disk is reclaimed when the task ends). + +## When `/local_disk0/tmp` IS fine + +For completeness: local-disk paths are correct, and recommended, for **per-task scratch** that doesn't outlive the task. + +```python +# OK on serverless: temporary intermediate inside a single task +scratch = "/local_disk0/tmp/intermediate.parquet" +df.write.parquet(scratch) +# ... use scratch later in the SAME task ... +post = spark.read.parquet(scratch) +``` + +The boundary is: does another task — child notebook, sibling task, pipeline — need to read this file? If yes, it must live on `/Volumes` or `/Workspace`. If no, `/local_disk0/tmp` is the right answer. + +## Reference + +- [Unity Catalog volumes overview](https://docs.databricks.com/en/connect/unity-catalog/volumes.html) +- [Workspace files](https://docs.databricks.com/en/files/workspace.html) +- [`dbutils.jobs.taskValues`](https://docs.databricks.com/en/dev-tools/databricks-utils.html#task-values-utility-dbutilsjobstaskvalues) +- [Serverless compute limitations](https://docs.databricks.com/en/compute/serverless/limitations) diff --git a/skills/databricks-serverless-storage-check/scripts/preflight.py b/skills/databricks-serverless-storage-check/scripts/preflight.py new file mode 100644 index 0000000..7d6b64d --- /dev/null +++ b/skills/databricks-serverless-storage-check/scripts/preflight.py @@ -0,0 +1,1240 @@ +#!/usr/bin/env python3 +"""Serverless storage preflight: detect cross-task local-disk handoffs. + +Scans Databricks notebooks, directories, DAB job YAML, or remote jobs/runs +for the antipattern where one task writes to /local_disk0, /tmp, or a +trustedTemp directory and another task reads from it. On serverless +compute, tasks may run on different nodes, so these handoffs fail with +`INTERNAL_ERROR: [Errno 13] Permission denied`. + +Stdlib only. Optional `databricks` CLI for --job-id / --run-id modes. + +Usage: + preflight.py --notebook PATH [--json] + preflight.py --dir PATH [--json] + preflight.py --job-yaml PATH [--json] + preflight.py --job-id ID --profile NAME [--json] + preflight.py --run-id ID --profile NAME [--json] + +Exit codes: + 0 clean (or info-only findings) + 1 warnings found + 2 blockers found +""" + +from __future__ import annotations + +import argparse +import ast +import json +import re +import subprocess +import sys +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Iterable, Iterator + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Local-disk path roots that are unsafe for cross-task sharing on serverless. +LOCAL_DISK_PREFIXES = ( + "/local_disk0", + "/tmp", + "/dbfs/tmp", + "dbfs:/tmp", +) + +# Exact BSI signature: /local_disk0/spark-/trustedTemp-/... +BSI_TRUSTED_TEMP_RE = re.compile( + r"/local_disk0/spark-[A-Za-z0-9\-]+/trustedTemp[A-Za-z0-9\-]*" +) + +# Generic trustedTemp anywhere in the path. +TRUSTED_TEMP_RE = re.compile(r"trustedTemp[A-Za-z0-9\-]*") + +# Durable, cross-node storage roots. Paths starting with these are safe. +SAFE_PREFIXES = ("/Volumes/", "/Workspace/") + +# Calls that move data to a child task / sibling. If a local-disk path +# flows into one of these, that's a cross-task handoff. +CHILD_CALL_NAMES = { + "dbutils.notebook.run", + "dbutils.jobs.taskValues.set", + "dbutils.task_values.set", +} + +# Calls that pull from a parent task. If the value is then used as a path +# starting with /local_disk0 or /tmp, the parent must have written it there. +PARENT_PULL_NAMES = { + "dbutils.widgets.get", + "dbutils.jobs.taskValues.get", + "dbutils.task_values.get", +} + +# Env-sync error signature (run-id mode only). +ENV_SYNC_RE = re.compile( + r"ENVIRONMENT_SETUP_ERROR\.PYTHON_NOTEBOOK_ENVIRONMENT" +) + +# Databricks notebook cell delimiter for .py source format. +PY_CELL_DELIM_RE = re.compile(r"^# COMMAND -+\s*$", re.MULTILINE) +PY_MAGIC_RE = re.compile(r"^# MAGIC %(\w+)\s*(.*)$", re.MULTILINE) + + +# --------------------------------------------------------------------------- +# Finding model +# --------------------------------------------------------------------------- + +SEVERITY_BLOCKER = "blocker" +SEVERITY_WARNING = "warning" +SEVERITY_INFO = "info" + +SEVERITY_ORDER = { + SEVERITY_BLOCKER: 2, + SEVERITY_WARNING: 1, + SEVERITY_INFO: 0, +} + + +@dataclass +class Finding: + pattern_id: str + severity: str + file: str + line: int + snippet: str + message: str + fix: str + + def to_dict(self) -> dict: + return asdict(self) + + +# --------------------------------------------------------------------------- +# Path classification helpers +# --------------------------------------------------------------------------- + + +def is_local_disk_path(value: str) -> bool: + """True if the string looks like a local-disk path on Databricks compute.""" + if not isinstance(value, str) or not value: + return False + if TRUSTED_TEMP_RE.search(value): + return True + for prefix in LOCAL_DISK_PREFIXES: + if value == prefix or value.startswith(prefix + "/"): + return True + return False + + +def is_bsi_signature(value: str) -> bool: + """True if the string matches the exact BSI trustedTemp signature.""" + return bool(isinstance(value, str) and BSI_TRUSTED_TEMP_RE.search(value)) + + +def is_safe_path(value: str) -> bool: + """True if the string is a durable, cross-node storage path.""" + return isinstance(value, str) and any( + value.startswith(p) for p in SAFE_PREFIXES + ) + + +# --------------------------------------------------------------------------- +# Notebook source extraction +# --------------------------------------------------------------------------- + + +@dataclass +class PythonCell: + """A Python code block extracted from a notebook, with its source offset.""" + + code: str + start_line: int # 1-indexed line in the original file + + +def extract_python_cells(file_path: Path) -> list[PythonCell]: + """Return Python code cells from a .py or .ipynb notebook. + + For .py (Databricks source format), splits on `# COMMAND -----` and + keeps only cells that are Python (no leading `# MAGIC %sql/%scala/%r`). + For .ipynb, returns cells with `cell_type == "code"`. Magic-only cells + (those that start with `%sql`, `%pip`, etc.) are skipped from AST + analysis but remain visible to regex scans elsewhere. + """ + suffix = file_path.suffix.lower() + text = file_path.read_text(encoding="utf-8", errors="replace") + + if suffix == ".ipynb": + return _extract_ipynb_cells(text) + return _extract_py_cells(text) + + +def _extract_py_cells(text: str) -> list[PythonCell]: + cells: list[PythonCell] = [] + pos = 0 + line = 1 + parts = PY_CELL_DELIM_RE.split(text) + for part in parts: + stripped = part.lstrip("\n") + leading = len(part) - len(stripped) + # Skip cells whose first non-blank line is a magic that isn't %python. + first_nonblank = next( + (ln for ln in stripped.splitlines() if ln.strip()), + "", + ) + magic = PY_MAGIC_RE.match(first_nonblank) + if magic and magic.group(1) not in ("python", "py"): + line += part.count("\n") + continue + # Strip Databricks `# MAGIC ` prefixes from any python magic lines + # so the remainder is valid Python for ast.parse. + cleaned = "\n".join( + re.sub(r"^# MAGIC ?", "", ln) for ln in stripped.splitlines() + ) + if cleaned.strip(): + cells.append(PythonCell(code=cleaned, start_line=line + leading)) + line += part.count("\n") + return cells + + +def _extract_ipynb_cells(text: str) -> list[PythonCell]: + try: + nb = json.loads(text) + except json.JSONDecodeError: + return [] + cells: list[PythonCell] = [] + synthetic_line = 1 + for cell in nb.get("cells", []): + if cell.get("cell_type") != "code": + synthetic_line += 1 + continue + source = cell.get("source", "") + if isinstance(source, list): + source = "".join(source) + first_nonblank = next( + (ln for ln in source.splitlines() if ln.strip()), + "", + ) + if first_nonblank.startswith("%") and not first_nonblank.startswith( + "%python" + ): + synthetic_line += source.count("\n") + 1 + continue + cleaned = "\n".join( + ln[len("%python") :] if ln.startswith("%python") else ln + for ln in source.splitlines() + ) + if cleaned.strip(): + cells.append(PythonCell(code=cleaned, start_line=synthetic_line)) + synthetic_line += source.count("\n") + 1 + return cells + + +# --------------------------------------------------------------------------- +# AST visitor +# --------------------------------------------------------------------------- + + +def _attr_chain(node: ast.AST) -> str | None: + """Return a dotted name for an ast.Attribute chain like a.b.c, else None.""" + parts: list[str] = [] + cur = node + while isinstance(cur, ast.Attribute): + parts.append(cur.attr) + cur = cur.value + if isinstance(cur, ast.Name): + parts.append(cur.id) + return ".".join(reversed(parts)) + return None + + +def _call_qualname(node: ast.Call) -> str | None: + """Return a dotted callable name like `dbutils.notebook.run`, else None.""" + return _attr_chain(node.func) if isinstance(node.func, ast.Attribute) else ( + node.func.id if isinstance(node.func, ast.Name) else None + ) + + +def _resolve_string( + node: ast.AST, var_map: dict[str, str] +) -> str | None: + """Return a string value for a constant or a Name bound to a constant.""" + if isinstance(node, ast.Constant) and isinstance(node.value, str): + return node.value + if isinstance(node, ast.Name) and node.id in var_map: + return var_map[node.id] + return None + + +def _string_args( + node: ast.Call, var_map: dict[str, str] | None = None +) -> list[tuple[str, int]]: + """Yield (value, line) for every string positional/keyword arg. + + Resolves Name nodes against `var_map` (assignments earlier in the cell) + and recurses into dict/list/tuple/set literals so paths passed via + `{"k": tmp}` or `[tmp]` are still detected. + """ + vm = var_map or {} + out: list[tuple[str, int]] = [] + + def _collect(value_node: ast.AST, lineno: int) -> None: + s = _resolve_string(value_node, vm) + if s is not None: + out.append((s, lineno)) + return + if isinstance(value_node, (ast.List, ast.Tuple, ast.Set)): + for elt in value_node.elts: + _collect(elt, getattr(elt, "lineno", lineno)) + elif isinstance(value_node, ast.Dict): + for k, v in zip(value_node.keys, value_node.values): + if k is not None: + _collect(k, getattr(k, "lineno", lineno)) + if v is not None: + _collect(v, getattr(v, "lineno", lineno)) + + for arg in node.args: + _collect(arg, arg.lineno) + for kw in node.keywords: + if kw.value is not None: + _collect(kw.value, kw.value.lineno) + return out + + +def _build_var_map(tree: ast.AST) -> dict[str, str]: + """Build name -> string-literal map from top-level and nested Assigns.""" + out: dict[str, str] = {} + for node in ast.walk(tree): + if not isinstance(node, ast.Assign): + continue + if not ( + isinstance(node.value, ast.Constant) and isinstance(node.value.value, str) + ): + continue + for target in node.targets: + if isinstance(target, ast.Name): + out[target.id] = node.value.value + return out + + +def _all_string_constants(tree: ast.AST) -> Iterator[tuple[str, int]]: + """Yield (value, lineno) for every string Constant anywhere in the tree.""" + for node in ast.walk(tree): + if isinstance(node, ast.Constant) and isinstance(node.value, str): + yield node.value, node.lineno + + +class _NotebookScanner(ast.NodeVisitor): + """Collects local-disk writes, child calls, and parent pulls in a cell.""" + + def __init__(self, cell: PythonCell, file_path: str): + self.cell = cell + self.file = file_path + self.var_map: dict[str, str] = {} + self.local_writes: list[tuple[str, int, str]] = [] # (path, line, snippet) + self.child_calls: list[tuple[str, int, str, str]] = [] # (path, line, snippet, callname) + self.parent_reads: list[tuple[str, int, str]] = [] # (path, line, snippet) + self.fs_cp_local_to_local: list[tuple[str, str, int, str]] = [] + self.bsi_hits: list[tuple[str, int, str]] = [] + self.all_local_paths: set[str] = set() + + # ---- entrypoint ---- + def scan(self) -> None: + try: + tree = ast.parse(self.cell.code) + except SyntaxError: + return + self.var_map = _build_var_map(tree) + # Walk every string constant in the cell once. This catches BSI + # signatures bound to variables (e.g. `tmp = "/local_disk0/.../trustedTemp/..."`) + # and seeds the "this cell touches these local paths" set used by + # the DAB sibling-sharing analysis. + for value, lineno in _all_string_constants(tree): + if is_local_disk_path(value): + self.all_local_paths.add(value) + if is_bsi_signature(value): + self.bsi_hits.append( + (value, self._real_line(lineno), self._snippet(lineno)) + ) + # Also include resolved variable values in case the constant is + # only an attribute of a longer chain we missed. + for value in self.var_map.values(): + if is_local_disk_path(value): + self.all_local_paths.add(value) + self.visit(tree) + + # ---- helpers ---- + def _real_line(self, lineno: int) -> int: + return self.cell.start_line + lineno - 1 + + def _snippet(self, lineno: int) -> str: + lines = self.cell.code.splitlines() + if 1 <= lineno <= len(lines): + return lines[lineno - 1].strip() + return "" + + # ---- visitors ---- + def visit_Call(self, node: ast.Call) -> None: + callname = _call_qualname(node) + strings = _string_args(node, self.var_map) + + # Child calls (parent writes flowing out) + if callname in CHILD_CALL_NAMES: + for s, ln in strings: + if is_local_disk_path(s): + self.child_calls.append( + (s, self._real_line(ln), self._snippet(ln), callname) + ) + + # File writes to local-disk paths (open(..., "w"), pandas to_*, spark.write.*) + write_path = _detect_write_target(node, callname, self.var_map) + if write_path is not None: + value, lineno = write_path + if is_local_disk_path(value): + self.local_writes.append( + (value, self._real_line(lineno), self._snippet(lineno)) + ) + self.all_local_paths.add(value) + + # dbutils.fs.cp local-to-local (heuristic) + if callname in ("dbutils.fs.cp", "dbutils.fs.mv"): + cp_strings = [s for s, _ in strings if isinstance(s, str)] + if ( + len(cp_strings) >= 2 + and is_local_disk_path(cp_strings[0]) + and is_local_disk_path(cp_strings[1]) + ): + ln = strings[0][1] + self.fs_cp_local_to_local.append( + ( + cp_strings[0], + cp_strings[1], + self._real_line(ln), + self._snippet(ln), + ) + ) + + self.generic_visit(node) + + def visit_Assign(self, node: ast.Assign) -> None: + # Detect: x = dbutils.widgets.get("path"); open(x); etc. + # We approximate: if RHS is a parent-pull call and the variable is + # later used as a path argument to open() or a read_* call, that + # would be FANOUT002. Without dataflow, we surface a softer signal: + # if RHS is a parent-pull AND any local-disk string literal exists + # in the same cell as a read target, we'll catch it via direct + # string-literal reads below. + self.generic_visit(node) + + +def _detect_write_target( + node: ast.Call, callname: str | None, var_map: dict[str, str] +) -> tuple[str, int] | None: + """Return (path_string, lineno) if the call writes to a path, else None. + + Resolves Name args via `var_map` so writes through a local variable + (e.g. `tmp = "/local_disk0/..."; pd.DataFrame(...).to_parquet(tmp)`) + are still detected. + """ + if callname is None: + return None + + def _resolve(arg: ast.AST) -> str | None: + return _resolve_string(arg, var_map) + + # open(path, "w"|"wb"|"a"|...) + if callname == "open" and node.args: + mode = None + for arg in node.args[1:]: + if isinstance(arg, ast.Constant) and isinstance(arg.value, str): + mode = arg.value + break + for kw in node.keywords: + if kw.arg == "mode" and isinstance(kw.value, ast.Constant): + if isinstance(kw.value.value, str): + mode = kw.value.value + if mode and any(c in mode for c in ("w", "a", "x")): + s = _resolve(node.args[0]) + if s is not None: + return s, node.args[0].lineno + + # spark.write.* / DataFrame.write.* (heuristic: any call whose name + # ends in .save / .saveAsTable / .parquet / .csv / .json / .text / .orc / + # .delta / .insertInto with a string arg) + write_terminals = { + "save", + "saveAsTable", + "parquet", + "csv", + "json", + "text", + "orc", + } + last = callname.split(".")[-1] + if last in write_terminals and node.args: + s = _resolve(node.args[0]) + if s is not None: + return s, node.args[0].lineno + + # pandas: df.to_csv, df.to_parquet, df.to_json, df.to_pickle + if last.startswith("to_") and node.args: + s = _resolve(node.args[0]) + if s is not None: + return s, node.args[0].lineno + + # shutil.copy / copyfile / move (dest is arg 1) + if callname in ("shutil.copy", "shutil.copyfile", "shutil.move") and len(node.args) >= 2: + s = _resolve(node.args[1]) + if s is not None: + return s, node.args[1].lineno + + # dbutils.fs.put(path, contents, overwrite?) + if callname == "dbutils.fs.put" and node.args: + s = _resolve(node.args[0]) + if s is not None: + return s, node.args[0].lineno + + return None + + +# --------------------------------------------------------------------------- +# Per-file analysis +# --------------------------------------------------------------------------- + + +def _read_targets_in_cell(scanner: _NotebookScanner) -> list[tuple[str, int, str]]: + """Best-effort detection of reads from local-disk string literals. + + Catches open(path, "r"), pd.read_*, spark.read.* with string args. + Resolves Name args via the scanner's var_map. + """ + out: list[tuple[str, int, str]] = [] + try: + tree = ast.parse(scanner.cell.code) + except SyntaxError: + return out + + read_terminals = { + "parquet", + "csv", + "json", + "text", + "orc", + "table", + "load", + } + vm = scanner.var_map + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + callname = _call_qualname(node) + strings = _string_args(node, vm) + if callname == "open": + mode = "r" + for arg in node.args[1:]: + if isinstance(arg, ast.Constant) and isinstance(arg.value, str): + mode = arg.value + break + if "r" in mode and not any(c in mode for c in ("w", "a", "x")): + for s, ln in strings[:1]: + if is_local_disk_path(s): + out.append((s, scanner._real_line(ln), scanner._snippet(ln))) + scanner.all_local_paths.add(s) + elif callname and callname.split(".")[-1] in read_terminals: + for s, ln in strings[:1]: + if is_local_disk_path(s): + out.append((s, scanner._real_line(ln), scanner._snippet(ln))) + scanner.all_local_paths.add(s) + elif callname and callname.split(".")[-1].startswith("read_"): + for s, ln in strings[:1]: + if is_local_disk_path(s): + out.append((s, scanner._real_line(ln), scanner._snippet(ln))) + scanner.all_local_paths.add(s) + return out + + +def scan_notebook(file_path: Path) -> list[Finding]: + """Scan a single notebook and emit FANOUT findings.""" + findings: list[Finding] = [] + rel = str(file_path) + cells = extract_python_cells(file_path) + + has_child_call_anywhere = False + has_local_write_anywhere = False + has_local_read_anywhere = False + + cell_scanners: list[_NotebookScanner] = [] + for cell in cells: + scanner = _NotebookScanner(cell, rel) + scanner.scan() + cell_scanners.append(scanner) + if scanner.child_calls: + has_child_call_anywhere = True + if scanner.local_writes: + has_local_write_anywhere = True + + for scanner in cell_scanners: + # FANOUT006 — BSI signature (always blocker, regardless of context) + for path, line, snippet in scanner.bsi_hits: + findings.append( + Finding( + pattern_id="FANOUT006", + severity=SEVERITY_BLOCKER, + file=rel, + line=line, + snippet=snippet, + message=( + f"Hardcoded path matches the exact BSI trustedTemp " + f"signature: {path!r}. This is a known-bad cross-node " + f"path on serverless." + ), + fix=( + "Replace with /Volumes////" + "handoff//... or /Workspace/Shared//...; " + "see references/remediation-guide.md." + ), + ) + ) + + # FANOUT001 — local-disk path passed to a child call + for path, line, snippet, callname in scanner.child_calls: + findings.append( + Finding( + pattern_id="FANOUT001", + severity=SEVERITY_BLOCKER, + file=rel, + line=line, + snippet=snippet, + message=( + f"Local-disk path {path!r} passed to {callname}. " + f"Child tasks may run on a different node and will " + f"hit Permission denied." + ), + fix=( + "Write the handoff to /Volumes///" + "/... or /Workspace/Shared/... and pass that " + "path instead. For small payloads, use " + "dbutils.jobs.taskValues with no file." + ), + ) + ) + + # FANOUT002 — local-disk read in a notebook that is also called by a parent + # We can't see the caller statically, so we surface reads of /local_disk0 + # or /tmp as warnings when they appear in a notebook that ALSO contains + # widgets/taskValues.get (suggesting it's a child notebook). + is_likely_child = any( + re.search(r"dbutils\.(widgets|jobs\.taskValues|task_values)\.get", + c.code) + for c in cells + ) + for path, line, snippet in _read_targets_in_cell(scanner): + if is_likely_child: + findings.append( + Finding( + pattern_id="FANOUT002", + severity=SEVERITY_BLOCKER, + file=rel, + line=line, + snippet=snippet, + message=( + f"Child notebook reads from local-disk path " + f"{path!r}. On serverless, the parent task that " + f"wrote this file may have run on a different node." + ), + fix=( + "Have the parent write to /Volumes/... or " + "/Workspace/... and read from there. For scalars " + "and small JSON, use dbutils.jobs.taskValues." + ), + ) + ) + + # FANOUT005 — dbutils.fs.cp local→local in a multi-task context (heuristic) + for src, dst, line, snippet in scanner.fs_cp_local_to_local: + findings.append( + Finding( + pattern_id="FANOUT005", + severity=SEVERITY_INFO, + file=rel, + line=line, + snippet=snippet, + message=( + f"dbutils.fs.cp from {src!r} to {dst!r} — both on local " + f"disk. Safe within a single task only." + ), + fix=( + "If this notebook is invoked by a multi-task job, use " + "/Volumes/... or /Workspace/... for cross-task data." + ), + ) + ) + + return findings + + +def scan_path(target: Path) -> list[Finding]: + """Scan a single notebook or a directory of notebooks.""" + findings: list[Finding] = [] + if target.is_file(): + if target.suffix.lower() in (".py", ".ipynb"): + findings.extend(scan_notebook(target)) + return findings + if target.is_dir(): + for path in sorted(target.rglob("*")): + if path.suffix.lower() in (".py", ".ipynb"): + findings.extend(scan_notebook(path)) + return findings + + +# --------------------------------------------------------------------------- +# DAB YAML analysis +# --------------------------------------------------------------------------- + + +def _try_load_yaml(text: str) -> dict | None: + try: + import yaml # type: ignore + except ImportError: + return None + try: + return yaml.safe_load(text) + except Exception: + return None + + +def _leading_spaces(line: str) -> int: + """Count leading spaces. Treats a tab as 4 spaces (good enough for DABs).""" + n = 0 + for ch in line: + if ch == " ": + n += 1 + elif ch == "\t": + n += 4 + else: + break + return n + + +def _minimal_yaml_tasks(text: str) -> list[dict]: + """Stdlib-only fallback: extract a flat task list from a DAB YAML. + + Indent-aware. The top-level task indent is the column of the first + `- task_key:` line under `tasks:`. Any subsequent `- task_key:` line + at a DEEPER indent is treated as a depends_on entry, not a new task. + """ + tasks: list[dict] = [] + in_tasks = False + tasks_indent: int | None = None # indent of `tasks:` keyword + task_item_indent: int | None = None # indent of `- task_key:` lines + cur: dict | None = None + in_depends = False + depends_indent: int | None = None + + for raw in text.splitlines(): + line = raw.rstrip() + if not line: + continue + indent = _leading_spaces(line) + + # `tasks:` declaration + if re.match(r"^\s*tasks\s*:\s*$", line): + in_tasks = True + tasks_indent = indent + task_item_indent = None + cur = None + continue + + if not in_tasks: + continue + + # Left the tasks: block (we hit something at <= tasks_indent that + # isn't a child of tasks:). + if tasks_indent is not None and indent <= tasks_indent and not re.match( + r"^\s*tasks\s*:\s*$", line + ): + if cur is not None: + tasks.append(cur) + cur = None + in_tasks = False + continue + + # New top-level task entry + m = re.match(r"^(\s*)-\s*task_key\s*:\s*(\S+)\s*$", line) + if m and (task_item_indent is None or indent == task_item_indent): + if cur is not None: + tasks.append(cur) + task_item_indent = indent + cur = {"task_key": m.group(2).strip("\"'"), "depends_on": []} + in_depends = False + continue + + if cur is None: + continue + + # Enter / leave depends_on block + if re.match(r"^\s*depends_on\s*:\s*$", line): + in_depends = True + depends_indent = indent + continue + if in_depends and depends_indent is not None and indent <= depends_indent: + in_depends = False + + # depends_on entries: `- task_key: X` deeper than depends_indent + if in_depends: + m = re.match(r"^\s*-\s*task_key\s*:\s*(\S+)\s*$", line) + if m: + cur["depends_on"].append(m.group(1).strip("\"'")) + continue + + # Task-level keys + m = re.match(r"^\s*notebook_path\s*:\s*(\S+)\s*$", line) + if m: + cur["notebook_path"] = m.group(1).strip("\"'") + continue + m = re.match(r"^\s*pipeline_id\s*:\s*(\S+)\s*$", line) + if m: + cur["pipeline_id"] = m.group(1) + continue + if re.match(r"^\s*pipeline_task\s*:\s*$", line): + cur["is_pipeline_task"] = True + continue + + if cur is not None: + tasks.append(cur) + return tasks + + +def _tasks_from_loaded(doc: dict) -> list[dict]: + """Extract task dicts from a loaded DAB YAML doc.""" + out: list[dict] = [] + if not isinstance(doc, dict): + return out + resources = doc.get("resources") or {} + jobs = (resources.get("jobs") or {}) if isinstance(resources, dict) else {} + if not isinstance(jobs, dict): + return out + for job_def in jobs.values(): + if not isinstance(job_def, dict): + continue + for task in job_def.get("tasks") or []: + if not isinstance(task, dict): + continue + entry = { + "task_key": task.get("task_key"), + "depends_on": [ + d.get("task_key") + for d in (task.get("depends_on") or []) + if isinstance(d, dict) + ], + } + notebook = task.get("notebook_task") or {} + if isinstance(notebook, dict) and "notebook_path" in notebook: + entry["notebook_path"] = notebook["notebook_path"] + if "pipeline_task" in task: + entry["is_pipeline_task"] = True + out.append(entry) + return out + + +def scan_job_yaml(yaml_path: Path) -> list[Finding]: + """Scan a DAB job YAML for sibling-task local-disk sharing patterns.""" + findings: list[Finding] = [] + text = yaml_path.read_text(encoding="utf-8", errors="replace") + + doc = _try_load_yaml(text) + tasks = _tasks_from_loaded(doc) if doc else _minimal_yaml_tasks(text) + + # Resolve referenced notebooks (relative to the YAML's parent dir or + # to the bundle root, taking the simplest interpretation). + base = yaml_path.parent + bundle_root_candidates = [base, base.parent] + referenced: list[Path] = [] + for task in tasks: + nb = task.get("notebook_path") + if not nb: + continue + # Strip the .py/.ipynb suffix if missing; try both. + for root in bundle_root_candidates: + for ext in ("", ".py", ".ipynb"): + candidate = (root / (nb.lstrip("./") + ext)).resolve() + if candidate.exists(): + referenced.append(candidate) + break + else: + continue + break + + # Scan referenced notebooks for any local-disk paths the notebook + # touches (writes, reads, child-call args, or bare string literals). + notebook_local_paths: dict[Path, set[str]] = {} + for nb_path in referenced: + paths: set[str] = set() + for cell in extract_python_cells(nb_path): + scanner = _NotebookScanner(cell, str(nb_path)) + scanner.scan() + _read_targets_in_cell(scanner) # populates scanner.all_local_paths + paths.update(scanner.all_local_paths) + notebook_local_paths[nb_path] = paths + + # Per-notebook findings still apply when scanning a job. + findings.extend(scan_notebook(nb_path)) + + # FANOUT003 — sibling tasks share a local-disk path + path_to_tasks: dict[str, list[str]] = {} + for task in tasks: + nb = task.get("notebook_path") + if not nb: + continue + for resolved, paths in notebook_local_paths.items(): + if nb.lstrip("./") in resolved.as_posix(): + for p in paths: + path_to_tasks.setdefault(p, []).append(task["task_key"]) + + for path, keys in path_to_tasks.items(): + unique_keys = sorted(set(k for k in keys if k)) + if len(unique_keys) > 1: + findings.append( + Finding( + pattern_id="FANOUT003", + severity=SEVERITY_WARNING, + file=str(yaml_path), + line=0, + snippet=f"tasks: {', '.join(unique_keys)}", + message=( + f"Multiple sibling tasks reference local-disk path " + f"{path!r}. On serverless, these tasks may run on " + f"different nodes and cannot share local files." + ), + fix=( + "Move the shared artifact to /Volumes/... or " + "/Workspace/... and update both tasks to use that path." + ), + ) + ) + + # FANOUT004 — pipeline_task downstream of notebook_task that wrote local + task_by_key = {t.get("task_key"): t for t in tasks if t.get("task_key")} + notebook_wrote_local: set[str] = set() + for task in tasks: + key = task.get("task_key") + nb = task.get("notebook_path") + if not key or not nb: + continue + for resolved, paths in notebook_local_paths.items(): + if nb.lstrip("./") in resolved.as_posix() and paths: + notebook_wrote_local.add(key) + break + for task in tasks: + if not task.get("is_pipeline_task"): + continue + upstream = task.get("depends_on") or [] + if any(u in notebook_wrote_local for u in upstream): + findings.append( + Finding( + pattern_id="FANOUT004", + severity=SEVERITY_WARNING, + file=str(yaml_path), + line=0, + snippet=f"pipeline_task {task.get('task_key')} depends_on {upstream}", + message=( + f"pipeline_task {task.get('task_key')!r} depends on a " + f"notebook_task that wrote to local disk. The pipeline " + f"will not see those files." + ), + fix=( + "Have the upstream notebook write to /Volumes/... and " + "configure the pipeline to read from that location." + ), + ) + ) + + return findings + + +# --------------------------------------------------------------------------- +# Remote modes (--job-id, --run-id) — shell out to databricks CLI +# --------------------------------------------------------------------------- + + +def _databricks_cli(args: list[str], profile: str) -> str: + """Run `databricks` CLI with the given profile, return stdout.""" + cmd = ["databricks"] + args + ["--profile", profile, "--output", "json"] + result = subprocess.run( + cmd, check=False, capture_output=True, text=True, timeout=60 + ) + if result.returncode != 0: + raise RuntimeError( + f"databricks CLI failed: {' '.join(cmd)}\n{result.stderr}" + ) + return result.stdout + + +def scan_remote_job(job_id: str, profile: str) -> list[Finding]: + """Pull notebook source for every task in a remote job and scan.""" + raw = _databricks_cli(["jobs", "get", "--job-id", job_id], profile) + job = json.loads(raw) + tasks = (job.get("settings") or {}).get("tasks") or [] + + tmp_dir = Path("/tmp") / f"preflight-job-{job_id}" + tmp_dir.mkdir(parents=True, exist_ok=True) + + findings: list[Finding] = [] + notebook_paths: dict[str, Path] = {} + for task in tasks: + nb = (task.get("notebook_task") or {}).get("notebook_path") + if not nb: + continue + local = tmp_dir / (task["task_key"] + ".py") + try: + _databricks_cli( + [ + "workspace", + "export", + nb, + "--format", + "SOURCE", + "--file", + str(local), + ], + profile, + ) + except RuntimeError as exc: + findings.append( + Finding( + pattern_id="FANOUT000", + severity=SEVERITY_INFO, + file=nb, + line=0, + snippet="", + message=f"Could not export {nb}: {exc}", + fix="Verify the notebook path and your CLI permissions.", + ) + ) + continue + notebook_paths[task["task_key"]] = local + findings.extend(scan_notebook(local)) + + return findings + + +def scan_run_output(run_id: str, profile: str) -> list[Finding]: + """Pull run output and classify the error trace as fan-out vs env-sync.""" + raw = _databricks_cli(["jobs", "get-run-output", "--run-id", run_id], profile) + payload = json.loads(raw) + error = (payload.get("error") or "") + "\n" + (payload.get("error_trace") or "") + + findings: list[Finding] = [] + if ENV_SYNC_RE.search(error): + findings.append( + Finding( + pattern_id="ENV001", + severity=SEVERITY_INFO, + file=f"run/{run_id}", + line=0, + snippet="ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT", + message=( + "The run failed with the rare, platform-side env-sync " + "error. This skill does not fix this — escalate to " + "Databricks engineering support." + ), + fix=( + "Open an ES ticket (use /jira-actions or /support-" + "escalation) with the run ID and full error trace. As a " + "mitigation, reduce dependency setup during child " + "notebook startup and add task retries." + ), + ) + ) + + bsi_hits = BSI_TRUSTED_TEMP_RE.findall(error) + for hit in bsi_hits: + findings.append( + Finding( + pattern_id="FANOUT006", + severity=SEVERITY_BLOCKER, + file=f"run/{run_id}", + line=0, + snippet=hit, + message=( + f"Run output contains the BSI trustedTemp signature " + f"{hit!r}. This is the cross-task local-disk antipattern." + ), + fix=( + "Locate the task that wrote to /local_disk0/spark-.../" + "trustedTemp-... and rewrite the handoff to use " + "/Volumes/... or /Workspace/..." + ), + ) + ) + + # Generic permission-denied on local-disk path + perm_re = re.compile( + r"Permission denied:\s*['\"]?(/local_disk0[^'\"\s]*|/tmp/[^'\"\s]*)" + ) + for m in perm_re.finditer(error): + path = m.group(1) + # Skip if already covered by FANOUT006 above. + if BSI_TRUSTED_TEMP_RE.search(path): + continue + findings.append( + Finding( + pattern_id="FANOUT001", + severity=SEVERITY_BLOCKER, + file=f"run/{run_id}", + line=0, + snippet=f"Permission denied: {path}", + message=( + f"Run failed with Permission denied on local-disk path " + f"{path!r}. Likely a cross-task handoff." + ), + fix=( + "Identify the writing task and move the handoff to " + "/Volumes/... or /Workspace/..." + ), + ) + ) + + return findings + + +# --------------------------------------------------------------------------- +# Output formatting +# --------------------------------------------------------------------------- + + +def format_human(findings: list[Finding]) -> str: + if not findings: + return "No serverless storage issues found.\n" + + by_sev: dict[str, list[Finding]] = { + SEVERITY_BLOCKER: [], + SEVERITY_WARNING: [], + SEVERITY_INFO: [], + } + for f in findings: + by_sev[f.severity].append(f) + + out: list[str] = [] + out.append( + f"Serverless storage preflight: {len(findings)} finding(s) " + f"({len(by_sev[SEVERITY_BLOCKER])} blocker, " + f"{len(by_sev[SEVERITY_WARNING])} warning, " + f"{len(by_sev[SEVERITY_INFO])} info)" + ) + out.append("=" * 72) + + label = { + SEVERITY_BLOCKER: "BLOCKER", + SEVERITY_WARNING: "WARNING", + SEVERITY_INFO: "INFO", + } + for sev in (SEVERITY_BLOCKER, SEVERITY_WARNING, SEVERITY_INFO): + items = by_sev[sev] + if not items: + continue + out.append("") + out.append(f"[{label[sev]}] {len(items)} finding(s)") + out.append("-" * 72) + for f in items: + location = ( + f"{f.file}:{f.line}" if f.line else f.file + ) + out.append(f" [{f.pattern_id}] {location}") + if f.snippet: + out.append(f" > {f.snippet}") + out.append(f" {f.message}") + out.append(f" Fix: {f.fix}") + out.append("") + return "\n".join(out) + + +def format_json(findings: list[Finding]) -> str: + payload = { + "findings": [f.to_dict() for f in findings], + "summary": { + "blocker": sum(1 for f in findings if f.severity == SEVERITY_BLOCKER), + "warning": sum(1 for f in findings if f.severity == SEVERITY_WARNING), + "info": sum(1 for f in findings if f.severity == SEVERITY_INFO), + "total": len(findings), + }, + } + return json.dumps(payload, indent=2) + + +def exit_code_for(findings: list[Finding]) -> int: + if any(f.severity == SEVERITY_BLOCKER for f in findings): + return 2 + if any(f.severity == SEVERITY_WARNING for f in findings): + return 1 + return 0 + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="preflight.py", + description=( + "Detect cross-task local-disk handoffs in Databricks serverless " + "jobs and notebooks." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + mode = p.add_mutually_exclusive_group(required=True) + mode.add_argument("--notebook", type=Path, help="Scan a single .py or .ipynb") + mode.add_argument("--dir", type=Path, help="Recursively scan a directory") + mode.add_argument("--job-yaml", type=Path, help="Scan a DAB job YAML") + mode.add_argument("--job-id", type=str, help="Scan a remote job by ID") + mode.add_argument("--run-id", type=str, help="Classify a failed run's error trace") + p.add_argument( + "--profile", + type=str, + default="DEFAULT", + help="Databricks CLI profile (required for --job-id / --run-id)", + ) + p.add_argument("--json", action="store_true", help="Machine-readable output") + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + findings: list[Finding] = [] + + if args.notebook: + findings = scan_path(args.notebook) + elif args.dir: + findings = scan_path(args.dir) + elif args.job_yaml: + findings = scan_job_yaml(args.job_yaml) + elif args.job_id: + findings = scan_remote_job(args.job_id, args.profile) + elif args.run_id: + findings = scan_run_output(args.run_id, args.profile) + + findings.sort( + key=lambda f: ( + -SEVERITY_ORDER[f.severity], + f.file, + f.line, + f.pattern_id, + ) + ) + + if args.json: + print(format_json(findings)) + else: + print(format_human(findings)) + + return exit_code_for(findings) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/databricks-serverless-storage-check/scripts/test_preflight.py b/skills/databricks-serverless-storage-check/scripts/test_preflight.py new file mode 100644 index 0000000..269459e --- /dev/null +++ b/skills/databricks-serverless-storage-check/scripts/test_preflight.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +"""Self-test fixtures and assertions for preflight.py. + +Run with: + python3 scripts/test_preflight.py + +Exits 0 if all assertions pass, 1 otherwise. Uses only the stdlib + +preflight.py itself; no test framework dependency. +""" + +from __future__ import annotations + +import json +import sys +import tempfile +from pathlib import Path + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) + +import preflight # noqa: E402 + + +# --------------------------------------------------------------------------- +# Fixture content +# --------------------------------------------------------------------------- + +BSI_PARENT_NOTEBOOK = """\ +# BSI-pattern parent: writes to trustedTemp and hands off via dbutils.notebook.run +import pandas as pd + +tmp = "/local_disk0/spark-d6bae111-42bd-4f54/trustedTemp-55adadbe/handoff.parquet" +pd.DataFrame({"x": [1, 2, 3]}).to_parquet(tmp) + +dbutils.notebook.run("./child", 600, {"handoff_path": tmp}) +""" + +BSI_CHILD_NOTEBOOK = """\ +# BSI-pattern child: reads from a /local_disk0 path passed in as a widget +import pandas as pd + +dbutils.widgets.text("handoff_path", "") +path = dbutils.widgets.get("handoff_path") +df = pd.read_parquet("/local_disk0/spark-d6bae111-42bd-4f54/trustedTemp-55adadbe/handoff.parquet") +print(df) +""" + +CLEAN_VOLUMES_NOTEBOOK = """\ +# Clean: uses /Volumes for cross-task handoff +import pandas as pd + +handoff = "/Volumes/main/analytics/handoffs/run_42/data.parquet" +pd.DataFrame({"x": [1, 2, 3]}).to_parquet(handoff) + +dbutils.notebook.run("./child", 600, {"handoff_path": handoff}) +""" + +DAB_YAML_SHARED_TMP = """\ +resources: + jobs: + my_job: + name: my_job + tasks: + - task_key: producer + notebook_task: + notebook_path: ./producer.py + - task_key: consumer + depends_on: + - task_key: producer + notebook_task: + notebook_path: ./consumer.py +""" + +PRODUCER_NOTEBOOK = """\ +import pandas as pd +shared = "/tmp/foo.parquet" +pd.DataFrame({"x": [1]}).to_parquet(shared) +""" + +CONSUMER_NOTEBOOK = """\ +import pandas as pd +shared = "/tmp/foo.parquet" +df = pd.read_parquet(shared) +""" + +ENV_SYNC_RUN_OUTPUT = json.dumps( + { + "error": "ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT", + "error_trace": "Virtual environment changed while syncing", + } +) + + +# --------------------------------------------------------------------------- +# Assertion helpers +# --------------------------------------------------------------------------- + + +class TestFailure(Exception): + pass + + +def expect(cond: bool, msg: str) -> None: + if not cond: + raise TestFailure(msg) + + +def has_finding(findings, pattern_id: str) -> bool: + return any(f.pattern_id == pattern_id for f in findings) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_bsi_pattern_blockers(): + """BSI repro: parent + child notebooks together trigger 001, 002, 006.""" + with tempfile.TemporaryDirectory() as td: + d = Path(td) + (d / "parent.py").write_text(BSI_PARENT_NOTEBOOK) + (d / "child.py").write_text(BSI_CHILD_NOTEBOOK) + + findings = preflight.scan_path(d) + expect( + has_finding(findings, "FANOUT001"), + f"expected FANOUT001 in BSI parent, got: {[f.pattern_id for f in findings]}", + ) + expect( + has_finding(findings, "FANOUT002"), + f"expected FANOUT002 in BSI child, got: {[f.pattern_id for f in findings]}", + ) + expect( + has_finding(findings, "FANOUT006"), + f"expected FANOUT006 for trustedTemp signature, got: " + f"{[f.pattern_id for f in findings]}", + ) + expect( + preflight.exit_code_for(findings) == 2, + f"expected exit code 2 for blockers, got {preflight.exit_code_for(findings)}", + ) + + +def test_clean_volumes_notebook(): + """Notebook using /Volumes produces zero findings, exit 0.""" + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(CLEAN_VOLUMES_NOTEBOOK) + path = Path(f.name) + try: + findings = preflight.scan_path(path) + expect( + findings == [], + f"expected no findings, got: {[(f.pattern_id, f.snippet) for f in findings]}", + ) + expect( + preflight.exit_code_for(findings) == 0, + "expected exit 0 on clean notebook", + ) + finally: + path.unlink() + + +def test_dab_yaml_shared_tmp(): + """DAB YAML with sibling tasks reading/writing /tmp triggers FANOUT003.""" + with tempfile.TemporaryDirectory() as td: + d = Path(td) + (d / "producer.py").write_text(PRODUCER_NOTEBOOK) + (d / "consumer.py").write_text(CONSUMER_NOTEBOOK) + yaml = d / "my_job.job.yml" + yaml.write_text(DAB_YAML_SHARED_TMP) + + findings = preflight.scan_job_yaml(yaml) + expect( + has_finding(findings, "FANOUT003"), + f"expected FANOUT003 for sibling-shared /tmp, got: " + f"{[f.pattern_id for f in findings]}", + ) + # Must be at least warning severity, not silent. + expect( + preflight.exit_code_for(findings) >= 1, + "expected exit code >= 1 for sibling-shared /tmp", + ) + + +def test_env_sync_run_classification(): + """Run-output mode produces ENV001 for env-sync error trace.""" + # We don't shell out — we call scan_run_output's inner classification + # by invoking the regex path directly through a tiny shim. + error_text = ( + "ENVIRONMENT_SETUP_ERROR.PYTHON_NOTEBOOK_ENVIRONMENT\n" + "Virtual environment changed while syncing" + ) + expect( + preflight.ENV_SYNC_RE.search(error_text) is not None, + "ENV_SYNC_RE failed to match canonical env-sync error", + ) + + +def test_bsi_signature_regex(): + """BSI trustedTemp regex matches the exact thread signature.""" + canonical = ( + "/local_disk0/spark-d6bae111-42bd-4f54-9136-a4e9fbdec3d6/" + "trustedTemp-55adadbe-d9ed-4278-a751-868797c1562f/tmpc58fz4pv" + ) + expect( + preflight.is_bsi_signature(canonical), + f"is_bsi_signature() failed on canonical BSI path: {canonical}", + ) + expect( + not preflight.is_bsi_signature("/Volumes/main/x/y.parquet"), + "is_bsi_signature() false-positive on a Volumes path", + ) + + +def test_exit_code_resolution(): + """exit_code_for follows blocker > warning > info > clean ordering.""" + expect(preflight.exit_code_for([]) == 0, "empty findings should exit 0") + info = preflight.Finding("X", "info", "f", 1, "s", "m", "fix") + warn = preflight.Finding("X", "warning", "f", 1, "s", "m", "fix") + block = preflight.Finding("X", "blocker", "f", 1, "s", "m", "fix") + expect(preflight.exit_code_for([info]) == 0, "info-only should exit 0") + expect(preflight.exit_code_for([warn]) == 1, "warning should exit 1") + expect(preflight.exit_code_for([block]) == 2, "blocker should exit 2") + expect( + preflight.exit_code_for([info, warn, block]) == 2, + "mixed severities should exit at highest (2)", + ) + + +def test_json_output_shape(): + """--json output has findings and summary keys with correct counts.""" + findings = [ + preflight.Finding("A", "blocker", "f", 1, "s", "m", "fix"), + preflight.Finding("B", "warning", "f", 2, "s", "m", "fix"), + preflight.Finding("C", "info", "f", 3, "s", "m", "fix"), + ] + payload = json.loads(preflight.format_json(findings)) + expect("findings" in payload, "JSON output missing 'findings'") + expect("summary" in payload, "JSON output missing 'summary'") + expect(payload["summary"]["blocker"] == 1, "wrong blocker count") + expect(payload["summary"]["warning"] == 1, "wrong warning count") + expect(payload["summary"]["info"] == 1, "wrong info count") + expect(payload["summary"]["total"] == 3, "wrong total count") + + +# --------------------------------------------------------------------------- +# Test runner +# --------------------------------------------------------------------------- + + +TESTS = [ + test_bsi_pattern_blockers, + test_clean_volumes_notebook, + test_dab_yaml_shared_tmp, + test_env_sync_run_classification, + test_bsi_signature_regex, + test_exit_code_resolution, + test_json_output_shape, +] + + +def main() -> int: + passed = 0 + failed: list[tuple[str, str]] = [] + for test in TESTS: + try: + test() + except TestFailure as exc: + failed.append((test.__name__, str(exc))) + except Exception as exc: # noqa: BLE001 + failed.append((test.__name__, f"unexpected error: {exc!r}")) + else: + passed += 1 + print(f"PASS {test.__name__}") + + print() + print(f"{passed}/{len(TESTS)} passed") + if failed: + print() + for name, msg in failed: + print(f"FAIL {name}") + print(f" {msg}") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main())