From d03d38d260275d936fa0edb6af8b341c2fb8ed36 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 1 Aug 2025 10:03:24 +0200 Subject: [PATCH 1/4] Update based on new templates and add a script for future updates --- contrib/data_engineering/.gitignore | 8 ++ .../data_engineering/.vscode/__builtins__.pyi | 3 + .../data_engineering/.vscode/extensions.json | 7 ++ .../data_engineering/.vscode/settings.json | 21 ++++ contrib/data_engineering/README.md | 86 ++++++++++++++ contrib/data_engineering/assets/README.md | 4 + contrib/data_engineering/conftest.py | 40 +++++++ contrib/data_engineering/databricks.yml | 46 ++++++++ contrib/data_engineering/pyproject.toml | 20 ++++ contrib/data_engineering/scripts/add_asset.py | 46 ++++++++ contrib/data_engineering/scripts/test.py | 17 +++ dbt_sql/.gitignore | 1 + dbt_sql/.vscode/settings.json | 1 - dbt_sql/README.md | 2 +- dbt_sql/databricks.yml | 10 +- dbt_sql/dbt_profiles/profiles.yml | 70 ++++++------ dbt_sql/dbt_project.yml | 2 +- dbt_sql/profile_template.yml | 4 +- .../{dbt_sql_job.yml => dbt_sql.job.yml} | 23 ++-- default_python/.vscode/settings.json | 1 - default_python/README.md | 48 +++++--- default_python/conftest.py | 76 ++++++++++++ default_python/databricks.yml | 13 ++- default_python/pyproject.toml | 35 ++++++ default_python/pytest.ini | 3 - default_python/requirements-dev.txt | 29 ----- .../resources/default_python.job.yml | 45 ++++++++ ...peline.yml => default_python.pipeline.yml} | 5 +- .../resources/default_python_job.yml | 53 --------- default_python/scratch/exploration.ipynb | 5 +- default_python/setup.py | 37 ------ default_python/src/default_python/__init__.py | 1 - default_python/src/default_python/main.py | 23 ++-- default_python/src/dlt_pipeline.ipynb | 8 +- default_python/src/notebook.ipynb | 4 +- default_python/tests/main_test.py | 6 +- default_sql/.vscode/settings.json | 3 +- default_sql/databricks.yml | 12 +- ...ql_sql_job.yml => default_sql_sql.job.yml} | 0 default_sql/src/orders_daily.sql | 2 +- default_sql/src/orders_raw.sql | 2 +- lakeflow_pipelines_python/.gitignore | 8 ++ .../.vscode/__builtins__.pyi | 3 + .../.vscode/extensions.json | 7 ++ .../.vscode/settings.json | 21 ++++ lakeflow_pipelines_python/README.md | 41 +++++++ lakeflow_pipelines_python/databricks.yml | 47 ++++++++ .../README.md | 22 ++++ .../lakeflow_pipelines_python.job.yml | 19 +++ .../lakeflow_pipelines_python.pipeline.yml | 12 ++ .../sample_trips_lakeflow_pipelines_python.py | 13 +++ .../sample_zones_lakeflow_pipelines_python.py | 13 +++ .../utilities/utils.py | 8 ++ lakeflow_pipelines_sql/.gitignore | 8 ++ .../.vscode/__builtins__.pyi | 3 + .../.vscode/extensions.json | 7 ++ lakeflow_pipelines_sql/.vscode/settings.json | 21 ++++ lakeflow_pipelines_sql/README.md | 41 +++++++ lakeflow_pipelines_sql/databricks.yml | 47 ++++++++ .../lakeflow_pipelines_sql_pipeline/README.md | 21 ++++ .../lakeflow_pipelines_sql.pipeline.yml | 12 ++ .../sample_trips_lakeflow_pipelines_sql.sql | 9 ++ .../sample_zones_lakeflow_pipelines_sql.sql | 10 ++ scripts/update_from_templates.sh | 108 ++++++++++++++++++ 64 files changed, 1087 insertions(+), 236 deletions(-) create mode 100644 contrib/data_engineering/.gitignore create mode 100644 contrib/data_engineering/.vscode/__builtins__.pyi create mode 100644 contrib/data_engineering/.vscode/extensions.json create mode 100644 contrib/data_engineering/.vscode/settings.json create mode 100644 contrib/data_engineering/README.md create mode 100644 contrib/data_engineering/assets/README.md create mode 100644 contrib/data_engineering/conftest.py create mode 100644 contrib/data_engineering/databricks.yml create mode 100644 contrib/data_engineering/pyproject.toml create mode 100644 contrib/data_engineering/scripts/add_asset.py create mode 100644 contrib/data_engineering/scripts/test.py rename dbt_sql/resources/{dbt_sql_job.yml => dbt_sql.job.yml} (55%) create mode 100644 default_python/conftest.py create mode 100644 default_python/pyproject.toml delete mode 100644 default_python/pytest.ini delete mode 100644 default_python/requirements-dev.txt create mode 100644 default_python/resources/default_python.job.yml rename default_python/resources/{default_python_pipeline.yml => default_python.pipeline.yml} (67%) delete mode 100644 default_python/resources/default_python_job.yml delete mode 100644 default_python/setup.py rename default_sql/resources/{default_sql_sql_job.yml => default_sql_sql.job.yml} (100%) create mode 100644 lakeflow_pipelines_python/.gitignore create mode 100644 lakeflow_pipelines_python/.vscode/__builtins__.pyi create mode 100644 lakeflow_pipelines_python/.vscode/extensions.json create mode 100644 lakeflow_pipelines_python/.vscode/settings.json create mode 100644 lakeflow_pipelines_python/README.md create mode 100644 lakeflow_pipelines_python/databricks.yml create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py create mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py create mode 100644 lakeflow_pipelines_sql/.gitignore create mode 100644 lakeflow_pipelines_sql/.vscode/__builtins__.pyi create mode 100644 lakeflow_pipelines_sql/.vscode/extensions.json create mode 100644 lakeflow_pipelines_sql/.vscode/settings.json create mode 100644 lakeflow_pipelines_sql/README.md create mode 100644 lakeflow_pipelines_sql/databricks.yml create mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md create mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml create mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql create mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql create mode 100755 scripts/update_from_templates.sh diff --git a/contrib/data_engineering/.gitignore b/contrib/data_engineering/.gitignore new file mode 100644 index 00000000..f6a3b5ff --- /dev/null +++ b/contrib/data_engineering/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/contrib/data_engineering/.vscode/__builtins__.pyi b/contrib/data_engineering/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/contrib/data_engineering/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/contrib/data_engineering/.vscode/extensions.json b/contrib/data_engineering/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/contrib/data_engineering/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/contrib/data_engineering/.vscode/settings.json b/contrib/data_engineering/.vscode/settings.json new file mode 100644 index 00000000..edff7df9 --- /dev/null +++ b/contrib/data_engineering/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["assets/etl_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/contrib/data_engineering/README.md b/contrib/data_engineering/README.md new file mode 100644 index 00000000..35578330 --- /dev/null +++ b/contrib/data_engineering/README.md @@ -0,0 +1,86 @@ +# data_engineering + +The 'data_engineering' project was generated by using the data-engineering template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. We recommend the UV package manager to install project dependencies. It's a drop-in replacement for `pip`. + See https://docs.astral.sh/uv/getting-started/installation/ for full installation instructions, + or run: + ``` + $ pip install uv + ``` + +4. Install all project dependencies: + ``` + $ uv sync + ``` + + See the "Running unit tests" below for more on testing. + +5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + +## Adding assets such as pipelines and jobs + +By default, the data-engineering template does not include any assets. + +1. To add an asset, run the `add-asset` script: + ``` + $ uv run add-asset + ``` + + or, if you don't use UV, use + + ``` + $ export TYPE=etl-pipeline + $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/$TYPE + ``` + +2. Optionally, run all tests on serverless compute after adding an asset: + ``` + $ uv run test + ``` + +## Deploying assets + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +## Running unit tests + +1. Run tests on a serverless environment using: + ``` + $ uv run test + ``` + +2. Optionally, to run unit tests in a different environment, such as on a cluster, + please refer to the documentation of DB connect at + https://docs.databricks.com/en/dev-tools/databricks-connect/python/install.html diff --git a/contrib/data_engineering/assets/README.md b/contrib/data_engineering/assets/README.md new file mode 100644 index 00000000..f6c8907f --- /dev/null +++ b/contrib/data_engineering/assets/README.md @@ -0,0 +1,4 @@ +This folder is reserved for Databricks Asset Bundles definitions. + +New jobs and pipelines should conventions from the 'data-engineering' template. +See https://github.com/databricks/bundle-examples/blob/main/contrib/templates/data-engineering/README.md. diff --git a/contrib/data_engineering/conftest.py b/contrib/data_engineering/conftest.py new file mode 100644 index 00000000..2b7f5db1 --- /dev/null +++ b/contrib/data_engineering/conftest.py @@ -0,0 +1,40 @@ +# conftest.py is used to configure pytest. +# This file is in the root since it affects all tests through this bundle. +# It makes sure all 'assets/*' directories are added to `sys.path` so that +# tests can import them. +import os +import sys +import dlt +import pathlib +import pytest +import warnings +from pyspark.sql import SparkSession +from databricks.connect import DatabricksSession + +# Dynamically find and add all `assets/*` directories to `sys.path` +for path in pathlib.Path(pathlib.Path(__file__).parent / "assets").glob("*"): + resolved_path = str(path.resolve()) + if resolved_path not in sys.path: + sys.path.append(resolved_path) + +# For older databricks-connect, work around issues importing SparkSession +# and errors when SPARK_REMOTE is set. +SparkSession.builder = DatabricksSession.builder +os.environ.pop("SPARK_REMOTE", None) + +# Make dlt.views in 'sources/dev' available for tests +warnings.filterwarnings( + "ignore", + message="This is a stub that only contains the interfaces to Delta Live Tables.*", + category=UserWarning, +) +dlt.enable_local_execution() +dlt.view = lambda func=None, *args, **kwargs: func or (lambda f: f) + + +# Provide a 'spark' fixture for tests and make sure the session is eagerly initialized +@pytest.fixture(scope="session", autouse=True) +def spark() -> SparkSession: + if hasattr(DatabricksSession.builder, "validateSession"): + return DatabricksSession.builder.validateSession().getOrCreate() + return DatabricksSession.builder.getOrCreate() diff --git a/contrib/data_engineering/databricks.yml b/contrib/data_engineering/databricks.yml new file mode 100644 index 00000000..0577aa4b --- /dev/null +++ b/contrib/data_engineering/databricks.yml @@ -0,0 +1,46 @@ +# This is a Databricks asset bundle definition for data_engineering. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: data_engineering + +include: + - assets/*.yml + - assets/*/*.yml + +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + notifications: [] + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly specify /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: user@company.com + level: CAN_MANAGE + run_as: + user_name: user@company.com + variables: + catalog: catalog + schema: default + notifications: [user@company.com] \ No newline at end of file diff --git a/contrib/data_engineering/pyproject.toml b/contrib/data_engineering/pyproject.toml new file mode 100644 index 00000000..966b1abb --- /dev/null +++ b/contrib/data_engineering/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "my_data_project" +version = "0.1.0" +description = "Databricks ETL pipeline project" +requires-python = "==3.10.*" +dependencies = [ + "databricks-dlt", + "pytest", + "databricks-connect==15.1.*", +] + +[project.scripts] +add-asset = "scripts.add_asset:main" +test = "scripts.test:main" + +[tool.uv] +package = true + +[tool.setuptools.packages.find] +include = ["scripts"] \ No newline at end of file diff --git a/contrib/data_engineering/scripts/add_asset.py b/contrib/data_engineering/scripts/add_asset.py new file mode 100644 index 00000000..931db611 --- /dev/null +++ b/contrib/data_engineering/scripts/add_asset.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# +# add_asset.py is used to initialize a new asset from the data-engineering template. +# +import sys +import subprocess +from typing import Literal + +VALID_ASSETS = ["etl-pipeline", "job", "ingest-pipeline"] +AssetType = Literal["etl-pipeline", "job", "ingest-pipeline"] + + +def init_bundle(asset_type: AssetType) -> None: + cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type}" + subprocess.run(cmd, shell=True) + + +def show_menu() -> AssetType: + print("\nSelect asset type to initialize:") + for i, asset in enumerate(VALID_ASSETS, 1): + print(f"{i}. {asset}") + + while True: + try: + choice = int(input("\nEnter number (1-3): ")) + if 1 <= choice <= len(VALID_ASSETS): + return VALID_ASSETS[choice - 1] + print("Invalid choice. Please try again.") + except ValueError: + print("Please enter a number.") + + +def main(): + if len(sys.argv) > 1: + asset_type = sys.argv[1] + if asset_type not in VALID_ASSETS: + print(f"Error: Asset type must be one of {VALID_ASSETS}") + sys.exit(1) + else: + asset_type = show_menu() + + init_bundle(asset_type) + + +if __name__ == "__main__": + main() diff --git a/contrib/data_engineering/scripts/test.py b/contrib/data_engineering/scripts/test.py new file mode 100644 index 00000000..4748c81d --- /dev/null +++ b/contrib/data_engineering/scripts/test.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# +# test.py runs the unit tests for this project using pytest and serverless compute. +# To use a different form of compute, instead use 'uv run pytest' or +# use your IDE's testing panel. When using VS Code, consider using the Databricks extension. +# +import os +import subprocess + + +def main(): + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + subprocess.run(["pytest"], check=True) + + +if __name__ == "__main__": + main() diff --git a/dbt_sql/.gitignore b/dbt_sql/.gitignore index cced6581..23116291 100644 --- a/dbt_sql/.gitignore +++ b/dbt_sql/.gitignore @@ -11,4 +11,5 @@ scratch/** # dbt target/ dbt_packages/ +dbt_modules/ logs/ diff --git a/dbt_sql/.vscode/settings.json b/dbt_sql/.vscode/settings.json index 82860360..a9355ea8 100644 --- a/dbt_sql/.vscode/settings.json +++ b/dbt_sql/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ diff --git a/dbt_sql/README.md b/dbt_sql/README.md index 7c2e6069..f012a898 100644 --- a/dbt_sql/README.md +++ b/dbt_sql/README.md @@ -121,7 +121,7 @@ You can find that job by opening your workpace and clicking on **Workflows**. You can also deploy to your production target directly from the command-line. The warehouse, catalog, and schema for that target are configured in databricks.yml. -When deploying to this target, note that the default job at resources/dbt_sql_job.yml +When deploying to this target, note that the default job at resources/dbt_sql.job.yml has a schedule set that runs every day. The schedule is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). diff --git a/dbt_sql/databricks.yml b/dbt_sql/databricks.yml index 5741351a..62de07b7 100644 --- a/dbt_sql/databricks.yml +++ b/dbt_sql/databricks.yml @@ -3,20 +3,22 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: dbt_sql + uuid: 5e5ca8d5-0388-473e-84a1-1414ed89c5df include: - resources/*.yml + - resources/*/*.yml # Deployment targets. # The default schema, catalog, etc. for dbt are defined in dbt_profiles/profiles.yml targets: dev: - default: true # The default target uses 'mode: development' to create a development copy. # - Deployed resources get prefixed with '[dev my_user_name]' # - Any job schedules and triggers are paused by default. # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. mode: development + default: true workspace: host: https://company.databricks.com @@ -24,10 +26,8 @@ targets: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/dbt_sql/dbt_profiles/profiles.yml b/dbt_sql/dbt_profiles/profiles.yml index 34e98a59..0d757ae6 100644 --- a/dbt_sql/dbt_profiles/profiles.yml +++ b/dbt_sql/dbt_profiles/profiles.yml @@ -1,38 +1,38 @@ # This file defines dbt profiles for deployed dbt jobs. dbt_sql: - target: dev # default target - outputs: - - # Doing local development with the dbt CLI? - # Then you should create your own profile in your .dbt/profiles.yml using 'dbt init' - # (See README.md) - - # The default target when deployed with the Databricks CLI - # N.B. when you use dbt from the command line, it uses the profile from .dbt/profiles.yml - dev: - type: databricks - method: http - catalog: main - schema: "{{ var('dev_schema') }}" - - http_path: /sql/1.0/warehouses/abcdef1234567890 - - # The workspace host / token are provided by Databricks - # see databricks.yml for the workspace host used for 'dev' - host: "{{ env_var('DBT_HOST') }}" - token: "{{ env_var('DBT_ACCESS_TOKEN') }}" - - # The production target when deployed with the Databricks CLI - prod: - type: databricks - method: http - catalog: main - schema: default - - http_path: /sql/1.0/warehouses/abcdef1234567890 - - # The workspace host / token are provided by Databricks - # see databricks.yml for the workspace host used for 'prod' - host: "{{ env_var('DBT_HOST') }}" - token: "{{ env_var('DBT_ACCESS_TOKEN') }}" + target: dev # default target + outputs: + + # Doing local development with the dbt CLI? + # Then you should create your own profile in your .dbt/profiles.yml using 'dbt init' + # (See README.md) + + # The default target when deployed with the Databricks CLI + # N.B. when you use dbt from the command line, it uses the profile from .dbt/profiles.yml + dev: + type: databricks + method: http + catalog: catalog + schema: "{{ var('dev_schema') }}" + + http_path: /sql/1.0/warehouses/abcdef1234567890 + + # The workspace host / token are provided by Databricks + # see databricks.yml for the workspace host used for 'dev' + host: "{{ env_var('DBT_HOST') }}" + token: "{{ env_var('DBT_ACCESS_TOKEN') }}" + + # The production target when deployed with the Databricks CLI + prod: + type: databricks + method: http + catalog: catalog + schema: default + + http_path: /sql/1.0/warehouses/abcdef1234567890 + + # The workspace host / token are provided by Databricks + # see databricks.yml for the workspace host used for 'prod' + host: "{{ env_var('DBT_HOST') }}" + token: "{{ env_var('DBT_ACCESS_TOKEN') }}" diff --git a/dbt_sql/dbt_project.yml b/dbt_sql/dbt_project.yml index 0c979c75..947412e3 100644 --- a/dbt_sql/dbt_project.yml +++ b/dbt_sql/dbt_project.yml @@ -15,7 +15,7 @@ seed-paths: ["src/seeds"] macro-paths: ["src/macros"] snapshot-paths: ["src/snapshots"] -clean-targets: # directories to be removed by `dbt clean` +clean-targets: # directories to be removed by `dbt clean` - "target" - "dbt_packages" diff --git a/dbt_sql/profile_template.yml b/dbt_sql/profile_template.yml index 1ddbeaf3..e6f2e69d 100644 --- a/dbt_sql/profile_template.yml +++ b/dbt_sql/profile_template.yml @@ -5,7 +5,7 @@ fixed: type: databricks prompts: host: - default: myworkspace.databricks.com + default: company.databricks.com token: hint: 'personal access token to use, dapiXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' hide_input: true @@ -14,7 +14,7 @@ prompts: default: /sql/1.0/warehouses/abcdef1234567890 catalog: hint: 'initial catalog' - default: main + default: catalog schema: hint: 'personal schema where dbt will build objects during development, example: user_name' threads: diff --git a/dbt_sql/resources/dbt_sql_job.yml b/dbt_sql/resources/dbt_sql.job.yml similarity index 55% rename from dbt_sql/resources/dbt_sql_job.yml rename to dbt_sql/resources/dbt_sql.job.yml index a25dc1ab..db1d1d43 100644 --- a/dbt_sql/resources/dbt_sql_job.yml +++ b/dbt_sql/resources/dbt_sql.job.yml @@ -9,26 +9,25 @@ resources: interval: 1 unit: DAYS - email_notifications: - on_failure: - - user@company.com + #email_notifications: + # on_failure: + # - your_email@example.com tasks: - task_key: dbt - dbt_task: project_directory: ../ # The default schema, catalog, etc. are defined in ../dbt_profiles/profiles.yml profiles_directory: dbt_profiles/ commands: - # The dbt commands to run (see also dbt_profiles/profiles.yml; dev_schema is used in the dev profile) - - 'dbt deps --target=${bundle.target}' - - 'dbt seed --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' - - 'dbt run --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' + # The dbt commands to run (see also dbt_profiles/profiles.yml; dev_schema is used in the dev profile) + - 'dbt deps --target=${bundle.target}' + - 'dbt seed --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' + - 'dbt run --target=${bundle.target} --vars "{ dev_schema: ${workspace.current_user.short_name} }"' libraries: - - pypi: - package: dbt-databricks>=1.8.0,<2.0.0 + - pypi: + package: dbt-databricks>=1.8.0,<2.0.0 new_cluster: spark_version: 15.4.x-scala2.12 @@ -36,7 +35,7 @@ resources: data_security_mode: SINGLE_USER num_workers: 0 spark_conf: - spark.master: "local[*, 4]" - spark.databricks.cluster.profile: singleNode + spark.master: "local[*, 4]" + spark.databricks.cluster.profile: singleNode custom_tags: ResourceClass: SingleNode diff --git a/default_python/.vscode/settings.json b/default_python/.vscode/settings.json index f19498da..8ee87c30 100644 --- a/default_python/.vscode/settings.json +++ b/default_python/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ diff --git a/default_python/README.md b/default_python/README.md index 3f11022c..c4f3a2b6 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -2,16 +2,39 @@ The 'default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + + +Dependencies for this project should be installed using UV: -2. Authenticate to your Databricks workspace, if you have not done so already: +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -21,27 +44,24 @@ The 'default_python' project was generated by using the default-python template. This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` Note that the default job from the template has a schedule that runs every day - (defined in resources/default_python_job.yml). The schedule + (defined in resources/default_python.job.yml). The schedule is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for - **Databricks Connect** for instructions on running the included Python code from a different IDE. - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/default_python/conftest.py b/default_python/conftest.py new file mode 100644 index 00000000..cf1d0978 --- /dev/null +++ b/default_python/conftest.py @@ -0,0 +1,76 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +def add_all_resources_to_sys_path(): + """Add all resources/* directories to sys.path for module discovery.""" + resources = pathlib.Path(__file__).with_name("resources") + resource_dirs = filter(pathlib.Path.is_dir, resources.iterdir()) + seen: dict[str, pathlib.Path] = {} + for resource in resource_dirs: + sys.path.append(str(resource.resolve())) + for py in resource.rglob("*.py"): + mod = ".".join(py.relative_to(resource).with_suffix("").parts) + if mod in seen: + raise ImportError(f"Duplicate module '{mod}' found:\n {seen[mod]}\n {py}") + seen[mod] = py + + +def enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with allow_stderr_output(config): + add_all_resources_to_sys_path() + enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() + + +@pytest.fixture(scope="session") +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests.""" + return DatabricksSession.builder.getOrCreate() diff --git a/default_python/databricks.yml b/default_python/databricks.yml index 0e9d0ced..079edb90 100644 --- a/default_python/databricks.yml +++ b/default_python/databricks.yml @@ -2,9 +2,16 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: default_python + uuid: 87d5a23e-7bc7-4f52-98ee-e374b67d5681 + +artifacts: + python_artifact: + type: whl + build: uv build --wheel include: - resources/*.yml + - resources/*/*.yml targets: dev: @@ -21,10 +28,8 @@ targets: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/default_python/pyproject.toml b/default_python/pyproject.toml new file mode 100644 index 00000000..dda79245 --- /dev/null +++ b/default_python/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "default_python" +version = "0.0.1" +authors = [{ name = "user@company.com" }] +requires-python = ">= 3.11" + +[dependency-groups] +dev = [ + "pytest", + + # Code completion support for DLT, also install databricks-connect + "databricks-dlt", + + # databricks-connect can be used to run parts of this project locally. + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", +] + +[tool.pytest.ini_options] +pythonpath = "src" +testpaths = [ + "tests", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/default_python"] + +[project.scripts] +main = "default_python.main:main" diff --git a/default_python/pytest.ini b/default_python/pytest.ini deleted file mode 100644 index 80432c22..00000000 --- a/default_python/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -testpaths = tests -pythonpath = src diff --git a/default_python/requirements-dev.txt b/default_python/requirements-dev.txt deleted file mode 100644 index 0ffbf6ae..00000000 --- a/default_python/requirements-dev.txt +++ /dev/null @@ -1,29 +0,0 @@ -## requirements-dev.txt: dependencies for local development. -## -## For defining dependencies used by jobs in Databricks Workflows, see -## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - -## Add code completion support for DLT -databricks-dlt - -## pytest is the default package used for testing -pytest - -## Dependencies for building wheel files -setuptools -wheel - -## databricks-connect can be used to run parts of this project locally. -## See https://docs.databricks.com/dev-tools/databricks-connect.html. -## -## databricks-connect is automatically installed if you're using Databricks -## extension for Visual Studio Code -## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). -## -## To manually install databricks-connect, either follow the instructions -## at https://docs.databricks.com/dev-tools/databricks-connect.html -## to install the package system-wide. Or uncomment the line below to install a -## version of db-connect that corresponds to the Databricks Runtime version used -## for this project. -# -# databricks-connect>=15.4,<15.5 diff --git a/default_python/resources/default_python.job.yml b/default_python/resources/default_python.job.yml new file mode 100644 index 00000000..0504090a --- /dev/null +++ b/default_python/resources/default_python.job.yml @@ -0,0 +1,45 @@ +# The main job for default_python. +resources: + jobs: + default_python_job: + name: default_python_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + tasks: + - task_key: notebook_task + notebook_task: + notebook_path: ../src/notebook.ipynb + + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_pipeline.id} + + - task_key: main_task + depends_on: + - task_key: refresh_pipeline + environment_key: default + python_wheel_task: + package_name: default_python + entry_point: main + + # A list of task execution environment specifications that can be referenced by tasks of this job. + environments: + - environment_key: default + + # Full documentation of this spec can be found at: + # https://docs.databricks.com/api/workspace/jobs/create#environments-spec + spec: + client: "2" + dependencies: + - ../dist/*.whl diff --git a/default_python/resources/default_python_pipeline.yml b/default_python/resources/default_python.pipeline.yml similarity index 67% rename from default_python/resources/default_python_pipeline.yml rename to default_python/resources/default_python.pipeline.yml index be61e864..ea7cdc02 100644 --- a/default_python/resources/default_python_pipeline.yml +++ b/default_python/resources/default_python.pipeline.yml @@ -4,10 +4,11 @@ resources: default_python_pipeline: name: default_python_pipeline catalog: main - target: default_python_${bundle.environment} + schema: default_python_${bundle.target} + serverless: true libraries: - notebook: path: ../src/dlt_pipeline.ipynb configuration: - bundle.sourcePath: /Workspace/${workspace.file_path}/src + bundle.sourcePath: ${workspace.file_path}/src diff --git a/default_python/resources/default_python_job.yml b/default_python/resources/default_python_job.yml deleted file mode 100644 index cbe9382a..00000000 --- a/default_python/resources/default_python_job.yml +++ /dev/null @@ -1,53 +0,0 @@ -# The main job for default_python. -resources: - jobs: - default_python_job: - name: default_python_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - email_notifications: - on_failure: - - user@company.com - - tasks: - - task_key: notebook_task - job_cluster_key: job_cluster - notebook_task: - notebook_path: ../src/notebook.ipynb - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.default_python_pipeline.id} - - - task_key: main_task - depends_on: - - task_key: refresh_pipeline - job_cluster_key: job_cluster - python_wheel_task: - package_name: default_python - entry_point: main - libraries: - # By default we just include the .whl file generated for the default_python package. - # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - # for more information on how to add other libraries. - - whl: ../dist/*.whl - - job_clusters: - - job_cluster_key: job_cluster - new_cluster: - spark_version: 15.4.x-scala2.12 - # node_type_id is the cluster node type to use. - # Typical node types on AWS include i3.xlarge; - # Standard_D3_v2 on Azure; - # n1-standard-4 on Google Cloud. - node_type_id: i3.xlarge - autoscale: - min_workers: 1 - max_workers: 4 diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb index 8be3b84e..57a9c978 100644 --- a/default_python/scratch/exploration.ipynb +++ b/default_python/scratch/exploration.ipynb @@ -28,10 +28,11 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../src')\n", + "\n", + "sys.path.append(\"../src\")\n", "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/default_python/setup.py b/default_python/setup.py deleted file mode 100644 index 6ba5b7b4..00000000 --- a/default_python/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -setup.py configuration script describing how to build and package this project. - -This file is primarily used by the setuptools library and typically should not -be executed directly. See README.md for how to deploy, test, and run -the default_python project. -""" -from setuptools import setup, find_packages - -import sys -sys.path.append('./src') - -import datetime -import default_python - -setup( - name="default_python", - # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) - # to ensure that changes to wheel package are picked up when used on all-purpose clusters - version=default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"), - url="https://databricks.com", - author="user@company.com", - description="wheel file based on default_python/src", - packages=find_packages(where='./src'), - package_dir={'': 'src'}, - entry_points={ - "packages": [ - "main=default_python.main:main" - ] - }, - install_requires=[ - # Dependencies in case the output wheel file is used as a library dependency. - # For defining dependencies, when this package is used in Databricks, see: - # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - "setuptools" - ], -) diff --git a/default_python/src/default_python/__init__.py b/default_python/src/default_python/__init__.py index f102a9ca..e69de29b 100644 --- a/default_python/src/default_python/__init__.py +++ b/default_python/src/default_python/__init__.py @@ -1 +0,0 @@ -__version__ = "0.0.1" diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py index c514c6dc..04e8be4d 100644 --- a/default_python/src/default_python/main.py +++ b/default_python/src/default_python/main.py @@ -1,21 +1,14 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") +def find_all_taxis() -> DataFrame: + return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) -if __name__ == '__main__': - main() + +if __name__ == "__main__": + main() diff --git a/default_python/src/dlt_pipeline.ipynb b/default_python/src/dlt_pipeline.ipynb index 4216a065..34e1895e 100644 --- a/default_python/src/dlt_pipeline.ipynb +++ b/default_python/src/dlt_pipeline.ipynb @@ -14,7 +14,7 @@ "source": [ "# DLT pipeline\n", "\n", - "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/default_python_pipeline.yml." + "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/default_python.pipeline.yml." ] }, { @@ -34,6 +34,7 @@ "# Import DLT and src/default_python\n", "import dlt\n", "import sys\n", + "\n", "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", "from pyspark.sql.functions import expr\n", "from default_python import main" @@ -55,11 +56,12 @@ "source": [ "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis(spark)\n", + " return main.find_all_taxis()\n", + "\n", "\n", "@dlt.table\n", "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" + " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" ] } ], diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb index daa4e08c..fd49e5b9 100644 --- a/default_python/src/notebook.ipynb +++ b/default_python/src/notebook.ipynb @@ -14,7 +14,7 @@ "source": [ "# Default notebook\n", "\n", - "This default notebook is executed using Databricks Workflows as defined in resources/default_python_job.yml." + "This default notebook is executed using Databricks Workflows as defined in resources/default_python.job.yml." ] }, { @@ -46,7 +46,7 @@ "source": [ "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" ] } ], diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py index 66b6f0a2..66c27024 100644 --- a/default_python/tests/main_test.py +++ b/default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from default_python.main import get_taxis, get_spark +from default_python import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5 diff --git a/default_sql/.vscode/settings.json b/default_sql/.vscode/settings.json index 0c9d4119..1b55565d 100644 --- a/default_sql/.vscode/settings.json +++ b/default_sql/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ @@ -17,7 +16,7 @@ "sqltools.connections": [ { "connectionMethod": "VS Code Extension (beta)", - "catalog": "main", + "catalog": "catalog", "previewLimit": 50, "driver": "Databricks", "name": "databricks", diff --git a/default_sql/databricks.yml b/default_sql/databricks.yml index 19d68680..715c2ebe 100644 --- a/default_sql/databricks.yml +++ b/default_sql/databricks.yml @@ -2,9 +2,11 @@ # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: default_sql + uuid: 853cd9bc-631c-4d4f-bca0-3195c7540854 include: - resources/*.yml + - resources/*/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -27,21 +29,19 @@ targets: host: https://company.databricks.com variables: warehouse_id: abcdef1234567890 - catalog: main + catalog: catalog schema: ${workspace.current_user.short_name} prod: mode: production workspace: host: https://company.databricks.com - # We explicitly specify /Users/user@company.com to make sure we only have a single copy. - root_path: /Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} variables: warehouse_id: abcdef1234567890 - catalog: main + catalog: catalog schema: default permissions: - user_name: user@company.com level: CAN_MANAGE - run_as: - user_name: user@company.com diff --git a/default_sql/resources/default_sql_sql_job.yml b/default_sql/resources/default_sql_sql.job.yml similarity index 100% rename from default_sql/resources/default_sql_sql_job.yml rename to default_sql/resources/default_sql_sql.job.yml diff --git a/default_sql/src/orders_daily.sql b/default_sql/src/orders_daily.sql index 18d6d296..101bdd67 100644 --- a/default_sql/src/orders_daily.sql +++ b/default_sql/src/orders_daily.sql @@ -1,4 +1,4 @@ --- This query is executed using Databricks Workflows (see resources/default_sql_sql_job.yml) +-- This query is executed using Databricks Workflows (see resources/default_sql_sql.job.yml) USE CATALOG {{catalog}}; USE IDENTIFIER({{schema}}); diff --git a/default_sql/src/orders_raw.sql b/default_sql/src/orders_raw.sql index 88fe4298..dfb1ce69 100644 --- a/default_sql/src/orders_raw.sql +++ b/default_sql/src/orders_raw.sql @@ -1,4 +1,4 @@ --- This query is executed using Databricks Workflows (see resources/default_sql_sql_job.yml) +-- This query is executed using Databricks Workflows (see resources/default_sql_sql.job.yml) -- -- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/ -- See also https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-streaming-table.html diff --git a/lakeflow_pipelines_python/.gitignore b/lakeflow_pipelines_python/.gitignore new file mode 100644 index 00000000..f6a3b5ff --- /dev/null +++ b/lakeflow_pipelines_python/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/lakeflow_pipelines_python/.vscode/__builtins__.pyi b/lakeflow_pipelines_python/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/lakeflow_pipelines_python/.vscode/extensions.json b/lakeflow_pipelines_python/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/lakeflow_pipelines_python/.vscode/settings.json b/lakeflow_pipelines_python/.vscode/settings.json new file mode 100644 index 00000000..47d90b62 --- /dev/null +++ b/lakeflow_pipelines_python/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/lakeflow_pipelines_python_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/lakeflow_pipelines_python/README.md b/lakeflow_pipelines_python/README.md new file mode 100644 index 00000000..e727cdbc --- /dev/null +++ b/lakeflow_pipelines_python/README.md @@ -0,0 +1,41 @@ +# lakeflow_pipelines_python + +The 'lakeflow_pipelines_python' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/lakeflow_pipelines_python/databricks.yml b/lakeflow_pipelines_python/databricks.yml new file mode 100644 index 00000000..5438327d --- /dev/null +++ b/lakeflow_pipelines_python/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for lakeflow_pipelines_python. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: lakeflow_pipelines_python + uuid: 87a174ba-60e4-4867-a140-1936bc9b00de + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: user@company.com + level: CAN_MANAGE + variables: + catalog: catalog + schema: default + notifications: [user@company.com] diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md new file mode 100644 index 00000000..5e845f08 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md @@ -0,0 +1,22 @@ +# lakeflow_pipelines_python_pipeline + +This folder defines all source code for the lakeflow_pipelines_python_pipeline pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `utilities` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_lakeflow_pipelines_python.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml new file mode 100644 index 00000000..c003b37f --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml @@ -0,0 +1,19 @@ +# The job that triggers lakeflow_pipelines_python_pipeline. +resources: + jobs: + lakeflow_pipelines_python_job: + name: lakeflow_pipelines_python_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_pipelines_python_pipeline.id} diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml new file mode 100644 index 00000000..3db75519 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + lakeflow_pipelines_python_pipeline: + name: lakeflow_pipelines_python_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py new file mode 100644 index 00000000..f0db7161 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col +from utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_lakeflow_pipelines_python(): + return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py new file mode 100644 index 00000000..a978db9b --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_lakeflow_pipelines_python(): + # Read from the "sample_trips" table, then sum all the fares + return spark.read.table("sample_trips_lakeflow_pipelines_python").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py new file mode 100644 index 00000000..ff039898 --- /dev/null +++ b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py @@ -0,0 +1,8 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType + + +@udf(returnType=FloatType()) +def distance_km(distance_miles): + """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" + return distance_miles * 1.60934 diff --git a/lakeflow_pipelines_sql/.gitignore b/lakeflow_pipelines_sql/.gitignore new file mode 100644 index 00000000..f6a3b5ff --- /dev/null +++ b/lakeflow_pipelines_sql/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/lakeflow_pipelines_sql/.vscode/__builtins__.pyi b/lakeflow_pipelines_sql/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/lakeflow_pipelines_sql/.vscode/extensions.json b/lakeflow_pipelines_sql/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/lakeflow_pipelines_sql/.vscode/settings.json b/lakeflow_pipelines_sql/.vscode/settings.json new file mode 100644 index 00000000..d0c85bb8 --- /dev/null +++ b/lakeflow_pipelines_sql/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/lakeflow_pipelines_sql_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/lakeflow_pipelines_sql/README.md b/lakeflow_pipelines_sql/README.md new file mode 100644 index 00000000..b4a17db3 --- /dev/null +++ b/lakeflow_pipelines_sql/README.md @@ -0,0 +1,41 @@ +# lakeflow_pipelines_sql + +The 'lakeflow_pipelines_sql' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/lakeflow_pipelines_sql/databricks.yml b/lakeflow_pipelines_sql/databricks.yml new file mode 100644 index 00000000..4beb0c58 --- /dev/null +++ b/lakeflow_pipelines_sql/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for lakeflow_pipelines_sql. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: lakeflow_pipelines_sql + uuid: 295000fc-1ea8-4f43-befe-d5fb9f7d4ad4 + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://company.databricks.com + variables: + catalog: catalog + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: https://company.databricks.com + # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. + root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: user@company.com + level: CAN_MANAGE + variables: + catalog: catalog + schema: default + notifications: [user@company.com] diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md new file mode 100644 index 00000000..d01f290a --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md @@ -0,0 +1,21 @@ +# lakeflow_pipelines_sql_pipeline + +This folder defines all source code for the 'lakeflow_pipelines_sql_pipeline' pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_lakeflow_pipelines_sql.sql" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml new file mode 100644 index 00000000..781c9fd6 --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + lakeflow_pipelines_sql_pipeline: + name: lakeflow_pipelines_sql_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql new file mode 100644 index 00000000..09dda0bf --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_lakeflow_pipelines_sql AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql new file mode 100644 index 00000000..5f5c567d --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_lakeflow_pipelines_sql AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_lakeflow_pipelines_sql +GROUP BY pickup_zip diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh new file mode 100755 index 00000000..add4a182 --- /dev/null +++ b/scripts/update_from_templates.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +set -euo pipefail + +function cleanup() { + cd "$1" + + # Replace specific names with company.databricks.com, user@company.com, user_name + find . -type f -exec sed -i '' -E 's|e2[^[:space:]]*\.com|company.databricks.com|g' {} \; + find . -type f -exec sed -i '' -E 's|[A-Za-z0-9._%+-]+@databricks\.com|user@company.com|g' {} \; + find . -type f -exec sed -i '' -e "s|$CURRENT_USER_NAME|user_name|g" {} \; + + cd .. +} + +function init_bundle() { + local TEMPLATE_NAME="$1" + local CONFIG_JSON="$2" + + # Extract project_name from JSON + local PROJECT_NAME=$(echo "$CONFIG_JSON" | grep -o '"project_name"[[:space:]]*:[[:space:]]*"[^"]*"' | cut -d'"' -f4) + + # Use 'cli' if available, otherwise fall back to 'databricks' + local CLI_CMD="databricks" + if command -v cli >/dev/null 2>&1; then + CLI_CMD="cli" + fi + + echo + echo "# $PROJECT_NAME" + + rm -rf "$PROJECT_NAME" + echo "$CONFIG_JSON" > /tmp/config.json + $CLI_CMD bundle init "$TEMPLATE_NAME" --config-file /tmp/config.json + cleanup "$PROJECT_NAME" +} + +# Check and extract the host from the databrickscfg file +if [ ! -f ~/.databrickscfg ]; then + echo "Error: ~/.databrickscfg not found." >&2 + exit 1 +fi + +DATABRICKS_HOST=$(grep -A1 '\[DEFAULT\]' ~/.databrickscfg | grep 'host' | awk -F'=' '{print $2}' | xargs || true) +if [ ! "$DATABRICKS_HOST" ]; then + echo "Error: expected ~/.databrickscfg file with a [DEFAULT] section with the first line of the form 'host=...'." >&2 + exit 1 +fi + +if [ -n "$1" ]; then + CURRENT_USER_NAME="$1" +else + read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME + read -p "Enter the current user name (e.g., 'lennart_kats'): " CURRENT_USER_NAME + if [ ! "$CURRENT_USER_NAME" ]; then + echo "Error: current user name is required." >&2 + exit 1 + fi +fi + +cd $(dirname $0)/.. + +# init_bundle "default-python" '{ +# "project_name": "default_python", +# "include_notebook": "yes", +# "include_dlt": "yes", +# "include_python": "yes", +# "serverless": "yes" +# }' + +# init_bundle "default-sql" '{ +# "project_name": "default_sql", +# "http_path": "/sql/1.0/warehouses/abcdef1234567890", +# "default_catalog": "catalog", +# "personal_schemas": "yes, automatically use a schema based on the current user name during development" +# }' + +# init_bundle "dbt-sql" '{ +# "project_name": "dbt_sql", +# "http_path": "/sql/1.0/warehouses/abcdef1234567890", +# "default_catalog": "catalog", +# "personal_schemas": "yes, use a schema based on the current user name during development" +# }' + +# init_bundle "lakeflow-pipelines" '{ +# "project_name": "lakeflow_pipelines_sql", +# "default_catalog": "catalog", +# "personal_schemas": "yes", +# "language": "sql" +# }' + + +# init_bundle "lakeflow-pipelines" '{ +# "project_name": "lakeflow_pipelines_python", +# "default_catalog": "catalog", +# "personal_schemas": "yes", +# "language": "python" +# }' + +cd contrib +( + init_bundle "templates/data-engineering" '{ + "project_name": "data_engineering", + "default_catalog": "catalog", + "personal_schemas": "yes, use a schema based on the current user name during development" + }' +) +cd .. \ No newline at end of file From e761cbb2bdfe43c092b31150426feae8cfb9ad45 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 1 Aug 2025 10:11:32 +0200 Subject: [PATCH 2/4] Cleanup --- scripts/update_from_templates.sh | 62 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh index add4a182..a8c8d7e3 100755 --- a/scripts/update_from_templates.sh +++ b/scripts/update_from_templates.sh @@ -60,42 +60,42 @@ fi cd $(dirname $0)/.. -# init_bundle "default-python" '{ -# "project_name": "default_python", -# "include_notebook": "yes", -# "include_dlt": "yes", -# "include_python": "yes", -# "serverless": "yes" -# }' +init_bundle "default-python" '{ + "project_name": "default_python", + "include_notebook": "yes", + "include_dlt": "yes", + "include_python": "yes", + "serverless": "yes" +}' -# init_bundle "default-sql" '{ -# "project_name": "default_sql", -# "http_path": "/sql/1.0/warehouses/abcdef1234567890", -# "default_catalog": "catalog", -# "personal_schemas": "yes, automatically use a schema based on the current user name during development" -# }' +init_bundle "default-sql" '{ + "project_name": "default_sql", + "http_path": "/sql/1.0/warehouses/abcdef1234567890", + "default_catalog": "catalog", + "personal_schemas": "yes, automatically use a schema based on the current user name during development" +}' -# init_bundle "dbt-sql" '{ -# "project_name": "dbt_sql", -# "http_path": "/sql/1.0/warehouses/abcdef1234567890", -# "default_catalog": "catalog", -# "personal_schemas": "yes, use a schema based on the current user name during development" -# }' +init_bundle "dbt-sql" '{ + "project_name": "dbt_sql", + "http_path": "/sql/1.0/warehouses/abcdef1234567890", + "default_catalog": "catalog", + "personal_schemas": "yes, use a schema based on the current user name during development" +}' -# init_bundle "lakeflow-pipelines" '{ -# "project_name": "lakeflow_pipelines_sql", -# "default_catalog": "catalog", -# "personal_schemas": "yes", -# "language": "sql" -# }' +init_bundle "lakeflow-pipelines" '{ + "project_name": "lakeflow_pipelines_sql", + "default_catalog": "catalog", + "personal_schemas": "yes", + "language": "sql" +}' -# init_bundle "lakeflow-pipelines" '{ -# "project_name": "lakeflow_pipelines_python", -# "default_catalog": "catalog", -# "personal_schemas": "yes", -# "language": "python" -# }' +init_bundle "lakeflow-pipelines" '{ + "project_name": "lakeflow_pipelines_python", + "default_catalog": "catalog", + "personal_schemas": "yes", + "language": "python" +}' cd contrib ( From ef7ee782915b484cb38d09db9e38a88fa4eca64e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 1 Aug 2025 10:17:27 +0200 Subject: [PATCH 3/4] Use hardcoded UUIDs --- .../lakeflow_pipelines_sql.job.yml | 19 +++++++++++++++++ scripts/update_from_templates.sh | 21 +++++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml new file mode 100644 index 00000000..32ba1ce4 --- /dev/null +++ b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml @@ -0,0 +1,19 @@ +# The job that triggers lakeflow_pipelines_sql_pipeline. +resources: + jobs: + lakeflow_pipelines_sql_job: + name: lakeflow_pipelines_sql_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_pipelines_sql_pipeline.id} diff --git a/scripts/update_from_templates.sh b/scripts/update_from_templates.sh index a8c8d7e3..a8b74fcf 100755 --- a/scripts/update_from_templates.sh +++ b/scripts/update_from_templates.sh @@ -4,18 +4,21 @@ set -euo pipefail function cleanup() { cd "$1" + local BUNDLE_UUID="$2" # Replace specific names with company.databricks.com, user@company.com, user_name find . -type f -exec sed -i '' -E 's|e2[^[:space:]]*\.com|company.databricks.com|g' {} \; find . -type f -exec sed -i '' -E 's|[A-Za-z0-9._%+-]+@databricks\.com|user@company.com|g' {} \; - find . -type f -exec sed -i '' -e "s|$CURRENT_USER_NAME|user_name|g" {} \; + find . -type f -exec sed -i '' -e "s|$CURRENT_USER_NAME|user_name|g" {} \; + find . -type f -exec sed -i '' -E "s|^([[:space:]]*uuid:[[:space:]]*)[^[:space:]]*[[:space:]]*$|\\1$BUNDLE_UUID|g" {} \; cd .. } function init_bundle() { local TEMPLATE_NAME="$1" - local CONFIG_JSON="$2" + local BUNDLE_UUID="${2:-}" + local CONFIG_JSON="$3" # Extract project_name from JSON local PROJECT_NAME=$(echo "$CONFIG_JSON" | grep -o '"project_name"[[:space:]]*:[[:space:]]*"[^"]*"' | cut -d'"' -f4) @@ -32,7 +35,7 @@ function init_bundle() { rm -rf "$PROJECT_NAME" echo "$CONFIG_JSON" > /tmp/config.json $CLI_CMD bundle init "$TEMPLATE_NAME" --config-file /tmp/config.json - cleanup "$PROJECT_NAME" + cleanup "$PROJECT_NAME" "$BUNDLE_UUID" } # Check and extract the host from the databrickscfg file @@ -60,7 +63,7 @@ fi cd $(dirname $0)/.. -init_bundle "default-python" '{ +init_bundle "default-python" "87d5a23e-7bc7-4f52-98ee-e374b67d5681" '{ "project_name": "default_python", "include_notebook": "yes", "include_dlt": "yes", @@ -68,21 +71,21 @@ init_bundle "default-python" '{ "serverless": "yes" }' -init_bundle "default-sql" '{ +init_bundle "default-sql" "853cd9bc-631c-4d4f-bca0-3195c7540854" '{ "project_name": "default_sql", "http_path": "/sql/1.0/warehouses/abcdef1234567890", "default_catalog": "catalog", "personal_schemas": "yes, automatically use a schema based on the current user name during development" }' -init_bundle "dbt-sql" '{ +init_bundle "dbt-sql" "5e5ca8d5-0388-473e-84a1-1414ed89c5df" '{ "project_name": "dbt_sql", "http_path": "/sql/1.0/warehouses/abcdef1234567890", "default_catalog": "catalog", "personal_schemas": "yes, use a schema based on the current user name during development" }' -init_bundle "lakeflow-pipelines" '{ +init_bundle "lakeflow-pipelines" "295000fc-1ea8-4f43-befe-d5fb9f7d4ad4" '{ "project_name": "lakeflow_pipelines_sql", "default_catalog": "catalog", "personal_schemas": "yes", @@ -90,7 +93,7 @@ init_bundle "lakeflow-pipelines" '{ }' -init_bundle "lakeflow-pipelines" '{ +init_bundle "lakeflow-pipelines" "87a174ba-60e4-4867-a140-1936bc9b00de" '{ "project_name": "lakeflow_pipelines_python", "default_catalog": "catalog", "personal_schemas": "yes", @@ -99,7 +102,7 @@ init_bundle "lakeflow-pipelines" '{ cd contrib ( - init_bundle "templates/data-engineering" '{ + init_bundle "templates/data-engineering" "e5f6g7h8-i9j0-1234-efgh-567890123456" '{ "project_name": "data_engineering", "default_catalog": "catalog", "personal_schemas": "yes, use a schema based on the current user name during development" From 88c51249047d7f69c073a25841f30edafd8a0c5b Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 1 Aug 2025 10:21:43 +0200 Subject: [PATCH 4/4] Add ruff exception --- .ruff.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/.ruff.toml b/.ruff.toml index 6845a35c..25bebc2a 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -5,4 +5,5 @@ exclude = [ "default_python/*", "default_sql/*", "mlops_stacks/*", + "lakeflow_pipelines_python/*", ]