From 4f13497e5dafe81409399b88fc3479fda241e8a9 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 26 Sep 2025 12:15:42 +0200 Subject: [PATCH] Update default templates --- default_python/README.md | 50 +++++++---- default_python/pyproject.toml | 20 ++--- .../resources/default_python.job.yml | 2 +- .../resources/default_python.pipeline.yml | 2 +- default_python/scratch/exploration.ipynb | 2 +- default_python/src/default_python/main.py | 19 +--- default_python/src/dlt_pipeline.ipynb | 90 ------------------- default_python/src/notebook.ipynb | 2 +- default_python/tests/main_test.py | 6 +- lakeflow_pipelines_python/.gitignore | 2 + .../.vscode/extensions.json | 4 +- .../.vscode/settings.json | 28 ++++-- lakeflow_pipelines_python/README.md | 49 ++++++---- lakeflow_pipelines_python/databricks.yml | 11 +-- .../README.md | 22 ----- .../lakeflow_pipelines_python.job.yml | 19 ---- .../lakeflow_pipelines_python.pipeline.yml | 12 --- .../sample_trips_lakeflow_pipelines_python.py | 13 --- .../sample_zones_lakeflow_pipelines_python.py | 13 --- .../utilities/utils.py | 8 -- lakeflow_pipelines_sql/.gitignore | 2 + .../.vscode/extensions.json | 4 +- lakeflow_pipelines_sql/.vscode/settings.json | 28 ++++-- lakeflow_pipelines_sql/README.md | 49 ++++++---- lakeflow_pipelines_sql/databricks.yml | 11 +-- .../lakeflow_pipelines_sql_pipeline/README.md | 21 ----- .../lakeflow_pipelines_sql.job.yml | 19 ---- .../lakeflow_pipelines_sql.pipeline.yml | 12 --- .../sample_trips_lakeflow_pipelines_sql.sql | 9 -- .../sample_zones_lakeflow_pipelines_sql.sql | 10 --- 30 files changed, 175 insertions(+), 364 deletions(-) delete mode 100644 default_python/src/dlt_pipeline.ipynb delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py delete mode 100644 lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py delete mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md delete mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml delete mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml delete mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql delete mode 100644 lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql diff --git a/default_python/README.md b/default_python/README.md index 74a1f42d..f5a6a220 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -2,18 +2,39 @@ The 'default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + + +Dependencies for this project should be installed using uv: -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. -2. Authenticate to your Databricks workspace, if you have not done so already: +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +44,9 @@ The 'default_python' project was generated by using the default-python template. This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,17 +56,12 @@ The 'default_python' project was generated by using the default-python template. is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/default_python/pyproject.toml b/default_python/pyproject.toml index d32e108a..279d7f32 100644 --- a/default_python/pyproject.toml +++ b/default_python/pyproject.toml @@ -2,26 +2,20 @@ name = "default_python" version = "0.0.1" authors = [{ name = "user@company.com" }] -requires-python = ">= 3.11" +requires-python = ">=3.10,<=3.13" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", - # Code completion support for DLT, also install databricks-connect + # Code completion support for Lakeflow Declarative Pipelines, also install databricks-connect "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect>=15.4,<15.5", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] diff --git a/default_python/resources/default_python.job.yml b/default_python/resources/default_python.job.yml index 0504090a..d99eb4dd 100644 --- a/default_python/resources/default_python.job.yml +++ b/default_python/resources/default_python.job.yml @@ -40,6 +40,6 @@ resources: # Full documentation of this spec can be found at: # https://docs.databricks.com/api/workspace/jobs/create#environments-spec spec: - client: "2" + environment_version: "2" dependencies: - ../dist/*.whl diff --git a/default_python/resources/default_python.pipeline.yml b/default_python/resources/default_python.pipeline.yml index ea7cdc02..7954922b 100644 --- a/default_python/resources/default_python.pipeline.yml +++ b/default_python/resources/default_python.pipeline.yml @@ -8,7 +8,7 @@ resources: serverless: true libraries: - notebook: - path: ../src/dlt_pipeline.ipynb + path: ../src/pipeline.ipynb configuration: bundle.sourcePath: ${workspace.file_path}/src diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb index f7832011..57a9c978 100644 --- a/default_python/scratch/exploration.ipynb +++ b/default_python/scratch/exploration.ipynb @@ -32,7 +32,7 @@ "sys.path.append(\"../src\")\n", "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py index 5ae344c7..04e8be4d 100644 --- a/default_python/src/default_python/main.py +++ b/default_python/src/default_python/main.py @@ -1,24 +1,13 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: +def find_all_taxis() -> DataFrame: return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) if __name__ == "__main__": diff --git a/default_python/src/dlt_pipeline.ipynb b/default_python/src/dlt_pipeline.ipynb deleted file mode 100644 index eb93d319..00000000 --- a/default_python/src/dlt_pipeline.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# DLT pipeline\n", - "\n", - "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/default_python.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Import DLT and src/default_python\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from default_python import main" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "@dlt.view\n", - "def taxi_raw():\n", - " return main.get_taxis(spark)\n", - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "dlt_pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb index fe99fd32..fd49e5b9 100644 --- a/default_python/src/notebook.ipynb +++ b/default_python/src/notebook.ipynb @@ -46,7 +46,7 @@ "source": [ "from default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" ] } ], diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py index 66b6f0a2..66c27024 100644 --- a/default_python/tests/main_test.py +++ b/default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from default_python.main import get_taxis, get_spark +from default_python import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5 diff --git a/lakeflow_pipelines_python/.gitignore b/lakeflow_pipelines_python/.gitignore index f6a3b5ff..e566c51f 100644 --- a/lakeflow_pipelines_python/.gitignore +++ b/lakeflow_pipelines_python/.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/lakeflow_pipelines_python/.vscode/extensions.json b/lakeflow_pipelines_python/.vscode/extensions.json index 5d15eba3..75a111a6 100644 --- a/lakeflow_pipelines_python/.vscode/extensions.json +++ b/lakeflow_pipelines_python/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/lakeflow_pipelines_python/.vscode/settings.json b/lakeflow_pipelines_python/.vscode/settings.json index 47d90b62..c49593bc 100644 --- a/lakeflow_pipelines_python/.vscode/settings.json +++ b/lakeflow_pipelines_python/.vscode/settings.json @@ -1,19 +1,37 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/lakeflow_pipelines_python_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, diff --git a/lakeflow_pipelines_python/README.md b/lakeflow_pipelines_python/README.md index e727cdbc..b4270849 100644 --- a/lakeflow_pipelines_python/README.md +++ b/lakeflow_pipelines_python/README.md @@ -2,38 +2,53 @@ The 'lakeflow_pipelines_python' project was generated by using the Lakeflow Pipelines template. -## Setup +* `lib/`: Python source code for this project. +* `lib/shared`: Shared source code across all jobs/pipelines/etc. +* `resources/pipelines_python_etl`: Pipeline code and assets for the pipelines_python_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks auth login - ``` +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -## Deploying resources +# Using this project using the CLI -1. To deploy a development copy of this project, type: +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) -2. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] pipelines_python_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -3. Use the "summary" comand to review everything that was deployed: +3. Similarly, to deploy a production copy, type: ``` - $ databricks bundle summary + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/pipelines_python_etl/pipelines_python_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). 4. To run a job or pipeline, use the "run" command: ``` diff --git a/lakeflow_pipelines_python/databricks.yml b/lakeflow_pipelines_python/databricks.yml index 5438327d..44beb468 100644 --- a/lakeflow_pipelines_python/databricks.yml +++ b/lakeflow_pipelines_python/databricks.yml @@ -14,8 +14,6 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: @@ -30,18 +28,15 @@ targets: variables: catalog: catalog schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: https://company.databricks.com # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: catalog + schema: prod permissions: - user_name: user@company.com level: CAN_MANAGE - variables: - catalog: catalog - schema: default - notifications: [user@company.com] diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md deleted file mode 100644 index 5e845f08..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# lakeflow_pipelines_python_pipeline - -This folder defines all source code for the lakeflow_pipelines_python_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. -- `data_sources` (optional): View definitions describing the source data for this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_lakeflow_pipelines_python.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. -* Use `Run file` to run and preview a single transformation. -* Use `Run pipeline` to run _all_ transformations in the entire pipeline. -* Use `+ Add` in the file browser to add a new data set definition. -* Use `Schedule` to run the pipeline on a schedule! - -For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml deleted file mode 100644 index c003b37f..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.job.yml +++ /dev/null @@ -1,19 +0,0 @@ -# The job that triggers lakeflow_pipelines_python_pipeline. -resources: - jobs: - lakeflow_pipelines_python_job: - name: lakeflow_pipelines_python_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - email_notifications: - on_failure: ${var.notifications} - - tasks: - - task_key: refresh_pipeline - pipeline_task: - pipeline_id: ${resources.pipelines.lakeflow_pipelines_python_pipeline.id} diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml deleted file mode 100644 index 3db75519..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/lakeflow_pipelines_python.pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - lakeflow_pipelines_python_pipeline: - name: lakeflow_pipelines_python_pipeline - serverless: true - channel: "PREVIEW" - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py deleted file mode 100644 index f0db7161..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_trips_lakeflow_pipelines_python.py +++ /dev/null @@ -1,13 +0,0 @@ -import dlt -from pyspark.sql.functions import col -from utilities import utils - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_trips_lakeflow_pipelines_python(): - return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py deleted file mode 100644 index a978db9b..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/transformations/sample_zones_lakeflow_pipelines_python.py +++ /dev/null @@ -1,13 +0,0 @@ -import dlt -from pyspark.sql.functions import col, sum - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_zones_lakeflow_pipelines_python(): - # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_lakeflow_pipelines_python").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py b/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py deleted file mode 100644 index ff039898..00000000 --- a/lakeflow_pipelines_python/resources/lakeflow_pipelines_python_pipeline/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/lakeflow_pipelines_sql/.gitignore b/lakeflow_pipelines_sql/.gitignore index f6a3b5ff..e566c51f 100644 --- a/lakeflow_pipelines_sql/.gitignore +++ b/lakeflow_pipelines_sql/.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/lakeflow_pipelines_sql/.vscode/extensions.json b/lakeflow_pipelines_sql/.vscode/extensions.json index 5d15eba3..75a111a6 100644 --- a/lakeflow_pipelines_sql/.vscode/extensions.json +++ b/lakeflow_pipelines_sql/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/lakeflow_pipelines_sql/.vscode/settings.json b/lakeflow_pipelines_sql/.vscode/settings.json index d0c85bb8..c49593bc 100644 --- a/lakeflow_pipelines_sql/.vscode/settings.json +++ b/lakeflow_pipelines_sql/.vscode/settings.json @@ -1,19 +1,37 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/lakeflow_pipelines_sql_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, diff --git a/lakeflow_pipelines_sql/README.md b/lakeflow_pipelines_sql/README.md index b4a17db3..20eba12a 100644 --- a/lakeflow_pipelines_sql/README.md +++ b/lakeflow_pipelines_sql/README.md @@ -2,38 +2,53 @@ The 'lakeflow_pipelines_sql' project was generated by using the Lakeflow Pipelines template. -## Setup +* `lib/`: Python source code for this project. +* `lib/shared`: Shared source code across all jobs/pipelines/etc. +* `resources/pipelines_sql_etl`: Pipeline code and assets for the pipelines_sql_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks auth login - ``` +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -## Deploying resources +# Using this project using the CLI -1. To deploy a development copy of this project, type: +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) -2. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] pipelines_sql_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -3. Use the "summary" comand to review everything that was deployed: +3. Similarly, to deploy a production copy, type: ``` - $ databricks bundle summary + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/pipelines_sql_etl/pipelines_sql_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). 4. To run a job or pipeline, use the "run" command: ``` diff --git a/lakeflow_pipelines_sql/databricks.yml b/lakeflow_pipelines_sql/databricks.yml index 4beb0c58..4c4d7a91 100644 --- a/lakeflow_pipelines_sql/databricks.yml +++ b/lakeflow_pipelines_sql/databricks.yml @@ -14,8 +14,6 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: @@ -30,18 +28,15 @@ targets: variables: catalog: catalog schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: https://company.databricks.com # We explicitly deploy to /Workspace/Users/user@company.com to make sure we only have a single copy. root_path: /Workspace/Users/user@company.com/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: catalog + schema: prod permissions: - user_name: user@company.com level: CAN_MANAGE - variables: - catalog: catalog - schema: default - notifications: [user@company.com] diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md deleted file mode 100644 index d01f290a..00000000 --- a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# lakeflow_pipelines_sql_pipeline - -This folder defines all source code for the 'lakeflow_pipelines_sql_pipeline' pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `data_sources` (optional): View definitions describing the source data for this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_lakeflow_pipelines_sql.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. -* Use `Run file` to run and preview a single transformation. -* Use `Run pipeline` to run _all_ transformations in the entire pipeline. -* Use `+ Add` in the file browser to add a new data set definition. -* Use `Schedule` to run the pipeline on a schedule! - -For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml deleted file mode 100644 index 32ba1ce4..00000000 --- a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.job.yml +++ /dev/null @@ -1,19 +0,0 @@ -# The job that triggers lakeflow_pipelines_sql_pipeline. -resources: - jobs: - lakeflow_pipelines_sql_job: - name: lakeflow_pipelines_sql_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - email_notifications: - on_failure: ${var.notifications} - - tasks: - - task_key: refresh_pipeline - pipeline_task: - pipeline_id: ${resources.pipelines.lakeflow_pipelines_sql_pipeline.id} diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml deleted file mode 100644 index 781c9fd6..00000000 --- a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/lakeflow_pipelines_sql.pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - lakeflow_pipelines_sql_pipeline: - name: lakeflow_pipelines_sql_pipeline - serverless: true - channel: "PREVIEW" - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql deleted file mode 100644 index 09dda0bf..00000000 --- a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_trips_lakeflow_pipelines_sql.sql +++ /dev/null @@ -1,9 +0,0 @@ --- This file defines a sample transformation. --- Edit the sample below or add new transformations --- using "+ Add" in the file browser. - -CREATE MATERIALIZED VIEW sample_trips_lakeflow_pipelines_sql AS -SELECT - pickup_zip, - fare_amount -FROM samples.nyctaxi.trips diff --git a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql b/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql deleted file mode 100644 index 5f5c567d..00000000 --- a/lakeflow_pipelines_sql/resources/lakeflow_pipelines_sql_pipeline/transformations/sample_zones_lakeflow_pipelines_sql.sql +++ /dev/null @@ -1,10 +0,0 @@ --- This file defines a sample transformation. --- Edit the sample below or add new transformations --- using "+ Add" in the file browser. - -CREATE MATERIALIZED VIEW sample_zones_lakeflow_pipelines_sql AS -SELECT - pickup_zip, - SUM(fare_amount) AS total_fare -FROM sample_trips_lakeflow_pipelines_sql -GROUP BY pickup_zip